Repository: JuliaFolds2/OhMyThreads.jl Branch: master Commit: fee46873b185 Files: 54 Total size: 222.3 KB Directory structure: gitextract_49advobv/ ├── .JuliaFormatter.toml ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── changelog.yml │ ├── ci.yml │ ├── compathelper.yml │ ├── documentation.yml │ ├── downgrade_CI.yml │ └── tagbot.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Project.toml ├── README.md ├── docs/ │ ├── Project.toml │ ├── build_docs.jl │ ├── make.jl │ └── src/ │ ├── basics.md │ ├── index.md │ ├── literate/ │ │ ├── Project.toml │ │ ├── boxing/ │ │ │ ├── Project.toml │ │ │ ├── boxing.jl │ │ │ └── boxing.md │ │ ├── falsesharing/ │ │ │ ├── Project.toml │ │ │ ├── falsesharing.jl │ │ │ └── falsesharing.md │ │ ├── integration/ │ │ │ ├── Project.toml │ │ │ ├── integration.jl │ │ │ └── integration.md │ │ ├── juliaset/ │ │ │ ├── Project.toml │ │ │ ├── juliaset.jl │ │ │ └── juliaset.md │ │ ├── mc/ │ │ │ ├── Project.toml │ │ │ ├── mc.jl │ │ │ └── mc.md │ │ ├── tls/ │ │ │ ├── Project.toml │ │ │ ├── tls.jl │ │ │ └── tls.md │ │ └── tomarkdown.sh │ ├── refs/ │ │ ├── api.md │ │ ├── experimental.md │ │ └── internal.md │ └── translation.md ├── ext/ │ └── MarkdownExt.jl ├── src/ │ ├── OhMyThreads.jl │ ├── experimental.jl │ ├── functions.jl │ ├── implementation.jl │ ├── macro_impl.jl │ ├── macros.jl │ ├── schedulers.jl │ ├── tools.jl │ └── types.jl └── test/ ├── Aqua.jl └── runtests.jl ================================================ FILE CONTENTS ================================================ ================================================ FILE: .JuliaFormatter.toml ================================================ style = "sciml" ================================================ FILE: .github/dependabot.yml ================================================ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "github-actions" directory: "/" # Location of package manifests schedule: interval: "monthly" ================================================ FILE: .github/workflows/changelog.yml ================================================ name: changelog on: pull_request: types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] jobs: # Enforces the update of a changelog file on every pull request # Can be skipped with the `Skip-Changelog` label changelog: runs-on: ubuntu-latest steps: - uses: dangoslen/changelog-enforcer@v3 ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: - push - pull_request jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: version: - '1.10' - 'pre' os: - ubuntu-latest - windows-latest arch: - x64 include: - os: macos-latest arch: aarch64 version: '1.10' - os: macos-latest arch: aarch64 version: 'pre' steps: - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 env: JULIA_NUM_THREADS: 4,2 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: files: lcov.info ================================================ FILE: .github/workflows/compathelper.yml ================================================ name: CompatHelper on: schedule: - cron: 0 0 * * * workflow_dispatch: permissions: contents: write pull-requests: write jobs: CompatHelper: runs-on: ubuntu-latest steps: - name: Check if Julia is already available in the PATH id: julia_in_path run: which julia continue-on-error: true - name: Install Julia, but only if it is not already available in the PATH uses: julia-actions/setup-julia@v3 with: version: '1' arch: ${{ runner.arch }} if: steps.julia_in_path.outcome != 'success' - name: "Add the General registry via Git" run: | import Pkg ENV["JULIA_PKG_SERVER"] = "" Pkg.Registry.add("General") shell: julia --color=yes {0} - name: "Install CompatHelper" run: | import Pkg name = "CompatHelper" uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" version = "3" Pkg.add(; name, uuid, version) shell: julia --color=yes {0} - name: "Run CompatHelper" run: | import CompatHelper CompatHelper.main() shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} ================================================ FILE: .github/workflows/documentation.yml ================================================ name: Documentation on: push: branches: - master tags: '*' paths: - 'docs/**' - 'src/**' pull_request: paths: - 'docs/**' - 'src/**' concurrency: # Skip intermediate builds: always. # Cancel intermediate builds: always. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: build: permissions: contents: write statuses: write runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@latest with: version: '1' - name: Build and deploy env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key run: julia docs/build_docs.jl ================================================ FILE: .github/workflows/downgrade_CI.yml ================================================ name: Downgrade on: pull_request: branches: - master paths-ignore: - 'docs/**' push: branches: - master paths-ignore: - 'docs/**' jobs: test: runs-on: ubuntu-latest strategy: matrix: version: ['1'] steps: - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: ${{ matrix.version }} - uses: cjdoris/julia-downgrade-compat-action@v1 with: skip: Pkg,TOML,Test,Markdown - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 ================================================ FILE: .github/workflows/tagbot.yml ================================================ name: TagBot on: issue_comment: types: - created workflow_dispatch: inputs: lookback: default: 3 permissions: actions: read checks: read contents: write deployments: read issues: read discussions: read packages: read pages: read pull-requests: read repository-projects: read security-events: read statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' runs-on: ubuntu-latest steps: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} # Edit the following line to reflect the actual name of the GitHub Secret containing your private key ssh: ${{ secrets.DOCUMENTER_KEY }} # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }} ================================================ FILE: .gitignore ================================================ docs/build Manifest.toml .vscode *~ ================================================ FILE: CHANGELOG.md ================================================ OhMyThreads.jl Changelog ========================= Unreleased ------------ - ![Enhancement][badge-enhancement] `SerialScheduler` now accepts and ignores arguments passed to it to make switching schedulers easier [#162][gh-pr-162]. Version 0.8.3 ------------ - ![Enhancement][badge-enhancement] The overhead of `tmapreduce` in the serial case was reduced a bit. Sentinel values in scheduler kwarg internals were replaced by `nothing` [#148][gh-pr-148] Version 0.8.2 ------------ - ![Feature][badge-feature] Added a `minchunksize` chunking argument for schedulers, so that they can specify a lower bound on the size of chunks which are worth parallelizing. For example, `treduce(+, 1:10; minchunksize=100)` will run serially, but `treduce(+, 1:1000000; minchunksize=100)` will be parallelized [#145][gh-pr-145]. - ![Enhancement][badge-enhancement] Operations on collections with only one 'chunk' no longer spawn an unnecessary task. That means operations like `treduce(+, 1:10; minchunksize=100)` will have less overhead [#145][gh-pr-145]. Version 0.8.1 ------------ - ![Feature][badge-feature] Added a `@localize` macro which turns `@localize x y expr` into `let x=x, y=y; expr end` ([#142][gh-pr-142]) - ![INFO][badge-info] The error messafe for captured variables now has a longer error hint that displays when the `Markdown` package is loaded (e.g. in the REPL.) ([#142][gh-pr-142]) Version 0.8.0 ------------- - ![BREAKING][badge-breaking] We now detect and throw errors if an `OhMyThreads` parallel function is passed a closure containing a `Box`ed variable. This behaviour can be disabled with the new `@allow_boxed_captures` macro, and re-enabled with `@disallow_boxed_captures`. ([#141][gh-pr-141]) - ![INFO][badge-info] Schedulder chunking info is no longer directly available via `getproperty`. This was never a public interface, but it's possible some users relied upon it [#135][gh-pr-135]. Version 0.7.0 ------------- - ![BREAKING][badge-breaking] We now use ChunkSplitters version 3.0. The function `OhMyThreads.chunks` has been renamed to `OhMyThreads.index_chunks`. The new functions `index_chunks` and `chunks` (different from the old one with the same name!) are now exported. See ChunkSplitters.jl for more information. - ![BREAKING][badge-breaking] If you provide a `chunks` or `index_chunks` as input we now disable the internal chunking without a warning. Previously, we did show a warning unless you had set `chunking=false`. In contrast, we now throw an error when you set any incompatible chunking related keyword arguments. - ![Deprecation][badge-deprecation] The `split` options `:batch` and `:scatter` are now deprecated (they still work but will be dropped at some point). Use `:consecutive` and `:roundrobin`, respectively, instead. - ![Enhancement][badge-enhancement] The `split` keyword argument can now also be a `<: OhMyThreads.Split`. Compared to providing a `Symbol`, the former can potentially give better performance. For example, you can replace `:consecutive` by `OhMyThreads.Consecutive()` and `:roundrobin` by `OhMyThreads.RoundRobin()`. - ![Feature][badge-feature] `ChannelLike` is a new public (but not exported) type. `ChannelLike(itr)` provide a way to iterate over `itr` in a concurrency safe manner similar to `Channel`. See the docstring for more details. ([#121][gh-pr-121]) - ![Enhancement][badge-enhancement] `ChannelLike` is used internally for the `GreedyScheduler` when `chunking=true`. This improves performance overall but it is especially noticeable when the number of chunks is large. ([#121][gh-pr-121]) Version 0.6.2 ------------- - ![Enhancement][badge-enhancement] Added API support for `enumerate(chunks(...))`. Best used in combination with `chunking=false` Version 0.6.1 ------------- Version 0.6.0 ------------- - ![BREAKING][badge-breaking] Drop support for Julia < 1.10. Version 0.5.3 ------------- - ![Enhancement][badge-enhancement] For the special/fake "macros" like, e.g., `@set`, support the verbose form `OhMyThreads.@set` within a `@tasks` for-loop (#107). Version 0.5.2 ------------- - ![Enhancement][badge-enhancement] For empty input (e.g. `Float64[]` or `11:10`) behavior is now aligned with the serial functions in `Base`. Version 0.5.1 ------------- - ![Feature][badge-feature] Within a parallel `@tasks` block one can now mark a region with `@one_by_one`. This region will be run by one task at a time ("critical region"). - ![Feature][badge-feature] Within a `@tasks` block one can now mark a region as with `@only_one`. This region will be run by a single parallel task only (other tasks will skip over it). - ![Experimental][badge-experimental] Added tentative support for `@barrier` in `@tasks` blocks. See `?OhMyThreads.Tools.@barrier` for more information. Note that this feature is experimental and **not** part of the public API (i.e. doesn't fall under SemVer). - ![Info][badge-info] Compat bounds for [BangBang.jl](https://github.com/JuliaFolds2/BangBang.jl) have been relaxed to include v0.3.40 Version 0.5.0 ------------- - ![Feature][badge-feature] The parallel functions (e.g. tmapreduce etc.) now support `scheduler::Symbol` besides `scheduler::Scheduler`. To configure the selected scheduler (e.g. set `nchunks` etc.) one may now pass keyword arguments directly into the parallel functions (they will get passed on to the scheduler constructor). Example: `tmapreduce(sin, +, 1:10; chunksize=2, scheduler=:static)`. Analogous support has been added to the macro API: (Most) settings (`@set name = value`) will now be passed on to the parallel functions as keyword arguments (which then forward them to the scheduler constructor). Note that, to avoid ambiguity, we don't support this feature for `scheduler::Scheduler` but only for `scheduler::Symbol`. - ![Feature][badge-feature] Added a `SerialScheduler` that can be used to turn off any multithreading. - ![Feature][badge-feature] Added `OhMyThreads.WithTaskLocals` that represents a closure over `TaskLocalValues`, but can have those values materialized as an optimization (using `OhMyThreads.promise_task_local`) - ![Feature][badge-feature] In the case `nchunks > nthreads()`, the `StaticScheduler` now distributes chunks in a round-robin fashion (instead of either implicitly decreasing `nchunks` to `nthreads()` or throwing an error). - ![Feature][badge-feature] `@set init = ...` may now be used to specify an initial value for a reduction (only has an effect in conjuction with `@set reducer=...` and triggers a warning otherwise). - ![Enhancement][badge-enhancement] `SerialScheduler` and `DynamicScheduler` now support the keyword argument `ntasks` as an alias for `nchunks`. - ![Enhancement][badge-enhancement] Made `@tasks` use `OhMyThreads.WithTaskLocals` automatically as an optimization. - ![Enhancement][badge-enhancement] Uses of `@local` within `@tasks` no-longer require users to declare the type of the task local value, it can be inferred automatically if a type is not provided. - ![Enhancement][badge-enhancement] Made `using OhMyThreads: ...` more explicit in examples in the documentation and docstrings. - ![BREAKING][badge-breaking] The `DynamicScheduler` (default) and the `StaticScheduler` now support a `chunksize` argument to specify the desired size of chunks instead of the number of chunks (`nchunks`). Note that `chunksize` and `nchunks` are mutually exclusive. (This is unlikely to break existing code but technically could because the type parameter has changed from `Bool` to `ChunkingMode`.) - ![BREAKING][badge-breaking] The greedy scheduler now supports chunking (similar to the static and dynamic scheduler). You can opt into it with, e.g., `chunking=true`. (This is unlikely to break existing code but technically could because we introduced a new type parameter for `GreedyScheduler`.) - ![Breaking][badge-breaking] `DynamicScheduler` and `StaticScheduler` don't support `nchunks=0` or `chunksize=0` any longer. Instead, chunking can now be turned off via an explicit new keyword argument `chunking=false`. - ![BREAKING][badge-breaking] Within a `@tasks` block, task-local values must from now on be defined via `@local` instead of `@init` (renamed). - ![BREAKING][badge-breaking] The (already deprecated) `SpawnAllScheduler` has been dropped. - ![BREAKING][badge-breaking] The default value for `ntasks`/`nchunks` for `DynamicScheduler` has been changed from `2*nthreads()` to `nthreads()`. With the new value we now align with `@threads :dynamic`. The old value wasn't giving good load balancing anyways and choosing a higher value penalizes uniform use cases even more. To get the old behavior, set `nchunks=2*nthreads()`. - ![Bugfix][badge-bugfix] When using the `GreedyScheduler` in combination with `tmapreduce` (or functions that build upon it) there could be non-deterministic errors in some cases (small input collection, not much work per element, see [#82](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82)). These cases should be fixed now. - ![Bugfix][badge-bugfix] We now handle empty collections as input in `tmapreduce` and `tforeach` explicitly ([#86](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/86)). Our general philosophy is to try match the behavior of the serial `Base` functions. Version 0.4.6 ------------- - ![Feature][badge-feature] Introduction of macro API (`@tasks`) that transforms for loops into corresponding `tforeach`, `tmapreduce`, and `tmap` calls. This new API enables us to facilitate certain patterns, like defining task local values. Version 0.4.5 ------------- - ![Enhancement][badge-enhancement] Improved the thread-safe storage section of the documentation. Version 0.4.4 ------------- - ![Bugfix][badge-bugfix] Fixed a type specification bug that could occur when passing a `Chunk` into, say, `tmapreduce`. Version 0.4.3 ------------- - ![Feature][badge-feature] Forward (but don't export) the macros `@fetch` and `@fetchfrom` from StableTasks.jl (v0.1.5), which are analogous to the same-named macros in Distributed.jl. Version 0.4.2 ------------- - ![Feature][badge-feature] `DynamicScheduler` now supports `nchunks=0`, which turns off internal chunking entirely. - ![Deprecation][badge-deprecation] `SpawnAllScheduler` is now deprecated in favor of `DynamicScheduler(; nchunks=0)`. - ![Feature][badge-feature] Partial support for passing in a `ChunkSplitters.Chunk` when using `DynamicScheduler` (default). In this case, one should generally use `DynamicScheduler(; nchunks=0)`, i.e. turn off internal chunking. - ![Feature][badge-feature] `StaticScheduler` now supports `nchunks=0`, which turns off internal chunking entirely. Only works for input that has `<= nthreads()` elements. Version 0.4.1 ------------- - ![Feature][badge-feature] Added a new, simple `SpawnAllScheduler` that spawns a task per input element (can be a lot of tasks!). - ![Info][badge-info] Added downgrade_CI which makes sure the testsuite works on the oldest versions of dependancies. Version 0.4.0 ------------- - ![BREAKING][badge-breaking] Instead of taking keyword arguments `schedule`, `nchunks`, `split` directly, we now use `Scheduler` structs to specify scheduling options ([#22](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/22)). The latter can be provided to all API functions via the new `scheduler` keyword argument. - ![BREAKING][badge-breaking] The default scheduler (`DynamicScheduler`) now, by default, creates `2*nthreads()` tasks to provide load-balancing by default. The old behavior can be restored with `DynamicScheduler(; nchunks=nthreads())`. - ![Enhancement][badge-enhancement] We reject unsupported keyword arguments early and give a more helpful error message. Version 0.3.1 ------------- - ![Bugfix][badge-bugfix] The documented Public API wasn't updated in 0.3.0 and thus out of sync with the actual API. Fixed in this version. Version 0.3.0 ------------- - ![BREAKING][badge-breaking] We don't (re-)export `chunks` anymore. Use `OhMyThreads.chunks` instead. - ![Feature][badge-feature] We now provide `OhMyThreads.TaskLocalValue` (from [TaskLocalValue.jl](https://github.com/vchuravy/TaskLocalValues.jl)) as a nice solution for task-local values. See the corresponding page in the documentation ([#25][gh-issue-25]). - ![Enhancement][badge-enhancement] Added a few missing `@views`. - ![Enhancement][badge-enhancement] Added three examples to the docs: monte carlo, julia set, and trapazoidal integration. - ![Enhancement][badge-enhancement] Improved all docstrings of the exported API functions. Keyword options are now only shown in the extended help (e.g. `??tmap`) ([#27][gh-issue-27]). - ![Enhancement][badge-enhancement] Added a translation page that hopefully helps with the Base.Threads → OhMyThreads.jl transition ([#24][gh-issue-24]). Version 0.2.1 ------------- - ![Enhancement][badge-enhancement] Basic documentation. - ![Enhancement][badge-enhancement] Making `ChunkSplitters` available internally. Version 0.2.0 ------------- - Initial version. [badge-breaking]: https://img.shields.io/badge/BREAKING-red.svg [badge-deprecation]: https://img.shields.io/badge/Deprecation-orange.svg [badge-feature]: https://img.shields.io/badge/Feature-green.svg [badge-experimental]: https://img.shields.io/badge/Experimental-yellow.svg [badge-enhancement]: https://img.shields.io/badge/Enhancement-blue.svg [badge-bugfix]: https://img.shields.io/badge/Bugfix-purple.svg [badge-fix]: https://img.shields.io/badge/Fix-purple.svg [badge-info]: https://img.shields.io/badge/Info-gray.svg [gh-issue-27]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/27 [gh-issue-24]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/24 [gh-issue-25]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/25 [gh-pr-5]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/5 [gh-pr-121]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/121 [gh-pr-135]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/135 [gh-pr-141]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/141 [gh-pr-142]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/142 [gh-pr-145]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/145 [gh-pr-148]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/148 ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Mason Protter Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Project.toml ================================================ name = "OhMyThreads" uuid = "67456a42-1dca-4109-a031-0a68de7e3ad5" authors = ["Carsten Bauer ", "Mason Protter "] version = "0.8.5" [deps] BangBang = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e" ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" StableTasks = "91464d47-22a1-43fe-8b7f-2d57ee82463f" TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" [weakdeps] Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" [extensions] MarkdownExt = "Markdown" [compat] Aqua = "0.8" BangBang = "0.3.40, 0.4" ChunkSplitters = "3.1" Markdown = "1" ScopedValues = "1.3" StableTasks = "0.1.5" TaskLocalValues = "0.1" Test = "1" julia = "1.10" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test", "Aqua"] ================================================ FILE: README.md ================================================ # OhMyThreads [docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg [docs-dev-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/dev [docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg [docs-stable-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/stable [ci-img]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml/badge.svg [ci-url]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml [cov-img]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl/branch/master/graph/badge.svg [cov-url]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl [lifecycle-img]: https://img.shields.io/badge/lifecycle-maturing-orange.svg [code-style-img]: https://img.shields.io/badge/code%20style-blue-4495d1.svg [code-style-url]: https://github.com/invenia/BlueStyle [aqua-img]: https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg [aqua-url]: https://github.com/JuliaTesting/Aqua.jl *Simple Multithreading in Julia* | **Documentation** | **Build Status** | **Quality** | |:-------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:| | [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][ci-img]][ci-url] [![][cov-img]][cov-url] | ![][lifecycle-img] [![][aqua-img]][aqua-url] | [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a focus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`). ## Example ```julia using OhMyThreads: tmapreduce, @tasks using BenchmarkTools: @btime using Base.Threads: nthreads # Variant 1: function API function mc_parallel(N; ntasks=nthreads()) M = tmapreduce(+, 1:N; ntasks) do i rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end # Variant 2: macro API function mc_parallel_macro(N; ntasks=nthreads()) M = @tasks for i in 1:N @set begin reducer=+ ntasks=ntasks end rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end N = 100_000_000 mc_parallel(N) # gives, e.g., 3.14159924 @btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread) @btime mc_parallel($N) # using all threads @btime mc_parallel_macro($N) # using all threads ``` With 5 threads, timings might be something like this: ``` 417.282 ms (14 allocations: 912 bytes) 83.578 ms (38 allocations: 3.08 KiB) 83.573 ms (38 allocations: 3.08 KiB) ``` (Check out the full [Parallel Monte Carlo](https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/mc/mc/) example if you like.) ## Documentation For more information, please check out the [documentation](https://JuliaFolds2.github.io/OhMyThreads.jl/stable) of the latest release (or the [development version](https://JuliaFolds2.github.io/OhMyThreads.jl/dev) if you're curious). ================================================ FILE: docs/Project.toml ================================================ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656" DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8" [compat] Documenter = "1.3" DocumenterInterLinks = "1" DocumenterTools = "0.1" ================================================ FILE: docs/build_docs.jl ================================================ cd(@__DIR__) println("--- :julia: Instantiating project") using Pkg Pkg.activate("..") Pkg.instantiate() Pkg.activate(".") Pkg.instantiate() push!(LOAD_PATH, joinpath(@__DIR__, "..")) deleteat!(LOAD_PATH, 2) println("+++ :julia: Building documentation") include("make.jl") ================================================ FILE: docs/make.jl ================================================ using Documenter using DocumenterInterLinks using OhMyThreads const ci = get(ENV, "CI", "") == "true" links = InterLinks( "ChunkSplitters" => ( "https://juliafolds2.github.io/ChunkSplitters.jl/stable/", "https://juliafolds2.github.io/ChunkSplitters.jl/stable/objects.inv", joinpath(@__DIR__, "inventories", "ChunkSplitters.toml") ), ); @info "Generating Documenter.jl site" makedocs(; sitename = "OhMyThreads.jl", authors = "Carsten Bauer, Mason Protter", modules = [OhMyThreads], checkdocs = :exports, doctest = false, pages = [ "OhMyThreads" => "index.md", "Examples" => [ "Parallel Monte Carlo" => "literate/mc/mc.md", "Julia Set" => "literate/juliaset/juliaset.md", "Trapezoidal Integration" => "literate/integration/integration.md" ], "Translation Guide" => "translation.md", "Boxed Variables" => "literate/boxing/boxing.md", "Thread-Safe Storage" => "literate/tls/tls.md", "False Sharing" => "literate/falsesharing/falsesharing.md", # "Explanations" => [ # "Task-Based Multithreading" => "explain/taskbasedmt.md", # ], "API" => [ "Public API" => "refs/api.md", "Experimental" => "refs/experimental.md", "Internal" => "refs/internal.md" ] ], repo = "https://github.com/JuliaFolds2/OhMyThreads.jl/blob/{commit}{path}#{line}", format = Documenter.HTML(repolink = "https://github.com/JuliaFolds2/OhMyThreads.jl"; collapselevel = 1), plugins = [links],) if ci @info "Deploying documentation to GitHub" deploydocs(; repo = "github.com/JuliaFolds2/OhMyThreads.jl.git", devbranch = "master", push_preview = true) end ================================================ FILE: docs/src/basics.md ================================================ # Basics This section is still in preparation. For now, you might want to take a look at the [translation guide](@ref TG) and the examples. ================================================ FILE: docs/src/index.md ================================================ # OhMyThreads.jl [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a focus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`). ## Quick Start The package is registered. Hence, you can simply use ``` ] add OhMyThreads ``` to add the package to your Julia environment. ### Basic example ```julia using OhMyThreads: tmapreduce, @tasks using BenchmarkTools: @btime using Base.Threads: nthreads # Variant 1: function API function mc_parallel(N; ntasks=nthreads()) M = tmapreduce(+, 1:N; ntasks) do i rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end # Variant 2: macro API function mc_parallel_macro(N; ntasks=nthreads()) M = @tasks for i in 1:N @set begin reducer=+ ntasks=ntasks end rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end N = 100_000_000 mc_parallel(N) # gives, e.g., 3.14159924 @btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread) @btime mc_parallel($N) # using all threads @btime mc_parallel_macro($N) # using all threads ``` With 5 threads, timings might be something like this: ``` 417.282 ms (14 allocations: 912 bytes) 83.578 ms (38 allocations: 3.08 KiB) 83.573 ms (38 allocations: 3.08 KiB) ``` (Check out the full [Parallel Monte Carlo](@ref) example if you like.) ## No Transducers Unlike most [JuliaFolds2](https://github.com/JuliaFolds2) packages, OhMyThreads.jl is not built off of [Transducers.jl](https://github.com/JuliaFolds2/Transducers.jl), nor is it a building block for Transducers.jl. Rather, it is meant to be a simpler, more maintainable, and more accessible alternative to high-level packages like, e.g., [ThreadsX.jl](https://github.com/tkf/ThreadsX.jl) or [Folds.jl](https://github.com/JuliaFolds2/Folds.jl). ## Acknowledgements The idea for this package came from [Carsten Bauer](https://github.com/carstenbauer) and [Mason Protter](https://github.com/MasonProtter). Check out the [list of contributors](https://github.com/JuliaFolds2/OhMyThreads.jl/graphs/contributors) for more information. ================================================ FILE: docs/src/literate/Project.toml ================================================ [deps] Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" [compat] Literate = "2.16" ================================================ FILE: docs/src/literate/boxing/Project.toml ================================================ [deps] OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" ================================================ FILE: docs/src/literate/boxing/boxing.jl ================================================ #==================================== # Boxed Variables All multithreading in julia is built around the idea of passing around and executing functions, but often these functions "enclose" data from an outer local scope, making them what's called a "closure". ## Boxed variables causing race conditions Julia allows functions which capture variables to re-bind those variables to different values, but doing so can cause subtle race conditions in multithreaded code. Consider the following example: ====================================# let out = zeros(Int, 10) Threads.@threads for i in 1:10 A = i sleep(1/100) out[i] = A end A = 1 out end #==================================== You may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`, but the nonsense result is caused by `A` actually being a shared mutable container here which all the parallel tasks are accessing and mutating in parallel, giving unpredictable results. OhMyThreads.jl tries to protect users from this surprising behaviour: ====================================# using OhMyThreads try let ## this throws an error! out = tmap(1:10) do i A = i sleep(1/100) A end A = 1 out end catch e; ## Show the error Base.showerror(stdout, e) end #==================================== In this case, we could fix the race conditon by marking `A` as local: ====================================# let out = tmap(1:10) do i local A = i # Note the use of `local` sleep(1/100) A end A = 1 out end #==================================== If you really desire to bypass this error, you can use the `@allow_boxed_captures` macro ====================================# @allow_boxed_captures let out = tmap(1:10) do i A = i sleep(1/100) A end A = 1 out end #==================================== ## Non-race conditon boxed variables Any re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop: ====================================# try let A = 1 if rand(Bool) ## Rebind A, it's now boxed! A = 2 end @tasks for i in 1:2 @show A end end catch e; println("Yup, that errored!") end #==================================== This comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop: ====================================# let A = 1 if rand(Bool) A = 2 end let A = A # This stops A from being boxed! @tasks for i in 1:2 @show A end end end #==================================== OhMyThreads provides a macro `@localize` to automate this process: ====================================# let A = 1 if rand(Bool) A = 2 end ## This stops A from being boxed! @localize A @tasks for i in 1:2 @show A end end ================================================ FILE: docs/src/literate/boxing/boxing.md ================================================ ```@meta EditURL = "boxing.jl" ``` # Boxed Variables All multithreading in julia is built around the idea of passing around and executing functions, but often these functions "enclose" data from an outer local scope, making them what's called a "closure". ## Boxed variables causing race conditions Julia allows functions which capture variables to re-bind those variables to different values, but doing so can cause subtle race conditions in multithreaded code. Consider the following example: ````julia let out = zeros(Int, 10) Threads.@threads for i in 1:10 A = i sleep(1/100) out[i] = A end A = 1 out end ```` ```` 10-element Vector{Int64}: 5 4 6 4 5 4 5 4 5 4 ```` You may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`, but the nonsense result is caused by `A` actually being a shared mutable container here which all the parallel tasks are accessing and mutating in parallel, giving unpredictable results. OhMyThreads.jl tries to protect users from this surprising behaviour: ````julia using OhMyThreads try let # this throws an error! out = tmap(1:10) do i A = i sleep(1/100) A end A = 1 out end catch e; # Show the error Base.showerror(stdout, e) end ```` ```` Attempted to capture and modify outer local variable: A See https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation. Hint ---- Capturing boxed variables can be not only slow, but also cause surprising and incorrect results. • If you meant for these variables to be local to each loop iteration and not depend on a variable from an outer scope, you should mark them as local inside the closure. • If you meant to reference a variable from the outer scope, but do not want access to it to be boxed, you can wrap uses of it in a let block, like e.g. function foo(x, N) rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ... let x = x # ... Unless we localize it here with the let block @tasks for i in 1:N f(x) end end end • OhMyThreads.jl provides a @localize macro that automates the above let block, i.e. @localize x f(x) is the same as let x=x; f(x) end • If these variables are being re-bound inside a @one_by_one or @only_one block, consider using a mutable Ref instead of re-binding the variable. This error can be bypassed with the @allow_boxed_captures macro. ```` In this case, we could fix the race conditon by marking `A` as local: ````julia let out = tmap(1:10) do i local A = i # Note the use of `local` sleep(1/100) A end A = 1 out end ```` ```` 10-element Vector{Int64}: 1 2 3 4 5 6 7 8 9 10 ```` If you really desire to bypass this error, you can use the `@allow_boxed_captures` macro ````julia @allow_boxed_captures let out = tmap(1:10) do i A = i sleep(1/100) A end A = 1 out end ```` ```` 10-element Vector{Int64}: 3 2 3 2 3 2 3 2 3 3 ```` ## Non-race conditon boxed variables Any re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop: ````julia try let A = 1 if rand(Bool) # Rebind A, it's now boxed! A = 2 end @tasks for i in 1:2 @show A end end catch e; println("Yup, that errored!") end ```` ```` Yup, that errored! ```` This comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop: ````julia let A = 1 if rand(Bool) A = 2 end let A = A # This stops A from being boxed! @tasks for i in 1:2 @show A end end end ```` ```` A = 1 A = 1 ```` OhMyThreads provides a macro `@localize` to automate this process: ````julia let A = 1 if rand(Bool) A = 2 end # This stops A from being boxed! @localize A @tasks for i in 1:2 @show A end end ```` ```` A = 2 A = 2 ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/falsesharing/Project.toml ================================================ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" ================================================ FILE: docs/src/literate/falsesharing/falsesharing.jl ================================================ # # [False Sharing](@id FalseSharing) # # *False Sharing* is a very common but subtle performance issue that comes up again and # again when writing parallel code manually. For this reason, we shall discuss what it is # about and how to avoid it. # # For simplicity, let's focus on a specific example: parallel summation. # # ## Baseline: sequential summation # # To establish a baseline, that we can later compare against, we define some fake data, # which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function. using Base.Threads: nthreads using BenchmarkTools using ThreadPinning #hide pinthreads(:cores) #hide data = rand(1_000_000 * nthreads()); @btime sum($data); # # ## The problematic parallel implementation # # A conceptually simple (and valid) approach to parallelizing the summation is to divide # the full computation into parts. Specifically, the idea is to divide the data into chunks, # compute the partial sums of these chunks in parallel, and finally sum up the partial # results. (Note that we will not concern ourselves with potential minor or # catastrophic numerical errors due to potential rearrangements of terms in the summation here.) # # A common, manual implementation of this idea might look like this: using OhMyThreads: @spawn, index_chunks function parallel_sum_falsesharing(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) @spawn begin for i in idcs psums[c] += data[i] end end end return sum(psums) end # The code is pretty straightforward: We allocate space for the results of the partial sums # (`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in # parallel. More importantly, and in this context perhaps surprisingly, the code is also # **correct** in the sense that it produces the desired result. using Test @test sum(data) ≈ parallel_sum_falsesharing(data) # This is just a reflection of the fact that there is no logical sharing of data - because # each parallel tasks modifies a different element of `psums` - implying the absence of # race conditions. # # What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime. # So let's see how well we're doing in this respect. nthreads() # @btime parallel_sum_falsesharing($data); # A **slowdown**?! Clearly, that's the opposite of what we tried to achieve! # # ## The issue: False sharing # # Although our parallel summation above is semantically correct, it has a # big **performance issue**: *False sharing*. To understand false sharing, we have to think # a little bit about how computers work. Specifically, we need to realize that processors # cache memory in lines (rather than individual elements) and that caches of different processors # are kept coherent. # When two (or more) different CPU cores operate on independent data elements that **fall # into the same cache line** (i.e. they are part of the same memory address region) # the **cache coherency mechanism leads to costly synchronization** between cores. # In our case, this happens despite the fact that different parallel tasks # (on different CPU cores) *logically* don't care about the rest of the data in the cache line # at all. # ![](false_sharing.svg) # Given these insights, we can come up with a few workarounds that mitigate the issue. # The most prominent is probably padding, where one simply adds sufficiently many unused # zeros to `psums` such that different partial sum counters don't fall into the same cache # line. However, let's discuss a more fundamental, more efficient, and more elegant solution. # # ## Task-local parallel summation # # The key mistake in `parallel_sum_falsesharing` above is the non-local modification of # (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop). # We can simply avoid this by making the code more task-local. To this end, we introduce a # **task-local accumulator variable**, which we use to perform the task-local partial sums. # Only at the very end do we communicate the result to the main thread, e.g. by writing it # into `psums` (once!). function parallel_sum_tasklocal(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) @spawn begin local s = zero(eltype(data)) for i in idcs s += data[i] end psums[c] = s end end return sum(psums) end @test sum(data) ≈ parallel_sum_tasklocal(data) @btime parallel_sum_tasklocal($data); # Finally, there is a speed up! 🎉 # # Two comments are in order. # # First, we note that the only role that `psums` plays is # as a temporary storage for the results from the parallel tasks to be able to sum them # up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which # would get updated via `Threads.atomic_add!` from each task directly. However, # for our discussion, this is a detail and we won't discuss it further. # # Secondly, while keeping the general idea, we can drastically simplify the above code by # using `map` and reusing the built-in (sequential) `sum` function on each parallel task: function parallel_sum_map(data; nchunks = nthreads()) ts = map(index_chunks(data, n = nchunks)) do idcs @spawn @views sum(data[idcs]) end return sum(fetch.(ts)) end @test sum(data) ≈ parallel_sum_map(data) @btime parallel_sum_map($data); # This implementation is conceptually # clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`, # anywhere at all. We can't run into false sharing if we don't modify shared state 😉. # # Note that since we use the built-in `sum` function, which is highly optimized, we might see # better runtimes due to other effects - like SIMD and the absence of bounds checks - compared # to the simple for-loop accumulation in `parallel_sum_tasklocal` above. # # ## Parallel summation with OhMyThreads # # Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref) # to implement the parallel summation. It also only takes a single line and function call. using OhMyThreads: treduce @test sum(data) ≈ treduce(+, data; ntasks = nthreads()) @btime treduce($+, $data; ntasks = $nthreads()); ================================================ FILE: docs/src/literate/falsesharing/falsesharing.md ================================================ ```@meta EditURL = "falsesharing.jl" ``` # [False Sharing](@id FalseSharing) *False Sharing* is a very common but subtle performance issue that comes up again and again when writing parallel code manually. For this reason, we shall discuss what it is about and how to avoid it. For simplicity, let's focus on a specific example: parallel summation. ## Baseline: sequential summation To establish a baseline, that we can later compare against, we define some fake data, which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function. ````julia using Base.Threads: nthreads using BenchmarkTools data = rand(1_000_000 * nthreads()); @btime sum($data); ```` ```` 2.327 ms (0 allocations: 0 bytes) ```` ## The problematic parallel implementation A conceptually simple (and valid) approach to parallelizing the summation is to divide the full computation into parts. Specifically, the idea is to divide the data into chunks, compute the partial sums of these chunks in parallel, and finally sum up the partial results. (Note that we will not concern ourselves with potential minor or catastrophic numerical errors due to potential rearrangements of terms in the summation here.) A common, manual implementation of this idea might look like this: ````julia using OhMyThreads: @spawn, index_chunks function parallel_sum_falsesharing(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) @spawn begin for i in idcs psums[c] += data[i] end end end return sum(psums) end ```` ```` parallel_sum_falsesharing (generic function with 1 method) ```` The code is pretty straightforward: We allocate space for the results of the partial sums (`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in parallel. More importantly, and in this context perhaps surprisingly, the code is also **correct** in the sense that it produces the desired result. ````julia using Test @test sum(data) ≈ parallel_sum_falsesharing(data) ```` ```` Test Passed ```` This is just a reflection of the fact that there is no logical sharing of data - because each parallel tasks modifies a different element of `psums` - implying the absence of race conditions. What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime. So let's see how well we're doing in this respect. ````julia nthreads() ```` ```` 10 ```` ````julia @btime parallel_sum_falsesharing($data); ```` ```` 52.919 ms (221 allocations: 18.47 KiB) ```` A (huge) **slowdown**?! Clearly, that's the opposite of what we tried to achieve! ## The issue: False sharing Although our parallel summation above is semantically correct, it has a big **performance issue**: *False sharing*. To understand false sharing, we have to think a little bit about how computers work. Specifically, we need to realize that processors cache memory in lines (rather than individual elements) and that caches of different processors are kept coherent. When two (or more) different CPU cores operate on independent data elements that **fall into the same cache line** (i.e. they are part of the same memory address region) the **cache coherency mechanism leads to costly synchronization** between cores. In our case, this happens despite the fact that different parallel tasks (on different CPU cores) *logically* don't care about the rest of the data in the cache line at all. ![](false_sharing.svg) Given these insights, we can come up with a few workarounds that mitigate the issue. The most prominent is probably padding, where one simply adds sufficiently many unused zeros to `psums` such that different partial sum counters don't fall into the same cache line. However, let's discuss a more fundamental, more efficient, and more elegant solution. ## Task-local parallel summation The key mistake in `parallel_sum_falsesharing` above is the non-local modification of (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop). We can simply avoid this by making the code more task-local. To this end, we introduce a **task-local accumulator variable**, which we use to perform the task-local partial sums. Only at the very end do we communicate the result to the main thread, e.g. by writing it into `psums` (once!). ````julia function parallel_sum_tasklocal(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) @spawn begin local s = zero(eltype(data)) for i in idcs s += data[i] end psums[c] = s end end return sum(psums) end @test sum(data) ≈ parallel_sum_tasklocal(data) @btime parallel_sum_tasklocal($data); ```` ```` 1.120 ms (221 allocations: 18.55 KiB) ```` Finally, there is a speed up! 🎉 Two comments are in order. First, we note that the only role that `psums` plays is as a temporary storage for the results from the parallel tasks to be able to sum them up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which would get updated via `Threads.atomic_add!` from each task directly. However, for our discussion, this is a detail and we won't discuss it further. Secondly, while keeping the general idea, we can drastically simplify the above code by using `map` and reusing the built-in (sequential) `sum` function on each parallel task: ````julia function parallel_sum_map(data; nchunks = nthreads()) ts = map(index_chunks(data, n = nchunks)) do idcs @spawn @views sum(data[idcs]) end return sum(fetch.(ts)) end @test sum(data) ≈ parallel_sum_map(data) @btime parallel_sum_map($data); ```` ```` 893.396 μs (64 allocations: 5.72 KiB) ```` This implementation is conceptually clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`, anywhere at all. We can't run into false sharing if we don't modify shared state 😉. Note that since we use the built-in `sum` function, which is highly optimized, we might see better runtimes due to other effects - like SIMD and the absence of bounds checks - compared to the simple for-loop accumulation in `parallel_sum_tasklocal` above. ## Parallel summation with OhMyThreads Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref) to implement the parallel summation. It also only takes a single line and function call. ````julia using OhMyThreads: treduce @test sum(data) ≈ treduce(+, data; ntasks = nthreads()) @btime treduce($+, $data; ntasks = $nthreads()); ```` ```` 899.097 μs (68 allocations: 5.92 KiB) ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/integration/Project.toml ================================================ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" ================================================ FILE: docs/src/literate/integration/integration.jl ================================================ # # Trapezoidal Integration # # In this example, we want to parallelize the computation of a simple numerical integral # via the trapezoidal rule. The latter is given by # # $\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$ # # The function to be integrated is the following. f(x) = 4 * √(1 - x^2) # The analytic result of the definite integral (from 0 to 1) is known to be $\pi$. # # ## Sequential # # Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop. function trapezoidal(a, b, n; h = (b - a) / n) y = (f(a) + f(b)) / 2.0 for i in 1:(n - 1) x = a + i * h y = y + f(x) end return y * h end # Let's compute the integral of `f` above and see if we get the expected result. # For simplicity, we choose `N`, the number of panels used to discretize the integration # interval, as a multiple of the number of available Julia threads. using Base.Threads: nthreads N = nthreads() * 1_000_000 # Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$. trapezoidal(0, 1, N) ≈ π # ## Parallel # # Our strategy is the following: Divide the integration interval among the available # Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial # integral. # It is straightforward to implement this strategy with `tmapreduce`. The `map` part # is, essentially, the application of `trapezoidal` and the reduction operator is chosen to # be `+` to sum up the local integrals. using OhMyThreads function trapezoidal_parallel(a, b, N) n = N ÷ nthreads() h = (b - a) / N return tmapreduce(+, 1:nthreads()) do i local α = a + (i - 1) * n * h local β = α + n * h trapezoidal(α, β, n; h) end end ## or equivalently ## ## function trapezoidal_parallel(a, b, N) ## n = N ÷ nthreads() ## h = (b - a) / N ## @tasks for i in 1:nthreads() ## @set reducer=+ ## local α = a + (i - 1) * n * h ## local β = α + n * h ## trapezoidal(α, β, n; h) ## end ## end # First, we check the correctness of our parallel implementation. trapezoidal_parallel(0, 1, N) ≈ π # Then, we benchmark and compare the performance of the sequential and parallel versions. using BenchmarkTools @btime trapezoidal(0, 1, $N); @btime trapezoidal_parallel(0, 1, $N); # Because the problem is trivially parallel - all threads to the same thing and don't need # to communicate - we expect an ideal speedup of (close to) the number of available threads. nthreads() ================================================ FILE: docs/src/literate/integration/integration.md ================================================ ```@meta EditURL = "integration.jl" ``` # Trapezoidal Integration In this example, we want to parallelize the computation of a simple numerical integral via the trapezoidal rule. The latter is given by $\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$ The function to be integrated is the following. ````julia f(x) = 4 * √(1 - x^2) ```` ```` f (generic function with 1 method) ```` The analytic result of the definite integral (from 0 to 1) is known to be $\pi$. ## Sequential Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop. ````julia function trapezoidal(a, b, n; h = (b - a) / n) y = (f(a) + f(b)) / 2.0 for i in 1:(n - 1) x = a + i * h y = y + f(x) end return y * h end ```` ```` trapezoidal (generic function with 1 method) ```` Let's compute the integral of `f` above and see if we get the expected result. For simplicity, we choose `N`, the number of panels used to discretize the integration interval, as a multiple of the number of available Julia threads. ````julia using Base.Threads: nthreads N = nthreads() * 1_000_000 ```` ```` 10000000 ```` Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$. ````julia trapezoidal(0, 1, N) ≈ π ```` ```` true ```` ## Parallel Our strategy is the following: Divide the integration interval among the available Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial integral. It is straightforward to implement this strategy with `tmapreduce`. The `map` part is, essentially, the application of `trapezoidal` and the reduction operator is chosen to be `+` to sum up the local integrals. ````julia using OhMyThreads function trapezoidal_parallel(a, b, N) n = N ÷ nthreads() h = (b - a) / N return tmapreduce(+, 1:nthreads()) do i local α = a + (i - 1) * n * h # the local keywords aren't necessary but good practice local β = α + n * h trapezoidal(α, β, n; h) end end # or equivalently # # function trapezoidal_parallel(a, b, N) # n = N ÷ nthreads() # h = (b - a) / N # @tasks for i in 1:nthreads() # @set reducer=+ # local α = a + (i - 1) * n * h # local β = α + n * h # trapezoidal(α, β, n; h) # end # end ```` ```` trapezoidal_parallel (generic function with 1 method) ```` First, we check the correctness of our parallel implementation. ````julia trapezoidal_parallel(0, 1, N) ≈ π ```` ```` true ```` Then, we benchmark and compare the performance of the sequential and parallel versions. ````julia using BenchmarkTools @btime trapezoidal(0, 1, $N); @btime trapezoidal_parallel(0, 1, $N); ```` ```` 24.348 ms (0 allocations: 0 bytes) 2.457 ms (69 allocations: 6.05 KiB) ```` Because the problem is trivially parallel - all threads to the same thing and don't need to communicate - we expect an ideal speedup of (close to) the number of available threads. ````julia nthreads() ```` ```` 10 ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/juliaset/Project.toml ================================================ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" DisplayAs = "0b91fe84-8a4c-11e9-3e1d-67c38462b6d6" OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" ================================================ FILE: docs/src/literate/juliaset/juliaset.jl ================================================ # # Julia Set # # In this example, we will compute an image of the # [Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore # the `schedule` and `nchunks` options that can be used to get load balancing. # # The value of a single pixel of the Julia set, which corresponds to a point in the # complex number plane, can be computed by the following iteration procedure. function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im) x = -2.0 + (j - 1) * 4.0 / (n - 1) y = -2.0 + (i - 1) * 4.0 / (n - 1) z = x + y * im iter = max_iter for k in 1:max_iter if abs2(z) > 4.0 iter = k - 1 break end z = z^2 + c end return iter end # Note that the value of the pixel is the number of performed iterations for the # corresponding complex input number. Hence, the computational **workload is non-uniform**. # ## Sequential computation # # In our naive implementation, we just loop over the dimensions of the image matrix and call # the pixel kernel above. function compute_juliaset_sequential!(img) N = size(img, 1) for j in 1:N for i in 1:N img[i, j] = _compute_pixel(i, j, N) end end return img end N = 2000 img = zeros(Int, N, N) compute_juliaset_sequential!(img); # Let's look at the result using Plots using DisplayAs #hide p = heatmap(img) DisplayAs.PNG(p) #hide # ## Parallelization # # The Julia set computation above is a `map!` operation: We apply some function to each # element of the array. Hence, we can use `tmap!` for parallelization. We use # `CartesianIndices` to map between linear and two-dimensional cartesian indices. using OhMyThreads: tmap! function compute_juliaset_parallel!(img; kwargs...) N = size(img, 1) cart = CartesianIndices(img) tmap!(img, eachindex(img); kwargs...) do idx c = cart[idx] _compute_pixel(c[1], c[2], N) end return img end ## or alternatively ## ## function compute_juliaset_parallel!(img; kwargs...) ## N = size(img, 1) ## cart = CartesianIndices(img) ## @tasks for idx in eachindex(img) ## c = cart[idx] ## img[idx] = _compute_pixel(c[1], c[2], N) ## end ## return img ## end N = 2000 img = zeros(Int, N, N) compute_juliaset_parallel!(img); p = heatmap(img) DisplayAs.PNG(p) #hide # ## Benchmark # # Let's benchmark the variants above. using BenchmarkTools using Base.Threads: nthreads N = 2000 img = zeros(Int, N, N) @show nthreads() @btime compute_juliaset_sequential!($img) samples=10 evals=3; @btime compute_juliaset_parallel!($img) samples=10 evals=3; # As hoped, the parallel implementation is much faster! # ### Dynamic vs static scheduling # # As stated above, the per-pixel computation is non-uniform. Hence, we do benefit from # the load balancing of the default dynamic scheduler. The latter divides the overall # workload into tasks that can then be dynamically distributed among threads to adjust the # per-thread load. We can try to fine tune and improve the load balancing further by # increasing the `ntasks` parameter of the scheduler, that is, creating more tasks with # smaller per-task workload. using OhMyThreads: DynamicScheduler @btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3; # Note that while this turns out to be a bit faster, it comes at the expense of much more # allocations. # # To quantify the impact of load balancing we can opt out of dynamic scheduling and use the # `StaticScheduler` instead. The latter doesn't provide any form of load balancing. using OhMyThreads: StaticScheduler @btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3; ================================================ FILE: docs/src/literate/juliaset/juliaset.md ================================================ ```@meta EditURL = "juliaset.jl" ``` # Julia Set In this example, we will compute an image of the [Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore the `schedule` and `nchunks` options that can be used to get load balancing. The value of a single pixel of the Julia set, which corresponds to a point in the complex number plane, can be computed by the following iteration procedure. ````julia function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im) x = -2.0 + (j - 1) * 4.0 / (n - 1) y = -2.0 + (i - 1) * 4.0 / (n - 1) z = x + y * im iter = max_iter for k in 1:max_iter if abs2(z) > 4.0 iter = k - 1 break end z = z^2 + c end return iter end ```` ```` _compute_pixel (generic function with 1 method) ```` Note that the value of the pixel is the number of performed iterations for the corresponding complex input number. Hence, the computational **workload is non-uniform**. ## Sequential computation In our naive implementation, we just loop over the dimensions of the image matrix and call the pixel kernel above. ````julia function compute_juliaset_sequential!(img) N = size(img, 1) for j in 1:N for i in 1:N img[i, j] = _compute_pixel(i, j, N) end end return img end N = 2000 img = zeros(Int, N, N) compute_juliaset_sequential!(img); ```` Let's look at the result ````julia using Plots p = heatmap(img) ```` ![](juliaset-8.png) ## Parallelization The Julia set computation above is a `map!` operation: We apply some function to each element of the array. Hence, we can use `tmap!` for parallelization. We use `CartesianIndices` to map between linear and two-dimensional cartesian indices. ````julia using OhMyThreads: tmap! function compute_juliaset_parallel!(img; kwargs...) N = size(img, 1) cart = CartesianIndices(img) tmap!(img, eachindex(img); kwargs...) do idx c = cart[idx] _compute_pixel(c[1], c[2], N) end return img end # or alternatively # # function compute_juliaset_parallel!(img; kwargs...) # N = size(img, 1) # cart = CartesianIndices(img) # @tasks for idx in eachindex(img) # c = cart[idx] # img[idx] = _compute_pixel(c[1], c[2], N) # end # return img # end N = 2000 img = zeros(Int, N, N) compute_juliaset_parallel!(img); p = heatmap(img) ```` ![](juliaset-10.png) ## Benchmark Let's benchmark the variants above. ````julia using BenchmarkTools using Base.Threads: nthreads N = 2000 img = zeros(Int, N, N) @show nthreads() @btime compute_juliaset_sequential!($img) samples=10 evals=3; @btime compute_juliaset_parallel!($img) samples=10 evals=3; ```` ```` nthreads() = 10 131.295 ms (0 allocations: 0 bytes) 31.422 ms (68 allocations: 6.09 KiB) ```` As hoped, the parallel implementation is much faster! ### Dynamic vs static scheduling As stated above, the per-pixel computation is non-uniform. Hence, we do benefit from the load balancing of the default dynamic scheduler. The latter divides the overall workload into tasks that can then be dynamically distributed among threads to adjust the per-thread load. We can try to fine tune and improve the load balancing further by increasing the `ntasks` parameter of the scheduler, that is, creating more tasks with smaller per-task workload. ````julia using OhMyThreads: DynamicScheduler @btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3; ```` ```` 17.438 ms (12018 allocations: 1.11 MiB) ```` Note that while this turns out to be a bit faster, it comes at the expense of much more allocations. To quantify the impact of load balancing we can opt out of dynamic scheduling and use the `StaticScheduler` instead. The latter doesn't provide any form of load balancing. ````julia using OhMyThreads: StaticScheduler @btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3; ```` ```` 30.097 ms (73 allocations: 6.23 KiB) ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/mc/Project.toml ================================================ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" ================================================ FILE: docs/src/literate/mc/mc.jl ================================================ # # Parallel Monte Carlo # # Calculate the value of $\pi$ through parallel direct Monte Carlo. # # A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1). # The area of the circle is $\pi$, the area of the square is 4, and the ratio is $\pi/4$. # This means that, if you throw $N$ darts randomly at the square, approximately $M=N\pi/4$ # of those darts will land inside the unit circle. # # Throw darts randomly at a unit square and count how many of them ($M$) landed inside of # a unit circle. Approximate $\pi \approx 4M/N$. # # ## Sequential implementation: function mc(N) M = 0 # number of darts that landed in the circle for i in 1:N if rand()^2 + rand()^2 < 1.0 M += 1 end end pi = 4 * M / N return pi end N = 100_000_000 mc(N) # ## Parallelization with `tmapreduce` # # To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction # operator. For the map part, we take `1:N` as our input collection and "throw one dart" per # element. using OhMyThreads function mc_parallel(N; kwargs...) M = tmapreduce(+, 1:N; kwargs...) do i rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end ## or alternatively ## ## function mc_parallel(N) ## M = @tasks for _ in 1:N ## @set reducer = + ## rand()^2 + rand()^2 < 1.0 ## end ## pi = 4 * M / N ## return pi ## end mc_parallel(N) # Let's run a quick benchmark. using BenchmarkTools using Base.Threads: nthreads @assert nthreads() > 1 # make sure we have multiple Julia threads @show nthreads() # print out the number of threads @btime mc($N) samples=10 evals=3; @btime mc_parallel($N) samples=10 evals=3; # ### Static scheduling # # Because the workload is highly uniform, it makes sense to also try the `StaticScheduler` # and compare the performance of static and dynamic scheduling (with default parameters). using OhMyThreads: StaticScheduler @btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default @btime mc_parallel($N; scheduler=:static) samples=10 evals=3; # ## Manual parallelization # # First, using the `index_chunks` function, we divide the iteration interval `1:N` into # `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task # per chunk. Each task will locally and independently perform a sequential Monte Carlo # simulation. Finally, we fetch the results and compute the average estimate for $\pi$. using OhMyThreads: @spawn, index_chunks function mc_parallel_manual(N; nchunks = nthreads()) tasks = map(index_chunks(1:N; n = nchunks)) do idcs @spawn mc(length(idcs)) end pi = sum(fetch, tasks) / nchunks return pi end mc_parallel_manual(N) # And this is the performance: @btime mc_parallel_manual($N) samples=10 evals=3; # It is faster than `mc_parallel` above because the task-local computation # `mc(length(idcs))` is faster than the implicit task-local computation within # `tmapreduce` (which itself is a `mapreduce`). idcs = first(index_chunks(1:N; n = nthreads())) @btime mapreduce($+, $idcs) do i rand()^2 + rand()^2 < 1.0 end samples=10 evals=3; @btime mc($(length(idcs))) samples=10 evals=3; ================================================ FILE: docs/src/literate/mc/mc.md ================================================ ```@meta EditURL = "mc.jl" ``` # Parallel Monte Carlo Calculate the value of $\pi$ through parallel direct Monte Carlo. A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1). The area of the circle is $\pi$, the area of the square is 4, and the ratio is $\pi/4$. This means that, if you throw $N$ darts randomly at the square, approximately $M=N\pi/4$ of those darts will land inside the unit circle. Throw darts randomly at a unit square and count how many of them ($M$) landed inside of a unit circle. Approximate $\pi \approx 4M/N$. ## Sequential implementation: ````julia function mc(N) M = 0 # number of darts that landed in the circle for i in 1:N if rand()^2 + rand()^2 < 1.0 M += 1 end end pi = 4 * M / N return pi end N = 100_000_000 mc(N) ```` ```` 3.14171236 ```` ## Parallelization with `tmapreduce` To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction operator. For the map part, we take `1:N` as our input collection and "throw one dart" per element. ````julia using OhMyThreads function mc_parallel(N; kwargs...) M = tmapreduce(+, 1:N; kwargs...) do i rand()^2 + rand()^2 < 1.0 end pi = 4 * M / N return pi end # or alternatively # # function mc_parallel(N) # M = @tasks for _ in 1:N # @set reducer = + # rand()^2 + rand()^2 < 1.0 # end # pi = 4 * M / N # return pi # end mc_parallel(N) ```` ```` 3.14156496 ```` Let's run a quick benchmark. ````julia using BenchmarkTools using Base.Threads: nthreads @assert nthreads() > 1 # make sure we have multiple Julia threads @show nthreads() # print out the number of threads @btime mc($N) samples=10 evals=3; @btime mc_parallel($N) samples=10 evals=3; ```` ```` nthreads() = 10 301.636 ms (0 allocations: 0 bytes) 41.864 ms (68 allocations: 5.81 KiB) ```` ### Static scheduling Because the workload is highly uniform, it makes sense to also try the `StaticScheduler` and compare the performance of static and dynamic scheduling (with default parameters). ````julia using OhMyThreads: StaticScheduler @btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default @btime mc_parallel($N; scheduler=:static) samples=10 evals=3; ```` ```` 41.839 ms (68 allocations: 5.81 KiB) 41.838 ms (68 allocations: 5.81 KiB) ```` ## Manual parallelization First, using the `index_chunks` function, we divide the iteration interval `1:N` into `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task per chunk. Each task will locally and independently perform a sequential Monte Carlo simulation. Finally, we fetch the results and compute the average estimate for $\pi$. ````julia using OhMyThreads: @spawn, index_chunks function mc_parallel_manual(N; nchunks = nthreads()) tasks = map(index_chunks(1:N; n = nchunks)) do idcs @spawn mc(length(idcs)) end pi = sum(fetch, tasks) / nchunks return pi end mc_parallel_manual(N) ```` ```` 3.14180504 ```` And this is the performance: ````julia @btime mc_parallel_manual($N) samples=10 evals=3; ```` ```` 30.224 ms (65 allocations: 5.70 KiB) ```` It is faster than `mc_parallel` above because the task-local computation `mc(length(idcs))` is faster than the implicit task-local computation within `tmapreduce` (which itself is a `mapreduce`). ````julia idcs = first(index_chunks(1:N; n = nthreads())) @btime mapreduce($+, $idcs) do i rand()^2 + rand()^2 < 1.0 end samples=10 evals=3; @btime mc($(length(idcs))) samples=10 evals=3; ```` ```` 41.750 ms (0 allocations: 0 bytes) 30.148 ms (0 allocations: 0 bytes) ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/tls/Project.toml ================================================ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" ================================================ FILE: docs/src/literate/tls/tls.jl ================================================ # # [Thread-Safe Storage](@id TSS) # # For some programs, it can be useful or even necessary to allocate and (re-)use memory in # your parallel code (e.g. your computation might require temporary buffers). # The following section demonstrates common issues that can arise in such a scenario and, # by means of a simple example, explains techniques to handle such cases safely. # Specifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and # (2) how channels can be used to organize per-task buffer allocation in a thread-safe # manner. # # # ## Test case (sequential) # # Let's say that we are given two arrays of matrices, `As` and `Bs`, and let's # further assume that our goal is to compute the total sum of all pairwise matrix products. # We can readily implement a (sequential) function that performs the necessary computations. using LinearAlgebra: mul!, BLAS BLAS.set_num_threads(1) # for simplicity, we turn off OpenBLAS multithreading using ThreadPinning #hide pinthreads(:cores) #hide function matmulsums(As, Bs) N = size(first(As), 1) C = Matrix{Float64}(undef, N, N) map(As, Bs) do A, B mul!(C, A, B) sum(C) end end # Here, we use `map` to perform the desired operation for each pair of matrices, # `A` and `B`. However, the crucial point for our discussion is that we want to use the # in-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated # temporary buffer, the output matrix `C`. This is to avoid the temporary allocation per # "iteration" (i.e. per matrix pair) that we would get with `C = A*B`. # # For later comparison, we generate some random input data and store the result. As = [rand(256, 16) for _ in 1:768] Bs = [rand(16, 256) for _ in 1:768] res = matmulsums(As, Bs); # ## How to not parallelize # # The key idea for creating a parallel version of `matmulsums` is to replace the `map` by # OhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't # entirely trivial. Someone new to parallel computing might be tempted to parallelize # `matmulsums` like this: using OhMyThreads: tmap function matmulsums_race(As, Bs) N = size(first(As), 1) C = Matrix{Float64}(undef, N, N) tmap(As, Bs) do A, B mul!(C, A, B) sum(C) end end # Unfortunately, this doesn't produce the correct result. res_race = matmulsums_race(As, Bs) res ≈ res_race # In fact, it doesn't even always produce the same result (check for yourself)! # The reason is that there is a race condition: different parallel # tasks are trying to use the shared variable `C` simultaneously leading to # non-deterministic behavior. Let's see how we can fix this. # # ### The naive (and inefficient) fix # # A simple solution for the race condition issue above is to move the allocation of `C` # into the body of the parallel `tmap`: function matmulsums_naive(As, Bs) N = size(first(As), 1) tmap(As, Bs) do A, B C = Matrix{Float64}(undef, N, N) mul!(C, A, B) sum(C) end end # In this case, a separate `C` will be allocated for each iteration such that parallel tasks # no longer mutate shared state. Hence, we'll get the desired result. res_naive = matmulsums_naive(As, Bs) res ≈ res_naive # However, this variant is obviously inefficient because it is no better than just writing # `C = A*B` and thus leads to one allocation per matrix pair. We need a different way of # allocating and re-using `C` for an efficient parallel version. # ## [Task-local storage](@id TLS) # # ### The manual (and cumbersome) way # # We've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't # allocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a # separate "C" on each parallel task once and then use this task-local "C" for all # iterations (i.e. matrix pairs) for which this task is responsible. # Before we learn how to do this more conveniently, let's implement this idea of a # task-local temporary buffer (for each parallel task) manually. using OhMyThreads: index_chunks, @spawn using Base.Threads: nthreads function matmulsums_manual(As, Bs) N = size(first(As), 1) tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs @spawn begin local C = Matrix{Float64}(undef, N, N) map(idcs) do i A = As[i] B = Bs[i] mul!(C, A, B) sum(C) end end end mapreduce(fetch, vcat, tasks) end res_manual = matmulsums_manual(As, Bs) res ≈ res_manual # We note that this is rather cumbersome and you might not # want to write it (repeatedly). But let's take a closer look and see what's happening here. # First, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of # those chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a # `results` vector) and (2) performs the actual computations using these pre-allocated # buffers. Finally, we `fetch` the results of the tasks and combine them. This variant works # just fine and the good news is that we can get the same behavior with less manual work. # # ### [The shortcut: `TaskLocalValue`](@id TLV) # # The desire for task-local storage is quite natural with task-based multithreading. For # this reason, Julia supports this out of the box with # [`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}). # But instead of using this directly (which you could), we will use a convenience wrapper # around it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl). # This allows us to express the idea from above in few lines of code: using OhMyThreads: TaskLocalValue function matmulsums_tlv(As, Bs; kwargs...) N = size(first(As), 1) tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N)) tmap(As, Bs; kwargs...) do A, B C = tlv[] mul!(C, A, B) sum(C) end end res_tlv = matmulsums_tlv(As, Bs) res ≈ res_tlv # Here, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a # task-local value - essentially a reference to a value in the task-local storage - that # behaves like this: The first time the task-local value is accessed from a task (`tls[]`) # it is initialized according to the provided anonymous function. Afterwards, every # following query (from the same task!) will simply lookup and return the task-local value. # This solves our issues above and leads to $O(\textrm{parallel tasks})$ # (instead of $O(\textrm{iterations})$) allocations. # # Note that if you use our `@tasks` macro API, there is built-in support for task-local # values via `@local`. # using OhMyThreads: @tasks function matmulsums_tlv_macro(As, Bs; kwargs...) N = size(first(As), 1) @tasks for i in eachindex(As, Bs) @set collect = true @local C = Matrix{Float64}(undef, N, N) mul!(C, As[i], Bs[i]) sum(C) end end res_tlv_macro = matmulsums_tlv_macro(As, Bs) res ≈ res_tlv_macro # Here, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically # infers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see # [`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in # loops which take on the order of 100ns to complete. # # # ### Benchmark # # The whole point of parallelization is increasing performance, so let's benchmark and # compare the performance of the variants that we've discussed so far. using BenchmarkTools @show nthreads() @btime matmulsums($As, $Bs); sleep(2) #hide @btime matmulsums_naive($As, $Bs); sleep(2) #hide @btime matmulsums_manual($As, $Bs); sleep(2) #hide @btime matmulsums_tlv($As, $Bs); sleep(2) #hide @btime matmulsums_tlv_macro($As, $Bs); # As we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient # but also efficient: It allocates much less memory than `matmulsums_naive` and is about on # par with the manual implementation. # # # ## Per-thread allocation # # The task-local solution above has one potential caveat: If we spawn many parallel tasks # (e.g. for load-balancing reasons) we need just as many task-local buffers. This can # clearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one # buffer per thread should actually suffice. # Of course, this raises the question of how to organize a pool of "per-thread" buffers # such that each running task always has exclusive (temporary) access to a buffer (we need # to make sure to avoid races). # # ### The naive (and incorrect) approach # A naive approach to implementing this idea is to pre-allocate an array of buffers # and then to use the `threadid()` to select a buffer for a running task. # using Base.Threads: threadid function matmulsums_perthread_incorrect(As, Bs) N = size(first(As), 1) Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()] tmap(As, Bs) do A, B C = Cs[threadid()] mul!(C, A, B) sum(C) end end; # This approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()` # doesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which # case `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising # but is a simple consequence of the ordering of different kinds of Julia threads: If Julia # is started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the # interactive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`). # [Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087), # and so the above code will error by default. # # But even if we account for this offset there is another, more fundamental problem, namely # **task-migration**. By default, all spawned parallel tasks are "non-sticky" and can # dynamically migrate between different Julia threads (loosely speaking, at any point in time). # This means nothing other than that **`threadid()` is not necessarily constant for a task**! # For example, imagine that task A starts on thread 4, loads the # buffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5. # Afterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on # (the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race # condition because both tasks are mutating the same buffer. # (Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example # above, but you can't rely on it!) # # ### The quick (and non-recommended) fix # # A simple solution for the task-migration issue is to opt-out of dynamic scheduling with # `scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically # assigns tasks to threads upfront without any dynamic rescheduling # (the tasks are sticky and won't migrate). # # We'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than # `nthreads`, as described above. # num_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads() function matmulsums_perthread_static(As, Bs) N = size(first(As), 1) Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()] ## Note!!! ## This code is *incorrect* if used with a non-static scheduler. this ## isn't just true in OhMyThreads but also applies to `Threads.@threads` ## You *must* use `Threads.@threads :static` or `scheduler = :static` to ## avoid race-conditions caused by task migration. tmap(As, Bs; scheduler = :static) do A, B C = Cs[threadid()] mul!(C, A, B) sum(C) end end ## non uniform workload As_nu = [rand(256, isqrt(i)^2) for i in 1:768]; Bs_nu = [rand(isqrt(i)^2, 256) for i in 1:768]; res_nu = matmulsums(As_nu, Bs_nu); res_pt_static = matmulsums_perthread_static(As_nu, Bs_nu) res_nu ≈ res_pt_static # However, this approach has serious shortcomings. # # 1. It can easily be broken if someone doesn't know that the `scheduler = :static` # option is required for correctness, and removes it in a refactor. # 2. It makes the parallel code non-composable: If we call other multithreaded functions # within the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called # from another parallel region we will likely oversubscribe the Julia threads and get subpar # performance. # 3. It can waste memory by creating too many temporary storage slots since `maxthreadid()` # can give an over-estimate of the number of slots needed for the computation. # # While the above pattern might be the easiest to migrate to from the incorrect pattern, # we do not recommend it. We instead urge you to use task-local-storages, or the `Channel` # based techniques described below: # # ### The safe way: `Channel` # # Instead of storing the pre-allocated buffers in an array, we can put them into a `Channel` # which internally ensures that parallel access is safe. In this scenario, we simply `take!` # a buffer from the channel whenever we need it and `put!` it back after our computation is # done. # function matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...) N = size(first(As), 1) chnl = Channel{Matrix{Float64}}(nbuffers) foreach(1:nbuffers) do _ put!(chnl, Matrix{Float64}(undef, N, N)) end tmap(As, Bs; kwargs...) do A, B C = take!(chnl) mul!(C, A, B) result = sum(C) put!(chnl, C) result end end res_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu) res_nu ≈ res_pt_channel # # ### Benchmark # # Let's benchmark the variants above and compare them to the task-local implementation. # We want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter # of which gives us dynamic load balancing. # ## no load balancing because ntasks == nthreads() @btime matmulsums_tlv($As_nu, $Bs_nu); @btime matmulsums_perthread_static($As_nu, $Bs_nu); @btime matmulsums_perthread_channel($As_nu, $Bs_nu); ## load balancing because ntasks > nthreads() @btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads()); @btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads()); # # Note that the runtime of `matmulsums_perthread_channel` improves with increasing number # of chunks/tasks (due to load balancing) while the amount of allocated memory doesn't # increase much. Contrast this with the drastic memory increase with `matmulsums_tlv`. # # ### Another safe way based on `Channel` # # Above, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel # and then spawn many tasks (one per input element). Sometimes it can make sense to flip # things around and put the (many) input elements into a channel and only spawn # a limited number of tasks (e.g. `nthreads()`) with task-local buffers. # using OhMyThreads: tmapreduce function matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads()) N = size(first(As), 1) chnl = Channel{Int}(length(As); spawn = true) do chnl for i in 1:length(As) put!(chnl, i) end end tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off local C = Matrix{Float64}(undef, N, N) map(chnl) do i # implicitly takes the values from the channel (parallel safe) A = As[i] B = Bs[i] mul!(C, A, B) sum(C) end end end; # Note that one caveat of this approach is that the input → task assignment, and thus the # order of the output, is **non-deterministic**. For this reason, we sort the output to check # for correctness. res_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu) sort(res_nu) ≈ sort(res_channel_flipped) # Quick benchmark: @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu); @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads()); # In addition, OhMyThreads provides an iterator-wrapper type # [`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If # the number of elements is large this can be more efficient since there is no # need to copy the elements into the `Channel`. Concretely, in the example above, # we could replace `Channel() do .. end` with # `OhMyThreads.ChannelLike(1:length(As))`. # ### Bumper.jl (only for the brave) # # If you are bold and want to cut down temporary allocations even more you can # give [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it # allows you to *bring your own stacks*, that is, task-local bump allocators which you can # dynamically allocate memory to, and reset them at the end of a code block, just like # Julia's stack. # Be warned though that Bumper.jl is (1) a rather young package with (likely) some bugs # and (2) can easily lead to segfaults when used incorrectly. If you can live with the # risk, Bumper.jl is especially useful for causes we don't know ahead of time how large # a matrix to pre-allocate, and even more useful if we want to do many intermediate # allocations on the task, not just one. For our example, this isn't the case but let's # nonetheless how one would use Bumper.jl here. using Bumper function matmulsums_bumper(As, Bs) tmap(As, Bs) do A, B @no_escape begin # promising that no memory will escape N = size(A, 1) C = @alloc(Float64, N, N) # from bump allocater (fake "stack") mul!(C, A, B) sum(C) end end end res_bumper = matmulsums_bumper(As, Bs); sort(res) ≈ sort(res_bumper) @btime matmulsums_bumper($As, $Bs); # Note that the benchmark is lying here about the total memory allocation, # because it doesn't show the allocation of the task-local bump allocators themselves # (the reason is that `SlabBuffer` uses `malloc` directly). ================================================ FILE: docs/src/literate/tls/tls.md ================================================ ```@meta EditURL = "tls.jl" ``` # [Thread-Safe Storage](@id TSS) For some programs, it can be useful or even necessary to allocate and (re-)use memory in your parallel code (e.g. your computation might require temporary buffers). The following section demonstrates common issues that can arise in such a scenario and, by means of a simple example, explains techniques to handle such cases safely. Specifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and (2) how channels can be used to organize per-task buffer allocation in a thread-safe manner. ## Test case (sequential) Let's say that we are given two arrays of matrices, `As` and `Bs`, and let's further assume that our goal is to compute the total sum of all pairwise matrix products. We can readily implement a (sequential) function that performs the necessary computations. ````julia using LinearAlgebra: mul!, BLAS BLAS.set_num_threads(1) # for simplicity, we turn off OpenBLAS multithreading function matmulsums(As, Bs) N = size(first(As), 1) C = Matrix{Float64}(undef, N, N) map(As, Bs) do A, B mul!(C, A, B) sum(C) end end ```` ```` matmulsums (generic function with 1 method) ```` Here, we use `map` to perform the desired operation for each pair of matrices, `A` and `B`. However, the crucial point for our discussion is that we want to use the in-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated temporary buffer, the output matrix `C`. This is to avoid the temporary allocation per "iteration" (i.e. per matrix pair) that we would get with `C = A*B`. For later comparison, we generate some random input data and store the result. ````julia As = [rand(256, 16) for _ in 1:768] Bs = [rand(16, 256) for _ in 1:768] res = matmulsums(As, Bs); ```` ## How to not parallelize The key idea for creating a parallel version of `matmulsums` is to replace the `map` by OhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't entirely trivial. Someone new to parallel computing might be tempted to parallelize `matmulsums` like this: ````julia using OhMyThreads: tmap function matmulsums_race(As, Bs) N = size(first(As), 1) C = Matrix{Float64}(undef, N, N) tmap(As, Bs) do A, B mul!(C, A, B) sum(C) end end ```` ```` matmulsums_race (generic function with 1 method) ```` Unfortunately, this doesn't produce the correct result. ````julia res_race = matmulsums_race(As, Bs) res ≈ res_race ```` ```` false ```` In fact, it doesn't even always produce the same result (check for yourself)! The reason is that there is a race condition: different parallel tasks are trying to use the shared variable `C` simultaneously leading to non-deterministic behavior. Let's see how we can fix this. ### The naive (and inefficient) fix A simple solution for the race condition issue above is to move the allocation of `C` into the body of the parallel `tmap`: ````julia function matmulsums_naive(As, Bs) N = size(first(As), 1) tmap(As, Bs) do A, B C = Matrix{Float64}(undef, N, N) mul!(C, A, B) sum(C) end end ```` ```` matmulsums_naive (generic function with 1 method) ```` In this case, a separate `C` will be allocated for each iteration such that parallel tasks no longer mutate shared state. Hence, we'll get the desired result. ````julia res_naive = matmulsums_naive(As, Bs) res ≈ res_naive ```` ```` true ```` However, this variant is obviously inefficient because it is no better than just writing `C = A*B` and thus leads to one allocation per matrix pair. We need a different way of allocating and re-using `C` for an efficient parallel version. ## [Task-local storage](@id TLS) ### The manual (and cumbersome) way We've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't allocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a separate "C" on each parallel task once and then use this task-local "C" for all iterations (i.e. matrix pairs) for which this task is responsible. Before we learn how to do this more conveniently, let's implement this idea of a task-local temporary buffer (for each parallel task) manually. ````julia using OhMyThreads: index_chunks, @spawn using Base.Threads: nthreads function matmulsums_manual(As, Bs) N = size(first(As), 1) tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs @spawn begin local C = Matrix{Float64}(undef, N, N) map(idcs) do i A = As[i] B = Bs[i] mul!(C, A, B) sum(C) end end end mapreduce(fetch, vcat, tasks) end res_manual = matmulsums_manual(As, Bs) res ≈ res_manual ```` ```` true ```` We note that this is rather cumbersome and you might not want to write it (repeatedly). But let's take a closer look and see what's happening here. First, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of those chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a `results` vector) and (2) performs the actual computations using these pre-allocated buffers. Finally, we `fetch` the results of the tasks and combine them. This variant works just fine and the good news is that we can get the same behavior with less manual work. ### [The shortcut: `TaskLocalValue`](@id TLV) The desire for task-local storage is quite natural with task-based multithreading. For this reason, Julia supports this out of the box with [`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}). But instead of using this directly (which you could), we will use a convenience wrapper around it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl). This allows us to express the idea from above in few lines of code: ````julia using OhMyThreads: TaskLocalValue function matmulsums_tlv(As, Bs; kwargs...) N = size(first(As), 1) tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N)) tmap(As, Bs; kwargs...) do A, B C = tlv[] mul!(C, A, B) sum(C) end end res_tlv = matmulsums_tlv(As, Bs) res ≈ res_tlv ```` ```` true ```` Here, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a task-local value - essentially a reference to a value in the task-local storage - that behaves like this: The first time the task-local value is accessed from a task (`tls[]`) it is initialized according to the provided anonymous function. Afterwards, every following query (from the same task!) will simply lookup and return the task-local value. This solves our issues above and leads to $O(\textrm{parallel tasks})$ (instead of $O(\textrm{iterations})$) allocations. Note that if you use our `@tasks` macro API, there is built-in support for task-local values via `@local`. ````julia using OhMyThreads: @tasks function matmulsums_tlv_macro(As, Bs; kwargs...) N = size(first(As), 1) @tasks for i in eachindex(As, Bs) @set collect = true @local C = Matrix{Float64}(undef, N, N) mul!(C, As[i], Bs[i]) sum(C) end end res_tlv_macro = matmulsums_tlv_macro(As, Bs) res ≈ res_tlv_macro ```` ```` true ```` Here, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically infers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see [`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in loops which take on the order of 100ns to complete. ### Benchmark The whole point of parallelization is increasing performance, so let's benchmark and compare the performance of the variants that we've discussed so far. ````julia using BenchmarkTools @show nthreads() @btime matmulsums($As, $Bs); @btime matmulsums_naive($As, $Bs); @btime matmulsums_manual($As, $Bs); @btime matmulsums_tlv($As, $Bs); @btime matmulsums_tlv_macro($As, $Bs); ```` ```` nthreads() = 6 50.439 ms (6 allocations: 518.14 KiB) 39.387 ms (2467 allocations: 384.09 MiB) 9.743 ms (165 allocations: 6.05 MiB) 9.749 ms (962 allocations: 3.05 MiB) 9.859 ms (199 allocations: 3.04 MiB) ```` As we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient but also efficient: It allocates much less memory than `matmulsums_naive` and is about on par with the manual implementation. ## Per-thread allocation The task-local solution above has one potential caveat: If we spawn many parallel tasks (e.g. for load-balancing reasons) we need just as many task-local buffers. This can clearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one buffer per thread should actually suffice. Of course, this raises the question of how to organize a pool of "per-thread" buffers such that each running task always has exclusive (temporary) access to a buffer (we need to make sure to avoid races). ### The naive (and incorrect) approach A naive approach to implementing this idea is to pre-allocate an array of buffers and then to use the `threadid()` to select a buffer for a running task. ````julia using Base.Threads: threadid function matmulsums_perthread_incorrect(As, Bs) N = size(first(As), 1) Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()] tmap(As, Bs) do A, B C = Cs[threadid()] mul!(C, A, B) sum(C) end end; ```` This approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()` doesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which case `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising but is a simple consequence of the ordering of different kinds of Julia threads: If Julia is started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the interactive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`). [Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087), and so the above code will error by default. But even if we account for this offset there is another, more fundamental problem, namely **task-migration**. By default, all spawned parallel tasks are "non-sticky" and can dynamically migrate between different Julia threads (loosely speaking, at any point in time). This means nothing other than that **`threadid()` is not necessarily constant for a task**! For example, imagine that task A starts on thread 4, loads the buffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5. Afterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on (the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race condition because both tasks are mutating the same buffer. (Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example above, but you can't rely on it!) ### The quick (and non-recommended) fix A simple solution for the task-migration issue is to opt-out of dynamic scheduling with `scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically assigns tasks to threads upfront without any dynamic rescheduling (the tasks are sticky and won't migrate). We'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than `nthreads`, as described above. ````julia num_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads() function matmulsums_perthread_static(As, Bs) N = size(first(As), 1) Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()] # Note!!! # This code is *incorrect* if used with a non-static scheduler. this # isn't just true in OhMyThreads but also applies to `Threads.@threads` # You *must* use `Threads.@threads :static` or `scheduler = :static` to # avoid race-conditions caused by task migration. tmap(As, Bs; scheduler = :static) do A, B C = Cs[threadid()] mul!(C, A, B) sum(C) end end # non uniform workload As_nu = [rand(256, isqrt(i)^2) for i in 1:768]; Bs_nu = [rand(isqrt(i)^2, 256) for i in 1:768]; res_nu = matmulsums(As_nu, Bs_nu); res_pt_static = matmulsums_perthread_static(As_nu, Bs_nu) res_nu ≈ res_pt_static ```` ```` true ```` However, this approach has serious shortcomings. 1. It can easily be broken if someone doesn't know that the `scheduler = :static` option is required for correctness, and removes it in a refactor. 2. It makes the parallel code non-composable: If we call other multithreaded functions within the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called from another parallel region we will likely oversubscribe the Julia threads and get subpar performance. 3. It can waste memory by creating too many temporary storage slots since `maxthreadid()` can give an over-estimate of the number of slots needed for the computation. While the above pattern might be the easiest to migrate to from the incorrect pattern, we do not recommend it. We instead urge you to use task-local-storages, or the `Channel` based techniques described below: ### The safe way: `Channel` Instead of storing the pre-allocated buffers in an array, we can put them into a `Channel` which internally ensures that parallel access is safe. In this scenario, we simply `take!` a buffer from the channel whenever we need it and `put!` it back after our computation is done. ````julia function matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...) N = size(first(As), 1) chnl = Channel{Matrix{Float64}}(nbuffers) foreach(1:nbuffers) do _ put!(chnl, Matrix{Float64}(undef, N, N)) end tmap(As, Bs; kwargs...) do A, B C = take!(chnl) mul!(C, A, B) result = sum(C) put!(chnl, C) result end end res_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu) res_nu ≈ res_pt_channel ```` ```` true ```` ### Benchmark Let's benchmark the variants above and compare them to the task-local implementation. We want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter of which gives us dynamic load balancing. ````julia # no load balancing because ntasks == nthreads() @btime matmulsums_tlv($As_nu, $Bs_nu); @btime matmulsums_perthread_static($As_nu, $Bs_nu); @btime matmulsums_perthread_channel($As_nu, $Bs_nu); # load balancing because ntasks > nthreads() @btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads()); @btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads()); ```` ```` 212.200 ms (962 allocations: 3.05 MiB) 212.014 ms (191 allocations: 4.04 MiB) 211.336 ms (190 allocations: 3.04 MiB) 168.835 ms (1136 allocations: 6.05 MiB) 169.097 ms (334 allocations: 3.04 MiB) 130.469 ms (2530 allocations: 30.17 MiB) 131.037 ms (1487 allocations: 3.14 MiB) ```` Note that the runtime of `matmulsums_perthread_channel` improves with increasing number of chunks/tasks (due to load balancing) while the amount of allocated memory doesn't increase much. Contrast this with the drastic memory increase with `matmulsums_tlv`. ### Another safe way based on `Channel` Above, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel and then spawn many tasks (one per input element). Sometimes it can make sense to flip things around and put the (many) input elements into a channel and only spawn a limited number of tasks (e.g. `nthreads()`) with task-local buffers. ````julia using OhMyThreads: tmapreduce function matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads()) N = size(first(As), 1) chnl = Channel{Int}(length(As); spawn = true) do chnl for i in 1:length(As) put!(chnl, i) end end tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off local C = Matrix{Float64}(undef, N, N) map(chnl) do i # implicitly takes the values from the channel (parallel safe) A = As[i] B = Bs[i] mul!(C, A, B) sum(C) end end end; ```` Note that one caveat of this approach is that the input → task assignment, and thus the order of the output, is **non-deterministic**. For this reason, we sort the output to check for correctness. ````julia res_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu) sort(res_nu) ≈ sort(res_channel_flipped) ```` ```` true ```` Quick benchmark: ````julia @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu); @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads()); @btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads()); ```` ```` 137.431 ms (133 allocations: 3.04 MiB) 126.854 ms (211 allocations: 6.06 MiB) 127.647 ms (836 allocations: 30.29 MiB) ```` In addition, OhMyThreads provides an iterator-wrapper type [`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If the number of elements is large this can be more efficient since there is no need to copy the elements into the `Channel`. Concretely, in the example above, we could replace `Channel() do .. end` with `OhMyThreads.ChannelLike(1:length(As))`. ### Bumper.jl (only for the brave) If you are bold and want to cut down temporary allocations even more you can give [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it allows you to *bring your own stacks*, that is, task-local bump allocators which you can dynamically allocate memory to, and reset them at the end of a code block, just like Julia's stack. Be warned though that Bumper.jl is (1) a rather young package with (likely) some bugs and (2) can easily lead to segfaults when used incorrectly. If you can live with the risk, Bumper.jl is especially useful for causes we don't know ahead of time how large a matrix to pre-allocate, and even more useful if we want to do many intermediate allocations on the task, not just one. For our example, this isn't the case but let's nonetheless how one would use Bumper.jl here. ````julia using Bumper function matmulsums_bumper(As, Bs) tmap(As, Bs) do A, B @no_escape begin # promising that no memory will escape N = size(A, 1) C = @alloc(Float64, N, N) # from bump allocater (fake "stack") mul!(C, A, B) sum(C) end end end res_bumper = matmulsums_bumper(As, Bs); sort(res) ≈ sort(res_bumper) @btime matmulsums_bumper($As, $Bs); ```` ```` 9.439 ms (198 allocations: 39.25 KiB) ```` Note that the benchmark is lying here about the total memory allocation, because it doesn't show the allocation of the task-local bump allocators themselves (the reason is that `SlabBuffer` uses `malloc` directly). --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* ================================================ FILE: docs/src/literate/tomarkdown.sh ================================================ #!/usr/bin/env sh #= julia --project -t 10 $0 $@ exit # =# const reporoot = joinpath(@__DIR__, "../../..") const repourl = "https://github.com/JuliaFolds2/OhMyThreads.jl/blob/main/docs" using Literate using Pkg if length(ARGS) == 0 println("Error: Please provide the names of the folders that you want to compile to markdown. " * "Alternatively, you can pass \"all\" as the first argument to compile them all.") exit() else if first(ARGS) == "all" dirs = filter(isdir, readdir()) else dirs = ARGS end end @show dirs for d in dirs println("directory: ", d) cd(d) do Pkg.activate(".") Pkg.resolve() Pkg.instantiate() jlfiles = filter(endswith(".jl"), readdir()) for f in jlfiles Literate.markdown( f, repo_root_url = repourl, execute=true; # config=Dict("image_formats" => [(MIME"image/png", ".png")]) ) end end end ================================================ FILE: docs/src/refs/api.md ================================================ ```@meta CollapsedDocStrings = true ``` # [Public API](@id API) ## Exported ### Macros ```@docs @tasks @set @local @only_one @one_by_one @allow_boxed_captures @disallow_boxed_captures @localize ``` ### Functions ```@docs tmapreduce treduce tmap tmap! tforeach tcollect treducemap ``` ### Schedulers ```@docs Scheduler DynamicScheduler StaticScheduler GreedyScheduler SerialScheduler ``` ## Re-exported | | | |------------------------|---------------------------------------------------------------------| | `OhMyThreads.chunks` | see [`ChunkSplitters.chunks`](@extref) | | `OhMyThreads.index_chunks` | see [`ChunkSplitters.index_chunks`](@extref) | ## Public but not exported | | | |------------------------|---------------------------------------------------------------------| | `OhMyThreads.@spawn` | see [`StableTasks.@spawn`](https://github.com/JuliaFolds2/StableTasks.jl) | | `OhMyThreads.@spawnat` | see [`StableTasks.@spawnat`](https://github.com/JuliaFolds2/StableTasks.jl) | | `OhMyThreads.@fetch` | see [`StableTasks.@fetch`](https://github.com/JuliaFolds2/StableTasks.jl) | | `OhMyThreads.@fetchfrom` | see [`StableTasks.@fetchfrom`](https://github.com/JuliaFolds2/StableTasks.jl) | | `OhMyThreads.TaskLocalValue` | see [TaskLocalValues.TaskLocalValue](https://github.com/vchuravy/TaskLocalValues.jl) | | `OhMyThreads.Split` | see [`ChunkSplitters.Split`](@extref) | | `OhMyThreads.Consecutive` | see [`ChunkSplitters.Consecutive`](@extref) | | `OhMyThreads.RoundRobin` | see [`ChunkSplitters.RoundRobin`](@extref) | ```@docs OhMyThreads.WithTaskLocals OhMyThreads.promise_task_local OhMyThreads.ChannelLike ``` ================================================ FILE: docs/src/refs/experimental.md ================================================ ```@meta CollapsedDocStrings = true ``` # Experimental !!! warning **Everything on this page is experimental and might changed or dropped at any point!** ## References ```@autodocs Modules = [OhMyThreads, OhMyThreads.Experimental] Public = false Pages = ["OhMyThreads.jl", "experimental.jl"] ``` ================================================ FILE: docs/src/refs/internal.md ================================================ ```@meta CollapsedDocStrings = true ``` # Internal !!! warning **Everything on this page is internal and and might changed or dropped at any point!** ## References ```@autodocs Modules = [OhMyThreads, OhMyThreads.Tools] Public = false Pages = ["OhMyThreads.jl", "tools.jl"] ``` ================================================ FILE: docs/src/translation.md ================================================ # [Translation Guide](@id TG) This page tries to give a general overview of how to translate patterns written with the built-in tools of [Base.Threads](https://docs.julialang.org/en/v1/base/multi-threading/) using the [OhMyThreads.jl API](@ref API). Note that this should be seen as a rough guide and (intentionally) isn't supposed to replace a systematic introduction into OhMyThreads.jl. ## Basics ### `@threads` ```julia # Base.Threads using Base.Threads: @threads @threads for i in 1:10 println(i) end ``` ```julia # OhMyThreads using OhMyThreads: @tasks @tasks for i in 1:10 println(i) end # or using OhMyThreads: tforeach tforeach(1:10) do i println(i) end ``` #### `:static` scheduling ```julia # Base.Threads using Base.Threads: @threads @threads :static for i in 1:10 println(i) end ``` ```julia # OhMyThreads using OhMyThreads: @tasks @tasks for i in 1:10 @set scheduler=:static println(i) end # or using OhMyThreads: tforeach tforeach(1:10; scheduler=:static) do i println(i) end ``` ### `@spawn` ```julia # Base.Threads using Base.Threads: @spawn @sync for i in 1:10 @spawn println(i) end ``` ```julia # OhMyThreads using OhMyThreads: @tasks @tasks for i in 1:10 @set chunking=false println(i) end # or using OhMyThreads: tforeach tforeach(1:10; chunking=false) do i println(i) end # or using OhMyThreads: @spawn @sync for i in 1:10 @spawn println(i) end ``` ## Reduction No built-in feature in Base.Threads. ```julia # Base.Threads: basic manual implementation using Base.Threads: @spawn data = rand(10) chunks_itr = Iterators.partition(data, length(data) ÷ nthreads()) tasks = map(chunks_itr) do chunk @spawn reduce(+, chunk) end reduce(+, fetch.(tasks)) ``` ```julia # OhMyThreads using OhMyThreads: @tasks data = rand(10) @tasks for x in data @set reducer=+ end # or using OhMyThreads: treduce treduce(+, data) ``` ## Mutation !!! warning Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!** ```julia # Base.Threads using Base.Threads: @threads data = rand(10) @threads for i in eachindex(data) data[i] = calc(i) end ``` ```julia # OhMyThreads using OhMyThreads: @tasks data = rand(10) @tasks for i in eachindex(data) data[i] = calc(i) end # or using OhMyThreads: tforeach tforeach(eachindex(data)) do i data[i] = calc(i) end # or using OhMyThreads: tmap! tmap!(data, eachindex(data)) do i calc(i) end ``` ## Parallel initialization !!! warning Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!** ```julia # Base.Threads using Base.Threads: @threads data = Vector{Float64}(undef, 10) @threads for i in eachindex(data) data[i] = calc(i) end ``` ```julia # OhMyThreads using OhMyThreads: @tasks data = @tasks for i in 1:10 @set collect=true calc(i) end # or using OhMyThreads: tmap data = tmap(i->calc(i), 1:10) # or using OhMyThreads: tcollect data = tcollect(calc(i) for i in 1:10) ``` ================================================ FILE: ext/MarkdownExt.jl ================================================ module MarkdownExt using Markdown: Markdown, @md_str, term using OhMyThreads.Implementation: BoxedVariableError function __init__() if isdefined(Base.Experimental, :register_error_hint) Base.Experimental.register_error_hint(BoxedVariableError) do io, bve println(io) println(io) term(io, md""" #### Hint Capturing boxed variables can be not only slow, but also cause surprising and incorrect results. * If you meant for these variables to be local to each loop iteration and not depend on a variable from an outer scope, you should mark them as `local` inside the closure. * If you meant to reference a variable from the outer scope, but do not want access to it to be boxed, you can wrap uses of it in a let block, like e.g. ```julia function foo(x, N) rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ... let x = x # ... Unless we localize it here with the let block @tasks for i in 1:N f(x) end end end ``` * OhMyThreads.jl provides a `@localize` macro that automates the above `let` block, i.e. `@localize x f(x)` is the same as `let x=x; f(x) end` * If these variables are being re-bound inside a `@one_by_one` or `@only_one` block, consider using a mutable `Ref` instead of re-binding the variable. This error can be bypassed with the `@allow_boxed_captures` macro. """) end end end end ================================================ FILE: src/OhMyThreads.jl ================================================ module OhMyThreads using StableTasks: StableTasks for mac in Symbol.(["@spawn", "@spawnat", "@fetch", "@fetchfrom"]) @eval const $mac = getproperty(StableTasks, $(QuoteNode(mac))) end using ChunkSplitters: ChunkSplitters const index_chunks = ChunkSplitters.index_chunks const chunks = ChunkSplitters.chunks const Split = ChunkSplitters.Split const Consecutive = ChunkSplitters.Consecutive const RoundRobin = ChunkSplitters.RoundRobin export chunks, index_chunks using TaskLocalValues: TaskLocalValues const TaskLocalValue = TaskLocalValues.TaskLocalValue using ScopedValues: ScopedValues, ScopedValue, @with include("types.jl") include("functions.jl") include("macros.jl") include("tools.jl") include("schedulers.jl") using .Schedulers: Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler, SerialScheduler include("implementation.jl") include("experimental.jl") export @tasks, @set, @local, @one_by_one, @only_one, @allow_boxed_captures, @disallow_boxed_captures, @localize export treduce, tmapreduce, treducemap, tmap, tmap!, tforeach, tcollect export Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler, SerialScheduler end # module OhMyThreads ================================================ FILE: src/experimental.jl ================================================ module Experimental """ @barrier This can be used inside a `@tasks for ... end` to synchronize `n` parallel tasks. Specifically, a task can only pass the `@barrier` if `n-1` other tasks have reached it as well. The value of `n` is determined from `@set ntasks=...`, which is required if one wants to use `@barrier`. Because this feature is experimental, it is required to load `@barrier` explicitly, e.g. via `using OhMyThreads.Experimental: @barrier`. **WARNING:** It is the responsibility of the user to ensure that the right number of tasks actually reach the barrier. Otherwise, a **deadlock** can occur. In partictular, if the number of iterations is not a multiple of `n`, the last few iterations (remainder) will be run by less than `n` tasks which will never be able to pass a `@barrier`. ## Example ```julia using OhMyThreads: @tasks # works @tasks for i in 1:20 @set ntasks = 20 sleep(i * 0.2) println(i, ": before") @barrier println(i, ": after") end # wrong - deadlock! @tasks for i in 1:22 # ntasks % niterations != 0 @set ntasks = 20 println(i, ": before") @barrier println(i, ": after") end ``` """ macro barrier(args...) error("The @barrier macro may only be used inside of a @tasks block.") end end # Experimental ================================================ FILE: src/functions.jl ================================================ """ tmapreduce(f, op, A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic], [outputtype::Type = Any], [init]) A multithreaded function like `Base.mapreduce`. Perform a reduction over `A`, applying a single-argument function `f` to each element, and then combining them with the two-argument function `op`. Note that `op` **must** be an [associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you will get undefined results. ## Example: ``` using OhMyThreads: tmapreduce tmapreduce(√, +, [1, 2, 3, 4, 5]) ``` is the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form ``` (√1 + √2) + (√3 + √4) + √5 ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. - `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument. - `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation. In addition, `tmapreduce` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` tmapreduce(√, +, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static) ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function tmapreduce end """ treducemap(op, f, A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic], [outputtype::Type = Any], [init]) Like `tmapreduce` except the order of the `f` and `op` arguments are switched. This is sometimes convenient with `do`-block notation. Perform a reduction over `A`, applying a single-argument function `f` to each element, and then combining them with the two-argument function `op`. Note that `op` **must** be an [associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you will get undefined results. ## Example: ``` using OhMyThreads: treducemap treducemap(+, √, [1, 2, 3, 4, 5]) ``` is the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form ``` (√1 + √2) + (√3 + √4) + √5 ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. - `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument. - `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation. In addition, `treducemap` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` treducemap(+, √, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static) ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function treducemap end """ treduce(op, A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic], [outputtype::Type = Any], [init]) A multithreaded function like `Base.reduce`. Perform a reduction over `A` using the two-argument function `op`. Note that `op` **must** be an [associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you will get undefined results. ## Example: ``` using OhMyThreads: treduce treduce(+, [1, 2, 3, 4, 5]) ``` is the parallelized version of `sum([1, 2, 3, 4, 5])` in the form ``` (1 + 2) + (3 + 4) + 5 ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. - `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument. - `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation. In addition, `treduce` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` treduce(+, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static) ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function treduce end """ tforeach(f, A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic]) :: Nothing A multithreaded function like `Base.foreach`. Apply `f` to each element of `A` on multiple parallel tasks, and return `nothing`. I.e. it is the parallel equivalent of ``` for x in A f(x) end ``` ## Example: ``` using OhMyThreads: tforeach tforeach(1:10) do i println(i^2) end ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. In addition, `tforeach` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` tforeach(1:10; chunksize=2, scheduler=:static) do i println(i^2) end ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function tforeach end """ tmap(f, [OutputElementType], A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic]) A multithreaded function like `Base.map`. Create a new container `similar` to `A` and fills it in parallel such that the `i`th element is equal to `f(A[i])`. The optional argument `OutputElementType` will select a specific element type for the returned container, and will generally incur fewer allocations than the version where `OutputElementType` is not specified. ## Example: ``` using OhMyThreads: tmap tmap(sin, 1:10) ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. In addition, `tmap` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` tmap(sin, 1:10; chunksize=2, scheduler=:static) ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function tmap end """ tmap!(f, out, A::AbstractArray...; [scheduler::Union{Scheduler, Symbol} = :dynamic]) A multithreaded function like `Base.map!`. In parallel on multiple tasks, this function assigns each element of `out[i] = f(A[i])` for each index `i` of `A` and `out`. ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. In addition, `tmap!` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function tmap! end """ tcollect([OutputElementType], gen::Union{AbstractArray, Generator{<:AbstractArray}}; [scheduler::Union{Scheduler, Symbol} = :dynamic]) A multithreaded function like `Base.collect`. Essentially just calls `tmap` on the generator function and inputs. The optional argument `OutputElementType` will select a specific element type for the returned container, and will generally incur fewer allocations than the version where `OutputElementType` is not specified. ## Example: ``` using OhMyThreads: tcollect tcollect(sin(i) for i in 1:10) ``` ## Keyword arguments: - `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers. In addition, `tcollect` accepts **all keyword arguments that are supported by the selected scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example: ``` tcollect(sin(i) for i in 1:10; chunksize=2, scheduler=:static) ``` However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`** (but not for `scheduler::Scheduler`). """ function tcollect end ================================================ FILE: src/implementation.jl ================================================ module Implementation import OhMyThreads: treduce, tmapreduce, treducemap, tforeach, tmap, tmap!, tcollect using OhMyThreads: @spawn, @spawnat, WithTaskLocals, promise_task_local, ChannelLike, allowing_boxed_captures using OhMyThreads.Tools: nthtid using OhMyThreads: Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler, SerialScheduler using OhMyThreads.Schedulers: chunksplitter_mode, chunking_enabled, nchunks, chunksize, chunksplit, minchunksize, has_chunksplit, has_minchunksize, chunkingargs_to_kwargs, chunking_mode, ChunkingMode, NoChunking, FixedSize, FixedCount, scheduler_from_symbol, NotGiven, isgiven, threadpool as get_threadpool using Base: @propagate_inbounds using Base.Threads: nthreads, @threads using BangBang: append!! using ChunkSplitters: ChunkSplitters, index_chunks, Consecutive using ChunkSplitters.Internals: AbstractChunks, IndexChunks const MaybeScheduler = Union{NotGiven, Scheduler, Symbol, Val} include("macro_impl.jl") @inline function _index_chunks(sched, arg) C = chunking_mode(sched) @assert chunking_enabled(sched) kwargs = chunkingargs_to_kwargs(sched, arg) return index_chunks(arg; kwargs...)::IndexChunks{typeof(arg), chunksplitter_mode(C)} end function _scheduler_from_userinput(scheduler::MaybeScheduler; kwargs...) if scheduler isa Scheduler isempty(kwargs) || scheduler_and_kwargs_err(; kwargs...) _scheduler = scheduler elseif scheduler isa Symbol _scheduler = scheduler_from_symbol(scheduler; kwargs...) else # default fallback _scheduler = DynamicScheduler(; kwargs...) end end function _check_chunks_incompatible_kwargs(; kwargs...) ks = keys(kwargs) if :ntasks in ks || :nchunks in ks || :chunksize in ks || :split in ks error("You've provided `chunks` or `index_chunks` as input and, at the same time, " * "chunking related keyword arguments (e.g. `ntasks`, `chunksize`, or `split`). " * "This isn't supported. " * "Set the chunking options directly in the `chunks` or `index_chunks` call or drop the latter.") end if :chunking in ks for (k, v) in kwargs if k == :chunking && v == true error("You've provided `chunks` or `index_chunks` as input and, at the same time, " * "have set chunking=true. This isn't supported.") end end end return nothing end function has_multiple_chunks(scheduler, coll) C = chunking_mode(scheduler) if C == NoChunking || coll isa Union{AbstractChunks, ChunkSplitters.Internals.Enumerate} length(coll) > 1 elseif C == FixedCount if !has_minchunksize(scheduler) mcs = 1 else mcs = max(min(minchunksize(scheduler), length(coll)), 1) end min(length(coll) ÷ mcs, nchunks(scheduler)) > 1 elseif C == FixedSize length(coll) ÷ chunksize(scheduler) > 1 else throw(ArgumentError("Unknown chunking mode: $C.")) end end # we can inline this function because we use @noinline on the main function # it can save some time in cases where we do not hit the main function (e.g. when # fallback to mapreduce without any threading) @inline function tmapreduce(f, op, Arrs...; scheduler::MaybeScheduler = NotGiven(), outputtype::Type = Any, init = NotGiven(), kwargs...) mapreduce_kwargs = isgiven(init) ? (; init) : (;) _scheduler = _scheduler_from_userinput(scheduler; kwargs...) A = first(Arrs) if A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate _check_chunks_incompatible_kwargs(; kwargs...) end if _scheduler isa SerialScheduler || !has_multiple_chunks(_scheduler, first(Arrs)) # empty input collection → align with Base.mapreduce behavior mapreduce(f, op, Arrs...; mapreduce_kwargs...) else @noinline _tmapreduce(f, op, Arrs, outputtype, _scheduler, mapreduce_kwargs) end end @noinline function scheduler_and_kwargs_err(; kwargs...) kwargstr = join(string.(keys(kwargs)), ", ") throw(ArgumentError("Providing an explicit scheduler as well as direct keyword arguments (e.g. $(kwargstr)) is currently not supported.")) end treducemap(op, f, A...; kwargs...) = tmapreduce(f, op, A...; kwargs...) # DynamicScheduler: AbstractArray/Generic function _tmapreduce(f, op, Arrs, ::Type{OutputType}, scheduler::DynamicScheduler, mapreduce_kwargs)::OutputType where {OutputType} threadpool = get_threadpool(scheduler) check_all_have_same_indices(Arrs) throw_if_boxed_captures(f, op) if chunking_enabled(scheduler) tasks = map(_index_chunks(scheduler, first(Arrs))) do inds args = map(A -> view(A, inds), Arrs) # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... @spawn threadpool mapreduce(promise_task_local(f), promise_task_local(op), args...; $mapreduce_kwargs...) end mapreduce(fetch, promise_task_local(op), tasks) else tasks = map(eachindex(first(Arrs))) do i args = map(A -> @inbounds(A[i]), Arrs) @spawn threadpool promise_task_local(f)(args...) end mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...) end end # DynamicScheduler: AbstractChunks function _tmapreduce(f, op, Arrs::Union{Tuple{AbstractChunks{T}}, Tuple{ChunkSplitters.Internals.Enumerate{T}}}, ::Type{OutputType}, scheduler::DynamicScheduler, mapreduce_kwargs)::OutputType where {OutputType, T} threadpool = get_threadpool(scheduler) throw_if_boxed_captures(f, op) tasks = map(only(Arrs)) do idcs @spawn threadpool promise_task_local(f)(idcs) end mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...) end # StaticScheduler: AbstractArray/Generic function _tmapreduce(f, op, Arrs, ::Type{OutputType}, scheduler::StaticScheduler, mapreduce_kwargs)::OutputType where {OutputType} nt = nthreads() check_all_have_same_indices(Arrs) throw_if_boxed_captures(f, op) if chunking_enabled(scheduler) tasks = map(enumerate(_index_chunks(scheduler, first(Arrs)))) do (c, inds) tid = @inbounds nthtid(mod1(c, nt)) args = map(A -> view(A, inds), Arrs) # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... @spawnat tid mapreduce(promise_task_local(f), promise_task_local(op), args...; mapreduce_kwargs...) end # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... mapreduce(fetch, promise_task_local(op), tasks) else tasks = map(enumerate(eachindex(first(Arrs)))) do (c, i) tid = @inbounds nthtid(mod1(c, nt)) args = map(A -> @inbounds(A[i]), Arrs) @spawnat tid promise_task_local(f)(args...) end # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...) end end # StaticScheduler: AbstractChunks function _tmapreduce(f, op, Arrs::Tuple{AbstractChunks{T}}, # we don't support multiple chunks for now ::Type{OutputType}, scheduler::StaticScheduler, mapreduce_kwargs)::OutputType where {OutputType, T} check_all_have_same_indices(Arrs) throw_if_boxed_captures(f, op) chnks = only(Arrs) nt = nthreads() tasks = map(enumerate(chnks)) do (c, idcs) tid = @inbounds nthtid(mod1(c, nt)) # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... @spawnat tid promise_task_local(f)(idcs) end # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...) end # NOTE: once v1.12 releases we should switch this to wait(t; throw=false) wait_nothrow(t) = Base._wait(t) """ empty_collection_error(task) Check if a task failed due to an empty collection error. """ function empty_collection_error end @static if VERSION < v"1.11.0-" function empty_collection_error(task) task.result isa MethodError && task.result.f == Base.mapreduce_empty end else function empty_collection_error(task) task.result isa ArgumentError && task.result.msg == "reducing over an empty collection is not allowed; consider supplying `init` to the reducer" end end # GreedyScheduler w/o chunking function _tmapreduce(f, op, Arrs, ::Type{OutputType}, scheduler::GreedyScheduler{NoChunking}, mapreduce_kwargs)::OutputType where {OutputType} ntasks_desired = scheduler.ntasks if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown ntasks = ntasks_desired ch_len = 0 else check_all_have_same_indices(Arrs) ntasks = min(length(first(Arrs)), ntasks_desired) ch_len = length(first(Arrs)) end throw_if_boxed_captures(f, op) # TODO: Use ChannelLike for iterators that support it. Dispatch on IndexLinear? ch = Channel{Tuple{eltype.(Arrs)...}}(ch_len; spawn = true) do ch for args in zip(Arrs...) put!(ch, args) end end tasks = map(1:ntasks) do _ # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do args promise_task_local(f)(args...) end end # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82 # The idea is that if the channel gets fully consumed before a task gets started up, # then if the user does not supply an `init` kwarg, we'll get an error. # Current way of dealing with this is just filtering out `mapreduce_empty` method # errors. This may not be the most stable way of dealing with things, e.g. if the # name of the function throwing the error changes this could break, so long term # we may want to try a different design. filtered_tasks = filter(tasks) do stabletask task = stabletask.t istaskdone(task) || wait_nothrow(task) if empty_collection_error(task) false else true end end # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...) end # GreedyScheduler w/ chunking function _tmapreduce(f, op, Arrs, ::Type{OutputType}, scheduler::GreedyScheduler, mapreduce_kwargs)::OutputType where {OutputType} if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown throw(ArgumentError("SizeUnkown iterators in combination with a greedy scheduler and chunking are currently not supported.")) end check_all_have_same_indices(Arrs) throw_if_boxed_captures(f, op) chnks = _index_chunks(scheduler, first(Arrs)) ntasks_desired = scheduler.ntasks ntasks = min(length(chnks), ntasks_desired) # ChunkSplitters.IndexChunks support everything needed for ChannelLike ch = ChannelLike(chnks) tasks = map(1:ntasks) do _ # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do inds args = map(A -> view(A, inds), Arrs) mapreduce(promise_task_local(f), promise_task_local(op), args...) end end # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82 # The idea is that if the channel gets fully consumed before a task gets started up, # then if the user does not supply an `init` kwarg, we'll get an error. # Current way of dealing with this is just filtering out `mapreduce_empty` method # errors. This may not be the most stable way of dealing with things, e.g. if the # name of the function throwing the error changes this could break, so long term # we may want to try a different design. filtered_tasks = filter(tasks) do stabletask task = stabletask.t istaskdone(task) || wait_nothrow(task) if empty_collection_error(task) false else true end end # Note, calling `promise_task_local` here is only safe because we're assuming that # Base.mapreduce isn't going to magically try to do multithreading on us... mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...) end function check_all_have_same_indices(Arrs) let A = first(Arrs), Arrs = Arrs[2:end] if !all(B -> eachindex(A) == eachindex(B), Arrs) error("The indices of the input arrays must match the indices of the output array.") end end end struct BoxedVariableError <: Exception vars::Vector{Symbol} end function Base.showerror(io::IO, bve::BoxedVariableError) boxed_fields = join(bve.vars, ", ") suffix = length(bve.vars) > 1 ? "s" : "" print(io, "Attempted to capture and modify outer local variable$(suffix): ") printstyled(io, boxed_fields; color=:red) print(io, "\n\nSee https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation.") if isdefined(Base.Experimental, :show_error_hints) Base.Experimental.show_error_hints(io, bve) end end function throw_if_boxed_captures(f) if allowing_boxed_captures[] return nothing end T = typeof(f) if any(FT -> FT <: Core.Box, fieldtypes(T)) boxed_fields = [fieldname(T, i) for i in 1:fieldcount(T) if fieldtype(T,i) <: Core.Box] throw(BoxedVariableError(boxed_fields)) end for i ∈ 1:fieldcount(T) # recurse into nested captured functions. if fieldtype(T, i) <: Function f_inner = getfield(f, i) if f !== f_inner # don't recurse into self! throw_if_boxed_captures(getfield(f, i)) end end end end function throw_if_boxed_captures(f, fs...) throw_if_boxed_captures(f) throw_if_boxed_captures(fs...) end #------------------------------------------------------------- function treduce(op, A...; kwargs...) tmapreduce(identity, op, A...; kwargs...) end #------------------------------------------------------------- function tforeach(f, A...; kwargs...)::Nothing tmapreduce(f, (l, r) -> l, A...; kwargs..., init = nothing, outputtype = Nothing) end #------------------------------------------------------------- function maybe_rewrap(g::G, f::F) where {G, F} g(f) end """ maybe_rewrap(g, f) takes a closure `g(f)` and if `f` is a `WithTaskLocals`, we're going to unwrap `f` and delegate its `TaskLocalValues` to `g`. This should always be equivalent to just calling `g(f)`. """ function maybe_rewrap(g::G, f::WithTaskLocals{F}) where {G, F} (; inner_func, tasklocals) = f WithTaskLocals(vals -> g(inner_func(vals)), tasklocals) end #------------------------------------------------------------ function tmap(f, ::Type{T}, A::AbstractArray, _Arrs::AbstractArray...; kwargs...) where {T} Arrs = (A, _Arrs...) tmap!(f, similar(A, T), Arrs...; kwargs...) end function tmap(f, A::Union{AbstractArray, AbstractChunks, ChunkSplitters.Internals.Enumerate}, _Arrs::AbstractArray...; scheduler::MaybeScheduler = NotGiven(), kwargs...) _scheduler = _scheduler_from_userinput(scheduler; kwargs...) if _scheduler isa GreedyScheduler error("Greedy scheduler isn't supported with `tmap` unless you provide an `OutputElementType` argument, since the greedy schedule requires a commutative reducing operator.") end if chunking_enabled(_scheduler) && has_chunksplit(_scheduler) && chunksplit(_scheduler) != Consecutive() error("Only `split == Consecutive()` is supported because the parallel operation isn't commutative. (Scheduler: $_scheduler)") end if (A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate) _check_chunks_incompatible_kwargs(; kwargs...) if chunking_enabled(_scheduler) if _scheduler isa DynamicScheduler _scheduler = DynamicScheduler(; threadpool = get_threadpool(_scheduler), chunking = false) elseif _scheduler isa StaticScheduler _scheduler = StaticScheduler(; chunking = false) else error("Can't disable chunking for this scheduler?! Shouldn't be reached.", _scheduler) end end end Arrs = (A, _Arrs...) if _scheduler isa SerialScheduler || isempty(A) # empty input collection → align with Base.map behavior map(f, Arrs...; kwargs...) else check_all_have_same_indices(Arrs) @noinline _tmap(_scheduler, f, A, _Arrs...) end end # w/o chunking (DynamicScheduler{NoChunking}): AbstractArray function _tmap(scheduler::DynamicScheduler{NoChunking}, f, A::AbstractArray, _Arrs::AbstractArray...;) threadpool = get_threadpool(scheduler) Arrs = (A, _Arrs...) throw_if_boxed_captures(f) tasks = map(eachindex(A)) do i @spawn threadpool begin args = map(A -> A[i], Arrs) promise_task_local(f)(args...) end end v = map(fetch, tasks) reshape(v, size(A)...) end # w/o chunking (DynamicScheduler{NoChunking}): AbstractChunks function _tmap(scheduler::DynamicScheduler{NoChunking}, f, A::Union{AbstractChunks, ChunkSplitters.Internals.Enumerate}, _Arrs::AbstractArray...) threadpool = get_threadpool(scheduler) throw_if_boxed_captures(f) tasks = map(A) do idcs @spawn threadpool promise_task_local(f)(idcs) end map(fetch, tasks) end # w/o chunking (StaticScheduler{NoChunking}): AbstractChunks function _tmap(scheduler::StaticScheduler{NoChunking}, f, A::AbstractChunks, _Arrs::AbstractArray...) nt = nthreads() throw_if_boxed_captures(f) tasks = map(enumerate(A)) do (c, idcs) tid = @inbounds nthtid(mod1(c, nt)) @spawnat tid promise_task_local(f)(idcs) end map(fetch, tasks) end # w/o chunking (StaticScheduler{NoChunking}): AbstractArray function _tmap(scheduler::StaticScheduler{NoChunking}, f, A::AbstractArray, _Arrs::AbstractArray...;) Arrs = (A, _Arrs...) nt = nthreads() throw_if_boxed_captures(f) tasks = map(enumerate(A)) do (c, i) tid = @inbounds nthtid(mod1(c, nt)) @spawnat tid begin args = map(A -> A[i], Arrs) promise_task_local(f)(args...) end end v = map(fetch, tasks) reshape(v, size(A)...) end # w/ chunking function _tmap(scheduler::Scheduler, f, A::AbstractArray, _Arrs::AbstractArray...) Arrs = (A, _Arrs...) idcs = collect(_index_chunks(scheduler, A)) reduction_f = append!! mapping_f = maybe_rewrap(f) do f (inds) -> begin args = map(A -> @view(A[inds]), Arrs) map(f, args...) end end v = tmapreduce(mapping_f, reduction_f, idcs; scheduler) reshape(v, size(A)...) end @propagate_inbounds function tmap!(f, out, A::AbstractArray, _Arrs::AbstractArray...; scheduler::MaybeScheduler = NotGiven(), kwargs...) _scheduler = _scheduler_from_userinput(scheduler; kwargs...) Arrs = (A, _Arrs...) if _scheduler isa SerialScheduler map!(f, out, Arrs...) else @boundscheck check_all_have_same_indices((out, Arrs...)) throw_if_boxed_captures(f) mapping_f = maybe_rewrap(f) do f function mapping_function(i) args = map(A -> @inbounds(A[i]), Arrs) res = f(args...) out[i] = res end end @noinline tforeach(mapping_f, eachindex(out); scheduler = _scheduler) out end end #------------------------------------------------------------- function tcollect(::Type{T}, gen::Base.Generator{<:AbstractArray}; kwargs...) where {T} tmap(gen.f, T, gen.iter; kwargs...) end tcollect(gen::Base.Generator{<:AbstractArray}; kwargs...) = tmap(gen.f, gen.iter; kwargs...) tcollect(::Type{T}, A; kwargs...) where {T} = tmap(identity, T, A; kwargs...) tcollect(A; kwargs...) = tmap(identity, A; kwargs...) end # module Implementation ================================================ FILE: src/macro_impl.jl ================================================ using OhMyThreads.Tools: OnlyOneRegion, try_enter! using OhMyThreads.Tools: SimpleBarrier using OhMyThreads: OhMyThreads function _is_special_macro_expr(arg; lookfor = ("@set", "@local", "@only_one", "@one_by_one", "@barrier")) if !(arg isa Expr) return false end lookfor_symbols = Symbol.(lookfor) if arg.head == :macrocall if arg.args[1] isa Symbol && arg.args[1] in lookfor_symbols # support, e.g., @set return true elseif arg.args[1] isa Expr && arg.args[1].head == Symbol(".") # support, e.g., OhMyThreads.@set x = arg.args[1] if x.args[1] == Symbol("OhMyThreads") && x.args[2] isa QuoteNode && x.args[2].value in lookfor_symbols return true end end end return false end function tasks_macro(forex; __module__) if forex.head != :for throw(ErrorException("Expected a for loop after `@tasks`.")) else if forex.args[1].head != :(=) # this'll catch cases like # @tasks for _ ∈ 1:10, _ ∈ 1:10 # body # end throw(ErrorException("`@tasks` currently only supports a single threaded loop, got $(forex.args[1])")) end it = forex.args[1] itvar = it.args[1] itrng = it.args[2] forbody = forex.args[2] end settings = Settings() # Escape everything in the loop body that is not used in conjuction with one of our # "macros", e.g. @set or @local. Code inside of these macro blocks will be escaped by # the respective "macro" handling functions below. for i in findall(!_is_special_macro_expr, forbody.args) forbody.args[i] = esc(forbody.args[i]) end locals_before, locals_names = _maybe_handle_atlocal_block!(forbody.args) tls_names = isnothing(locals_before) ? [] : map(x -> x.args[1], locals_before) _maybe_handle_atset_block!(settings, forbody.args) setup_onlyone_blocks = _maybe_handle_atonlyone_blocks!(forbody.args) setup_onebyone_blocks = _maybe_handle_atonebyone_blocks!(forbody.args) if isdefined(__module__, Symbol("@barrier")) if __module__.var"@barrier" != OhMyThreads.Experimental.var"@barrier" error("There seems to be a macro `@barrier` around which isn't `OhMyThreads.Experimental.@barrier`. This isn't supported.") end setup_barriers = _maybe_handle_atbarriers!(forbody.args, settings) else setup_barriers = nothing end itrng = esc(itrng) itvar = esc(itvar) make_mapping_function = if isempty(tls_names) :(local function mapping_function($itvar,) $(forbody) end) else :(local mapping_function = WithTaskLocals(($(tls_names...),)) do ($(locals_names...),) function mapping_function_local($itvar,) $(forbody) end end) end q = if isgiven(settings.reducer) quote $setup_onlyone_blocks $setup_onebyone_blocks $setup_barriers $make_mapping_function tmapreduce(mapping_function, $(settings.reducer), $(itrng)) end elseif isgiven(settings.collect) maybe_warn_useless_init(settings) quote $setup_onlyone_blocks $setup_onebyone_blocks $setup_barriers $make_mapping_function tmap(mapping_function, $(itrng)) end else maybe_warn_useless_init(settings) quote $setup_onlyone_blocks $setup_onebyone_blocks $setup_barriers $make_mapping_function tforeach(mapping_function, $(itrng)) end end # insert keyword arguments into the function call kwexpr = :($(Expr(:parameters))) if isgiven(settings.scheduler) push!(kwexpr.args, Expr(:kw, :scheduler, settings.scheduler)) end if isgiven(settings.init) push!(kwexpr.args, Expr(:kw, :init, settings.init)) end for (k, v) in settings.kwargs push!(kwexpr.args, Expr(:kw, k, v)) end insert!(q.args[10].args, 2, kwexpr) # wrap everything in a let ... end block # and, potentially, define the `TaskLocalValue`s. result = :(let end) push!(result.args[2].args, q) if !isnothing(locals_before) for x in locals_before push!(result.args[1].args, x) end end result end function maybe_warn_useless_init(settings) isgiven(settings.init) && @warn("The @set init = ... settings won't have any effect because no reduction is performed.") end Base.@kwdef mutable struct Settings scheduler::Union{Expr, QuoteNode, NotGiven} = NotGiven() reducer::Union{Expr, Symbol, NotGiven} = NotGiven() collect::Union{Bool, NotGiven} = NotGiven() init::Union{Expr, Symbol, NotGiven} = NotGiven() kwargs::Dict{Symbol, Any} = Dict{Symbol, Any}() end function _maybe_handle_atlocal_block!(args) locals_before = nothing local_inner = nothing tlsidx = findfirst(args) do arg _is_special_macro_expr(arg; lookfor = (Symbol("@local"),)) end if !isnothing(tlsidx) locals_before, local_inner = _unfold_atlocal_block(args[tlsidx].args[3]) deleteat!(args, tlsidx) end return locals_before, local_inner end function _unfold_atlocal_block(ex) locals_before = Expr[] locals_names = Expr[] if ex.head == :(=) localb, localn = _atlocal_assign_to_exprs(ex) push!(locals_before, localb) push!(locals_names, localn) elseif ex.head == :block tlsexprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode for x in tlsexprs localb, localn = _atlocal_assign_to_exprs(x) push!(locals_before, localb) push!(locals_names, localn) end else throw(ErrorException("Wrong usage of @local. You must either provide a typed assignment or multiple typed assignments in a `begin ... end` block.")) end return locals_before, locals_names end #= If the TLS doesn't have a declared return type, we're going to use `CC.return_type` to get it automatically. This would normally be non-kosher, but it's okay here for three reasons: 1) The task local value *only* exists within the function being called, meaning that the worldage is frozen for the full lifetime of the TLV, so and `eval` can't change the outcome or cause incorrect inference. 2) We do not allow users to *write* to the task local value, they can only retrieve its value, so there's no potential problems from the type being maximally narrow and then them trying to write a value of another type to it 3) the task local value is not user-observable. we never let the user inspect its type, unless they themselves are using `code____` tools to inspect the generated code, hence if inference changes and gives a more or less precise type, there's no observable semantic changes, just performance increases or decreases. =# function _atlocal_assign_to_exprs(ex) left_ex = ex.args[1] tls_def = esc(ex.args[2]) @gensym tl_storage if Base.isexpr(left_ex, :(::)) tls_sym = esc(left_ex.args[1]) tls_type = esc(left_ex.args[2]) local_before = :($(tl_storage) = TaskLocalValue{$tls_type}(() -> $(tls_def))) else tls_sym = esc(left_ex) local_before = :($(tl_storage) = let f = () -> $(tls_def) TaskLocalValue{Core.Compiler.return_type(f, Tuple{})}(f) end) end local_name = :($(tls_sym)) return local_before, local_name end function _maybe_handle_atset_block!(settings, args) idcs = findall(args) do arg _is_special_macro_expr(arg; lookfor = (Symbol("@set"),)) end isnothing(idcs) && return # no @set block found for i in idcs ex = args[i].args[3] if ex.head == :(=) _handle_atset_single_assign!(settings, ex) elseif ex.head == :block exprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode _handle_atset_single_assign!.(Ref(settings), exprs) else throw(ErrorException("Wrong usage of @set. You must either provide an assignment or multiple assignments in a `begin ... end` block.")) end end deleteat!(args, idcs) # check incompatible settings if isgiven(settings.collect) && settings.collect && isgiven(settings.reducer) throw(ArgumentError("Specifying both collect and reducer isn't supported.")) end end function _handle_atset_single_assign!(settings, ex) if ex.head != :(=) throw(ErrorException("Wrong usage of @set. Expected assignment, e.g. `scheduler = StaticScheduler()`.")) end sym = ex.args[1] def = ex.args[2] if hasfield(Settings, sym) if sym == :collect && !(def isa Bool) throw(ArgumentError("Setting collect can only be true or false.")) #TODO support specifying the OutputElementType end def = def isa Bool ? def : esc(def) setfield!(settings, sym, def) else # push!(settings.kwargs, sym => esc(def)) settings.kwargs[sym] = esc(def) end end function _maybe_handle_atonlyone_blocks!(args) idcs = findall(args) do arg _is_special_macro_expr(arg; lookfor = (Symbol("@only_one"),)) end isnothing(idcs) && return # no @only_one blocks setup_onlyone_blocks = quote end for i in idcs body = args[i].args[3] @gensym onlyone init_onlyone_ex = :($(onlyone) = Tools.OnlyOneRegion()) push!(setup_onlyone_blocks.args, init_onlyone_ex) args[i] = quote Tools.try_enter!($(onlyone)) do $(esc(body)) end end end return setup_onlyone_blocks end function _maybe_handle_atonebyone_blocks!(args) idcs = findall(args) do arg _is_special_macro_expr(arg; lookfor = (Symbol("@one_by_one"),)) end isnothing(idcs) && return # no @one_by_one blocks setup_onebyone_blocks = quote end for i in idcs body = args[i].args[3] @gensym onebyone init_lock_ex = :($(onebyone) = Base.ReentrantLock()) push!(setup_onebyone_blocks.args, init_lock_ex) args[i] = quote lock($(onebyone)) do $(esc(body)) end end end return setup_onebyone_blocks end function _maybe_handle_atbarriers!(args, settings) idcs = findall(args) do arg _is_special_macro_expr(arg; lookfor = (Symbol("@barrier"),)) end isnothing(idcs) && return # no @barrier found setup_barriers = quote end for i in idcs !haskey(settings.kwargs, :ntasks) && throw(ErrorException("When using `@barrier`, the number of tasks must be " * "specified explicitly, e.g. via `@set ntasks=...`. ")) ntasks = settings.kwargs[:ntasks] @gensym barrier push!(setup_barriers.args, :($(barrier) = $(SimpleBarrier)($ntasks))) args[i] = :($(esc(:wait))($(barrier))) end return setup_barriers end ================================================ FILE: src/macros.jl ================================================ """ @tasks for ... end A macro to parallelize a `for` loop by spawning a set of tasks that can be run in parallel. The policy of how many tasks to spawn and how to distribute the iteration space among the tasks (and more) can be configured via `@set` statements in the loop body. Supports reductions (`@set reducer=`) and collecting the results (`@set collect=true`). Under the hood, the `for` loop is translated into corresponding parallel [`tforeach`](@ref), [`tmapreduce`](@ref), or [`tmap`](@ref) calls. See also: [`@set`](@ref), [`@local`](@ref) ## Examples ```julia using OhMyThreads: @tasks ``` ```julia @tasks for i in 1:3 println(i) end ``` ```julia @tasks for x in rand(10) @set reducer=+ sin(x) end ``` ```julia @tasks for i in 1:5 @set collect=true i^2 end ``` ```julia @tasks for i in 1:100 @set ntasks=4*nthreads() # non-uniform work... end ``` ```julia @tasks for i in 1:5 @set scheduler=:static println("i=", i, " → ", threadid()) end ``` ```julia @tasks for i in 1:100 @set begin scheduler=:static chunksize=10 end println("i=", i, " → ", threadid()) end ``` """ macro tasks(args...) Implementation.tasks_macro(args...; __module__) end """ @set name = value This can be used inside a `@tasks for ... end` block to specify settings for the parallel execution of the loop. Multiple settings are supported, either as separate `@set` statements or via `@set begin ... end`. ## Settings * `reducer` (e.g. `reducer=+`): Indicates that a reduction should be performed with the provided binary function. See [`tmapreduce`](@ref) for more information. * `collect` (e.g. `collect=true`): Indicates that results should be collected (similar to `map`). All other settings will be passed on to the underlying parallel functions (e.g. [tmapreduce](@ref)) as keyword arguments. Hence, you may provide whatever these functions accept as keyword arguments. Among others, this includes * `scheduler` (e.g. `scheduler=:static`): Can be either a [`Scheduler`](@ref) or a `Symbol` (e.g. `:dynamic`, `:static`, `:serial`, or `:greedy`). * `init` (e.g. `init=0.0`): Initial value to be used in a reduction (requires `reducer=...`). Settings like `ntasks`, `chunksize`, and `split` etc. can be used to tune the scheduling policy (if the selected scheduler supports it). Note that the assignment is hoisted above the loop body which means that the scope is *not* the scope of the loop (even though it looks like it) but rather the scope *surrounding* the loop body. (`@macroexpand` is a useful tool to inspect the generated code of the `@tasks` block.) """ macro set(args...) error("The @set macro may only be used inside of a @tasks block.") end @eval begin """ @local name = value @local name::T = value Can be used inside a `@tasks for ... end` block to specify [task-local values](@ref TLS) (TLV) via explicitly typed assignments. These values will be allocated once per task (rather than once per iteration) and can be re-used between different task-local iterations. There can only be a single `@local` block in a `@tasks for ... end` block. To specify multiple TLVs, use `@local begin ... end`. Compared to regular assignments, there are some limitations though, e.g. TLVs can't reference each other. ## Examples ```julia using OhMyThreads: @tasks using OhMyThreads.Tools: taskid @tasks for i in 1:10 @set begin scheduler=:dynamic ntasks=2 end @local x = zeros(3) # TLV x .+= 1 println(taskid(), " -> ", x) end ``` ```julia @tasks for i in 1:10 @local begin x = rand(Int, 3) M = rand(3, 3) end # ... end ``` Task local variables created by `@local` are by default constrained to their inferred type, but if you need to, you can specify a different type during declaration: ```julia @tasks for i in 1:10 @local x::Vector{Float64} = some_hard_to_infer_setup_function() # ... end ``` The right hand side of the assignment is hoisted outside of the loop body and captured as a closure used to initialize the task local value. This means that the scope of the closure is *not* the scope of the loop (even though it looks like it) but rather the scope *surrounding* the loop body. (`@macroexpand` is a useful tool to inspect the generated code of the `@tasks` block.) """ macro $(Symbol("local"))(args...) error("The @local macro may only be used inside of a @tasks block.") end end """ @only_one begin ... end This can be used inside a `@tasks for ... end` block to mark a region of code to be executed by only one of the parallel tasks (all other tasks skip over this region). ## Example ```julia using OhMyThreads: @tasks @tasks for i in 1:10 @set ntasks = 10 println(i, ": before") @only_one begin println(i, ": only printed by a single task") sleep(1) end println(i, ": after") end ``` """ macro only_one(args...) error("The @only_one macro may only be used inside of a @tasks block.") end """ @one_by_one begin ... end This can be used inside a `@tasks for ... end` block to mark a region of code to be executed by one parallel task at a time (i.e. exclusive access). The order may be arbitrary and non-deterministic. ## Example ```julia using OhMyThreads: @tasks @tasks for i in 1:10 @set ntasks = 10 println(i, ": before") @one_by_one begin println(i, ": one task at a time") sleep(0.5) end println(i, ": after") end ``` """ macro one_by_one(args...) error("The @one_by_one macro may only be used inside of a @tasks block.") end const allowing_boxed_captures = ScopedValue(false) """ @allow_boxed_captures expr By default, OhMyThreads.jl will detect and error on multithreaded code which references local variables which are 'boxed' -- something that happens if the variable could be re-bound in multiple scopes. This process can cause very sublte bugs in multithreaded code by creating silent race conditions, e.g. ```julia let function wrong() tmap(1:10) do i A = i # define A for the first time (lexically) sleep(rand()/10) A # user is trying to reference local A only end end @show wrong() A = 1 # boxed! this hoists "A" to the same variable as in `wrong` but presumably the user wanted a new one end ``` In this example, you might expect to get `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`, but you would actually observe incorrect results because `A` is 'boxed'. The fix for this would be to write something like ```julia let function right() tmap(1:10) do i local A = i sleep(rand()/10) A end end @show right() A = 1 end ``` However, if you are really sure you want to bypass OhMyThreads's error mechanism, you can use `@allow_boxed_captures` to wrap code you believe is okay, e.g. ```julia-repl julia> let A = 1 @allow_boxed_captures tmap(1:10) do i A = i sleep(rand()/10) A # race condition! end end 10-element Vector{Int64}: 4 2 7 2 2 8 6 8 7 2 ``` This is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`. See also `@disallow_boxed_captures` """ macro allow_boxed_captures(ex) quote @with allowing_boxed_captures => true $(esc(ex)) end end """ @disallow_boxed_captures expr Disable the effect of `@allow_boxed_captures` for any code in `expr`. This is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`. See also `@disallow_boxed_captures` """ macro disallow_boxed_captures(ex) quote @with allowing_boxed_captures => false $(esc(ex)) end end """ @localize args... expr Writing ``` @localize x y z expr ``` is equivalent to writing ``` let x=x, y=y, z=z expr end ``` This is useful for avoiding the boxing of captured variables when working with closures. See https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for more information about boxed variables. """ macro localize(args...) syms = args[1:end-1] ex = args[end] letargs = map(syms) do sym if !(sym isa Symbol) throw(ArgumentError("All but the final argument to `@localize` must be symbols! Got $sym")) end :($sym = $sym) end esc(:(let $(letargs...) $ex end)) end ================================================ FILE: src/schedulers.jl ================================================ module Schedulers using Base.Threads: nthreads using ChunkSplitters: Split, Consecutive, RoundRobin, ChunkSplitters # Used to indicate that a keyword argument has not been set by the user. # We don't use Nothing because nothing maybe sometimes be a valid user input (e.g. for init) struct NotGiven end isgiven(::NotGiven) = false isgiven(::T) where {T} = true const MaybeInteger = Union{Integer, NotGiven} struct NoSplit <: Split end _parse_split(split::Split) = split function _parse_split(split::Symbol) split in (:consecutive, :batch) && return Consecutive() split in (:roundrobin, :scatter) && return RoundRobin() throw(ArgumentError("You've provided an unsupported value for `split`")) end _splitid(x::Type{<:Split}) = nameof(x) |> string |> lowercase |> Symbol _splitid(x::Split) = _splitid(typeof(x)) """ Supertype for all available schedulers: * [`DynamicScheduler`](@ref): default dynamic scheduler * [`StaticScheduler`](@ref): low-overhead static scheduler * [`GreedyScheduler`](@ref): greedy load-balancing scheduler * [`SerialScheduler`](@ref): serial (non-parallel) execution """ abstract type Scheduler end #! A subtype of Scheduler (let's call it `S`) **must** implement: # - `from_symbol(::Val{:symbol})` returning exactly `S` for the given symbol. # (e.g. `from_symbol(::Val{:dynamic}) = DynamicScheduler`) # To enable chunking, S **must** implement: # - `chunking_args(::S)::ChunkingArgs` returning the chunking arguments of the scheduler. # It usually is a field of the scheduler, and use the constructor # `ChunkingArgs` to create it (see below). # And can optionally implement: # - `default_nchunks(::Type{S})` returning the default number of chunks for the scheduler. # if chunking is enabled. Default is `Threads.nthreads(:default)`. from_symbol(::Val) = throw(ArgumentError("unkown scheduler symbol")) scheduler_from_symbol(s::Symbol; kwargs...) = scheduler_from_symbol(Val(s); kwargs...) function scheduler_from_symbol(v::Val; kwargs...) sched = from_symbol(v) return sched(; kwargs...) end """ ChunkingMode A trait type to indicate the chunking mode of a scheduler. The following subtypes are available: * `NoChunking`: no chunking is used * `FixedCount`: the number of chunks is fixed * `FixedSize`: the size of each chunk is fixed """ abstract type ChunkingMode end struct NoChunking <: ChunkingMode end struct FixedCount <: ChunkingMode end struct FixedSize <: ChunkingMode end chunksplitter_mode(::Type{FixedCount}) = ChunkSplitters.Internals.FixedCount chunksplitter_mode(::Type{FixedSize}) = ChunkSplitters.Internals.FixedSize """ ChunkingArgs{C, S <: Split}(n::Union{Int, Nothing}, size::Union{Int, Nothing}, minsize::Union{Int, Nothing}, split::S) ChunkingArgs(Sched::Type{<:Scheduler}; n = nothing, size = nothing, minsize = nothing, split::Union{Symbol, Split}; chunking) Stores all the information needed for chunking. The type parameter `C` is the chunking mode (`NoChunking`, `FixedSize`, or `FixedCount`). The `chunking` keyword argument is a boolean and if true, everything is skipped and `C = NoChunking`. Once the object is created, use the `has_fieldname(object)` function (e.g. `has_size(object)`) to know if the field is effectively used. """ struct ChunkingArgs{C, S <: Split} n::Union{Int, Nothing} size::Union{Int, Nothing} minsize::Union{Int, Nothing} split::S end function ChunkingArgs(::Type{NoChunking}) ChunkingArgs{NoChunking, NoSplit}(nothing, nothing, nothing, NoSplit()) end function ChunkingArgs( Sched::Type{<:Scheduler}; n = nothing, size = nothing, minsize = nothing, split::Union{Symbol, Split}, chunking ) chunking || return ChunkingArgs(NoChunking) if isnothing(n) && isnothing(size) n = default_nchunks(Sched) elseif !isnothing(n) && !isnothing(size) throw(ArgumentError("nchunks and chunksize are mutually exclusive")) end chunking_mode = isnothing(n) ? FixedSize : FixedCount split = _parse_split(split) return ChunkingArgs{chunking_mode, typeof(split)}(n, size, minsize, split) end chunking_mode(::ChunkingArgs{C}) where {C} = C has_n(ca::ChunkingArgs) = !isnothing(ca.n) has_size(ca::ChunkingArgs) = !isnothing(ca.size) has_split(::ChunkingArgs{C, S}) where {C, S} = S !== NoSplit has_minsize(ca::ChunkingArgs) = !isnothing(ca.minsize) chunking_enabled(ca::ChunkingArgs) = chunking_mode(ca) != NoChunking function chunkingargs_to_kwargs(ca::ChunkingArgs, arg) minsize = !has_minsize(ca) ? nothing : min(ca.minsize, length(arg)) return (; ca.n, ca.size, minsize, ca.split) end _chunkingstr(ca::ChunkingArgs{NoChunking}) = "none" function _chunkingstr(ca::ChunkingArgs{FixedCount}) str = "fixed count ($(ca.n)), split :$(_splitid(ca.split))" if has_minsize(ca) str = str * ", minimum chunk size $(ca.minsize)" end str end function _chunkingstr(ca::ChunkingArgs{FixedSize}) str = "fixed size ($(ca.size)), split :$(_splitid(ca.split))" str end # Link between a scheduler and its chunking arguments # The first and only the first method must be overloaded for each scheduler # that supports chunking. chunking_args(::Scheduler) = ChunkingArgs(NoChunking) nchunks(sched::Scheduler) = chunking_args(sched).n chunksize(sched::Scheduler) = chunking_args(sched).size chunksplit(sched::Scheduler) = chunking_args(sched).split minchunksize(sched::Scheduler) = chunking_args(sched).minsize has_nchunks(sched::Scheduler) = has_n(chunking_args(sched)) has_chunksize(sched::Scheduler) = has_size(chunking_args(sched)) has_chunksplit(sched::Scheduler) = has_split(chunking_args(sched)) has_minchunksize(sched::Scheduler) = has_minsize(chunking_args(sched)) function chunkingargs_to_kwargs(sched::Scheduler, arg) chunkingargs_to_kwargs(chunking_args(sched), arg) end chunking_mode(sched::Scheduler) = chunking_mode(chunking_args(sched)) chunking_enabled(sched::Scheduler) = chunking_enabled(chunking_args(sched)) _chunkingstr(sched::Scheduler) = _chunkingstr(chunking_args(sched)) """ default_nchunks(::Type{<:Scheduler}) Hardcoded default number of chunks, if not provided by the user. Can depend on the kind of scheduler. """ function default_nchunks end default_nchunks(::Type{<:Scheduler}) = nthreads(:default) """ DynamicScheduler (aka :dynamic) The default dynamic scheduler. Divides the given collection into chunks and then spawns a task per chunk to perform the requested operation in parallel. The tasks are assigned to threads by Julia's dynamic scheduler and are non-sticky, that is, they can migrate between threads. Generally preferred since it is flexible, can provide load balancing, and is composable with other multithreaded code. ## Keyword arguments: - `nchunks::Integer` or `ntasks::Integer` (default `nthreads(threadpool)`): * Determines the number of chunks (and thus also the number of parallel tasks). * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)), but at the expense of creating more overhead. For `nchunks <= nthreads()` there are not enough chunks for any load balancing. * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads. - `chunksize::Integer` (default not set) * Specifies the desired chunk size (instead of the number of chunks). * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be a positive integer). - `minchunksize::Union{Integer, Nothing}` (default `nothing`) * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example. - `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`): * Determines how the collection is divided into chunks (if chunking=true). By default, each chunk consists of contiguous elements and order is maintained. * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()` * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**! - `chunking::Bool` (default `true`): * Controls whether input elements are grouped into chunks (`true`) or not (`false`). * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as "chunks" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly! - `threadpool::Symbol` (default `:default`): * Possible options are `:default` and `:interactive`. * The high-priority pool `:interactive` should be used very carefully since tasks on this threadpool should not be allowed to run for a long time without `yield`ing as it can interfere with [heartbeat](https://en.wikipedia.org/wiki/Heartbeat_(computing)) processes. """ struct DynamicScheduler{C <: ChunkingMode, S <: Split, threadpool} <: Scheduler chunking_args::ChunkingArgs{C, S} function DynamicScheduler(threadpool::Symbol, ca::ChunkingArgs) if !(threadpool in (:default, :interactive)) throw(ArgumentError("threadpool must be either :default or :interactive")) end new{chunking_mode(ca), typeof(ca.split), threadpool}(ca) end end function DynamicScheduler(; threadpool::Symbol = :default, nchunks = nothing, ntasks = nothing, # "alias" for nchunks chunksize = nothing, split::Union{Split, Symbol} = Consecutive(), minchunksize = nothing, chunking::Bool = true ) if !isnothing(ntasks) if !isnothing(nchunks) throw(ArgumentError("For the dynamic scheduler, nchunks and ntasks are aliases and only one may be provided")) end nchunks = ntasks end ca = ChunkingArgs(DynamicScheduler; n = nchunks, size = chunksize, minsize = minchunksize, split, chunking) return DynamicScheduler(threadpool, ca) end from_symbol(::Val{:dynamic}) = DynamicScheduler chunking_args(sched::DynamicScheduler) = sched.chunking_args threadpool(::DynamicScheduler{C, S, T}) where {C, S, T} = T function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::DynamicScheduler) print(io, "DynamicScheduler", "\n") cstr = _chunkingstr(s.chunking_args) println(io, "├ Chunking: ", cstr) print(io, "└ Threadpool: ", threadpool(s)) end """ StaticScheduler (aka :static) A static low-overhead scheduler. Divides the given collection into chunks and then spawns a task per chunk to perform the requested operation in parallel. The tasks are statically assigned to threads up front and are made *sticky*, that is, they are guaranteed to stay on the assigned threads (**no task migration**). Can sometimes be more performant than `DynamicScheduler` when the workload is (close to) uniform and, because of the lower overhead, for small workloads. Isn't well composable with other multithreaded code though. ## Keyword arguments: - `nchunks::Integer` or `ntasks::Integer` (default `nthreads()`): * Determines the number of chunks (and thus also the number of parallel tasks). * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads. * For `nchunks > nthreads()` the chunks will be distributed to the available threads in a round-robin fashion. - `chunksize::Integer` (default not set) * Specifies the desired chunk size (instead of the number of chunks). * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be non-zero). - `minchunksize::Union{Integer, Nothing}` (default `nothing`) * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example. - `chunking::Bool` (default `true`): * Controls whether input elements are grouped into chunks (`true`) or not (`false`). * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as "chunks" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly! - `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`): * Determines how the collection is divided into chunks. By default, each chunk consists of contiguous elements and order is maintained. * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()` * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**! """ struct StaticScheduler{C <: ChunkingMode, S <: Split} <: Scheduler chunking_args::ChunkingArgs{C, S} end function StaticScheduler(; nchunks = nothing, ntasks = nothing, # "alias" for nchunks chunksize = nothing, minchunksize = nothing, split::Union{Split, Symbol} = Consecutive(), chunking::Bool = true ) if !isnothing(ntasks) if !isnothing(nchunks) throw(ArgumentError("For the static scheduler, nchunks and ntasks are aliases and only one may be provided")) end nchunks = ntasks end ca = ChunkingArgs(StaticScheduler; n = nchunks, size = chunksize, minsize = minchunksize, split, chunking) return StaticScheduler(ca) end from_symbol(::Val{:static}) = StaticScheduler chunking_args(sched::StaticScheduler) = sched.chunking_args function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::StaticScheduler) print(io, "StaticScheduler", "\n") cstr = _chunkingstr(s.chunking_args) println(io, "├ Chunking: ", cstr) print(io, "└ Threadpool: default") end """ GreedyScheduler (aka :greedy) A greedy dynamic scheduler. The elements are put into a shared workqueue and dynamic, non-sticky, tasks are spawned to process the elements of the queue with each task taking a new element from the queue as soon as the previous one is done. Note that elements are processed in a non-deterministic order, and thus a potential reducing function **must** be [commutative](https://en.wikipedia.org/wiki/Commutative_property) in addition to being associative, or you could get incorrect results! Can be good choice for load-balancing slower, uneven computations, but does carry some additional overhead. ## Keyword arguments: - `ntasks::Int` (default `nthreads()`): * Determines the number of parallel tasks to be spawned. * Setting `ntasks < nthreads()` is an effective way to use only a subset of the available threads. - `chunking::Bool` (default `false`): * Controls whether input elements are grouped into chunks (`true`) or not (`false`) before put into the shared workqueue. This can improve the performance especially if there are many iterations each of which are computationally cheap. * If `nchunks` or `chunksize` are explicitly specified, `chunking` will be automatically set to `true`. - `nchunks::Integer` (default `10 * nthreads()`): * Determines the number of chunks (that will eventually be put into the shared workqueue). * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)). For `nchunks <= nthreads()` there are not enough chunks for any load balancing. - `chunksize::Integer` (default not set) * Specifies the desired chunk size (instead of the number of chunks). * The options `chunksize` and `nchunks` are **mutually exclusive** (only one may be a positive integer). - `minchunksize::Union{Integer, Nothing}` (default `nothing`) * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example. - `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.RoundRobin()`): * Determines how the collection is divided into chunks (if chunking=true). * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()` """ struct GreedyScheduler{C <: ChunkingMode, S <: Split} <: Scheduler ntasks::Int chunking_args::ChunkingArgs{C, S} function GreedyScheduler(ntasks::Integer, ca::ChunkingArgs) ntasks > 0 || throw(ArgumentError("ntasks must be a positive integer")) return new{chunking_mode(ca), typeof(ca.split)}(ntasks, ca) end end function GreedyScheduler(; ntasks::Integer = nthreads(), nchunks = nothing, chunksize = nothing, minchunksize = nothing, split::Union{Split, Symbol} = RoundRobin(), chunking::Bool = false ) if !(isnothing(nchunks) && isnothing(chunksize)) chunking = true end ca = ChunkingArgs(GreedyScheduler; n = nchunks, size = chunksize, minsize = minchunksize, split, chunking) return GreedyScheduler(ntasks, ca) end from_symbol(::Val{:greedy}) = GreedyScheduler chunking_args(sched::GreedyScheduler) = sched.chunking_args default_nchunks(::Type{GreedyScheduler}) = 10 * nthreads(:default) function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::GreedyScheduler) print(io, "GreedyScheduler", "\n") println(io, "├ Num. tasks: ", s.ntasks) cstr = _chunkingstr(s) println(io, "├ Chunking: ", cstr) print(io, "└ Threadpool: default") end """ SerialScheduler (aka :serial) A scheduler for turning off any multithreading and running the code in serial. It aims to make parallel functions like, e.g., `tmapreduce(sin, +, 1:100)` behave like their serial counterparts, e.g., `mapreduce(sin, +, 1:100)`. Note that `SerialScheduler` has no arguments and will ignore any that are passed to it. This is to make it easier to switch to the serial scheduler without having to change the rest of the code. """ struct SerialScheduler <: Scheduler # Dummy constructor to allow ignoring settings for other schedulers SerialScheduler(; _...) = new() end from_symbol(::Val{:serial}) = SerialScheduler end # module ================================================ FILE: src/tools.jl ================================================ module Tools using Base.Threads: nthreads """ nthtid(n) Returns the thread id of the `n`th Julia thread in the `:default` threadpool. """ @inline function nthtid(n) @static if VERSION < v"1.9" @boundscheck 1 <= n <= nthreads() return n else @boundscheck 1 <= n <= nthreads(:default) return n + Threads.threadpoolsize(:interactive) # default threads after interactive threads end end """ taskid() :: UInt Return a `UInt` identifier for the current running [Task](https://docs.julialang.org/en/v1/base/parallel/#Core.Task). This identifier will be unique so long as references to the task it came from still exist. """ taskid() = objectid(current_task()) """ May be used to mark a region in parallel code to be executed by a single task only (all other tasks shall skip over it). See [`try_enter!`](@ref) and [`reset!`](@ref). """ mutable struct OnlyOneRegion @atomic task::Union{Task, Nothing} OnlyOneRegion() = new(nothing) end """ try_enter!(f, s::OnlyOneRegion) When called from multiple parallel tasks (on a shared `s::OnlyOneRegion`) only a single task will execute `f`. ## Example ```julia using OhMyThreads: @tasks using OhMyThreads.Tools: OnlyOneRegion, try_enter! only_one = OnlyOneRegion() @tasks for i in 1:10 @set ntasks = 10 println(i, ": before") try_enter!(only_one) do println(i, ": only printed by a single task") sleep(1) end println(i, ": after") end ``` """ function try_enter!(f, s::OnlyOneRegion) ct = current_task() t = @atomic :monotonic s.task if !isnothing(t) && ct != t return end if ct == t || (@atomicreplace s.task nothing=>ct).success f() end return end """ Reset the `OnlyOneRegion` (so that it can be used again). """ function reset!(s::OnlyOneRegion) @atomic s.task = nothing return end """ SimpleBarrier(n::Integer) Simple reusable barrier for `n` parallel tasks. Given `b = SimpleBarrier(n)` and `n` parallel tasks, each task that calls `wait(b)` will block until the other `n-1` tasks have called `wait(b)` as well. ## Example ``` n = nthreads() barrier = SimpleBarrier(n) @sync for i in 1:n @spawn begin println("A") wait(barrier) # synchronize all tasks println("B") wait(barrier) # synchronize all tasks (reusable) println("C") end end ``` """ mutable struct SimpleBarrier const n::Int64 const c::Threads.Condition cnt::Int64 function SimpleBarrier(n::Integer) new(n, Threads.Condition(), 0) end end function Base.wait(b::SimpleBarrier) lock(b.c) try b.cnt += 1 if b.cnt == b.n b.cnt = 0 notify(b.c) else wait(b.c) end finally unlock(b.c) end end end # Tools ================================================ FILE: src/types.jl ================================================ """ struct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function This callable function-like object is meant to represent a function which closes over some [`TaskLocalValues`](https://github.com/vchuravy/TaskLocalValues.jl). This is, if you do ``` TLV{T} = TaskLocalValue{T} f = WithTaskLocals((TLV{Int}(() -> 1), TLV{Int}(() -> 2))) do (x, y) z -> (x + y)/z end ``` then that is equivalent to ``` g = let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2) z -> let x = x[], y=y[] (x + y)/z end end ``` however, the main difference is that you can call [`promise_task_local`](@ref) on a `WithTaskLocals` closure in order to turn it into something equivalent to ``` let x=x[], y=y[] z -> (x + y)/z end ``` which doesn't have the overhead of accessing the `task_local_storage` each time the closure is called. This of course will lose the safety advantages of `TaskLocalValue`, so you should never do `f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that unknown function calls `f_local` on a new task, you'll hit a race condition. """ struct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function inner_func::F tasklocals::TLVs end """ promise_task_local(f) = f promise_task_local(f::WithTaskLocals) = f.inner_func(map(x -> x[], f.tasklocals)) Take a `WithTaskLocals` closure, grab the `TaskLocalValue`s, and passs them to the closure. That is, it turns a `WithTaskLocals` closure from the equivalent of ``` TLV{T} = TaskLocalValue{T} let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2) z -> let x = x[], y=y[] (x + y)/z end end ``` into the equivalent of ``` let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2) let x = x[], y = y[] z -> (x + y)/z end end ``` which doesn't have the overhead of accessing the `task_local_storage` each time the closure is called. This of course will lose the safety advantages of `TaskLocalValue`, so you should never do `f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that unknown function calls `f_local` on a new task, you'll hit a race condition. ``` """ function promise_task_local(f::WithTaskLocals{F}) where {F} f.inner_func(map(x -> x[], f.tasklocals)) end promise_task_local(f::Any) = f function (f::WithTaskLocals{F})(args...; kwargs...) where {F} promise_task_local(f)(args...; kwargs...) end """ ChannelLike(itr) This struct wraps an indexable object such that it can be iterated by concurrent tasks in a safe manner similar to a `Channel`. `ChannelLike(itr)` is conceptually similar to: ```julia Channel{eltype(itr)}(length(itr)) do ch foreach(i -> put!(ch, i), itr) end ``` i.e. creating a channel, `put!`ing all elements of `itr` into it and closing it. The advantage is that `ChannelLike` doesn't copy the data. # Examples ```julia ch = OhMyThreads.ChannelLike(1:5) @sync for taskid in 1:2 Threads.@spawn begin for i in ch println("Task #\$taskid processing item \$i") sleep(1 / i) end end end # output Task #1 processing item 1 Task #2 processing item 2 Task #2 processing item 3 Task #2 processing item 4 Task #1 processing item 5 ``` Note that `ChannelLike` is stateful (just like a `Channel`), so you can't iterate over it twice. The wrapped iterator must support `firstindex(itr)::Int`, `lastindex(itr)::Int` and `getindex(itr, ::Int)`. """ mutable struct ChannelLike{T} const itr::T @atomic idx::Int function ChannelLike(itr::T) where {T} return new{T}(itr, firstindex(itr) - 1) end end Base.length(ch::ChannelLike) = length(ch.itr) Base.eltype(ch::ChannelLike) = eltype(ch.itr) function Base.iterate(ch::ChannelLike, ::Nothing = nothing) this = @atomic ch.idx += 1 if this <= lastindex(ch.itr) return (@inbounds(ch.itr[this]), nothing) else return nothing end end ================================================ FILE: test/Aqua.jl ================================================ using Aqua @testset "Aqua.jl" begin Aqua.test_all( OhMyThreads; # ambiguities=(exclude=[SomePackage.some_function], broken=true), # stale_deps=(ignore=[:SomePackage],), deps_compat=(ignore=[:Test],), # piracies=false, persistent_tasks=false, ) end ================================================ FILE: test/runtests.jl ================================================ using Test, OhMyThreads using OhMyThreads: TaskLocalValue, WithTaskLocals, @fetch, promise_task_local using OhMyThreads: Consecutive, RoundRobin using OhMyThreads.Experimental: @barrier using OhMyThreads.Implementation: BoxedVariableError @info "Testing with $(Threads.nthreads(:default)),$(Threads.nthreads(:interactive)) threads." include("Aqua.jl") sets_to_test = [(~ = isapprox, f = sin ∘ *, op = +, itrs = (rand(ComplexF64, 10, 10), rand(-10:10, 10, 10)), init = complex(0.0)) (~ = isapprox, f = cos, op = max, itrs = (1:100000,), init = 0.0) (~ = (==), f = round, op = vcat, itrs = (randn(1000),), init = Float64[]) (~ = (==), f = last, op = *, itrs = ([1 => "a", 2 => "b", 3 => "c", 4 => "d", 5 => "e"],), init = "")] ChunkedGreedy(; kwargs...) = GreedyScheduler(; kwargs...) @testset "Basics" begin for (; ~, f, op, itrs, init) in sets_to_test @testset "f=$f, op=$op, itrs::$(typeof(itrs))" begin @testset for sched in ( StaticScheduler, DynamicScheduler, GreedyScheduler, DynamicScheduler{OhMyThreads.Schedulers.NoChunking}, SerialScheduler, ChunkedGreedy) @testset for split in (Consecutive(), RoundRobin(), :consecutive, :roundrobin) for nchunks in (1, 2, 6) for minchunksize ∈ (nothing, 1, 3) if sched == GreedyScheduler scheduler = sched(; ntasks = nchunks, minchunksize) elseif sched == DynamicScheduler{OhMyThreads.Schedulers.NoChunking} scheduler = DynamicScheduler(; chunking = false) elseif sched == SerialScheduler scheduler = SerialScheduler(; nchunks) else scheduler = sched(; nchunks, split, minchunksize) end kwargs = (; scheduler) if (split in (RoundRobin(), :roundrobin) || sched ∈ (GreedyScheduler, ChunkedGreedy)) || op ∉ (vcat, *) # scatter and greedy only works for commutative operators! else mapreduce_f_op_itr = mapreduce(f, op, itrs...) @test tmapreduce(f, op, itrs...; init, kwargs...) ~ mapreduce_f_op_itr @test treducemap(op, f, itrs...; init, kwargs...) ~ mapreduce_f_op_itr @test treduce(op, f.(itrs...); init, kwargs...) ~ mapreduce_f_op_itr end split in (RoundRobin(), :roundrobin) && continue map_f_itr = map(f, itrs...) @test all(tmap(f, Any, itrs...; kwargs...) .~ map_f_itr) @test all(tcollect(Any, (f(x...) for x in collect(zip(itrs...))); kwargs...) .~ map_f_itr) @test all(tcollect(Any, f.(itrs...); kwargs...) .~ map_f_itr) RT = Core.Compiler.return_type(f, Tuple{eltype.(itrs)...}) @test tmap(f, RT, itrs...; kwargs...) ~ map_f_itr @test tcollect(RT, (f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr @test tcollect(RT, f.(itrs...); kwargs...) ~ map_f_itr if sched ∉ (GreedyScheduler, ChunkedGreedy) @test tmap(f, itrs...; kwargs...) ~ map_f_itr @test tcollect((f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr @test tcollect(f.(itrs...); kwargs...) ~ map_f_itr end end end end end end end end; @testset "ChunkSplitters.Chunk" begin x = rand(100) chnks = OhMyThreads.index_chunks(x; n = Threads.nthreads()) for scheduler in ( DynamicScheduler(), DynamicScheduler(; chunking = false), StaticScheduler(; chunking = false)) @testset "$scheduler" begin @test tmap(x -> sin.(x), chnks; scheduler) ≈ map(x -> sin.(x), chnks) @test tmapreduce(x -> sin.(x), vcat, chnks; scheduler) ≈ mapreduce(x -> sin.(x), vcat, chnks) @test tcollect(chnks; scheduler) == collect(chnks) @test treduce(vcat, chnks; scheduler) == reduce(vcat, chnks) @test isnothing(tforeach(x -> sin.(x), chnks; scheduler)) end end # enumerate(chunks) data = 1:100 @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs) [i, sum(@view(data[idcs]))] end == [sum(1:5), sum(data)] @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; size=5)); chunking=false) do (i, idcs) [i, sum(@view(data[idcs]))] end == [sum(1:20), sum(data)] @test tmap(enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs) [i, idcs] end == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]] end; @testset "macro API" begin # basic @test @tasks(for i in 1:3 i end) |> isnothing # reduction @test @tasks(for i in 1:3 @set reducer = (+) i end) == 6 # scheduler settings for sched in (StaticScheduler(), DynamicScheduler(), GreedyScheduler()) @test @tasks(for i in 1:3 @set scheduler = sched i end) |> isnothing end # scheduler settings as symbols @test @tasks(for i in 1:3 @set scheduler = :static i end) |> isnothing @test @tasks(for i in 1:3 @set scheduler = :dynamic i end) |> isnothing @test @tasks(for i in 1:3 @set scheduler = :greedy i end) |> isnothing # @set begin ... end @test @tasks(for i in 1:10 @set begin scheduler = StaticScheduler() reducer = (+) end i end) == 55 # multiple @set @test @tasks(for i in 1:10 @set scheduler = StaticScheduler() i @set reducer = (+) end) == 55 # @set init @test @tasks(for i in 1:10 @set begin reducer = (+) init = 0.0 end i end) === 55.0 @test @tasks(for i in 1:10 @set begin reducer = (+) init = 0.0 * im end i end) === (55.0 + 0.0im) # top-level "kwargs" @test @tasks(for i in 1:3 @set scheduler = :static @set ntasks = 1 i end) |> isnothing @test @tasks(for i in 1:3 @set scheduler = :static @set nchunks = 2 i end) |> isnothing @test @tasks(for i in 1:3 @set scheduler = :dynamic @set chunksize = 2 i end) |> isnothing @test @tasks(for i in 1:3 @set scheduler = :dynamic @set chunking = false i end) |> isnothing @test @tasks(for i in 1:4 @set minchunksize=2 i end) |> isnothing @test_throws ArgumentError @tasks(for i in 1:3 @set scheduler = DynamicScheduler() @set chunking = false i end) @test_throws MethodError @tasks(for i in 1:3 @set scheduler = :dynamic @set asd = 123 i end) # TaskLocalValue ntd = 2 * Threads.nthreads() ptrs = Vector{Ptr{Nothing}}(undef, ntd) tids = Vector{UInt64}(undef, ntd) tid() = OhMyThreads.Tools.taskid() @test @tasks(for i in 1:ntd @local C::Vector{Float64} = rand(3) @set scheduler = :static ptrs[i] = pointer_from_objref(C) tids[i] = tid() end) |> isnothing # check that different iterations of a task # have access to the same C (same pointer) for t in unique(tids) @test allequal(ptrs[findall(==(t), tids)]) end # TaskLocalValue (another fundamental check) @test @tasks(for i in 1:ntd @local x::Ref{Int64} = Ref(0) @set reducer = (+) @set scheduler = :static x[] += 1 x[] end) == 1.5 * ntd # if a new x would be allocated per iteration, we'd get ntd here. # TaskLocalValue (begin ... end block), inferred TLV type @test @inferred (() -> @tasks for i in 1:10 @local begin C = fill(4, 3, 3) x = fill(5.0, 3) end @set reducer = (+) sum(C * x) end)() == 1800 # hygiene / escaping var = 3 sched = StaticScheduler() sched_sym = :static data = rand(10) red = (a, b) -> a + b n = 2 @test @tasks(for d in data @set scheduler = sched @set reducer = red var * d end) ≈ var * sum(data) @test @tasks(for d in data @set scheduler = sched_sym @set ntasks = n @set reducer = red var * d end) ≈ var * sum(data) struct SingleInt x::Int end @test @tasks(for _ in 1:10 @local C = SingleInt(var) @set reducer = + C.x end) == 10 * var # enumerate(chunks) let data = collect(1:100) @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; n=5)) @set reducer = + @set chunking = false [i, sum(@view(data[idcs]))] end) == [sum(1:5), sum(data)] @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; size=5)) @set reducer = + [i, sum(@view(data[idcs]))] end) == [sum(1:20), sum(data)] @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(1:100; n=5)) @set chunking=false @set collect=true [i, idcs] end) == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]] end end; @testset "WithTaskLocals" begin let x = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0)), y = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0)) # Equivalent to # function f() # x[][] += 1 # x[][] += 1 # x[], y[] # end f = WithTaskLocals((x, y)) do (x, y) function () x[] += 1 y[] += 1 x[], y[] end end # Make sure we can call `f` like a regular function @test f() == (1, 1) @test f() == (2, 2) @test @fetch(f()) == (1, 1) # Acceptable use of promise_task_local @test @fetch(promise_task_local(f)()) == (1, 1) # Acceptable use of promise_task_local @test promise_task_local(f)() == (3, 3) # Acceptable use of promise_task_local @test @fetch(promise_task_local(f)()) == (1, 1) # Acceptable use of promise_task_local g() = @fetch((promise_task_local(f)(); promise_task_local(f)(); f())) @test g() == (3, 3) @test g() == (3, 3) h = promise_task_local(f) # Unacceptable use of `promise_task_local` # This is essentially testing that if you use `promise_task_local`, then pass that to another task, # you could get data races, since we here have a different thread writing to another thread's value. @test @fetch(h()) == (4, 4) @test @fetch(h()) == (5, 5) end end; @testset "chunking mode + chunksize option" begin @test OhMyThreads.Schedulers.chunking_mode(SerialScheduler()) == OhMyThreads.Schedulers.NoChunking for sched in (DynamicScheduler, StaticScheduler, GreedyScheduler) @test sched() isa sched @test sched(; chunksize = 2) isa sched @test OhMyThreads.Schedulers.chunking_mode(sched(; chunksize = 2)) == OhMyThreads.Schedulers.FixedSize @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = 2)) == OhMyThreads.Schedulers.FixedCount @test OhMyThreads.Schedulers.chunking_mode(sched(; chunking = false)) == OhMyThreads.Schedulers.NoChunking if sched != GreedyScheduler # For (Dynamic|Static)Scheduler `chunking = false` disables all chunking # arguments @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = 2, chunksize = 4, chunking = false)) == OhMyThreads.Schedulers.NoChunking @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = nothing, chunksize = nothing, split = :whatever, chunking = false)) == OhMyThreads.Schedulers.NoChunking @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = nothing, chunksize = nothing, chunking = false)) == false @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = 2, chunksize = 4, chunking = false)) == false else # For GreedyScheduler `nchunks` or `chunksize` overrides `chunking = false` @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = 2, chunking = false)) == OhMyThreads.Schedulers.FixedCount @test OhMyThreads.Schedulers.chunking_mode(sched(; chunksize = 2, chunking = false)) == OhMyThreads.Schedulers.FixedSize @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = 2, chunking = false)) == true @test OhMyThreads.Schedulers.chunking_enabled(sched(; chunksize = 4, chunking = false)) == true end @test OhMyThreads.Schedulers.chunking_enabled(sched(; chunksize = 2)) == true @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = 2)) == true @test_throws ArgumentError sched(; nchunks = 2, chunksize = 3) @test_throws ArgumentError sched(; nchunks = 2, split = :whatever) let scheduler = sched(; chunksize = 2, split = :batch) @test tmapreduce(sin, +, 1:10; scheduler, init=0.0) ≈ mapreduce(sin, +, 1:10) @test treduce(+, 1:10; scheduler, init=0.0) ≈ reduce(+, 1:10) @test tmap(sin, Float64, 1:10; scheduler) ≈ map(sin, 1:10) @test isnothing(tforeach(sin, 1:10; scheduler)) end end end; @testset "top-level kwargs" begin res_tmr = mapreduce(sin, +, 1:10000) # scheduler not given @test tmapreduce(sin, +, 1:10000; ntasks = 2) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; nchunks = 2) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; split = RoundRobin()) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; chunksize = 2) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; chunking = false) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; minchunksize=10) ≈ res_tmr @test tmapreduce(sin, +, 1:10; minchunksize=10) == mapreduce(sin, +, 1:10) # scheduler isa Scheduler @test tmapreduce(sin, +, 1:10000; scheduler = StaticScheduler()) ≈ res_tmr @test_throws ArgumentError tmapreduce( sin, +, 1:10000; ntasks = 2, scheduler = DynamicScheduler()) @test_throws ArgumentError tmapreduce( sin, +, 1:10000; chunksize = 2, scheduler = DynamicScheduler()) @test_throws ArgumentError tmapreduce( sin, +, 1:10000; split = RoundRobin(), scheduler = StaticScheduler()) @test_throws ArgumentError tmapreduce( sin, +, 1:10000; ntasks = 3, scheduler = SerialScheduler()) # scheduler isa Symbol for s in (:dynamic, :static, :serial, :greedy) @test tmapreduce(sin, +, 1:10000; scheduler = s, init = 0.0) ≈ res_tmr end for s in (:dynamic, :static, :greedy) @test tmapreduce(sin, +, 1:10000; ntasks = 2, scheduler = s, init = 0.0) ≈ res_tmr end for s in (:dynamic, :static) @test tmapreduce(sin, +, 1:10000; chunksize = 2, scheduler = s) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; chunking = false, scheduler = s) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; nchunks = 3, scheduler = s) ≈ res_tmr @test tmapreduce(sin, +, 1:10000; ntasks = 3, scheduler = s) ≈ res_tmr @test_throws ArgumentError tmapreduce( sin, +, 1:10000; ntasks = 3, nchunks = 2, scheduler = s)≈res_tmr end @test_throws ArgumentError tmapreduce(sin, +, 1:10000; scheduler = :whatever) @test_throws ArgumentError tmapreduce( sin, +, 1:10000; threadpool = :whatever, chunking = false) end; @testset "empty collections" begin @static if VERSION < v"1.11.0-" err = MethodError else err = ArgumentError end for empty_coll in (11:9, Float64[]) for f in (sin, x -> im * x, identity) for op in (+, *, min) # mapreduce for init in (0.0, 0, 0.0 * im, 0.0f0) @test tmapreduce(f, op, empty_coll; init) == init end # foreach @test tforeach(f, empty_coll) |> isnothing # reduce if op != min @test treduce(op, empty_coll) == reduce(op, empty_coll) else @test_throws err treduce(op, empty_coll) end # map @test tmap(f, empty_coll) == map(f, empty_coll) # collect @test tcollect(empty_coll) == collect(empty_coll) end end end end; # for testing @one_by_one region mutable struct SingleAccessOnly in_use::Bool const lck::ReentrantLock SingleAccessOnly() = new(false, ReentrantLock()) end function acquire(f, o::SingleAccessOnly) lock(o.lck) do o.in_use && throw(ErrorException("Already in use!")) o.in_use = true end try f() finally lock(o.lck) do !o.in_use && throw(ErrorException("Conflict!")) o.in_use = false end end end @testset "regions" begin @testset "@one_by_one" begin sao = SingleAccessOnly() try @tasks for i in 1:10 @set ntasks = 10 @one_by_one begin acquire(sao) do sleep(0.01) end end end catch ErrorException @test false else @test true end # test escaping let x = Ref(0) y = Ref(0) @tasks for i in 1:10 @set ntasks = 10 y[] += 1 # not safe (race condition) @one_by_one begin x[] += 1 # parallel-safe because inside of one_by_one region acquire(sao) do sleep(0.01) end end end @test x[] == 10 end test_f = () -> begin x = Ref(0) y = Ref(0) @tasks for i in 1:10 @set ntasks = 10 y[] += 1 # not safe (race condition) @one_by_one begin x[] += 1 # parallel-safe because inside of one_by_one region acquire(sao) do sleep(0.01) end end end return x[] end @test test_f() == 10 end @testset "@only_one" begin let x = Ref(0) y = Ref(0) try @tasks for i in 1:10 @set ntasks = 10 y[] += 1 # not safe (race condition) @only_one begin x[] += 1 # parallel-safe because only a single task will execute this end end @test x[] == 1 # only a single task should have incremented x catch ErrorException @test false end end let x = Ref(0) y = Ref(0) try @tasks for i in 1:10 @set ntasks = 2 y[] += 1 # not safe (race condition) @only_one begin x[] += 1 # parallel-safe because only a single task will execute this end end @test x[] == 5 # a single task should have incremented x 5 times catch ErrorException @test false end end test_f = () -> begin x = Ref(0) y = Ref(0) @tasks for i in 1:10 @set ntasks = 2 y[] += 1 # not safe (race condition) @only_one begin x[] += 1 # parallel-safe because only a single task will execute this end end return x[] end @test test_f() == 5 end @testset "@only_one + @one_by_one" begin x = Ref(0) y = Ref(0) try @tasks for i in 1:10 @set ntasks = 10 @only_one begin x[] += 1 # parallel-safe end @one_by_one begin y[] += 1 # parallel-safe end end @test x[] == 1 && y[] == 10 catch ErrorException @test false end end end; @testset "@barrier" begin @test (@tasks for i in 1:20 @set ntasks = 20 @barrier end) |> isnothing @test try @macroexpand @tasks for i in 1:20 @barrier end false catch true end @test try x = Threads.Atomic{Int64}(0) y = Threads.Atomic{Int64}(0) @tasks for i in 1:20 @set ntasks = 20 Threads.atomic_add!(x, 1) @barrier if x[] < 20 && y[] > 0 # x hasn't reached 20 yet and y is already > 0 error("shouldn't happen") end Threads.atomic_add!(y, 1) end true catch ErrorException false end @test try x = Threads.Atomic{Int64}(0) y = Threads.Atomic{Int64}(0) @tasks for i in 1:20 @set ntasks = 20 Threads.atomic_add!(x, 1) @barrier Threads.atomic_add!(x, 1) @barrier if x[] < 40 && y[] > 0 # x hasn't reached 20 yet and y is already > 0 error("shouldn't happen") end Threads.atomic_add!(y, 1) end true catch ErrorException false end end @testset "verbose special macro usage" begin # OhMyThreads.@set @test @tasks(for i in 1:3 OhMyThreads.@set reducer = (+) i end) == 6 @test @tasks(for i in 1:3 OhMyThreads.@set begin reducer = (+) end i end) == 6 # OhMyThreads.@local ntd = 2 * Threads.nthreads() @test @tasks(for i in 1:ntd OhMyThreads.@local x::Ref{Int64} = Ref(0) OhMyThreads.@set begin reducer = (+) scheduler = :static end x[] += 1 x[] end) == @tasks(for i in 1:ntd @local x::Ref{Int64} = Ref(0) @set begin reducer = (+) scheduler = :static end x[] += 1 x[] end) # OhMyThreads.@only_one let x = Ref(0) y = Ref(0) try @tasks for i in 1:10 OhMyThreads.@set ntasks = 10 y[] += 1 # not safe (race condition) OhMyThreads.@only_one begin x[] += 1 # parallel-safe because only a single task will execute this end end @test x[] == 1 # only a single task should have incremented x catch ErrorException @test false end end # OhMyThreads.@one_by_one test_f = () -> begin sao = SingleAccessOnly() x = Ref(0) y = Ref(0) @tasks for i in 1:10 OhMyThreads.@set ntasks = 10 y[] += 1 # not safe (race condition) OhMyThreads.@one_by_one begin x[] += 1 # parallel-safe because inside of one_by_one region acquire(sao) do sleep(0.01) end end end return x[] end @test test_f() == 10 end @testset "show schedulers" begin nt = Threads.nthreads(:default) @test repr("text/plain", DynamicScheduler()) == """ DynamicScheduler ├ Chunking: fixed count ($nt), split :consecutive └ Threadpool: default""" @test repr( "text/plain", DynamicScheduler(; chunking = false, threadpool = :interactive)) == """ DynamicScheduler ├ Chunking: none └ Threadpool: interactive""" @test repr("text/plain", StaticScheduler()) == """StaticScheduler ├ Chunking: fixed count ($nt), split :consecutive └ Threadpool: default""" @test repr("text/plain", StaticScheduler(; chunksize = 2, split = :scatter)) == """ StaticScheduler ├ Chunking: fixed size (2), split :roundrobin └ Threadpool: default""" @test repr("text/plain", GreedyScheduler(; chunking = true)) == """ GreedyScheduler ├ Num. tasks: $nt ├ Chunking: fixed count ($(10 * nt)), split :roundrobin └ Threadpool: default""" end if Threads.nthreads() > 1 @testset "Boxing detection and error" begin let f1() = tmap(1:10) do i A = i sleep(rand()/10) A end f2() = tmap(1:10) do i local A = i sleep(rand()/10) A end @test f1() == 1:10 @test f2() == 1:10 end let f1() = tmap(1:10) do i A = i sleep(rand()/10) A end f2() = tmap(1:10) do i local A = i sleep(rand()/10) A end @test_throws BoxedVariableError f1() @test f2() == 1:10 A = 1 # Cause spooky action-at-a-distance by making A outer-local to the whole let block! end let A = 1 f1() = tmap(1:10) do i A = 1 end @test_throws BoxedVariableError f1() == ones(10) # Throws even though the redefinition is 'harmless' @allow_boxed_captures begin f2() = tmap(1:10) do i A = 1 end @test f2() == ones(10) end # Can nest allow and disallow because they're scoped values! function f3() @disallow_boxed_captures begin tmap(1:10) do i A = 1 end end end @allow_boxed_captures begin @test_throws BoxedVariableError f3() == ones(10) end end @testset "@localize" begin A = 1 if false A = 2 end ## This stops A from being boxed! v = @localize A tmap(1:2) do _ A end @test v == [1, 1] end end end # Todo way more testing, and easier tests to deal with