Repository: JuliaFolds2/OhMyThreads.jl
Branch: master
Commit: fee46873b185
Files: 54
Total size: 222.3 KB

Directory structure:
gitextract_49advobv/

├── .JuliaFormatter.toml
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── changelog.yml
│       ├── ci.yml
│       ├── compathelper.yml
│       ├── documentation.yml
│       ├── downgrade_CI.yml
│       └── tagbot.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Project.toml
├── README.md
├── docs/
│   ├── Project.toml
│   ├── build_docs.jl
│   ├── make.jl
│   └── src/
│       ├── basics.md
│       ├── index.md
│       ├── literate/
│       │   ├── Project.toml
│       │   ├── boxing/
│       │   │   ├── Project.toml
│       │   │   ├── boxing.jl
│       │   │   └── boxing.md
│       │   ├── falsesharing/
│       │   │   ├── Project.toml
│       │   │   ├── falsesharing.jl
│       │   │   └── falsesharing.md
│       │   ├── integration/
│       │   │   ├── Project.toml
│       │   │   ├── integration.jl
│       │   │   └── integration.md
│       │   ├── juliaset/
│       │   │   ├── Project.toml
│       │   │   ├── juliaset.jl
│       │   │   └── juliaset.md
│       │   ├── mc/
│       │   │   ├── Project.toml
│       │   │   ├── mc.jl
│       │   │   └── mc.md
│       │   ├── tls/
│       │   │   ├── Project.toml
│       │   │   ├── tls.jl
│       │   │   └── tls.md
│       │   └── tomarkdown.sh
│       ├── refs/
│       │   ├── api.md
│       │   ├── experimental.md
│       │   └── internal.md
│       └── translation.md
├── ext/
│   └── MarkdownExt.jl
├── src/
│   ├── OhMyThreads.jl
│   ├── experimental.jl
│   ├── functions.jl
│   ├── implementation.jl
│   ├── macro_impl.jl
│   ├── macros.jl
│   ├── schedulers.jl
│   ├── tools.jl
│   └── types.jl
└── test/
    ├── Aqua.jl
    └── runtests.jl

================================================
FILE CONTENTS
================================================

================================================
FILE: .JuliaFormatter.toml
================================================
style = "sciml"

================================================
FILE: .github/dependabot.yml
================================================
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/" # Location of package manifests
    schedule:
      interval: "monthly"


================================================
FILE: .github/workflows/changelog.yml
================================================
name: changelog
on:
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]

jobs:
  # Enforces the update of a changelog file on every pull request
  # Can be skipped with the `Skip-Changelog` label
  changelog:
    runs-on: ubuntu-latest
    steps:
    - uses: dangoslen/changelog-enforcer@v3

================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
  - push
  - pull_request
jobs:
  test:
    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        version:
          - '1.10'
          - 'pre'
        os:
          - ubuntu-latest
          - windows-latest
        arch:
          - x64
        include:
          - os: macos-latest
            arch: aarch64
            version: '1.10'
          - os: macos-latest
            arch: aarch64
            version: 'pre'
    steps:
      - uses: actions/checkout@v6
      - uses: julia-actions/setup-julia@v3
        with:
          version: ${{ matrix.version }}
          arch: ${{ matrix.arch }}
      - uses: julia-actions/cache@v2
      - uses: julia-actions/julia-buildpkg@v1
      - uses: julia-actions/julia-runtest@v1
        env:
          JULIA_NUM_THREADS: 4,2
      - uses: julia-actions/julia-processcoverage@v1
      - uses: codecov/codecov-action@v5
        with:
          files: lcov.info


================================================
FILE: .github/workflows/compathelper.yml
================================================
name: CompatHelper
on:
  schedule:
    - cron: 0 0 * * *
  workflow_dispatch:
permissions:
  contents: write
  pull-requests: write
jobs:
  CompatHelper:
    runs-on: ubuntu-latest
    steps:
      - name: Check if Julia is already available in the PATH
        id: julia_in_path
        run: which julia
        continue-on-error: true
      - name: Install Julia, but only if it is not already available in the PATH
        uses: julia-actions/setup-julia@v3
        with:
          version: '1'
          arch: ${{ runner.arch }}
        if: steps.julia_in_path.outcome != 'success'
      - name: "Add the General registry via Git"
        run: |
          import Pkg
          ENV["JULIA_PKG_SERVER"] = ""
          Pkg.Registry.add("General")
        shell: julia --color=yes {0}
      - name: "Install CompatHelper"
        run: |
          import Pkg
          name = "CompatHelper"
          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
          version = "3"
          Pkg.add(; name, uuid, version)
        shell: julia --color=yes {0}
      - name: "Run CompatHelper"
        run: |
          import CompatHelper
          CompatHelper.main()
        shell: julia --color=yes {0}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}


================================================
FILE: .github/workflows/documentation.yml
================================================
name: Documentation

on:
  push:
    branches:
      - master
    tags: '*'
    paths:
      - 'docs/**'
      - 'src/**'
  pull_request:
    paths:
      - 'docs/**'
      - 'src/**'

concurrency:
  # Skip intermediate builds: always.
  # Cancel intermediate builds: always.
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build:
    permissions:
      contents: write
      statuses: write
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: julia-actions/setup-julia@latest
        with:
          version: '1'
      - name: Build and deploy
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
        run: julia docs/build_docs.jl

================================================
FILE: .github/workflows/downgrade_CI.yml
================================================
name: Downgrade
on:
  pull_request:
    branches:
      - master
    paths-ignore:
      - 'docs/**'
  push:
    branches:
      - master
    paths-ignore:
      - 'docs/**'
jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        version: ['1']
    steps:
      - uses: actions/checkout@v6
      - uses: julia-actions/setup-julia@v3
        with:
          version: ${{ matrix.version }}
      - uses: cjdoris/julia-downgrade-compat-action@v1
        with:
          skip: Pkg,TOML,Test,Markdown
      - uses: julia-actions/julia-buildpkg@v1
      - uses: julia-actions/julia-runtest@v1


================================================
FILE: .github/workflows/tagbot.yml
================================================
name: TagBot
on:
  issue_comment:
    types:
      - created
  workflow_dispatch:
    inputs:
      lookback:
        default: 3
permissions:
  actions: read
  checks: read
  contents: write
  deployments: read
  issues: read
  discussions: read
  packages: read
  pages: read
  pull-requests: read
  repository-projects: read
  security-events: read
  statuses: read
jobs:
  TagBot:
    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
    runs-on: ubuntu-latest
    steps:
      - uses: JuliaRegistries/TagBot@v1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          # Edit the following line to reflect the actual name of the GitHub Secret containing your private key
          ssh: ${{ secrets.DOCUMENTER_KEY }}
          # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }}

================================================
FILE: .gitignore
================================================
docs/build
Manifest.toml
.vscode
*~


================================================
FILE: CHANGELOG.md
================================================
OhMyThreads.jl Changelog
=========================

Unreleased
------------
- ![Enhancement][badge-enhancement] `SerialScheduler` now accepts and ignores arguments passed to it to make switching schedulers easier [#162][gh-pr-162].

Version 0.8.3
------------
- ![Enhancement][badge-enhancement] The overhead of `tmapreduce` in the serial case was reduced a bit. Sentinel values in scheduler kwarg internals were replaced by `nothing` [#148][gh-pr-148]

Version 0.8.2
------------
- ![Feature][badge-feature] Added a `minchunksize` chunking argument for schedulers, so that they can specify a lower bound on the size of chunks which are worth parallelizing. For example, `treduce(+, 1:10; minchunksize=100)` will run serially, but `treduce(+, 1:1000000; minchunksize=100)` will be parallelized [#145][gh-pr-145].
- ![Enhancement][badge-enhancement] Operations on collections with only one 'chunk' no longer spawn an unnecessary task. That means operations like `treduce(+, 1:10; minchunksize=100)` will have less overhead [#145][gh-pr-145].

Version 0.8.1
------------
- ![Feature][badge-feature] Added a `@localize` macro which turns `@localize x y expr` into `let x=x, y=y; expr end` ([#142][gh-pr-142])
- ![INFO][badge-info] The error messafe for captured variables now has a longer error hint that displays when the `Markdown` package is loaded (e.g. in the REPL.) ([#142][gh-pr-142])

Version 0.8.0
-------------
- ![BREAKING][badge-breaking] We now detect and throw errors if an `OhMyThreads` parallel function is passed a closure containing a `Box`ed variable. This behaviour can be disabled with the new `@allow_boxed_captures` macro, and re-enabled with `@disallow_boxed_captures`. ([#141][gh-pr-141])
- ![INFO][badge-info] Schedulder chunking info is no longer directly available via `getproperty`. This was never a public interface, but it's possible some users relied upon it [#135][gh-pr-135].

Version 0.7.0
-------------
- ![BREAKING][badge-breaking] We now use ChunkSplitters version 3.0. The function `OhMyThreads.chunks` has been renamed to `OhMyThreads.index_chunks`. The new functions `index_chunks` and `chunks` (different from the old one with the same name!) are now exported. See ChunkSplitters.jl for more information.
- ![BREAKING][badge-breaking] If you provide a `chunks` or `index_chunks` as input we now disable the internal chunking without a warning. Previously, we did show a warning unless you had set `chunking=false`. In contrast, we now throw an error when you set any incompatible chunking related keyword arguments.
- ![Deprecation][badge-deprecation] The `split` options `:batch` and `:scatter` are now deprecated (they still work but will be dropped at some point). Use `:consecutive` and `:roundrobin`, respectively, instead.
- ![Enhancement][badge-enhancement] The `split` keyword argument can now also be a `<: OhMyThreads.Split`. Compared to providing a `Symbol`, the former can potentially give better performance. For example, you can replace `:consecutive` by `OhMyThreads.Consecutive()` and `:roundrobin` by `OhMyThreads.RoundRobin()`.
- ![Feature][badge-feature] `ChannelLike` is a new public (but not exported) type. `ChannelLike(itr)` provide a way to iterate over `itr` in a concurrency safe manner similar to `Channel`. See the docstring for more details. ([#121][gh-pr-121])
- ![Enhancement][badge-enhancement] `ChannelLike` is used internally for the `GreedyScheduler` when `chunking=true`. This improves performance overall but it is especially noticeable when the number of chunks is large. ([#121][gh-pr-121])

Version 0.6.2
-------------
- ![Enhancement][badge-enhancement] Added API support for `enumerate(chunks(...))`. Best used in combination with `chunking=false`

Version 0.6.1
-------------

Version 0.6.0
-------------
- ![BREAKING][badge-breaking] Drop support for Julia < 1.10.

Version 0.5.3
-------------
- ![Enhancement][badge-enhancement] For the special/fake "macros" like, e.g., `@set`, support the verbose form `OhMyThreads.@set` within a `@tasks` for-loop (#107).

Version 0.5.2
-------------
- ![Enhancement][badge-enhancement] For empty input (e.g. `Float64[]` or `11:10`) behavior is now aligned with the serial functions in `Base`.

Version 0.5.1
-------------
- ![Feature][badge-feature] Within a parallel `@tasks` block one can now mark a region with `@one_by_one`. This region will be run by one task at a time ("critical region").
- ![Feature][badge-feature] Within a `@tasks` block one can now mark a region as with `@only_one`. This region will be run by a single parallel task only (other tasks will skip over it).
- ![Experimental][badge-experimental] Added tentative support for `@barrier` in `@tasks` blocks. See `?OhMyThreads.Tools.@barrier` for more information. Note that this feature is experimental and **not** part of the public API (i.e. doesn't fall under SemVer).
- ![Info][badge-info] Compat bounds for [BangBang.jl](https://github.com/JuliaFolds2/BangBang.jl) have been relaxed to include v0.3.40

Version 0.5.0
-------------

- ![Feature][badge-feature] The parallel functions (e.g. tmapreduce etc.) now support `scheduler::Symbol` besides `scheduler::Scheduler`. To configure the selected scheduler (e.g. set `nchunks` etc.) one may now pass keyword arguments directly into the parallel functions (they will get passed on to the scheduler constructor). Example: `tmapreduce(sin, +, 1:10; chunksize=2, scheduler=:static)`. Analogous support has been added to the macro API: (Most) settings (`@set name = value`) will now be passed on to the parallel functions as keyword arguments (which then forward them to the scheduler constructor). Note that, to avoid ambiguity, we don't support this feature for `scheduler::Scheduler` but only for `scheduler::Symbol`.
- ![Feature][badge-feature] Added a `SerialScheduler` that can be used to turn off any multithreading.
- ![Feature][badge-feature] Added `OhMyThreads.WithTaskLocals` that represents a closure over `TaskLocalValues`, but can have those values materialized as an optimization (using `OhMyThreads.promise_task_local`)
- ![Feature][badge-feature] In the case `nchunks > nthreads()`, the `StaticScheduler` now distributes chunks in a round-robin fashion (instead of either implicitly decreasing `nchunks` to `nthreads()` or throwing an error).
- ![Feature][badge-feature] `@set init = ...` may now be used to specify an initial value for a reduction (only has an effect in conjuction with `@set reducer=...` and triggers a warning otherwise).
- ![Enhancement][badge-enhancement] `SerialScheduler` and `DynamicScheduler` now support the keyword argument `ntasks` as an alias for `nchunks`.
- ![Enhancement][badge-enhancement] Made `@tasks` use `OhMyThreads.WithTaskLocals` automatically as an optimization.
- ![Enhancement][badge-enhancement] Uses of `@local` within `@tasks` no-longer require users to declare the type of the task local value, it can be inferred automatically if a type is not provided.
- ![Enhancement][badge-enhancement] Made `using OhMyThreads: ...` more explicit in examples in the documentation and docstrings.
- ![BREAKING][badge-breaking] The `DynamicScheduler` (default) and the `StaticScheduler` now support a `chunksize` argument to specify the desired size of chunks instead of the number of chunks (`nchunks`). Note that `chunksize` and `nchunks` are mutually exclusive. (This is unlikely to break existing code but technically could because the type parameter has changed from `Bool` to `ChunkingMode`.)
- ![BREAKING][badge-breaking] The greedy scheduler now supports chunking (similar to the static and dynamic scheduler). You can opt into it with, e.g., `chunking=true`. (This is unlikely to break existing code but technically could because we introduced a new type parameter for `GreedyScheduler`.)
- ![Breaking][badge-breaking] `DynamicScheduler` and `StaticScheduler` don't support `nchunks=0` or `chunksize=0` any longer. Instead, chunking can now be turned off via an explicit new keyword argument `chunking=false`.
- ![BREAKING][badge-breaking] Within a `@tasks` block, task-local values must from now on be defined via `@local` instead of `@init` (renamed).
- ![BREAKING][badge-breaking] The (already deprecated) `SpawnAllScheduler` has been dropped.
- ![BREAKING][badge-breaking] The default value for `ntasks`/`nchunks` for `DynamicScheduler` has been changed from `2*nthreads()` to `nthreads()`. With the new value we now align with `@threads :dynamic`. The old value wasn't giving good load balancing anyways and choosing a higher value penalizes uniform use cases even more. To get the old behavior, set `nchunks=2*nthreads()`.
- ![Bugfix][badge-bugfix] When using the `GreedyScheduler` in combination with `tmapreduce` (or functions that build upon it) there could be non-deterministic errors in some cases (small input collection, not much work per element, see [#82](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82)). These cases should be fixed now.
- ![Bugfix][badge-bugfix] We now handle empty collections as input in `tmapreduce` and `tforeach` explicitly ([#86](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/86)). Our general philosophy is to try match the behavior of the serial `Base` functions.

Version 0.4.6
-------------

- ![Feature][badge-feature] Introduction of macro API (`@tasks`) that transforms for loops into corresponding `tforeach`, `tmapreduce`, and `tmap` calls. This new API enables us to facilitate certain patterns, like defining task local values.

Version 0.4.5
-------------

- ![Enhancement][badge-enhancement] Improved the thread-safe storage section of the documentation.

Version 0.4.4
-------------

- ![Bugfix][badge-bugfix] Fixed a type specification bug that could occur when passing a `Chunk` into, say, `tmapreduce`.

Version 0.4.3
-------------

- ![Feature][badge-feature] Forward (but don't export) the macros `@fetch` and `@fetchfrom` from StableTasks.jl (v0.1.5), which are analogous to the same-named macros in Distributed.jl.

Version 0.4.2
-------------

- ![Feature][badge-feature] `DynamicScheduler` now supports `nchunks=0`, which turns off internal chunking entirely.
- ![Deprecation][badge-deprecation] `SpawnAllScheduler` is now deprecated in favor of `DynamicScheduler(; nchunks=0)`.
- ![Feature][badge-feature] Partial support for passing in a `ChunkSplitters.Chunk` when using `DynamicScheduler` (default). In this case, one should generally use `DynamicScheduler(; nchunks=0)`, i.e. turn off internal chunking.
- ![Feature][badge-feature] `StaticScheduler` now supports `nchunks=0`, which turns off internal chunking entirely. Only works for input that has `<= nthreads()` elements.

Version 0.4.1
-------------

- ![Feature][badge-feature] Added a new, simple `SpawnAllScheduler` that spawns a task per input element (can be a lot of tasks!).
- ![Info][badge-info] Added downgrade_CI which makes sure the testsuite works on the oldest versions of dependancies.

Version 0.4.0
-------------

- ![BREAKING][badge-breaking] Instead of taking keyword arguments `schedule`, `nchunks`, `split` directly, we now use `Scheduler` structs to specify scheduling options ([#22](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/22)). The latter can be provided to all API functions via the new `scheduler` keyword argument.
- ![BREAKING][badge-breaking] The default scheduler (`DynamicScheduler`) now, by default, creates `2*nthreads()` tasks to provide load-balancing by default. The old behavior can be restored with `DynamicScheduler(; nchunks=nthreads())`.
- ![Enhancement][badge-enhancement] We reject unsupported keyword arguments early and give a more helpful error message.

Version 0.3.1
-------------

- ![Bugfix][badge-bugfix] The documented Public API wasn't updated in 0.3.0 and thus out of sync with the actual API. Fixed in this version.

Version 0.3.0
-------------

- ![BREAKING][badge-breaking] We don't (re-)export `chunks` anymore. Use `OhMyThreads.chunks` instead.
- ![Feature][badge-feature] We now provide `OhMyThreads.TaskLocalValue` (from [TaskLocalValue.jl](https://github.com/vchuravy/TaskLocalValues.jl)) as a nice solution for task-local values. See the corresponding page in the documentation ([#25][gh-issue-25]).
- ![Enhancement][badge-enhancement] Added a few missing `@views`.
- ![Enhancement][badge-enhancement] Added three examples to the docs: monte carlo, julia set, and trapazoidal integration.
- ![Enhancement][badge-enhancement] Improved all docstrings of the exported API functions. Keyword options are now only shown in the extended help (e.g. `??tmap`) ([#27][gh-issue-27]).
- ![Enhancement][badge-enhancement] Added a translation page that hopefully helps with the Base.Threads → OhMyThreads.jl transition ([#24][gh-issue-24]).

Version 0.2.1
-------------

- ![Enhancement][badge-enhancement] Basic documentation.
- ![Enhancement][badge-enhancement] Making `ChunkSplitters` available internally.

Version 0.2.0
-------------

- Initial version.

[badge-breaking]: https://img.shields.io/badge/BREAKING-red.svg
[badge-deprecation]: https://img.shields.io/badge/Deprecation-orange.svg
[badge-feature]: https://img.shields.io/badge/Feature-green.svg
[badge-experimental]: https://img.shields.io/badge/Experimental-yellow.svg
[badge-enhancement]: https://img.shields.io/badge/Enhancement-blue.svg
[badge-bugfix]: https://img.shields.io/badge/Bugfix-purple.svg
[badge-fix]: https://img.shields.io/badge/Fix-purple.svg
[badge-info]: https://img.shields.io/badge/Info-gray.svg

[gh-issue-27]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/27
[gh-issue-24]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/24
[gh-issue-25]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/25

[gh-pr-5]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/5
[gh-pr-121]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/121
[gh-pr-135]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/135
[gh-pr-141]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/141
[gh-pr-142]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/142
[gh-pr-145]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/145
[gh-pr-148]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/148


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 Mason Protter

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Project.toml
================================================
name = "OhMyThreads"
uuid = "67456a42-1dca-4109-a031-0a68de7e3ad5"
authors = ["Carsten Bauer <mail@carstenbauer.eu>", "Mason Protter <mason.protter@icloud.com>"]
version = "0.8.5"

[deps]
BangBang = "198e06fe-97b7-11e9-32a5-e1d131e6ad66"
ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
StableTasks = "91464d47-22a1-43fe-8b7f-2d57ee82463f"
TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"

[weakdeps]
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"

[extensions]
MarkdownExt = "Markdown"

[compat]
Aqua = "0.8"
BangBang = "0.3.40, 0.4"
ChunkSplitters = "3.1"
Markdown = "1"
ScopedValues = "1.3"
StableTasks = "0.1.5"
TaskLocalValues = "0.1"
Test = "1"
julia = "1.10"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "Aqua"]


================================================
FILE: README.md
================================================
# OhMyThreads

[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg
[docs-dev-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/dev

[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
[docs-stable-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/stable

[ci-img]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml/badge.svg
[ci-url]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml

[cov-img]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl/branch/master/graph/badge.svg
[cov-url]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl

[lifecycle-img]: https://img.shields.io/badge/lifecycle-maturing-orange.svg

[code-style-img]: https://img.shields.io/badge/code%20style-blue-4495d1.svg
[code-style-url]: https://github.com/invenia/BlueStyle

[aqua-img]: https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg
[aqua-url]: https://github.com/JuliaTesting/Aqua.jl

<!--
![Lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)
![Lifecycle](https://img.shields.io/badge/lifecycle-stable-green.svg)
![Lifecycle](https://img.shields.io/badge/lifecycle-retired-orange.svg)
![Lifecycle](https://img.shields.io/badge/lifecycle-archived-red.svg)
![Lifecycle](https://img.shields.io/badge/lifecycle-dormant-blue.svg)
![Lifecycle](https://img.shields.io/badge/lifecycle-experimental-orange.svg)
-->

*Simple Multithreading in Julia*

| **Documentation**                                                               | **Build Status**                                                                                |  **Quality**                                                                                |
|:-------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][ci-img]][ci-url] [![][cov-img]][cov-url] | ![][lifecycle-img] [![][aqua-img]][aqua-url] |

[OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a
focus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`).

## Example

```julia
using OhMyThreads: tmapreduce, @tasks
using BenchmarkTools: @btime
using Base.Threads: nthreads

# Variant 1: function API
function mc_parallel(N; ntasks=nthreads())
    M = tmapreduce(+, 1:N; ntasks) do i
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

# Variant 2: macro API
function mc_parallel_macro(N; ntasks=nthreads())
    M = @tasks for i in 1:N
        @set begin
            reducer=+
            ntasks=ntasks
        end
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

N = 100_000_000
mc_parallel(N) # gives, e.g., 3.14159924

@btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread)
@btime mc_parallel($N)           # using all threads
@btime mc_parallel_macro($N)     # using all threads
```

With 5 threads, timings might be something like this:

```
417.282 ms (14 allocations: 912 bytes)
83.578 ms (38 allocations: 3.08 KiB)
83.573 ms (38 allocations: 3.08 KiB)
```

(Check out the full [Parallel Monte Carlo](https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/mc/mc/) example if you like.)

## Documentation

For more information, please check out the [documentation](https://JuliaFolds2.github.io/OhMyThreads.jl/stable) of the latest release (or the [development version](https://JuliaFolds2.github.io/OhMyThreads.jl/dev) if you're curious).


================================================
FILE: docs/Project.toml
================================================
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656"
DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"

[compat]
Documenter = "1.3"
DocumenterInterLinks = "1"
DocumenterTools = "0.1"


================================================
FILE: docs/build_docs.jl
================================================
cd(@__DIR__)
println("--- :julia: Instantiating project")
using Pkg
Pkg.activate("..")
Pkg.instantiate()
Pkg.activate(".")
Pkg.instantiate()
push!(LOAD_PATH, joinpath(@__DIR__, ".."))
deleteat!(LOAD_PATH, 2)
println("+++ :julia: Building documentation")
include("make.jl")


================================================
FILE: docs/make.jl
================================================
using Documenter
using DocumenterInterLinks
using OhMyThreads

const ci = get(ENV, "CI", "") == "true"

links = InterLinks(
    "ChunkSplitters" => (
        "https://juliafolds2.github.io/ChunkSplitters.jl/stable/",
        "https://juliafolds2.github.io/ChunkSplitters.jl/stable/objects.inv",
        joinpath(@__DIR__, "inventories", "ChunkSplitters.toml")
    ),
);

@info "Generating Documenter.jl site"
makedocs(;
    sitename = "OhMyThreads.jl",
    authors = "Carsten Bauer, Mason Protter",
    modules = [OhMyThreads],
    checkdocs = :exports,
    doctest = false,
    pages = [
        "OhMyThreads" => "index.md",
        "Examples" => [
            "Parallel Monte Carlo" => "literate/mc/mc.md",
            "Julia Set" => "literate/juliaset/juliaset.md",
            "Trapezoidal Integration" => "literate/integration/integration.md"
        ],
        "Translation Guide" => "translation.md",
        "Boxed Variables" => "literate/boxing/boxing.md",
        "Thread-Safe Storage" => "literate/tls/tls.md",
        "False Sharing" => "literate/falsesharing/falsesharing.md",
        # "Explanations" => [
        #     "Task-Based Multithreading" => "explain/taskbasedmt.md",
        # ],
        "API" => [
            "Public API" => "refs/api.md",
            "Experimental" => "refs/experimental.md",
            "Internal" => "refs/internal.md"
        ]
    ],
    repo = "https://github.com/JuliaFolds2/OhMyThreads.jl/blob/{commit}{path}#{line}",
    format = Documenter.HTML(repolink = "https://github.com/JuliaFolds2/OhMyThreads.jl"; collapselevel = 1),
    plugins = [links],)

if ci
    @info "Deploying documentation to GitHub"
    deploydocs(;
        repo = "github.com/JuliaFolds2/OhMyThreads.jl.git",
        devbranch = "master",
        push_preview = true)
end


================================================
FILE: docs/src/basics.md
================================================
# Basics

This section is still in preparation. For now, you might want to take a look at the [translation guide](@ref TG) and the examples.

================================================
FILE: docs/src/index.md
================================================
# OhMyThreads.jl

[OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a
focus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`).

## Quick Start

The package is registered. Hence, you can simply use
```
] add OhMyThreads
```
to add the package to your Julia environment.

### Basic example

```julia
using OhMyThreads: tmapreduce, @tasks
using BenchmarkTools: @btime
using Base.Threads: nthreads

# Variant 1: function API
function mc_parallel(N; ntasks=nthreads())
    M = tmapreduce(+, 1:N; ntasks) do i
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

# Variant 2: macro API
function mc_parallel_macro(N; ntasks=nthreads())
    M = @tasks for i in 1:N
        @set begin
            reducer=+
            ntasks=ntasks
        end
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

N = 100_000_000
mc_parallel(N) # gives, e.g., 3.14159924

@btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread)
@btime mc_parallel($N)           # using all threads
@btime mc_parallel_macro($N)     # using all threads
```

With 5 threads, timings might be something like this:

```
417.282 ms (14 allocations: 912 bytes)
83.578 ms (38 allocations: 3.08 KiB)
83.573 ms (38 allocations: 3.08 KiB)
```

(Check out the full [Parallel Monte Carlo](@ref) example if you like.)

## No Transducers

Unlike most [JuliaFolds2](https://github.com/JuliaFolds2) packages, OhMyThreads.jl is not built off of [Transducers.jl](https://github.com/JuliaFolds2/Transducers.jl), nor is it a building block for Transducers.jl. Rather, it is meant to be a simpler, more maintainable, and more accessible alternative to high-level packages like, e.g., [ThreadsX.jl](https://github.com/tkf/ThreadsX.jl) or [Folds.jl](https://github.com/JuliaFolds2/Folds.jl).

## Acknowledgements

The idea for this package came from [Carsten Bauer](https://github.com/carstenbauer) and [Mason Protter](https://github.com/MasonProtter). Check out the [list of contributors](https://github.com/JuliaFolds2/OhMyThreads.jl/graphs/contributors) for more information.


================================================
FILE: docs/src/literate/Project.toml
================================================
[deps]
Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"

[compat]
Literate = "2.16"


================================================
FILE: docs/src/literate/boxing/Project.toml
================================================
[deps]
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"


================================================
FILE: docs/src/literate/boxing/boxing.jl
================================================
#====================================
# Boxed Variables

All multithreading in julia is built around the idea of passing around
and executing functions, but often these functions "enclose" data from
an outer local scope, making them what's called a "closure".

## Boxed variables causing race conditions

Julia allows functions which capture variables to re-bind those variables
to different values, but doing so can cause subtle race conditions in
multithreaded code.

Consider the following example:
====================================#

let out = zeros(Int, 10)
    Threads.@threads for i in 1:10
        A = i
        sleep(1/100)
        out[i] = A
    end
    A = 1
    out
end

#====================================
You may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`,
but the nonsense result is caused by `A` actually being a shared mutable
container here which all the parallel tasks are accessing and mutating
in parallel, giving unpredictable results.

OhMyThreads.jl tries to protect users from this surprising behaviour:
====================================#
using OhMyThreads

try
    let
        ## this throws an error!
        out = tmap(1:10) do i
            A = i
            sleep(1/100)
            A
        end
        A = 1
        out
    end
catch e;
    ## Show the error
    Base.showerror(stdout, e)
end

#====================================
In this case, we could fix the race conditon by marking `A` as local:
====================================#

let
    out = tmap(1:10) do i
        local A = i # Note the use of `local`
        sleep(1/100)
        A
    end
    A = 1
    out
end

#====================================
If you really desire to bypass this error, you can use the
`@allow_boxed_captures` macro
====================================#

@allow_boxed_captures let
    out = tmap(1:10) do i
        A = i
        sleep(1/100)
        A
    end
    A = 1
    out
end

#====================================
## Non-race conditon boxed variables

Any re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop:
====================================#
try
    let A = 1
        if rand(Bool)
            ## Rebind A, it's now boxed!
            A = 2
        end
        @tasks for i in 1:2
            @show A
        end
    end
catch e;
    println("Yup, that errored!")
end
#====================================
This comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop:
====================================#

let A = 1
    if rand(Bool)
        A = 2
    end
    let A = A # This stops A from being boxed!
        @tasks for i in 1:2
            @show A
        end
    end
end

#====================================
OhMyThreads provides a macro `@localize` to automate this process:
====================================#

let A = 1
    if rand(Bool)
        A = 2
    end
    ## This stops A from being boxed!
    @localize A @tasks for i in 1:2
        @show A
    end
end


================================================
FILE: docs/src/literate/boxing/boxing.md
================================================
```@meta
EditURL = "boxing.jl"
```

# Boxed Variables

All multithreading in julia is built around the idea of passing around
and executing functions, but often these functions "enclose" data from
an outer local scope, making them what's called a "closure".

## Boxed variables causing race conditions

Julia allows functions which capture variables to re-bind those variables
to different values, but doing so can cause subtle race conditions in
multithreaded code.

Consider the following example:

````julia
let out = zeros(Int, 10)
    Threads.@threads for i in 1:10
        A = i
        sleep(1/100)
        out[i] = A
    end
    A = 1
    out
end
````

````
10-element Vector{Int64}:
 5
 4
 6
 4
 5
 4
 5
 4
 5
 4
````

You may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`,
but the nonsense result is caused by `A` actually being a shared mutable
container here which all the parallel tasks are accessing and mutating
in parallel, giving unpredictable results.

OhMyThreads.jl tries to protect users from this surprising behaviour:

````julia
using OhMyThreads

try
    let
        # this throws an error!
        out = tmap(1:10) do i
            A = i
            sleep(1/100)
            A
        end
        A = 1
        out
    end
catch e;
    # Show the error
    Base.showerror(stdout, e)
end
````

````
Attempted to capture and modify outer local variable: A

See https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation.

  Hint
  ----

  Capturing boxed variables can be not only slow, but also cause surprising
  and incorrect results.

    •  If you meant for these variables to be local to each loop
       iteration and not depend on a variable from an outer scope, you
       should mark them as local inside the closure.

    •  If you meant to reference a variable from the outer scope, but do
       not want access to it to be boxed, you can wrap uses of it in a
       let block, like e.g.

  function foo(x, N)
      rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ...
      let x = x # ... Unless we localize it here with the let block 
          @tasks for i in 1:N
              f(x)    
          end
      end
  end

    •  OhMyThreads.jl provides a @localize macro that automates the above
       let block, i.e. @localize x f(x) is the same as let x=x; f(x) end

    •  If these variables are being re-bound inside a @one_by_one or
       @only_one block, consider using a mutable Ref instead of
       re-binding the variable.

  This error can be bypassed with the @allow_boxed_captures macro.
````

In this case, we could fix the race conditon by marking `A` as local:

````julia
let
    out = tmap(1:10) do i
        local A = i # Note the use of `local`
        sleep(1/100)
        A
    end
    A = 1
    out
end
````

````
10-element Vector{Int64}:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
````

If you really desire to bypass this error, you can use the
`@allow_boxed_captures` macro

````julia
@allow_boxed_captures let
    out = tmap(1:10) do i
        A = i
        sleep(1/100)
        A
    end
    A = 1
    out
end
````

````
10-element Vector{Int64}:
 3
 2
 3
 2
 3
 2
 3
 2
 3
 3
````

## Non-race conditon boxed variables

Any re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop:

````julia
try
    let A = 1
        if rand(Bool)
            # Rebind A, it's now boxed!
            A = 2
        end
        @tasks for i in 1:2
            @show A
        end
    end
catch e;
    println("Yup, that errored!")
end
````

````
Yup, that errored!

````

This comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop:

````julia
let A = 1
    if rand(Bool)
        A = 2
    end
    let A = A # This stops A from being boxed!
        @tasks for i in 1:2
            @show A
        end
    end
end
````

````
A = 1
A = 1

````

OhMyThreads provides a macro `@localize` to automate this process:

````julia
let A = 1
    if rand(Bool)
        A = 2
    end
    # This stops A from being boxed!
    @localize A @tasks for i in 1:2
        @show A
    end
end
````

````
A = 2
A = 2

````

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/falsesharing/Project.toml
================================================
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"


================================================
FILE: docs/src/literate/falsesharing/falsesharing.jl
================================================
# # [False Sharing](@id FalseSharing)
#
# *False Sharing* is a very common but subtle performance issue that comes up again and
# again when writing parallel code manually. For this reason, we shall discuss what it is
# about and how to avoid it.
#
# For simplicity, let's focus on a specific example: parallel summation.
#
# ## Baseline: sequential summation
#
# To establish a baseline, that we can later compare against, we define some fake data,
# which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function.

using Base.Threads: nthreads
using BenchmarkTools
using ThreadPinning #hide
pinthreads(:cores) #hide

data = rand(1_000_000 * nthreads());
@btime sum($data);

#
# ## The problematic parallel implementation
#
# A conceptually simple (and valid) approach to parallelizing the summation is to divide
# the full computation into parts. Specifically, the idea is to divide the data into chunks,
# compute the partial sums of these chunks in parallel, and finally sum up the partial
# results. (Note that we will not concern ourselves with potential minor or
# catastrophic numerical errors due to potential rearrangements of terms in the summation here.)
#
# A common, manual implementation of this idea might look like this:

using OhMyThreads: @spawn, index_chunks

function parallel_sum_falsesharing(data; nchunks = nthreads())
    psums = zeros(eltype(data), nchunks)
    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))
        @spawn begin
            for i in idcs
                psums[c] += data[i]
            end
        end
    end
    return sum(psums)
end

# The code is pretty straightforward: We allocate space for the results of the partial sums
# (`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in
# parallel. More importantly, and in this context perhaps surprisingly, the code is also
# **correct** in the sense that it produces the desired result.

using Test
@test sum(data) ≈ parallel_sum_falsesharing(data)

# This is just a reflection of the fact that there is no logical sharing of data - because
# each parallel tasks modifies a different element of `psums` - implying the absence of
# race conditions.
#
# What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime.
# So let's see how well we're doing in this respect.

nthreads()

#

@btime parallel_sum_falsesharing($data);

# A **slowdown**?! Clearly, that's the opposite of what we tried to achieve!

#
# ## The issue: False sharing
#
# Although our parallel summation above is semantically correct, it has a
# big **performance issue**: *False sharing*. To understand false sharing, we have to think
# a little bit about how computers work. Specifically, we need to realize that processors
# cache memory in lines (rather than individual elements) and that caches of different processors
# are kept coherent.
# When two (or more) different CPU cores operate on independent data elements that **fall
# into the same cache line** (i.e. they are part of the same memory address region)
# the **cache coherency mechanism leads to costly synchronization** between cores.

# In our case, this happens despite the fact that different parallel tasks
# (on different CPU cores) *logically* don't care about the rest of the data in the cache line
# at all.

# ![](false_sharing.svg)

# Given these insights, we can come up with a few workarounds that mitigate the issue.
# The most prominent is probably padding, where one simply adds sufficiently many unused
# zeros to `psums` such that different partial sum counters don't fall into the same cache
# line. However, let's discuss a more fundamental, more efficient, and more elegant solution.

#
# ## Task-local parallel summation
#
# The key mistake in `parallel_sum_falsesharing` above is the non-local modification of
# (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop).
# We can simply avoid this by making the code more task-local. To this end, we introduce a
# **task-local accumulator variable**, which we use to perform the task-local partial sums.
# Only at the very end do we communicate the result to the main thread, e.g. by writing it
# into `psums` (once!).

function parallel_sum_tasklocal(data; nchunks = nthreads())
    psums = zeros(eltype(data), nchunks)
    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))
        @spawn begin
            local s = zero(eltype(data))
            for i in idcs
                s += data[i]
            end
            psums[c] = s
        end
    end
    return sum(psums)
end

@test sum(data) ≈ parallel_sum_tasklocal(data)
@btime parallel_sum_tasklocal($data);

# Finally, there is a speed up! 🎉
#
# Two comments are in order.
#
# First, we note that the only role that `psums` plays is
# as a temporary storage for the results from the parallel tasks to be able to sum them
# up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which
# would get updated via `Threads.atomic_add!` from each task directly. However,
# for our discussion, this is a detail and we won't discuss it further.
#
# Secondly, while keeping the general idea, we can drastically simplify the above code by
# using `map` and reusing the built-in (sequential) `sum` function on each parallel task:

function parallel_sum_map(data; nchunks = nthreads())
    ts = map(index_chunks(data, n = nchunks)) do idcs
        @spawn @views sum(data[idcs])
    end
    return sum(fetch.(ts))
end

@test sum(data) ≈ parallel_sum_map(data)
@btime parallel_sum_map($data);

# This implementation is conceptually
# clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`,
# anywhere at all. We can't run into false sharing if we don't modify shared state 😉.
#
# Note that since we use the built-in `sum` function, which is highly optimized, we might see
# better runtimes due to other effects - like SIMD and the absence of bounds checks - compared
# to the simple for-loop accumulation in `parallel_sum_tasklocal` above.

#
# ## Parallel summation with OhMyThreads
#
# Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref)
# to implement the parallel summation. It also only takes a single line and function call.

using OhMyThreads: treduce

@test sum(data) ≈ treduce(+, data; ntasks = nthreads())
@btime treduce($+, $data; ntasks = $nthreads());


================================================
FILE: docs/src/literate/falsesharing/falsesharing.md
================================================
```@meta
EditURL = "falsesharing.jl"
```

# [False Sharing](@id FalseSharing)

*False Sharing* is a very common but subtle performance issue that comes up again and
again when writing parallel code manually. For this reason, we shall discuss what it is
about and how to avoid it.

For simplicity, let's focus on a specific example: parallel summation.

## Baseline: sequential summation

To establish a baseline, that we can later compare against, we define some fake data,
which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function.

````julia
using Base.Threads: nthreads
using BenchmarkTools

data = rand(1_000_000 * nthreads());
@btime sum($data);
````

````
  2.327 ms (0 allocations: 0 bytes)

````

## The problematic parallel implementation

A conceptually simple (and valid) approach to parallelizing the summation is to divide
the full computation into parts. Specifically, the idea is to divide the data into chunks,
compute the partial sums of these chunks in parallel, and finally sum up the partial
results. (Note that we will not concern ourselves with potential minor or
catastrophic numerical errors due to potential rearrangements of terms in the summation here.)

A common, manual implementation of this idea might look like this:

````julia
using OhMyThreads: @spawn, index_chunks

function parallel_sum_falsesharing(data; nchunks = nthreads())
    psums = zeros(eltype(data), nchunks)
    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))
        @spawn begin
            for i in idcs
                psums[c] += data[i]
            end
        end
    end
    return sum(psums)
end
````

````
parallel_sum_falsesharing (generic function with 1 method)
````

The code is pretty straightforward: We allocate space for the results of the partial sums
(`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in
parallel. More importantly, and in this context perhaps surprisingly, the code is also
**correct** in the sense that it produces the desired result.

````julia
using Test
@test sum(data) ≈ parallel_sum_falsesharing(data)
````

````
Test Passed
````

This is just a reflection of the fact that there is no logical sharing of data - because
each parallel tasks modifies a different element of `psums` - implying the absence of
race conditions.

What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime.
So let's see how well we're doing in this respect.

````julia
nthreads()
````

````
10
````

````julia
@btime parallel_sum_falsesharing($data);
````

````
  52.919 ms (221 allocations: 18.47 KiB)

````

A (huge) **slowdown**?! Clearly, that's the opposite of what we tried to achieve!

## The issue: False sharing

Although our parallel summation above is semantically correct, it has a
big **performance issue**: *False sharing*. To understand false sharing, we have to think
a little bit about how computers work. Specifically, we need to realize that processors
cache memory in lines (rather than individual elements) and that caches of different processors
are kept coherent.
When two (or more) different CPU cores operate on independent data elements that **fall
into the same cache line** (i.e. they are part of the same memory address region)
the **cache coherency mechanism leads to costly synchronization** between cores.

In our case, this happens despite the fact that different parallel tasks
(on different CPU cores) *logically* don't care about the rest of the data in the cache line
at all.

![](false_sharing.svg)

Given these insights, we can come up with a few workarounds that mitigate the issue.
The most prominent is probably padding, where one simply adds sufficiently many unused
zeros to `psums` such that different partial sum counters don't fall into the same cache
line. However, let's discuss a more fundamental, more efficient, and more elegant solution.

## Task-local parallel summation

The key mistake in `parallel_sum_falsesharing` above is the non-local modification of
(implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop).
We can simply avoid this by making the code more task-local. To this end, we introduce a
**task-local accumulator variable**, which we use to perform the task-local partial sums.
Only at the very end do we communicate the result to the main thread, e.g. by writing it
into `psums` (once!).

````julia
function parallel_sum_tasklocal(data; nchunks = nthreads())
    psums = zeros(eltype(data), nchunks)
    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))
        @spawn begin
            local s = zero(eltype(data))
            for i in idcs
                s += data[i]
            end
            psums[c] = s
        end
    end
    return sum(psums)
end

@test sum(data) ≈ parallel_sum_tasklocal(data)
@btime parallel_sum_tasklocal($data);
````

````
  1.120 ms (221 allocations: 18.55 KiB)

````

Finally, there is a speed up! 🎉

Two comments are in order.

First, we note that the only role that `psums` plays is
as a temporary storage for the results from the parallel tasks to be able to sum them
up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which
would get updated via `Threads.atomic_add!` from each task directly. However,
for our discussion, this is a detail and we won't discuss it further.

Secondly, while keeping the general idea, we can drastically simplify the above code by
using `map` and reusing the built-in (sequential) `sum` function on each parallel task:

````julia
function parallel_sum_map(data; nchunks = nthreads())
    ts = map(index_chunks(data, n = nchunks)) do idcs
        @spawn @views sum(data[idcs])
    end
    return sum(fetch.(ts))
end

@test sum(data) ≈ parallel_sum_map(data)
@btime parallel_sum_map($data);
````

````
  893.396 μs (64 allocations: 5.72 KiB)

````

This implementation is conceptually
clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`,
anywhere at all. We can't run into false sharing if we don't modify shared state 😉.

Note that since we use the built-in `sum` function, which is highly optimized, we might see
better runtimes due to other effects - like SIMD and the absence of bounds checks - compared
to the simple for-loop accumulation in `parallel_sum_tasklocal` above.

## Parallel summation with OhMyThreads

Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref)
to implement the parallel summation. It also only takes a single line and function call.

````julia
using OhMyThreads: treduce

@test sum(data) ≈ treduce(+, data; ntasks = nthreads())
@btime treduce($+, $data; ntasks = $nthreads());
````

````
  899.097 μs (68 allocations: 5.92 KiB)

````

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/integration/Project.toml
================================================
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"


================================================
FILE: docs/src/literate/integration/integration.jl
================================================
# # Trapezoidal Integration
#
# In this example, we want to parallelize the computation of a simple numerical integral
# via the trapezoidal rule. The latter is given by
#
# $\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$
#
# The function to be integrated is the following.

f(x) = 4 * √(1 - x^2)

# The analytic result of the definite integral (from 0 to 1) is known to be $\pi$.
#
# ## Sequential
#
# Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.

function trapezoidal(a, b, n; h = (b - a) / n)
    y = (f(a) + f(b)) / 2.0
    for i in 1:(n - 1)
        x = a + i * h
        y = y + f(x)
    end
    return y * h
end

# Let's compute the integral of `f` above and see if we get the expected result.
# For simplicity, we choose `N`, the number of panels used to discretize the integration
# interval, as a multiple of the number of available Julia threads.

using Base.Threads: nthreads

N = nthreads() * 1_000_000

# Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$.

trapezoidal(0, 1, N) ≈ π

# ## Parallel
#
# Our strategy is the following: Divide the integration interval among the available
# Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial
# integral.
# It is straightforward to implement this strategy with `tmapreduce`. The `map` part
# is, essentially, the application of `trapezoidal` and the reduction operator is chosen to
# be `+` to sum up the local integrals.

using OhMyThreads

function trapezoidal_parallel(a, b, N)
    n = N ÷ nthreads()
    h = (b - a) / N
    return tmapreduce(+, 1:nthreads()) do i
        local α = a + (i - 1) * n * h
        local β = α + n * h
        trapezoidal(α, β, n; h)
    end
end

## or equivalently
##
## function trapezoidal_parallel(a, b, N)
##     n = N ÷ nthreads()
##     h = (b - a) / N
##     @tasks for i in 1:nthreads()
##         @set reducer=+
##         local α = a + (i - 1) * n * h
##         local β = α + n * h
##         trapezoidal(α, β, n; h)
##     end
## end

# First, we check the correctness of our parallel implementation.
trapezoidal_parallel(0, 1, N) ≈ π

# Then, we benchmark and compare the performance of the sequential and parallel versions.

using BenchmarkTools
@btime trapezoidal(0, 1, $N);
@btime trapezoidal_parallel(0, 1, $N);

# Because the problem is trivially parallel - all threads to the same thing and don't need
# to communicate - we expect an ideal speedup of (close to) the number of available threads.

nthreads()


================================================
FILE: docs/src/literate/integration/integration.md
================================================
```@meta
EditURL = "integration.jl"
```

# Trapezoidal Integration

In this example, we want to parallelize the computation of a simple numerical integral
via the trapezoidal rule. The latter is given by

$\int_{a}^{b}f(x)\,dx \approx h \sum_{i=1}^{N}\frac{f(x_{i-1})+f(x_{i})}{2}.$

The function to be integrated is the following.

````julia
f(x) = 4 * √(1 - x^2)
````

````
f (generic function with 1 method)
````

The analytic result of the definite integral (from 0 to 1) is known to be $\pi$.

## Sequential

Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.

````julia
function trapezoidal(a, b, n; h = (b - a) / n)
    y = (f(a) + f(b)) / 2.0
    for i in 1:(n - 1)
        x = a + i * h
        y = y + f(x)
    end
    return y * h
end
````

````
trapezoidal (generic function with 1 method)
````

Let's compute the integral of `f` above and see if we get the expected result.
For simplicity, we choose `N`, the number of panels used to discretize the integration
interval, as a multiple of the number of available Julia threads.

````julia
using Base.Threads: nthreads

N = nthreads() * 1_000_000
````

````
10000000
````

Calling `trapezoidal` we do indeed find the (approximate) value of $\pi$.

````julia
trapezoidal(0, 1, N) ≈ π
````

````
true
````

## Parallel

Our strategy is the following: Divide the integration interval among the available
Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial
integral.
It is straightforward to implement this strategy with `tmapreduce`. The `map` part
is, essentially, the application of `trapezoidal` and the reduction operator is chosen to
be `+` to sum up the local integrals.

````julia
using OhMyThreads

function trapezoidal_parallel(a, b, N)
    n = N ÷ nthreads()
    h = (b - a) / N
    return tmapreduce(+, 1:nthreads()) do i
        local α = a + (i - 1) * n * h # the local keywords aren't necessary but good practice
        local β = α + n * h
        trapezoidal(α, β, n; h)
    end
end

# or equivalently
#
# function trapezoidal_parallel(a, b, N)
#     n = N ÷ nthreads()
#     h = (b - a) / N
#     @tasks for i in 1:nthreads()
#         @set reducer=+
#         local α = a + (i - 1) * n * h
#         local β = α + n * h
#         trapezoidal(α, β, n; h)
#     end
# end
````

````
trapezoidal_parallel (generic function with 1 method)
````

First, we check the correctness of our parallel implementation.

````julia
trapezoidal_parallel(0, 1, N) ≈ π
````

````
true
````

Then, we benchmark and compare the performance of the sequential and parallel versions.

````julia
using BenchmarkTools
@btime trapezoidal(0, 1, $N);
@btime trapezoidal_parallel(0, 1, $N);
````

````
  24.348 ms (0 allocations: 0 bytes)
  2.457 ms (69 allocations: 6.05 KiB)

````

Because the problem is trivially parallel - all threads to the same thing and don't need
to communicate - we expect an ideal speedup of (close to) the number of available threads.

````julia
nthreads()
````

````
10
````

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/juliaset/Project.toml
================================================
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
DisplayAs = "0b91fe84-8a4c-11e9-3e1d-67c38462b6d6"
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"


================================================
FILE: docs/src/literate/juliaset/juliaset.jl
================================================
# # Julia Set
#
# In this example, we will compute an image of the
# [Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore
# the `schedule` and `nchunks` options that can be used to get load balancing.
#
# The value of a single pixel of the Julia set, which corresponds to a point in the
# complex number plane, can be computed by the following iteration procedure.

function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)
    x = -2.0 + (j - 1) * 4.0 / (n - 1)
    y = -2.0 + (i - 1) * 4.0 / (n - 1)

    z = x + y * im
    iter = max_iter
    for k in 1:max_iter
        if abs2(z) > 4.0
            iter = k - 1
            break
        end
        z = z^2 + c
    end
    return iter
end

# Note that the value of the pixel is the number of performed iterations for the
# corresponding complex input number. Hence, the computational **workload is non-uniform**.

# ## Sequential computation
#
# In our naive implementation, we just loop over the dimensions of the image matrix and call
# the pixel kernel above.

function compute_juliaset_sequential!(img)
    N = size(img, 1)
    for j in 1:N
        for i in 1:N
            img[i, j] = _compute_pixel(i, j, N)
        end
    end
    return img
end

N = 2000
img = zeros(Int, N, N)
compute_juliaset_sequential!(img);

# Let's look at the result

using Plots
using DisplayAs #hide
p = heatmap(img)
DisplayAs.PNG(p) #hide

# ## Parallelization
#
# The Julia set computation above is a `map!` operation: We apply some function to each
# element of the array. Hence, we can use `tmap!` for parallelization. We use
# `CartesianIndices` to map between linear and two-dimensional cartesian indices.

using OhMyThreads: tmap!

function compute_juliaset_parallel!(img; kwargs...)
    N = size(img, 1)
    cart = CartesianIndices(img)
    tmap!(img, eachindex(img); kwargs...) do idx
        c = cart[idx]
        _compute_pixel(c[1], c[2], N)
    end
    return img
end

## or alternatively
##
## function compute_juliaset_parallel!(img; kwargs...)
##     N = size(img, 1)
##     cart = CartesianIndices(img)
##     @tasks for idx in eachindex(img)
##         c = cart[idx]
##         img[idx] = _compute_pixel(c[1], c[2], N)
##     end
##     return img
## end

N = 2000
img = zeros(Int, N, N)
compute_juliaset_parallel!(img);
p = heatmap(img)
DisplayAs.PNG(p) #hide

# ## Benchmark
#
# Let's benchmark the variants above.

using BenchmarkTools
using Base.Threads: nthreads

N = 2000
img = zeros(Int, N, N)

@show nthreads()

@btime compute_juliaset_sequential!($img) samples=10 evals=3;
@btime compute_juliaset_parallel!($img) samples=10 evals=3;

# As hoped, the parallel implementation is much faster!

# ### Dynamic vs static scheduling
#
# As stated above, the per-pixel computation is non-uniform. Hence, we do benefit from
# the load balancing of the default dynamic scheduler. The latter divides the overall
# workload into tasks that can then be dynamically distributed among threads to adjust the
# per-thread load. We can try to fine tune and improve the load balancing further by
# increasing the `ntasks` parameter of the scheduler, that is, creating more tasks with
# smaller per-task workload.

using OhMyThreads: DynamicScheduler

@btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3;

# Note that while this turns out to be a bit faster, it comes at the expense of much more
# allocations.
#
# To quantify the impact of load balancing we can opt out of dynamic scheduling and use the
# `StaticScheduler` instead. The latter doesn't provide any form of load balancing.

using OhMyThreads: StaticScheduler

@btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3;


================================================
FILE: docs/src/literate/juliaset/juliaset.md
================================================
```@meta
EditURL = "juliaset.jl"
```

# Julia Set

In this example, we will compute an image of the
[Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore
the `schedule` and `nchunks` options that can be used to get load balancing.

The value of a single pixel of the Julia set, which corresponds to a point in the
complex number plane, can be computed by the following iteration procedure.

````julia
function _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)
    x = -2.0 + (j - 1) * 4.0 / (n - 1)
    y = -2.0 + (i - 1) * 4.0 / (n - 1)

    z = x + y * im
    iter = max_iter
    for k in 1:max_iter
        if abs2(z) > 4.0
            iter = k - 1
            break
        end
        z = z^2 + c
    end
    return iter
end
````

````
_compute_pixel (generic function with 1 method)
````

Note that the value of the pixel is the number of performed iterations for the
corresponding complex input number. Hence, the computational **workload is non-uniform**.

## Sequential computation

In our naive implementation, we just loop over the dimensions of the image matrix and call
the pixel kernel above.

````julia
function compute_juliaset_sequential!(img)
    N = size(img, 1)
    for j in 1:N
        for i in 1:N
            img[i, j] = _compute_pixel(i, j, N)
        end
    end
    return img
end

N = 2000
img = zeros(Int, N, N)
compute_juliaset_sequential!(img);
````

Let's look at the result

````julia
using Plots
p = heatmap(img)
````
![](juliaset-8.png)

## Parallelization

The Julia set computation above is a `map!` operation: We apply some function to each
element of the array. Hence, we can use `tmap!` for parallelization. We use
`CartesianIndices` to map between linear and two-dimensional cartesian indices.

````julia
using OhMyThreads: tmap!

function compute_juliaset_parallel!(img; kwargs...)
    N = size(img, 1)
    cart = CartesianIndices(img)
    tmap!(img, eachindex(img); kwargs...) do idx
        c = cart[idx]
        _compute_pixel(c[1], c[2], N)
    end
    return img
end

# or alternatively
#
# function compute_juliaset_parallel!(img; kwargs...)
#     N = size(img, 1)
#     cart = CartesianIndices(img)
#     @tasks for idx in eachindex(img)
#         c = cart[idx]
#         img[idx] = _compute_pixel(c[1], c[2], N)
#     end
#     return img
# end

N = 2000
img = zeros(Int, N, N)
compute_juliaset_parallel!(img);
p = heatmap(img)
````
![](juliaset-10.png)

## Benchmark

Let's benchmark the variants above.

````julia
using BenchmarkTools
using Base.Threads: nthreads

N = 2000
img = zeros(Int, N, N)

@show nthreads()

@btime compute_juliaset_sequential!($img) samples=10 evals=3;
@btime compute_juliaset_parallel!($img) samples=10 evals=3;
````

````
nthreads() = 10
  131.295 ms (0 allocations: 0 bytes)
  31.422 ms (68 allocations: 6.09 KiB)

````

As hoped, the parallel implementation is much faster!

### Dynamic vs static scheduling

As stated above, the per-pixel computation is non-uniform. Hence, we do benefit from
the load balancing of the default dynamic scheduler. The latter divides the overall
workload into tasks that can then be dynamically distributed among threads to adjust the
per-thread load. We can try to fine tune and improve the load balancing further by
increasing the `ntasks` parameter of the scheduler, that is, creating more tasks with
smaller per-task workload.

````julia
using OhMyThreads: DynamicScheduler

@btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3;
````

````
  17.438 ms (12018 allocations: 1.11 MiB)

````

Note that while this turns out to be a bit faster, it comes at the expense of much more
allocations.

To quantify the impact of load balancing we can opt out of dynamic scheduling and use the
`StaticScheduler` instead. The latter doesn't provide any form of load balancing.

````julia
using OhMyThreads: StaticScheduler

@btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3;
````

````
  30.097 ms (73 allocations: 6.23 KiB)

````

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/mc/Project.toml
================================================
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"


================================================
FILE: docs/src/literate/mc/mc.jl
================================================
# # Parallel Monte Carlo
#
# Calculate the value of $\pi$ through parallel direct Monte Carlo.
#
# A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1).
# The area of the circle is $\pi$, the area of the square is 4, and the ratio is $\pi/4$.
# This means that, if you throw $N$ darts randomly at the square, approximately $M=N\pi/4$
# of those darts will land inside the unit circle.
#
# Throw darts randomly at a unit square and count how many of them ($M$) landed inside of
# a unit circle. Approximate $\pi \approx 4M/N$.
#
# ## Sequential implementation:

function mc(N)
    M = 0 # number of darts that landed in the circle
    for i in 1:N
        if rand()^2 + rand()^2 < 1.0
            M += 1
        end
    end
    pi = 4 * M / N
    return pi
end

N = 100_000_000

mc(N)

# ## Parallelization with `tmapreduce`
#
# To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction
# operator. For the map part, we take `1:N` as our input collection and "throw one dart" per
# element.

using OhMyThreads

function mc_parallel(N; kwargs...)
    M = tmapreduce(+, 1:N; kwargs...) do i
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

## or alternatively
##
## function mc_parallel(N)
##     M = @tasks for _ in 1:N
##         @set reducer = +
##         rand()^2 + rand()^2 < 1.0
##     end
##     pi = 4 * M / N
##     return pi
## end

mc_parallel(N)

# Let's run a quick benchmark.

using BenchmarkTools
using Base.Threads: nthreads

@assert nthreads() > 1 # make sure we have multiple Julia threads
@show nthreads()       # print out the number of threads

@btime mc($N) samples=10 evals=3;
@btime mc_parallel($N) samples=10 evals=3;

# ### Static scheduling
#
# Because the workload is highly uniform, it makes sense to also try the `StaticScheduler`
# and compare the performance of static and dynamic scheduling (with default parameters).

using OhMyThreads: StaticScheduler

@btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default
@btime mc_parallel($N; scheduler=:static) samples=10 evals=3;

# ## Manual parallelization
#
# First, using the `index_chunks` function, we divide the iteration interval `1:N` into
# `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task
# per chunk. Each task will locally and independently perform a sequential Monte Carlo
# simulation. Finally, we fetch the results and compute the average estimate for $\pi$.

using OhMyThreads: @spawn, index_chunks

function mc_parallel_manual(N; nchunks = nthreads())
    tasks = map(index_chunks(1:N; n = nchunks)) do idcs
        @spawn mc(length(idcs))
    end
    pi = sum(fetch, tasks) / nchunks
    return pi
end

mc_parallel_manual(N)

# And this is the performance:

@btime mc_parallel_manual($N) samples=10 evals=3;

# It is faster than `mc_parallel` above because the task-local computation
# `mc(length(idcs))` is faster than the implicit task-local computation within
# `tmapreduce` (which itself is a `mapreduce`).

idcs = first(index_chunks(1:N; n = nthreads()))

@btime mapreduce($+, $idcs) do i
    rand()^2 + rand()^2 < 1.0
end samples=10 evals=3;

@btime mc($(length(idcs))) samples=10 evals=3;


================================================
FILE: docs/src/literate/mc/mc.md
================================================
```@meta
EditURL = "mc.jl"
```

# Parallel Monte Carlo

Calculate the value of $\pi$ through parallel direct Monte Carlo.

A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1).
The area of the circle is $\pi$, the area of the square is 4, and the ratio is $\pi/4$.
This means that, if you throw $N$ darts randomly at the square, approximately $M=N\pi/4$
of those darts will land inside the unit circle.

Throw darts randomly at a unit square and count how many of them ($M$) landed inside of
a unit circle. Approximate $\pi \approx 4M/N$.

## Sequential implementation:

````julia
function mc(N)
    M = 0 # number of darts that landed in the circle
    for i in 1:N
        if rand()^2 + rand()^2 < 1.0
            M += 1
        end
    end
    pi = 4 * M / N
    return pi
end

N = 100_000_000

mc(N)
````

````
3.14171236
````

## Parallelization with `tmapreduce`

To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction
operator. For the map part, we take `1:N` as our input collection and "throw one dart" per
element.

````julia
using OhMyThreads

function mc_parallel(N; kwargs...)
    M = tmapreduce(+, 1:N; kwargs...) do i
        rand()^2 + rand()^2 < 1.0
    end
    pi = 4 * M / N
    return pi
end

# or alternatively
#
# function mc_parallel(N)
#     M = @tasks for _ in 1:N
#         @set reducer = +
#         rand()^2 + rand()^2 < 1.0
#     end
#     pi = 4 * M / N
#     return pi
# end

mc_parallel(N)
````

````
3.14156496
````

Let's run a quick benchmark.

````julia
using BenchmarkTools
using Base.Threads: nthreads

@assert nthreads() > 1 # make sure we have multiple Julia threads
@show nthreads()       # print out the number of threads

@btime mc($N) samples=10 evals=3;
@btime mc_parallel($N) samples=10 evals=3;
````

````
nthreads() = 10
  301.636 ms (0 allocations: 0 bytes)
  41.864 ms (68 allocations: 5.81 KiB)

````

### Static scheduling

Because the workload is highly uniform, it makes sense to also try the `StaticScheduler`
and compare the performance of static and dynamic scheduling (with default parameters).

````julia
using OhMyThreads: StaticScheduler

@btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default
@btime mc_parallel($N; scheduler=:static) samples=10 evals=3;
````

````
  41.839 ms (68 allocations: 5.81 KiB)
  41.838 ms (68 allocations: 5.81 KiB)

````

## Manual parallelization

First, using the `index_chunks` function, we divide the iteration interval `1:N` into
`nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task
per chunk. Each task will locally and independently perform a sequential Monte Carlo
simulation. Finally, we fetch the results and compute the average estimate for $\pi$.

````julia
using OhMyThreads: @spawn, index_chunks

function mc_parallel_manual(N; nchunks = nthreads())
    tasks = map(index_chunks(1:N; n = nchunks)) do idcs
        @spawn mc(length(idcs))
    end
    pi = sum(fetch, tasks) / nchunks
    return pi
end

mc_parallel_manual(N)
````

````
3.14180504
````

And this is the performance:

````julia
@btime mc_parallel_manual($N) samples=10 evals=3;
````

````
  30.224 ms (65 allocations: 5.70 KiB)

````

It is faster than `mc_parallel` above because the task-local computation
`mc(length(idcs))` is faster than the implicit task-local computation within
`tmapreduce` (which itself is a `mapreduce`).

````julia
idcs = first(index_chunks(1:N; n = nthreads()))

@btime mapreduce($+, $idcs) do i
    rand()^2 + rand()^2 < 1.0
end samples=10 evals=3;

@btime mc($(length(idcs))) samples=10 evals=3;
````

````
  41.750 ms (0 allocations: 0 bytes)
  30.148 ms (0 allocations: 0 bytes)

````

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/tls/Project.toml
================================================
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"


================================================
FILE: docs/src/literate/tls/tls.jl
================================================
# # [Thread-Safe Storage](@id TSS)
#
# For some programs, it can be useful or even necessary to allocate and (re-)use memory in
# your parallel code (e.g. your computation might require temporary buffers).
# The following section demonstrates common issues that can arise in such a scenario and,
# by means of a simple example, explains techniques to handle such cases safely.
# Specifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and
# (2) how channels can be used to organize per-task buffer allocation in a thread-safe
# manner.
#
#
# ## Test case (sequential)
#
# Let's say that we are given two arrays of matrices, `As` and `Bs`, and let's
# further assume that our goal is to compute the total sum of all pairwise matrix products.
# We can readily implement a (sequential) function that performs the necessary computations.
using LinearAlgebra: mul!, BLAS
BLAS.set_num_threads(1) #  for simplicity, we turn off OpenBLAS multithreading
using ThreadPinning #hide
pinthreads(:cores) #hide

function matmulsums(As, Bs)
    N = size(first(As), 1)
    C = Matrix{Float64}(undef, N, N)
    map(As, Bs) do A, B
        mul!(C, A, B)
        sum(C)
    end
end

# Here, we use `map` to perform the desired operation for each pair of matrices,
# `A` and `B`. However, the crucial point for our discussion is that we want to use the
# in-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated
# temporary buffer, the output matrix `C`. This is to avoid the temporary allocation per
# "iteration" (i.e. per matrix pair) that we would get with `C = A*B`.
#
# For later comparison, we generate some random input data and store the result.

As = [rand(256, 16) for _ in 1:768]
Bs = [rand(16, 256) for _ in 1:768]

res = matmulsums(As, Bs);

# ## How to not parallelize
#
# The key idea for creating a parallel version of `matmulsums` is to replace the `map` by
# OhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't
# entirely trivial. Someone new to parallel computing might be tempted to parallelize
# `matmulsums` like this:
using OhMyThreads: tmap

function matmulsums_race(As, Bs)
    N = size(first(As), 1)
    C = Matrix{Float64}(undef, N, N)
    tmap(As, Bs) do A, B
        mul!(C, A, B)
        sum(C)
    end
end

# Unfortunately, this doesn't produce the correct result.

res_race = matmulsums_race(As, Bs)
res ≈ res_race

# In fact, it doesn't even always produce the same result (check for yourself)!
# The reason is that there is a race condition: different parallel
# tasks are trying to use the shared variable `C` simultaneously leading to
# non-deterministic behavior. Let's see how we can fix this.
#
# ### The naive (and inefficient) fix
#
# A simple solution for the race condition issue above is to move the allocation of `C`
# into the body of the parallel `tmap`:

function matmulsums_naive(As, Bs)
    N = size(first(As), 1)
    tmap(As, Bs) do A, B
        C = Matrix{Float64}(undef, N, N)
        mul!(C, A, B)
        sum(C)
    end
end

# In this case, a separate `C` will be allocated for each iteration such that parallel tasks
# no longer mutate shared state. Hence, we'll get the desired result.

res_naive = matmulsums_naive(As, Bs)
res ≈ res_naive

# However, this variant is obviously inefficient because it is no better than just writing
# `C = A*B` and thus leads to one allocation per matrix pair. We need a different way of
# allocating and re-using `C` for an efficient parallel version.

# ## [Task-local storage](@id TLS)
#
# ### The manual (and cumbersome) way
#
# We've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't
# allocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a
# separate "C" on each parallel task once and then use this task-local "C" for all
# iterations (i.e. matrix pairs) for which this task is responsible.
# Before we learn how to do this more conveniently, let's implement this idea of a
# task-local temporary buffer (for each parallel task) manually.
using OhMyThreads: index_chunks, @spawn
using Base.Threads: nthreads

function matmulsums_manual(As, Bs)
    N = size(first(As), 1)
    tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs
        @spawn begin
            local C = Matrix{Float64}(undef, N, N)
            map(idcs) do i
                A = As[i]
                B = Bs[i]

                mul!(C, A, B)
                sum(C)
            end
        end
    end
    mapreduce(fetch, vcat, tasks)
end

res_manual = matmulsums_manual(As, Bs)
res ≈ res_manual

# We note that this is rather cumbersome and you might not
# want to write it (repeatedly). But let's take a closer look and see what's happening here.
# First, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of
# those chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a
# `results` vector) and (2) performs the actual computations using these pre-allocated
# buffers. Finally, we `fetch` the results of the tasks and combine them. This variant works
# just fine and the good news is that we can get the same behavior with less manual work.
#
# ### [The shortcut: `TaskLocalValue`](@id TLV)
#
# The desire for task-local storage is quite natural with task-based multithreading. For
# this reason, Julia supports this out of the box with
# [`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}).
# But instead of using this directly (which you could), we will use a convenience wrapper
# around it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl).
# This allows us to express the idea from above in few lines of code:
using OhMyThreads: TaskLocalValue

function matmulsums_tlv(As, Bs; kwargs...)
    N = size(first(As), 1)
    tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))
    tmap(As, Bs; kwargs...) do A, B
        C = tlv[]
        mul!(C, A, B)
        sum(C)
    end
end

res_tlv = matmulsums_tlv(As, Bs)
res ≈ res_tlv

# Here, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a
# task-local value - essentially a reference to a value in the task-local storage - that
# behaves like this: The first time the task-local value is accessed from a task (`tls[]`)
# it is initialized according to the provided anonymous function. Afterwards, every
# following query (from the same task!) will simply lookup and return the task-local value.
# This solves our issues above and leads to $O(\textrm{parallel tasks})$
# (instead of $O(\textrm{iterations})$) allocations.
#
# Note that if you use our `@tasks` macro API, there is built-in support for task-local
# values via `@local`.
#

using OhMyThreads: @tasks

function matmulsums_tlv_macro(As, Bs; kwargs...)
    N = size(first(As), 1)
    @tasks for i in eachindex(As, Bs)
        @set collect = true
        @local C = Matrix{Float64}(undef, N, N)
        mul!(C, As[i], Bs[i])
        sum(C)
    end
end

res_tlv_macro = matmulsums_tlv_macro(As, Bs)
res ≈ res_tlv_macro

# Here, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically
# infers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see
# [`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in
# loops which take on the order of 100ns to complete.
#
#
# ### Benchmark
#
# The whole point of parallelization is increasing performance, so let's benchmark and
# compare the performance of the variants that we've discussed so far.

using BenchmarkTools

@show nthreads()

@btime matmulsums($As, $Bs);
sleep(2) #hide
@btime matmulsums_naive($As, $Bs);
sleep(2) #hide
@btime matmulsums_manual($As, $Bs);
sleep(2) #hide
@btime matmulsums_tlv($As, $Bs);
sleep(2) #hide
@btime matmulsums_tlv_macro($As, $Bs);

# As we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient
# but also efficient: It allocates much less memory than `matmulsums_naive` and is about on
# par with the manual implementation.
#
#
# ## Per-thread allocation
#
# The task-local solution above has one potential caveat: If we spawn many parallel tasks
# (e.g. for load-balancing reasons) we need just as many task-local buffers. This can
# clearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one
# buffer per thread should actually suffice.
# Of course, this raises the question of how to organize a pool of "per-thread" buffers
# such that each running task always has exclusive (temporary) access to a buffer (we need
# to make sure to avoid races).
#
# ### The naive (and incorrect) approach
# A naive approach to implementing this idea is to pre-allocate an array of buffers
# and then to use the `threadid()` to select a buffer for a running task.
#
using Base.Threads: threadid

function matmulsums_perthread_incorrect(As, Bs)
    N = size(first(As), 1)
    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()]
    tmap(As, Bs) do A, B
        C = Cs[threadid()]
        mul!(C, A, B)
        sum(C)
    end
end;

# This approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()`
# doesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which
# case `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising
# but is a simple consequence of the ordering of different kinds of Julia threads: If Julia
# is started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the
# interactive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`).
# [Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087),
# and so the above code will error by default.
#
# But even if we account for this offset there is another, more fundamental problem, namely
# **task-migration**. By default, all spawned parallel tasks are "non-sticky" and can
# dynamically migrate between different Julia threads (loosely speaking, at any point in time).
# This means nothing other than that **`threadid()` is not necessarily constant for a task**!
# For example, imagine that task A starts on thread 4, loads the
# buffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5.
# Afterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on
# (the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race
# condition because both tasks are mutating the same buffer.
# (Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example
# above, but you can't rely on it!)
#
# ### The quick (and non-recommended) fix
#
# A simple solution for the task-migration issue is to opt-out of dynamic scheduling with
# `scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically
# assigns tasks to threads upfront without any dynamic rescheduling
# (the tasks are sticky and won't migrate).
#
# We'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than
# `nthreads`, as described above.
#
num_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads()

function matmulsums_perthread_static(As, Bs)
    N = size(first(As), 1)
    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()]
    ## Note!!!
    ## This code is *incorrect* if used with a non-static scheduler. this
    ## isn't just true in OhMyThreads but also applies to `Threads.@threads`
    ## You *must* use `Threads.@threads :static` or `scheduler = :static` to
    ## avoid race-conditions caused by task migration.
    tmap(As, Bs; scheduler = :static) do A, B
        C = Cs[threadid()]
        mul!(C, A, B)
        sum(C)
    end
end

## non uniform workload
As_nu = [rand(256, isqrt(i)^2) for i in 1:768];
Bs_nu = [rand(isqrt(i)^2, 256) for i in 1:768];
res_nu = matmulsums(As_nu, Bs_nu);

res_pt_static = matmulsums_perthread_static(As_nu, Bs_nu)
res_nu ≈ res_pt_static

# However, this approach has serious shortcomings.
#
# 1. It can easily be broken if someone doesn't know that the `scheduler = :static`
# option is required for correctness, and removes it in a refactor.
# 2. It makes the parallel code  non-composable: If we call other multithreaded functions
# within the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called
# from another parallel region we will likely oversubscribe the Julia threads and get subpar
# performance.
# 3. It can waste memory by creating too many temporary storage slots since `maxthreadid()`
# can give an over-estimate of the number of slots needed for the computation.
#
# While the above pattern might be the easiest to migrate to from the incorrect pattern,
# we do not recommend it. We instead urge you to use task-local-storages, or the `Channel`
# based techniques described below:
#
# ### The safe way: `Channel`
#
# Instead of storing the pre-allocated buffers in an array, we can put them into a `Channel`
# which internally ensures that parallel access is safe. In this scenario, we simply `take!`
# a buffer from the channel whenever we need it and `put!` it back after our computation is
# done.
#
function matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...)
    N = size(first(As), 1)
    chnl = Channel{Matrix{Float64}}(nbuffers)
    foreach(1:nbuffers) do _
        put!(chnl, Matrix{Float64}(undef, N, N))
    end
    tmap(As, Bs; kwargs...) do A, B
        C = take!(chnl)
        mul!(C, A, B)
        result = sum(C)
        put!(chnl, C)
        result
    end
end

res_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu)
res_nu ≈ res_pt_channel

#
# ### Benchmark
#
# Let's benchmark the variants above and compare them to the task-local implementation.
# We want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter
# of which gives us dynamic load balancing.
#

## no load balancing because ntasks == nthreads()
@btime matmulsums_tlv($As_nu, $Bs_nu);
@btime matmulsums_perthread_static($As_nu, $Bs_nu);
@btime matmulsums_perthread_channel($As_nu, $Bs_nu);

## load balancing because ntasks > nthreads()
@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads());
@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads());

@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads());
@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads());

#
# Note that the runtime of `matmulsums_perthread_channel` improves with increasing number
# of chunks/tasks (due to load balancing) while the amount of allocated memory doesn't
# increase much. Contrast this with the drastic memory increase with `matmulsums_tlv`.
#
# ### Another safe way based on `Channel`
#
# Above, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel
# and then spawn many tasks (one per input element). Sometimes it can make sense to flip
# things around and put the (many) input elements into a channel and only spawn
# a limited number of tasks (e.g. `nthreads()`) with task-local buffers.
#
using OhMyThreads: tmapreduce

function matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads())
    N = size(first(As), 1)
    chnl = Channel{Int}(length(As); spawn = true) do chnl
        for i in 1:length(As)
            put!(chnl, i)
        end
    end
    tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off
        local C = Matrix{Float64}(undef, N, N)
        map(chnl) do i # implicitly takes the values from the channel (parallel safe)
            A = As[i]
            B = Bs[i]
            mul!(C, A, B)
            sum(C)
        end
    end
end;

# Note that one caveat of this approach is that the input → task assignment, and thus the
# order of the output, is **non-deterministic**. For this reason, we sort the output to check
# for correctness.

res_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu)
sort(res_nu) ≈ sort(res_channel_flipped)

# Quick benchmark:

@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu);
@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads());
@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads());

# In addition, OhMyThreads provides an iterator-wrapper type
# [`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If
# the number of elements is large this can be more efficient since there is no
# need to copy the elements into the `Channel`. Concretely, in the example above,
# we could replace `Channel() do .. end` with
# `OhMyThreads.ChannelLike(1:length(As))`.

# ### Bumper.jl (only for the brave)
#
# If you are bold and want to cut down temporary allocations even more you can
# give [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it
# allows you to *bring your own stacks*, that is, task-local bump allocators which you can
# dynamically allocate memory to, and reset them at the end of a code block, just like
# Julia's stack.
# Be warned though that Bumper.jl is (1) a rather young package with (likely) some bugs
# and (2) can easily lead to segfaults when used incorrectly. If you can live with the
# risk, Bumper.jl is especially useful for causes  we don't know ahead of time how large
# a matrix to pre-allocate, and even more useful if we want to do many intermediate
# allocations on the task, not just one. For our example, this isn't the case but let's
# nonetheless how one would use Bumper.jl here.

using Bumper

function matmulsums_bumper(As, Bs)
    tmap(As, Bs) do A, B
        @no_escape begin # promising that no memory will escape
            N = size(A, 1)
            C = @alloc(Float64, N, N) # from bump allocater (fake "stack")
            mul!(C, A, B)
            sum(C)
        end
    end
end

res_bumper = matmulsums_bumper(As, Bs);
sort(res) ≈ sort(res_bumper)

@btime matmulsums_bumper($As, $Bs);

# Note that the benchmark is lying here about the total memory allocation,
# because it doesn't show the allocation of the task-local bump allocators themselves
# (the reason is that `SlabBuffer` uses `malloc` directly).


================================================
FILE: docs/src/literate/tls/tls.md
================================================
```@meta
EditURL = "tls.jl"
```

# [Thread-Safe Storage](@id TSS)

For some programs, it can be useful or even necessary to allocate and (re-)use memory in
your parallel code (e.g. your computation might require temporary buffers).
The following section demonstrates common issues that can arise in such a scenario and,
by means of a simple example, explains techniques to handle such cases safely.
Specifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and
(2) how channels can be used to organize per-task buffer allocation in a thread-safe
manner.


## Test case (sequential)

Let's say that we are given two arrays of matrices, `As` and `Bs`, and let's
further assume that our goal is to compute the total sum of all pairwise matrix products.
We can readily implement a (sequential) function that performs the necessary computations.

````julia
using LinearAlgebra: mul!, BLAS
BLAS.set_num_threads(1) #  for simplicity, we turn off OpenBLAS multithreading

function matmulsums(As, Bs)
    N = size(first(As), 1)
    C = Matrix{Float64}(undef, N, N)
    map(As, Bs) do A, B
        mul!(C, A, B)
        sum(C)
    end
end
````

````
matmulsums (generic function with 1 method)
````

Here, we use `map` to perform the desired operation for each pair of matrices,
`A` and `B`. However, the crucial point for our discussion is that we want to use the
in-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated
temporary buffer, the output matrix `C`. This is to avoid the temporary allocation per
"iteration" (i.e. per matrix pair) that we would get with `C = A*B`.

For later comparison, we generate some random input data and store the result.

````julia
As = [rand(256, 16) for _ in 1:768]
Bs = [rand(16, 256) for _ in 1:768]

res = matmulsums(As, Bs);
````

## How to not parallelize

The key idea for creating a parallel version of `matmulsums` is to replace the `map` by
OhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't
entirely trivial. Someone new to parallel computing might be tempted to parallelize
`matmulsums` like this:

````julia
using OhMyThreads: tmap

function matmulsums_race(As, Bs)
    N = size(first(As), 1)
    C = Matrix{Float64}(undef, N, N)
    tmap(As, Bs) do A, B
        mul!(C, A, B)
        sum(C)
    end
end
````

````
matmulsums_race (generic function with 1 method)
````

Unfortunately, this doesn't produce the correct result.

````julia
res_race = matmulsums_race(As, Bs)
res ≈ res_race
````

````
false
````

In fact, it doesn't even always produce the same result (check for yourself)!
The reason is that there is a race condition: different parallel
tasks are trying to use the shared variable `C` simultaneously leading to
non-deterministic behavior. Let's see how we can fix this.

### The naive (and inefficient) fix

A simple solution for the race condition issue above is to move the allocation of `C`
into the body of the parallel `tmap`:

````julia
function matmulsums_naive(As, Bs)
    N = size(first(As), 1)
    tmap(As, Bs) do A, B
        C = Matrix{Float64}(undef, N, N)
        mul!(C, A, B)
        sum(C)
    end
end
````

````
matmulsums_naive (generic function with 1 method)
````

In this case, a separate `C` will be allocated for each iteration such that parallel tasks
no longer mutate shared state. Hence, we'll get the desired result.

````julia
res_naive = matmulsums_naive(As, Bs)
res ≈ res_naive
````

````
true
````

However, this variant is obviously inefficient because it is no better than just writing
`C = A*B` and thus leads to one allocation per matrix pair. We need a different way of
allocating and re-using `C` for an efficient parallel version.

## [Task-local storage](@id TLS)

### The manual (and cumbersome) way

We've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't
allocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a
separate "C" on each parallel task once and then use this task-local "C" for all
iterations (i.e. matrix pairs) for which this task is responsible.
Before we learn how to do this more conveniently, let's implement this idea of a
task-local temporary buffer (for each parallel task) manually.

````julia
using OhMyThreads: index_chunks, @spawn
using Base.Threads: nthreads

function matmulsums_manual(As, Bs)
    N = size(first(As), 1)
    tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs
        @spawn begin
            local C = Matrix{Float64}(undef, N, N)
            map(idcs) do i
                A = As[i]
                B = Bs[i]

                mul!(C, A, B)
                sum(C)
            end
        end
    end
    mapreduce(fetch, vcat, tasks)
end

res_manual = matmulsums_manual(As, Bs)
res ≈ res_manual
````

````
true
````

We note that this is rather cumbersome and you might not
want to write it (repeatedly). But let's take a closer look and see what's happening here.
First, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of
those chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a
`results` vector) and (2) performs the actual computations using these pre-allocated
buffers. Finally, we `fetch` the results of the tasks and combine them. This variant works
just fine and the good news is that we can get the same behavior with less manual work.

### [The shortcut: `TaskLocalValue`](@id TLV)

The desire for task-local storage is quite natural with task-based multithreading. For
this reason, Julia supports this out of the box with
[`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}).
But instead of using this directly (which you could), we will use a convenience wrapper
around it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl).
This allows us to express the idea from above in few lines of code:

````julia
using OhMyThreads: TaskLocalValue

function matmulsums_tlv(As, Bs; kwargs...)
    N = size(first(As), 1)
    tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))
    tmap(As, Bs; kwargs...) do A, B
        C = tlv[]
        mul!(C, A, B)
        sum(C)
    end
end

res_tlv = matmulsums_tlv(As, Bs)
res ≈ res_tlv
````

````
true
````

Here, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a
task-local value - essentially a reference to a value in the task-local storage - that
behaves like this: The first time the task-local value is accessed from a task (`tls[]`)
it is initialized according to the provided anonymous function. Afterwards, every
following query (from the same task!) will simply lookup and return the task-local value.
This solves our issues above and leads to $O(\textrm{parallel tasks})$
(instead of $O(\textrm{iterations})$) allocations.

Note that if you use our `@tasks` macro API, there is built-in support for task-local
values via `@local`.

````julia
using OhMyThreads: @tasks

function matmulsums_tlv_macro(As, Bs; kwargs...)
    N = size(first(As), 1)
    @tasks for i in eachindex(As, Bs)
        @set collect = true
        @local C = Matrix{Float64}(undef, N, N)
        mul!(C, As[i], Bs[i])
        sum(C)
    end
end

res_tlv_macro = matmulsums_tlv_macro(As, Bs)
res ≈ res_tlv_macro
````

````
true
````

Here, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically
infers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see
[`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in
loops which take on the order of 100ns to complete.


### Benchmark

The whole point of parallelization is increasing performance, so let's benchmark and
compare the performance of the variants that we've discussed so far.

````julia
using BenchmarkTools

@show nthreads()

@btime matmulsums($As, $Bs);
@btime matmulsums_naive($As, $Bs);
@btime matmulsums_manual($As, $Bs);
@btime matmulsums_tlv($As, $Bs);
@btime matmulsums_tlv_macro($As, $Bs);
````

````
nthreads() = 6
  50.439 ms (6 allocations: 518.14 KiB)
  39.387 ms (2467 allocations: 384.09 MiB)
  9.743 ms (165 allocations: 6.05 MiB)
  9.749 ms (962 allocations: 3.05 MiB)
  9.859 ms (199 allocations: 3.04 MiB)

````

As we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient
but also efficient: It allocates much less memory than `matmulsums_naive` and is about on
par with the manual implementation.


## Per-thread allocation

The task-local solution above has one potential caveat: If we spawn many parallel tasks
(e.g. for load-balancing reasons) we need just as many task-local buffers. This can
clearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one
buffer per thread should actually suffice.
Of course, this raises the question of how to organize a pool of "per-thread" buffers
such that each running task always has exclusive (temporary) access to a buffer (we need
to make sure to avoid races).

### The naive (and incorrect) approach
A naive approach to implementing this idea is to pre-allocate an array of buffers
and then to use the `threadid()` to select a buffer for a running task.

````julia
using Base.Threads: threadid

function matmulsums_perthread_incorrect(As, Bs)
    N = size(first(As), 1)
    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()]
    tmap(As, Bs) do A, B
        C = Cs[threadid()]
        mul!(C, A, B)
        sum(C)
    end
end;
````

This approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()`
doesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which
case `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising
but is a simple consequence of the ordering of different kinds of Julia threads: If Julia
is started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the
interactive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`).
[Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087),
and so the above code will error by default.

But even if we account for this offset there is another, more fundamental problem, namely
**task-migration**. By default, all spawned parallel tasks are "non-sticky" and can
dynamically migrate between different Julia threads (loosely speaking, at any point in time).
This means nothing other than that **`threadid()` is not necessarily constant for a task**!
For example, imagine that task A starts on thread 4, loads the
buffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5.
Afterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on
(the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race
condition because both tasks are mutating the same buffer.
(Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example
above, but you can't rely on it!)

### The quick (and non-recommended) fix

A simple solution for the task-migration issue is to opt-out of dynamic scheduling with
`scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically
assigns tasks to threads upfront without any dynamic rescheduling
(the tasks are sticky and won't migrate).

We'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than
`nthreads`, as described above.

````julia
num_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads()

function matmulsums_perthread_static(As, Bs)
    N = size(first(As), 1)
    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()]
    # Note!!!
    # This code is *incorrect* if used with a non-static scheduler. this
    # isn't just true in OhMyThreads but also applies to `Threads.@threads`
    # You *must* use `Threads.@threads :static` or `scheduler = :static` to
    # avoid race-conditions caused by task migration.
    tmap(As, Bs; scheduler = :static) do A, B
        C = Cs[threadid()]
        mul!(C, A, B)
        sum(C)
    end
end

# non uniform workload
As_nu = [rand(256, isqrt(i)^2) for i in 1:768];
Bs_nu = [rand(isqrt(i)^2, 256) for i in 1:768];
res_nu = matmulsums(As_nu, Bs_nu);

res_pt_static = matmulsums_perthread_static(As_nu, Bs_nu)
res_nu ≈ res_pt_static
````

````
true
````

However, this approach has serious shortcomings.

1. It can easily be broken if someone doesn't know that the `scheduler = :static`
option is required for correctness, and removes it in a refactor.
2. It makes the parallel code  non-composable: If we call other multithreaded functions
within the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called
from another parallel region we will likely oversubscribe the Julia threads and get subpar
performance.
3. It can waste memory by creating too many temporary storage slots since `maxthreadid()`
can give an over-estimate of the number of slots needed for the computation.

While the above pattern might be the easiest to migrate to from the incorrect pattern,
we do not recommend it. We instead urge you to use task-local-storages, or the `Channel`
based techniques described below:

### The safe way: `Channel`

Instead of storing the pre-allocated buffers in an array, we can put them into a `Channel`
which internally ensures that parallel access is safe. In this scenario, we simply `take!`
a buffer from the channel whenever we need it and `put!` it back after our computation is
done.

````julia
function matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...)
    N = size(first(As), 1)
    chnl = Channel{Matrix{Float64}}(nbuffers)
    foreach(1:nbuffers) do _
        put!(chnl, Matrix{Float64}(undef, N, N))
    end
    tmap(As, Bs; kwargs...) do A, B
        C = take!(chnl)
        mul!(C, A, B)
        result = sum(C)
        put!(chnl, C)
        result
    end
end

res_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu)
res_nu ≈ res_pt_channel
````

````
true
````

### Benchmark

Let's benchmark the variants above and compare them to the task-local implementation.
We want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter
of which gives us dynamic load balancing.

````julia
# no load balancing because ntasks == nthreads()
@btime matmulsums_tlv($As_nu, $Bs_nu);
@btime matmulsums_perthread_static($As_nu, $Bs_nu);
@btime matmulsums_perthread_channel($As_nu, $Bs_nu);

# load balancing because ntasks > nthreads()
@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads());
@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads());

@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads());
@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads());
````

````
  212.200 ms (962 allocations: 3.05 MiB)
  212.014 ms (191 allocations: 4.04 MiB)
  211.336 ms (190 allocations: 3.04 MiB)
  168.835 ms (1136 allocations: 6.05 MiB)
  169.097 ms (334 allocations: 3.04 MiB)
  130.469 ms (2530 allocations: 30.17 MiB)
  131.037 ms (1487 allocations: 3.14 MiB)

````

Note that the runtime of `matmulsums_perthread_channel` improves with increasing number
of chunks/tasks (due to load balancing) while the amount of allocated memory doesn't
increase much. Contrast this with the drastic memory increase with `matmulsums_tlv`.

### Another safe way based on `Channel`

Above, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel
and then spawn many tasks (one per input element). Sometimes it can make sense to flip
things around and put the (many) input elements into a channel and only spawn
a limited number of tasks (e.g. `nthreads()`) with task-local buffers.

````julia
using OhMyThreads: tmapreduce

function matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads())
    N = size(first(As), 1)
    chnl = Channel{Int}(length(As); spawn = true) do chnl
        for i in 1:length(As)
            put!(chnl, i)
        end
    end
    tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off
        local C = Matrix{Float64}(undef, N, N)
        map(chnl) do i # implicitly takes the values from the channel (parallel safe)
            A = As[i]
            B = Bs[i]
            mul!(C, A, B)
            sum(C)
        end
    end
end;
````

Note that one caveat of this approach is that the input → task assignment, and thus the
order of the output, is **non-deterministic**. For this reason, we sort the output to check
for correctness.

````julia
res_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu)
sort(res_nu) ≈ sort(res_channel_flipped)
````

````
true
````

Quick benchmark:

````julia
@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu);
@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads());
@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads());
````

````
  137.431 ms (133 allocations: 3.04 MiB)
  126.854 ms (211 allocations: 6.06 MiB)
  127.647 ms (836 allocations: 30.29 MiB)

````

In addition, OhMyThreads provides an iterator-wrapper type
[`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If
the number of elements is large this can be more efficient since there is no
need to copy the elements into the `Channel`. Concretely, in the example above,
we could replace `Channel() do .. end` with
`OhMyThreads.ChannelLike(1:length(As))`.

### Bumper.jl (only for the brave)

If you are bold and want to cut down temporary allocations even more you can
give [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it
allows you to *bring your own stacks*, that is, task-local bump allocators which you can
dynamically allocate memory to, and reset them at the end of a code block, just like
Julia's stack.
Be warned though that Bumper.jl is (1) a rather young package with (likely) some bugs
and (2) can easily lead to segfaults when used incorrectly. If you can live with the
risk, Bumper.jl is especially useful for causes  we don't know ahead of time how large
a matrix to pre-allocate, and even more useful if we want to do many intermediate
allocations on the task, not just one. For our example, this isn't the case but let's
nonetheless how one would use Bumper.jl here.

````julia
using Bumper

function matmulsums_bumper(As, Bs)
    tmap(As, Bs) do A, B
        @no_escape begin # promising that no memory will escape
            N = size(A, 1)
            C = @alloc(Float64, N, N) # from bump allocater (fake "stack")
            mul!(C, A, B)
            sum(C)
        end
    end
end

res_bumper = matmulsums_bumper(As, Bs);
sort(res) ≈ sort(res_bumper)

@btime matmulsums_bumper($As, $Bs);
````

````
  9.439 ms (198 allocations: 39.25 KiB)

````

Note that the benchmark is lying here about the total memory allocation,
because it doesn't show the allocation of the task-local bump allocators themselves
(the reason is that `SlabBuffer` uses `malloc` directly).

---

*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*


================================================
FILE: docs/src/literate/tomarkdown.sh
================================================
#!/usr/bin/env sh
#=
julia --project -t 10 $0 $@
exit
# =#

const reporoot = joinpath(@__DIR__, "../../..")
const repourl = "https://github.com/JuliaFolds2/OhMyThreads.jl/blob/main/docs"

using Literate
using Pkg

if length(ARGS) == 0
    println("Error: Please provide the names of the folders that you want to compile to markdown. " *
    "Alternatively, you can pass \"all\" as the first argument to compile them all.")
    exit()
else
    if first(ARGS) == "all"
        dirs = filter(isdir, readdir())
    else
        dirs = ARGS
    end
end
@show dirs

for d in dirs
    println("directory: ", d)
    cd(d) do
        Pkg.activate(".")
        Pkg.resolve()
        Pkg.instantiate()
        jlfiles = filter(endswith(".jl"), readdir())
        for f in jlfiles
            Literate.markdown(
                f,
                repo_root_url = repourl,
                execute=true;
                # config=Dict("image_formats" => [(MIME"image/png", ".png")])
            )
        end
    end
end


================================================
FILE: docs/src/refs/api.md
================================================
```@meta
CollapsedDocStrings = true
```

# [Public API](@id API)

## Exported

### Macros
```@docs
@tasks
@set
@local
@only_one
@one_by_one
@allow_boxed_captures
@disallow_boxed_captures
@localize
```

### Functions

```@docs
tmapreduce
treduce
tmap
tmap!
tforeach
tcollect
treducemap
```

### Schedulers

```@docs
Scheduler
DynamicScheduler
StaticScheduler
GreedyScheduler
SerialScheduler
```

## Re-exported

|                        |                                                                     |
|------------------------|---------------------------------------------------------------------|
| `OhMyThreads.chunks`   | see [`ChunkSplitters.chunks`](@extref) |
| `OhMyThreads.index_chunks`   | see [`ChunkSplitters.index_chunks`](@extref) |

## Public but not exported

|                        |                                                                     |
|------------------------|---------------------------------------------------------------------|
| `OhMyThreads.@spawn`   | see [`StableTasks.@spawn`](https://github.com/JuliaFolds2/StableTasks.jl) |
| `OhMyThreads.@spawnat` | see [`StableTasks.@spawnat`](https://github.com/JuliaFolds2/StableTasks.jl) |
| `OhMyThreads.@fetch`   | see [`StableTasks.@fetch`](https://github.com/JuliaFolds2/StableTasks.jl) |
| `OhMyThreads.@fetchfrom` | see [`StableTasks.@fetchfrom`](https://github.com/JuliaFolds2/StableTasks.jl) |
| `OhMyThreads.TaskLocalValue`   | see [TaskLocalValues.TaskLocalValue](https://github.com/vchuravy/TaskLocalValues.jl) |
| `OhMyThreads.Split`   | see [`ChunkSplitters.Split`](@extref) |
| `OhMyThreads.Consecutive`   | see [`ChunkSplitters.Consecutive`](@extref) |
| `OhMyThreads.RoundRobin`   | see [`ChunkSplitters.RoundRobin`](@extref) |


```@docs
OhMyThreads.WithTaskLocals
OhMyThreads.promise_task_local
OhMyThreads.ChannelLike
```


================================================
FILE: docs/src/refs/experimental.md
================================================
```@meta
CollapsedDocStrings = true
```

# Experimental

!!! warning
    **Everything on this page is experimental and might changed or dropped at any point!**

## References

```@autodocs
Modules = [OhMyThreads, OhMyThreads.Experimental]
Public = false
Pages   = ["OhMyThreads.jl", "experimental.jl"]
```


================================================
FILE: docs/src/refs/internal.md
================================================
```@meta
CollapsedDocStrings = true
```

# Internal

!!! warning
    **Everything on this page is internal and and might changed or dropped at any point!**

## References

```@autodocs
Modules = [OhMyThreads, OhMyThreads.Tools]
Public = false
Pages   = ["OhMyThreads.jl", "tools.jl"]
```


================================================
FILE: docs/src/translation.md
================================================
# [Translation Guide](@id TG)

This page tries to give a general overview of how to translate patterns written with the built-in tools of [Base.Threads](https://docs.julialang.org/en/v1/base/multi-threading/) using the [OhMyThreads.jl API](@ref API). Note that this should be seen as a rough guide and (intentionally) isn't supposed to replace a systematic introduction into OhMyThreads.jl.


## Basics

### `@threads`

```julia
# Base.Threads
using Base.Threads: @threads

@threads for i in 1:10
    println(i)
end
```

```julia
# OhMyThreads
using OhMyThreads: @tasks

@tasks for i in 1:10
    println(i)
end

# or
using OhMyThreads: tforeach

tforeach(1:10) do i
    println(i)
end
```

#### `:static` scheduling

```julia
# Base.Threads
using Base.Threads: @threads

@threads :static for i in 1:10
    println(i)
end
```

```julia
# OhMyThreads
using OhMyThreads: @tasks

@tasks for i in 1:10
    @set scheduler=:static
    println(i)
end

# or
using OhMyThreads: tforeach

tforeach(1:10; scheduler=:static) do i
    println(i)
end
```

### `@spawn`

```julia
# Base.Threads
using Base.Threads: @spawn

@sync for i in 1:10
    @spawn println(i)
end
```

```julia
# OhMyThreads
using OhMyThreads: @tasks

@tasks for i in 1:10
    @set chunking=false
    println(i)
end

# or
using OhMyThreads: tforeach

tforeach(1:10; chunking=false) do i
    println(i)
end

# or
using OhMyThreads: @spawn

@sync for i in 1:10
    @spawn println(i)
end
```

## Reduction

No built-in feature in Base.Threads.

```julia
# Base.Threads: basic manual implementation
using Base.Threads: @spawn

data = rand(10)
chunks_itr = Iterators.partition(data, length(data) ÷ nthreads())
tasks = map(chunks_itr) do chunk
    @spawn reduce(+, chunk)
end
reduce(+, fetch.(tasks))
```

```julia
# OhMyThreads
using OhMyThreads: @tasks
data = rand(10)

@tasks for x in data
    @set reducer=+
end

# or
using OhMyThreads: treduce

treduce(+, data)
```

## Mutation

!!! warning
    Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!**

```julia
# Base.Threads
using Base.Threads: @threads
data = rand(10)

@threads for i in eachindex(data)
    data[i] = calc(i)
end
```

```julia
# OhMyThreads
using OhMyThreads: @tasks
data = rand(10)

@tasks for i in eachindex(data)
    data[i] = calc(i)
end

# or
using OhMyThreads: tforeach

tforeach(eachindex(data)) do i
    data[i] = calc(i)
end

# or
using OhMyThreads: tmap!

tmap!(data, eachindex(data)) do i
    calc(i)
end
```

## Parallel initialization

!!! warning
    Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!**

```julia
# Base.Threads
using Base.Threads: @threads

data = Vector{Float64}(undef, 10)
@threads for i in eachindex(data)
    data[i] = calc(i)
end
```

```julia
# OhMyThreads
using OhMyThreads: @tasks

data = @tasks for i in 1:10
    @set collect=true
    calc(i)
end

# or
using OhMyThreads: tmap

data = tmap(i->calc(i), 1:10)

# or
using OhMyThreads: tcollect

data = tcollect(calc(i) for i in 1:10)
```


================================================
FILE: ext/MarkdownExt.jl
================================================
module MarkdownExt

using Markdown: Markdown, @md_str, term
using OhMyThreads.Implementation: BoxedVariableError

function __init__()
    if isdefined(Base.Experimental, :register_error_hint)
        Base.Experimental.register_error_hint(BoxedVariableError) do io, bve
            println(io)
            println(io)
            term(io, md"""
#### Hint
Capturing boxed variables can be not only slow, but also cause surprising and incorrect results. 

* If you meant for these variables to be local to each loop iteration and not depend on a variable from an outer scope, you should mark them as `local` inside the closure.
* If you meant to reference a variable from the outer scope, but do not want access to it to be boxed, you can wrap uses of it in a let block, like e.g.
```julia
function foo(x, N)
    rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ...
    let x = x # ... Unless we localize it here with the let block 
        @tasks for i in 1:N
            f(x)    
        end
    end
end
```
* OhMyThreads.jl provides a `@localize` macro that automates the above `let` block, i.e. `@localize x f(x)` is the same as `let x=x; f(x) end`
* If these variables are being re-bound inside a `@one_by_one` or `@only_one` block, consider using a mutable `Ref` instead of re-binding the variable.

This error can be bypassed with the `@allow_boxed_captures` macro.
    """)
        end
    end
end 


end


================================================
FILE: src/OhMyThreads.jl
================================================
module OhMyThreads

using StableTasks: StableTasks
for mac in Symbol.(["@spawn", "@spawnat", "@fetch", "@fetchfrom"])
    @eval const $mac = getproperty(StableTasks, $(QuoteNode(mac)))
end

using ChunkSplitters: ChunkSplitters
const index_chunks = ChunkSplitters.index_chunks
const chunks = ChunkSplitters.chunks
const Split = ChunkSplitters.Split
const Consecutive = ChunkSplitters.Consecutive
const RoundRobin = ChunkSplitters.RoundRobin
export chunks, index_chunks

using TaskLocalValues: TaskLocalValues
const TaskLocalValue = TaskLocalValues.TaskLocalValue

using ScopedValues: ScopedValues, ScopedValue, @with

include("types.jl")
include("functions.jl")
include("macros.jl")

include("tools.jl")
include("schedulers.jl")
using .Schedulers: Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler,
                   SerialScheduler
include("implementation.jl")
include("experimental.jl")

export @tasks, @set, @local, @one_by_one, @only_one, @allow_boxed_captures, @disallow_boxed_captures, @localize
export treduce, tmapreduce, treducemap, tmap, tmap!, tforeach, tcollect
export Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler, SerialScheduler

end # module OhMyThreads


================================================
FILE: src/experimental.jl
================================================
module Experimental

"""
    @barrier

This can be used inside a `@tasks for ... end` to synchronize `n` parallel tasks.
Specifically, a task can only pass the `@barrier` if `n-1` other tasks have reached it
as well. The value of `n` is determined from `@set ntasks=...`, which
is required if one wants to use `@barrier`.

Because this feature is experimental, it is required to load `@barrier` explicitly, e.g. via
`using OhMyThreads.Experimental: @barrier`.

**WARNING:** It is the responsibility of the user to ensure that the right number of tasks
actually reach the barrier. Otherwise, a **deadlock** can occur. In partictular, if the
number of iterations is not a multiple of `n`, the last few iterations (remainder) will be
run by less than `n` tasks which will never be able to pass a `@barrier`.

## Example

```julia
using OhMyThreads: @tasks

# works
@tasks for i in 1:20
    @set ntasks = 20

    sleep(i * 0.2)
    println(i, ": before")
    @barrier
    println(i, ": after")
end

# wrong - deadlock!
@tasks for i in 1:22 # ntasks % niterations != 0
    @set ntasks = 20

    println(i, ": before")
    @barrier
    println(i, ": after")
end
```
"""
macro barrier(args...)
    error("The @barrier macro may only be used inside of a @tasks block.")
end

end # Experimental


================================================
FILE: src/functions.jl
================================================
"""
    tmapreduce(f, op, A::AbstractArray...;
               [scheduler::Union{Scheduler, Symbol} = :dynamic],
               [outputtype::Type = Any],
               [init])

A multithreaded function like `Base.mapreduce`. Perform a reduction over `A`, applying a
single-argument function `f` to each element, and then combining them with the two-argument
function `op`.

Note that `op` **must** be an
[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense
that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you
will get undefined results.

## Example:

```
using OhMyThreads: tmapreduce

tmapreduce(√, +, [1, 2, 3, 4, 5])
```

is the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form

```
(√1 + √2) + (√3 + √4) + √5
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.
- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.
- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.

In addition, `tmapreduce` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
tmapreduce(√, +, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function tmapreduce end

"""
    treducemap(op, f, A::AbstractArray...;
               [scheduler::Union{Scheduler, Symbol} = :dynamic],
               [outputtype::Type = Any],
               [init])

Like `tmapreduce` except the order of the `f` and `op` arguments are switched. This is
sometimes convenient with `do`-block notation. Perform a reduction over `A`, applying a
single-argument function `f` to each element, and then combining them with the two-argument
function `op`.

Note that `op` **must** be an
[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense
that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you
will get undefined results.

## Example:

```
using OhMyThreads: treducemap

treducemap(+, √, [1, 2, 3, 4, 5])
```

is the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form

```
(√1 + √2) + (√3 + √4) + √5
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.
- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.
- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.

In addition, `treducemap` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
treducemap(+, √, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function treducemap end

"""
    treduce(op, A::AbstractArray...;
            [scheduler::Union{Scheduler, Symbol} = :dynamic],
            [outputtype::Type = Any],
            [init])

A multithreaded function like `Base.reduce`. Perform a reduction over `A` using the
two-argument function `op`.

Note that `op` **must** be an
[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense
that `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you
will get undefined results.

## Example:

```
using OhMyThreads: treduce

treduce(+, [1, 2, 3, 4, 5])
```

is the parallelized version of `sum([1, 2, 3, 4, 5])` in the form

```
(1 + 2) + (3 + 4) + 5
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.
- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.
- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.

In addition, `treduce` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
treduce(+, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function treduce end

"""
    tforeach(f, A::AbstractArray...;
             [scheduler::Union{Scheduler, Symbol} = :dynamic]) :: Nothing

A multithreaded function like `Base.foreach`. Apply `f` to each element of `A` on
multiple parallel tasks, and return `nothing`. I.e. it is the parallel equivalent of

```
for x in A
    f(x)
end
```

## Example:

```
using OhMyThreads: tforeach

tforeach(1:10) do i
    println(i^2)
end
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.

In addition, `tforeach` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
tforeach(1:10; chunksize=2, scheduler=:static) do i
    println(i^2)
end
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function tforeach end

"""
    tmap(f, [OutputElementType], A::AbstractArray...;
         [scheduler::Union{Scheduler, Symbol} = :dynamic])

A multithreaded function like `Base.map`. Create a new container `similar` to `A` and fills
it in parallel such that the `i`th element is equal to `f(A[i])`.

The optional argument `OutputElementType` will select a specific element type for the
returned container, and will generally incur fewer allocations than the version where
`OutputElementType` is not specified.

## Example:

```
using OhMyThreads: tmap

tmap(sin, 1:10)
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.

In addition, `tmap` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
tmap(sin, 1:10; chunksize=2, scheduler=:static)
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function tmap end

"""
    tmap!(f, out, A::AbstractArray...;
          [scheduler::Union{Scheduler, Symbol} = :dynamic])

A multithreaded function like `Base.map!`. In parallel on multiple tasks, this function
assigns each element of `out[i] = f(A[i])` for each index `i` of `A` and `out`.

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.

In addition, `tmap!` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor.
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function tmap! end

"""
    tcollect([OutputElementType], gen::Union{AbstractArray, Generator{<:AbstractArray}};
             [scheduler::Union{Scheduler, Symbol} = :dynamic])

A multithreaded function like `Base.collect`. Essentially just calls `tmap` on the
generator function and inputs.

The optional argument `OutputElementType` will select a specific element type for the
returned container, and will generally incur fewer allocations than the version where
`OutputElementType` is not specified.

## Example:

```
using OhMyThreads: tcollect

tcollect(sin(i) for i in 1:10)
```

## Keyword arguments:

- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.

In addition, `tcollect` accepts **all keyword arguments that are supported by the selected
scheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:
```
tcollect(sin(i) for i in 1:10; chunksize=2, scheduler=:static)
```
However, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**
(but not for `scheduler::Scheduler`).
"""
function tcollect end


================================================
FILE: src/implementation.jl
================================================
module Implementation

import OhMyThreads: treduce, tmapreduce, treducemap, tforeach, tmap, tmap!, tcollect
using OhMyThreads: @spawn, @spawnat, WithTaskLocals, promise_task_local, ChannelLike,
                   allowing_boxed_captures
using OhMyThreads.Tools: nthtid
using OhMyThreads: Scheduler,
                   DynamicScheduler, StaticScheduler, GreedyScheduler,
                   SerialScheduler
using OhMyThreads.Schedulers: chunksplitter_mode, chunking_enabled,
                              nchunks, chunksize, chunksplit, minchunksize, has_chunksplit,
                              has_minchunksize, chunkingargs_to_kwargs,
                              chunking_mode, ChunkingMode, NoChunking,
                              FixedSize, FixedCount, scheduler_from_symbol, NotGiven,
                              isgiven, threadpool as get_threadpool
using Base: @propagate_inbounds
using Base.Threads: nthreads, @threads
using BangBang: append!!
using ChunkSplitters: ChunkSplitters, index_chunks, Consecutive
using ChunkSplitters.Internals: AbstractChunks, IndexChunks

const MaybeScheduler = Union{NotGiven, Scheduler, Symbol, Val}

include("macro_impl.jl")

@inline function _index_chunks(sched, arg)
    C = chunking_mode(sched)
    @assert chunking_enabled(sched)
    kwargs = chunkingargs_to_kwargs(sched, arg)
    return index_chunks(arg; kwargs...)::IndexChunks{typeof(arg), chunksplitter_mode(C)}
end

function _scheduler_from_userinput(scheduler::MaybeScheduler; kwargs...)
    if scheduler isa Scheduler
        isempty(kwargs) || scheduler_and_kwargs_err(; kwargs...)
        _scheduler = scheduler
    elseif scheduler isa Symbol
        _scheduler = scheduler_from_symbol(scheduler; kwargs...)
    else # default fallback
        _scheduler = DynamicScheduler(; kwargs...)
    end
end

function _check_chunks_incompatible_kwargs(; kwargs...)
    ks = keys(kwargs)
    if :ntasks in ks || :nchunks in ks || :chunksize in ks || :split in ks
        error("You've provided `chunks` or `index_chunks` as input and, at the same time, " *
              "chunking related keyword arguments (e.g. `ntasks`, `chunksize`, or `split`). " *
              "This isn't supported. " *
              "Set the chunking options directly in the `chunks` or `index_chunks` call or drop the latter.")
    end
    if :chunking in ks
        for (k, v) in kwargs
            if k == :chunking && v == true
                error("You've provided `chunks` or `index_chunks` as input and, at the same time, " *
                      "have set chunking=true. This isn't supported.")
            end
        end
    end
    return nothing
end

function has_multiple_chunks(scheduler, coll)
    C = chunking_mode(scheduler)
    if C == NoChunking || coll isa Union{AbstractChunks, ChunkSplitters.Internals.Enumerate}
        length(coll) > 1
    elseif C == FixedCount
        if !has_minchunksize(scheduler)
            mcs = 1
        else
            mcs = max(min(minchunksize(scheduler), length(coll)), 1)
        end
        min(length(coll) ÷ mcs, nchunks(scheduler)) > 1
    elseif C == FixedSize
        length(coll) ÷ chunksize(scheduler) > 1
    else
        throw(ArgumentError("Unknown chunking mode: $C."))
    end
end

# we can inline this function because we use @noinline on the main function
# it can save some time in cases where we do not hit the main function (e.g. when
# fallback to mapreduce without any threading)
@inline function tmapreduce(f, op, Arrs...;
        scheduler::MaybeScheduler = NotGiven(),
        outputtype::Type = Any,
        init = NotGiven(),
        kwargs...)
    mapreduce_kwargs = isgiven(init) ? (; init) : (;)
    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)

    A = first(Arrs)
    if A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate
        _check_chunks_incompatible_kwargs(; kwargs...)
    end
    if _scheduler isa SerialScheduler || !has_multiple_chunks(_scheduler, first(Arrs))
        # empty input collection → align with Base.mapreduce behavior
        mapreduce(f, op, Arrs...; mapreduce_kwargs...)
    else
        @noinline _tmapreduce(f, op, Arrs, outputtype, _scheduler, mapreduce_kwargs)
    end
end

@noinline function scheduler_and_kwargs_err(; kwargs...)
    kwargstr = join(string.(keys(kwargs)), ", ")
    throw(ArgumentError("Providing an explicit scheduler as well as direct keyword arguments (e.g. $(kwargstr)) is currently not supported."))
end

treducemap(op, f, A...; kwargs...) = tmapreduce(f, op, A...; kwargs...)


# DynamicScheduler: AbstractArray/Generic
function _tmapreduce(f,
        op,
        Arrs,
        ::Type{OutputType},
        scheduler::DynamicScheduler,
        mapreduce_kwargs)::OutputType where {OutputType}
    threadpool = get_threadpool(scheduler)
    check_all_have_same_indices(Arrs)
    throw_if_boxed_captures(f, op)
    if chunking_enabled(scheduler)
        tasks = map(_index_chunks(scheduler, first(Arrs))) do inds

            args = map(A -> view(A, inds), Arrs)
            # Note, calling `promise_task_local` here is only safe because we're assuming that
            # Base.mapreduce isn't going to magically try to do multithreading on us...
            @spawn threadpool mapreduce(promise_task_local(f), promise_task_local(op),
                                        args...; $mapreduce_kwargs...)
        end
        mapreduce(fetch, promise_task_local(op), tasks)
    else
        tasks = map(eachindex(first(Arrs))) do i
            args = map(A -> @inbounds(A[i]), Arrs)
            @spawn threadpool promise_task_local(f)(args...)
        end
        mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)
    end
end

# DynamicScheduler: AbstractChunks
function _tmapreduce(f,
        op,
        Arrs::Union{Tuple{AbstractChunks{T}}, Tuple{ChunkSplitters.Internals.Enumerate{T}}},
        ::Type{OutputType},
        scheduler::DynamicScheduler,
        mapreduce_kwargs)::OutputType where {OutputType, T}
    threadpool = get_threadpool(scheduler)
    throw_if_boxed_captures(f, op)
    tasks = map(only(Arrs)) do idcs
        @spawn threadpool promise_task_local(f)(idcs)
    end
    mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)
end

# StaticScheduler: AbstractArray/Generic
function _tmapreduce(f,
        op,
        Arrs,
        ::Type{OutputType},
        scheduler::StaticScheduler,
        mapreduce_kwargs)::OutputType where {OutputType}
    nt = nthreads()
    check_all_have_same_indices(Arrs)
    throw_if_boxed_captures(f, op)
    if chunking_enabled(scheduler)
        tasks = map(enumerate(_index_chunks(scheduler, first(Arrs)))) do (c, inds)
            tid = @inbounds nthtid(mod1(c, nt))
            args = map(A -> view(A, inds), Arrs)
            # Note, calling `promise_task_local` here is only safe because we're assuming that
            # Base.mapreduce isn't going to magically try to do multithreading on us...
            @spawnat tid mapreduce(promise_task_local(f), promise_task_local(op), args...;
                mapreduce_kwargs...)
        end
        # Note, calling `promise_task_local` here is only safe because we're assuming that
        # Base.mapreduce isn't going to magically try to do multithreading on us...
        mapreduce(fetch, promise_task_local(op), tasks)
    else
        tasks = map(enumerate(eachindex(first(Arrs)))) do (c, i)
            tid = @inbounds nthtid(mod1(c, nt))
            args = map(A -> @inbounds(A[i]), Arrs)
            @spawnat tid promise_task_local(f)(args...)
        end
        # Note, calling `promise_task_local` here is only safe because we're assuming that
        # Base.mapreduce isn't going to magically try to do multithreading on us...
        mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)
    end
end

# StaticScheduler: AbstractChunks
function _tmapreduce(f,
        op,
        Arrs::Tuple{AbstractChunks{T}}, # we don't support multiple chunks for now
        ::Type{OutputType},
        scheduler::StaticScheduler,
        mapreduce_kwargs)::OutputType where {OutputType, T}
    check_all_have_same_indices(Arrs)
    throw_if_boxed_captures(f, op)
    chnks = only(Arrs)
    nt = nthreads()
    tasks = map(enumerate(chnks)) do (c, idcs)
        tid = @inbounds nthtid(mod1(c, nt))
        # Note, calling `promise_task_local` here is only safe because we're assuming that
        # Base.mapreduce isn't going to magically try to do multithreading on us...
        @spawnat tid promise_task_local(f)(idcs)
    end
    # Note, calling `promise_task_local` here is only safe because we're assuming that
    # Base.mapreduce isn't going to magically try to do multithreading on us...
    mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)
end

# NOTE: once v1.12 releases we should switch this to wait(t; throw=false)
wait_nothrow(t) = Base._wait(t)


"""
    empty_collection_error(task)

Check if a task failed due to an empty collection error.
"""
function empty_collection_error end

@static if VERSION < v"1.11.0-"
    function empty_collection_error(task)
        task.result isa MethodError && task.result.f == Base.mapreduce_empty
    end
else
    function empty_collection_error(task)
        task.result isa ArgumentError &&
            task.result.msg ==
            "reducing over an empty collection is not allowed; consider supplying `init` to the reducer"
    end
end

# GreedyScheduler w/o chunking
function _tmapreduce(f,
        op,
        Arrs,
        ::Type{OutputType},
        scheduler::GreedyScheduler{NoChunking},
        mapreduce_kwargs)::OutputType where {OutputType}
    ntasks_desired = scheduler.ntasks
    if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown
        ntasks = ntasks_desired
        ch_len = 0
    else
        check_all_have_same_indices(Arrs)
        ntasks = min(length(first(Arrs)), ntasks_desired)
        ch_len = length(first(Arrs))
    end
    throw_if_boxed_captures(f, op)
    # TODO: Use ChannelLike for iterators that support it. Dispatch on IndexLinear?
    ch = Channel{Tuple{eltype.(Arrs)...}}(ch_len; spawn = true) do ch
        for args in zip(Arrs...)
            put!(ch, args)
        end
    end
    tasks = map(1:ntasks) do _
        # Note, calling `promise_task_local` here is only safe because we're assuming that
        # Base.mapreduce isn't going to magically try to do multithreading on us...
        @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do args
            promise_task_local(f)(args...)
        end
    end
    # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82
    # The idea is that if the channel gets fully consumed before a task gets started up,
    # then if the user does not supply an `init` kwarg, we'll get an error.
    # Current way of dealing with this is just filtering out `mapreduce_empty` method
    # errors. This may not be the most stable way of dealing with things, e.g. if the
    # name of the function throwing the error changes this could break, so long term
    # we may want to try a different design.
    filtered_tasks = filter(tasks) do stabletask
        task = stabletask.t
        istaskdone(task) || wait_nothrow(task)
        if empty_collection_error(task)
            false
        else
            true
        end
    end
    # Note, calling `promise_task_local` here is only safe because we're assuming that
    # Base.mapreduce isn't going to magically try to do multithreading on us...
    mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...)
end

# GreedyScheduler w/ chunking
function _tmapreduce(f,
        op,
        Arrs,
        ::Type{OutputType},
        scheduler::GreedyScheduler,
        mapreduce_kwargs)::OutputType where {OutputType}
    if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown
        throw(ArgumentError("SizeUnkown iterators in combination with a greedy scheduler and chunking are currently not supported."))
    end
    check_all_have_same_indices(Arrs)
    throw_if_boxed_captures(f, op)
    chnks = _index_chunks(scheduler, first(Arrs))
    ntasks_desired = scheduler.ntasks
    ntasks = min(length(chnks), ntasks_desired)

    # ChunkSplitters.IndexChunks support everything needed for ChannelLike
    ch = ChannelLike(chnks)

    tasks = map(1:ntasks) do _
        # Note, calling `promise_task_local` here is only safe because we're assuming that
        # Base.mapreduce isn't going to magically try to do multithreading on us...
        @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do inds
            args = map(A -> view(A, inds), Arrs)
            mapreduce(promise_task_local(f), promise_task_local(op), args...)
        end
    end
    # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82
    # The idea is that if the channel gets fully consumed before a task gets started up,
    # then if the user does not supply an `init` kwarg, we'll get an error.
    # Current way of dealing with this is just filtering out `mapreduce_empty` method
    # errors. This may not be the most stable way of dealing with things, e.g. if the
    # name of the function throwing the error changes this could break, so long term
    # we may want to try a different design.
    filtered_tasks = filter(tasks) do stabletask
        task = stabletask.t
        istaskdone(task) || wait_nothrow(task)
        if empty_collection_error(task)
            false
        else
            true
        end
    end
    # Note, calling `promise_task_local` here is only safe because we're assuming that
    # Base.mapreduce isn't going to magically try to do multithreading on us...
    mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...)
end

function check_all_have_same_indices(Arrs)
    let A = first(Arrs), Arrs = Arrs[2:end]
        if !all(B -> eachindex(A) == eachindex(B), Arrs)
            error("The indices of the input arrays must match the indices of the output array.")
        end
    end
end

struct BoxedVariableError <: Exception
    vars::Vector{Symbol}
end
function Base.showerror(io::IO, bve::BoxedVariableError)
    boxed_fields = join(bve.vars, ", ")
    suffix = length(bve.vars) > 1 ? "s" : ""
    print(io, "Attempted to capture and modify outer local variable$(suffix): ")
    printstyled(io, boxed_fields; color=:red)
    print(io, "\n\nSee https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation.")
    if isdefined(Base.Experimental, :show_error_hints)
        Base.Experimental.show_error_hints(io, bve)
    end
end

function throw_if_boxed_captures(f)
    if allowing_boxed_captures[]
        return nothing
    end
    T = typeof(f)
    if any(FT -> FT <: Core.Box, fieldtypes(T))
        boxed_fields = [fieldname(T, i) for i in 1:fieldcount(T) if fieldtype(T,i) <: Core.Box]
        throw(BoxedVariableError(boxed_fields))
    end
    for i ∈ 1:fieldcount(T)
        # recurse into nested captured functions.
        if fieldtype(T, i) <: Function
            f_inner = getfield(f, i)
            if f !== f_inner
                # don't recurse into self!
                throw_if_boxed_captures(getfield(f, i))
            end
        end
    end
end

function throw_if_boxed_captures(f, fs...)
    throw_if_boxed_captures(f)
    throw_if_boxed_captures(fs...)
end

#-------------------------------------------------------------

function treduce(op, A...; kwargs...)
    tmapreduce(identity, op, A...; kwargs...)
end

#-------------------------------------------------------------

function tforeach(f, A...; kwargs...)::Nothing
    tmapreduce(f, (l, r) -> l, A...; kwargs..., init = nothing, outputtype = Nothing)
end

#-------------------------------------------------------------

function maybe_rewrap(g::G, f::F) where {G, F}
    g(f)
end

"""
   maybe_rewrap(g, f)

takes a closure `g(f)` and if `f` is a `WithTaskLocals`, we're going
to unwrap `f` and delegate its `TaskLocalValues` to `g`.

This should always be equivalent to just calling `g(f)`.
"""
function maybe_rewrap(g::G, f::WithTaskLocals{F}) where {G, F}
    (; inner_func, tasklocals) = f
    WithTaskLocals(vals -> g(inner_func(vals)), tasklocals)
end

#------------------------------------------------------------

function tmap(f, ::Type{T}, A::AbstractArray, _Arrs::AbstractArray...; kwargs...) where {T}
    Arrs = (A, _Arrs...)
    tmap!(f, similar(A, T), Arrs...; kwargs...)
end

function tmap(f,
        A::Union{AbstractArray, AbstractChunks, ChunkSplitters.Internals.Enumerate},
        _Arrs::AbstractArray...;
        scheduler::MaybeScheduler = NotGiven(),
        kwargs...)
    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)

    if _scheduler isa GreedyScheduler
        error("Greedy scheduler isn't supported with `tmap` unless you provide an `OutputElementType` argument, since the greedy schedule requires a commutative reducing operator.")
    end
    if chunking_enabled(_scheduler) && has_chunksplit(_scheduler) &&
       chunksplit(_scheduler) != Consecutive()
        error("Only `split == Consecutive()` is supported because the parallel operation isn't commutative. (Scheduler: $_scheduler)")
    end
    if (A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate)
        _check_chunks_incompatible_kwargs(; kwargs...)
        if chunking_enabled(_scheduler)
            if _scheduler isa DynamicScheduler
                _scheduler = DynamicScheduler(;
                    threadpool = get_threadpool(_scheduler),
                    chunking = false)
            elseif _scheduler isa StaticScheduler
                _scheduler = StaticScheduler(; chunking = false)
            else
                error("Can't disable chunking for this scheduler?! Shouldn't be reached.",
                    _scheduler)
            end
        end
    end

    Arrs = (A, _Arrs...)
    if _scheduler isa SerialScheduler || isempty(A)
        # empty input collection → align with Base.map behavior
        map(f, Arrs...; kwargs...)
    else
        check_all_have_same_indices(Arrs)
        @noinline _tmap(_scheduler, f, A, _Arrs...)
    end
end

# w/o chunking (DynamicScheduler{NoChunking}): AbstractArray
function _tmap(scheduler::DynamicScheduler{NoChunking},
        f,
        A::AbstractArray,
        _Arrs::AbstractArray...;)
    threadpool = get_threadpool(scheduler)
    Arrs = (A, _Arrs...)
    throw_if_boxed_captures(f)
    tasks = map(eachindex(A)) do i
        @spawn threadpool begin
            args = map(A -> A[i], Arrs)
            promise_task_local(f)(args...)
        end
    end
    v = map(fetch, tasks)
    reshape(v, size(A)...)
end

# w/o chunking (DynamicScheduler{NoChunking}): AbstractChunks
function _tmap(scheduler::DynamicScheduler{NoChunking},
        f,
        A::Union{AbstractChunks, ChunkSplitters.Internals.Enumerate},
        _Arrs::AbstractArray...)
    threadpool = get_threadpool(scheduler)
    throw_if_boxed_captures(f)
    tasks = map(A) do idcs
        @spawn threadpool promise_task_local(f)(idcs)
    end
    map(fetch, tasks)
end

# w/o chunking (StaticScheduler{NoChunking}): AbstractChunks
function _tmap(scheduler::StaticScheduler{NoChunking},
        f,
        A::AbstractChunks,
        _Arrs::AbstractArray...)
    nt = nthreads()
    throw_if_boxed_captures(f)
    tasks = map(enumerate(A)) do (c, idcs)
        tid = @inbounds nthtid(mod1(c, nt))
        @spawnat tid promise_task_local(f)(idcs)
    end
    map(fetch, tasks)
end

# w/o chunking (StaticScheduler{NoChunking}): AbstractArray
function _tmap(scheduler::StaticScheduler{NoChunking},
        f,
        A::AbstractArray,
        _Arrs::AbstractArray...;)
    Arrs = (A, _Arrs...)
    nt = nthreads()
    throw_if_boxed_captures(f)
    tasks = map(enumerate(A)) do (c, i)
        tid = @inbounds nthtid(mod1(c, nt))
        @spawnat tid begin
            args = map(A -> A[i], Arrs)
            promise_task_local(f)(args...)
        end
    end
    v = map(fetch, tasks)
    reshape(v, size(A)...)
end

# w/ chunking
function _tmap(scheduler::Scheduler,
        f,
        A::AbstractArray,
        _Arrs::AbstractArray...)
    Arrs = (A, _Arrs...)
    idcs = collect(_index_chunks(scheduler, A))
    reduction_f = append!!
    mapping_f = maybe_rewrap(f) do f
        (inds) -> begin
            args = map(A -> @view(A[inds]), Arrs)
            map(f, args...)
        end
    end
    v = tmapreduce(mapping_f, reduction_f, idcs; scheduler)
    reshape(v, size(A)...)
end

@propagate_inbounds function tmap!(f,
        out,
        A::AbstractArray,
        _Arrs::AbstractArray...;
        scheduler::MaybeScheduler = NotGiven(),
        kwargs...)
    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)

    Arrs = (A, _Arrs...)
    if _scheduler isa SerialScheduler
        map!(f, out, Arrs...)
    else
        @boundscheck check_all_have_same_indices((out, Arrs...))
        throw_if_boxed_captures(f)
        mapping_f = maybe_rewrap(f) do f
            function mapping_function(i)
                args = map(A -> @inbounds(A[i]), Arrs)
                res = f(args...)
                out[i] = res
            end
        end
        @noinline tforeach(mapping_f, eachindex(out); scheduler = _scheduler)
        out
    end
end

#-------------------------------------------------------------

function tcollect(::Type{T}, gen::Base.Generator{<:AbstractArray}; kwargs...) where {T}
    tmap(gen.f, T, gen.iter; kwargs...)
end
tcollect(gen::Base.Generator{<:AbstractArray}; kwargs...) = tmap(gen.f, gen.iter; kwargs...)

tcollect(::Type{T}, A; kwargs...) where {T} = tmap(identity, T, A; kwargs...)
tcollect(A; kwargs...) = tmap(identity, A; kwargs...)

end # module Implementation


================================================
FILE: src/macro_impl.jl
================================================
using OhMyThreads.Tools: OnlyOneRegion, try_enter!
using OhMyThreads.Tools: SimpleBarrier
using OhMyThreads: OhMyThreads

function _is_special_macro_expr(arg;
        lookfor = ("@set", "@local", "@only_one", "@one_by_one", "@barrier"))
    if !(arg isa Expr)
        return false
    end
    lookfor_symbols = Symbol.(lookfor)
    if arg.head == :macrocall
        if arg.args[1] isa Symbol && arg.args[1] in lookfor_symbols
            # support, e.g., @set
            return true
        elseif arg.args[1] isa Expr && arg.args[1].head == Symbol(".")
            # support, e.g., OhMyThreads.@set
            x = arg.args[1]
            if x.args[1] == Symbol("OhMyThreads") && x.args[2] isa QuoteNode &&
               x.args[2].value in lookfor_symbols
                return true
            end
        end
    end
    return false
end

function tasks_macro(forex; __module__)
    if forex.head != :for
        throw(ErrorException("Expected a for loop after `@tasks`."))
    else
        if forex.args[1].head != :(=)
            # this'll catch cases like
            # @tasks for _ ∈ 1:10, _ ∈ 1:10
            #     body
            # end
            throw(ErrorException("`@tasks` currently only supports a single threaded loop, got $(forex.args[1])"))
        end
        it = forex.args[1]
        itvar = it.args[1]
        itrng = it.args[2]
        forbody = forex.args[2]
    end

    settings = Settings()

    # Escape everything in the loop body that is not used in conjuction with one of our
    # "macros", e.g. @set or @local. Code inside of these macro blocks will be escaped by
    # the respective "macro" handling functions below.
    for i in findall(!_is_special_macro_expr, forbody.args)
        forbody.args[i] = esc(forbody.args[i])
    end

    locals_before, locals_names = _maybe_handle_atlocal_block!(forbody.args)
    tls_names = isnothing(locals_before) ? [] : map(x -> x.args[1], locals_before)
    _maybe_handle_atset_block!(settings, forbody.args)
    setup_onlyone_blocks = _maybe_handle_atonlyone_blocks!(forbody.args)
    setup_onebyone_blocks = _maybe_handle_atonebyone_blocks!(forbody.args)
    if isdefined(__module__, Symbol("@barrier"))
        if __module__.var"@barrier" != OhMyThreads.Experimental.var"@barrier"
            error("There seems to be a macro `@barrier` around which isn't `OhMyThreads.Experimental.@barrier`. This isn't supported.")
        end
        setup_barriers = _maybe_handle_atbarriers!(forbody.args, settings)
    else
        setup_barriers = nothing
    end

    itrng = esc(itrng)
    itvar = esc(itvar)

    make_mapping_function = if isempty(tls_names)
        :(local function mapping_function($itvar,)
            $(forbody)
        end)

    else
        :(local mapping_function = WithTaskLocals(($(tls_names...),)) do ($(locals_names...),)
            function mapping_function_local($itvar,)
                $(forbody)
            end
        end)
    end
    q = if isgiven(settings.reducer)
        quote
            $setup_onlyone_blocks
            $setup_onebyone_blocks
            $setup_barriers
            $make_mapping_function
            tmapreduce(mapping_function, $(settings.reducer),
                $(itrng))
        end
    elseif isgiven(settings.collect)
        maybe_warn_useless_init(settings)
        quote
            $setup_onlyone_blocks
            $setup_onebyone_blocks
            $setup_barriers
            $make_mapping_function
            tmap(mapping_function, $(itrng))
        end
    else
        maybe_warn_useless_init(settings)
        quote
            $setup_onlyone_blocks
            $setup_onebyone_blocks
            $setup_barriers
            $make_mapping_function
            tforeach(mapping_function, $(itrng))
        end
    end

    # insert keyword arguments into the function call
    kwexpr = :($(Expr(:parameters)))
    if isgiven(settings.scheduler)
        push!(kwexpr.args, Expr(:kw, :scheduler, settings.scheduler))
    end
    if isgiven(settings.init)
        push!(kwexpr.args, Expr(:kw, :init, settings.init))
    end
    for (k, v) in settings.kwargs
        push!(kwexpr.args, Expr(:kw, k, v))
    end
    insert!(q.args[10].args, 2, kwexpr)

    # wrap everything in a let ... end block
    # and, potentially, define the `TaskLocalValue`s.
    result = :(let
    end)
    push!(result.args[2].args, q)
    if !isnothing(locals_before)
        for x in locals_before
            push!(result.args[1].args, x)
        end
    end

    result
end

function maybe_warn_useless_init(settings)
    isgiven(settings.init) &&
        @warn("The @set init = ... settings won't have any effect because no reduction is performed.")
end

Base.@kwdef mutable struct Settings
    scheduler::Union{Expr, QuoteNode, NotGiven} = NotGiven()
    reducer::Union{Expr, Symbol, NotGiven} = NotGiven()
    collect::Union{Bool, NotGiven} = NotGiven()
    init::Union{Expr, Symbol, NotGiven} = NotGiven()
    kwargs::Dict{Symbol, Any} = Dict{Symbol, Any}()
end

function _maybe_handle_atlocal_block!(args)
    locals_before = nothing
    local_inner = nothing
    tlsidx = findfirst(args) do arg
        _is_special_macro_expr(arg; lookfor = (Symbol("@local"),))
    end
    if !isnothing(tlsidx)
        locals_before, local_inner = _unfold_atlocal_block(args[tlsidx].args[3])
        deleteat!(args, tlsidx)
    end
    return locals_before, local_inner
end

function _unfold_atlocal_block(ex)
    locals_before = Expr[]
    locals_names = Expr[]
    if ex.head == :(=)
        localb, localn = _atlocal_assign_to_exprs(ex)
        push!(locals_before, localb)
        push!(locals_names, localn)
    elseif ex.head == :block
        tlsexprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode
        for x in tlsexprs
            localb, localn = _atlocal_assign_to_exprs(x)
            push!(locals_before, localb)
            push!(locals_names, localn)
        end
    else
        throw(ErrorException("Wrong usage of @local. You must either provide a typed assignment or multiple typed assignments in a `begin ... end` block."))
    end
    return locals_before, locals_names
end

#=
If the TLS doesn't have a declared return type, we're going to use `CC.return_type` to get it
automatically. This would normally be non-kosher, but it's okay here for three reasons:
1) The task local value *only* exists within the function being called, meaning that the worldage
is frozen for the full lifetime of the TLV, so and `eval` can't change the outcome or cause incorrect inference.
2) We do not allow users to *write* to the task local value, they can only retrieve its value, so there's no
potential problems from the type being maximally narrow and then them trying to write a value of another type to it
3) the task local value is not user-observable. we never let the user inspect its type, unless they themselves are
using `code____` tools to inspect the generated code, hence if inference changes and gives a more or less precise
type, there's no observable semantic changes, just performance increases or decreases.
=#
function _atlocal_assign_to_exprs(ex)
    left_ex = ex.args[1]
    tls_def = esc(ex.args[2])
    @gensym tl_storage
    if Base.isexpr(left_ex, :(::))
        tls_sym = esc(left_ex.args[1])
        tls_type = esc(left_ex.args[2])
        local_before = :($(tl_storage) = TaskLocalValue{$tls_type}(() -> $(tls_def)))
    else
        tls_sym = esc(left_ex)
        local_before = :($(tl_storage) = let f = () -> $(tls_def)
            TaskLocalValue{Core.Compiler.return_type(f, Tuple{})}(f)
        end)
    end
    local_name = :($(tls_sym))
    return local_before, local_name
end

function _maybe_handle_atset_block!(settings, args)
    idcs = findall(args) do arg
        _is_special_macro_expr(arg; lookfor = (Symbol("@set"),))
    end
    isnothing(idcs) && return # no @set block found
    for i in idcs
        ex = args[i].args[3]
        if ex.head == :(=)
            _handle_atset_single_assign!(settings, ex)
        elseif ex.head == :block
            exprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode
            _handle_atset_single_assign!.(Ref(settings), exprs)
        else
            throw(ErrorException("Wrong usage of @set. You must either provide an assignment or multiple assignments in a `begin ... end` block."))
        end
    end
    deleteat!(args, idcs)
    # check incompatible settings
    if isgiven(settings.collect) && settings.collect && isgiven(settings.reducer)
        throw(ArgumentError("Specifying both collect and reducer isn't supported."))
    end
end

function _handle_atset_single_assign!(settings, ex)
    if ex.head != :(=)
        throw(ErrorException("Wrong usage of @set. Expected assignment, e.g. `scheduler = StaticScheduler()`."))
    end
    sym = ex.args[1]
    def = ex.args[2]
    if hasfield(Settings, sym)
        if sym == :collect && !(def isa Bool)
            throw(ArgumentError("Setting collect can only be true or false."))
            #TODO support specifying the OutputElementType
        end
        def = def isa Bool ? def : esc(def)
        setfield!(settings, sym, def)
    else
        # push!(settings.kwargs, sym => esc(def))
        settings.kwargs[sym] = esc(def)
    end
end

function _maybe_handle_atonlyone_blocks!(args)
    idcs = findall(args) do arg
        _is_special_macro_expr(arg; lookfor = (Symbol("@only_one"),))
    end
    isnothing(idcs) && return # no @only_one blocks
    setup_onlyone_blocks = quote end
    for i in idcs
        body = args[i].args[3]
        @gensym onlyone
        init_onlyone_ex = :($(onlyone) = Tools.OnlyOneRegion())
        push!(setup_onlyone_blocks.args, init_onlyone_ex)
        args[i] = quote
            Tools.try_enter!($(onlyone)) do
                $(esc(body))
            end
        end
    end
    return setup_onlyone_blocks
end

function _maybe_handle_atonebyone_blocks!(args)
    idcs = findall(args) do arg
        _is_special_macro_expr(arg; lookfor = (Symbol("@one_by_one"),))
    end
    isnothing(idcs) && return # no @one_by_one blocks
    setup_onebyone_blocks = quote end
    for i in idcs
        body = args[i].args[3]
        @gensym onebyone
        init_lock_ex = :($(onebyone) = Base.ReentrantLock())
        push!(setup_onebyone_blocks.args, init_lock_ex)
        args[i] = quote
            lock($(onebyone)) do
                $(esc(body))
            end
        end
    end
    return setup_onebyone_blocks
end

function _maybe_handle_atbarriers!(args, settings)
    idcs = findall(args) do arg
        _is_special_macro_expr(arg; lookfor = (Symbol("@barrier"),))
    end
    isnothing(idcs) && return # no @barrier found
    setup_barriers = quote end
    for i in idcs
        !haskey(settings.kwargs, :ntasks) &&
            throw(ErrorException("When using `@barrier`, the number of tasks must be " *
                                 "specified explicitly, e.g. via `@set ntasks=...`. "))
        ntasks = settings.kwargs[:ntasks]
        @gensym barrier
        push!(setup_barriers.args, :($(barrier) = $(SimpleBarrier)($ntasks)))
        args[i] = :($(esc(:wait))($(barrier)))
    end
    return setup_barriers
end


================================================
FILE: src/macros.jl
================================================
"""
    @tasks for ... end

A macro to parallelize a `for` loop by spawning a set of tasks that can be run in parallel.
The policy of how many tasks to spawn and how to distribute the iteration space among the
tasks (and more) can be configured via `@set` statements in the loop body.

Supports reductions (`@set reducer=<reducer function>`) and collecting the results
(`@set collect=true`).

Under the hood, the `for` loop is translated into corresponding parallel
[`tforeach`](@ref), [`tmapreduce`](@ref), or [`tmap`](@ref) calls.

See also: [`@set`](@ref), [`@local`](@ref)

## Examples

```julia
using OhMyThreads: @tasks
```

```julia
@tasks for i in 1:3
    println(i)
end
```

```julia
@tasks for x in rand(10)
    @set reducer=+
    sin(x)
end
```

```julia
@tasks for i in 1:5
    @set collect=true
    i^2
end
```

```julia
@tasks for i in 1:100
    @set ntasks=4*nthreads()
    # non-uniform work...
end
```

```julia
@tasks for i in 1:5
    @set scheduler=:static
    println("i=", i, " → ", threadid())
end
```

```julia
@tasks for i in 1:100
    @set begin
        scheduler=:static
        chunksize=10
    end
    println("i=", i, " → ", threadid())
end
```
"""
macro tasks(args...)
    Implementation.tasks_macro(args...; __module__)
end

"""
    @set name = value

This can be used inside a `@tasks for ... end` block to specify settings for the parallel
execution of the loop.

Multiple settings are supported, either as separate `@set` statements or via
`@set begin ... end`.

## Settings

* `reducer` (e.g. `reducer=+`): Indicates that a reduction should be performed with the provided binary function. See [`tmapreduce`](@ref) for more information.
* `collect` (e.g. `collect=true`): Indicates that results should be collected (similar to `map`).

All other settings will be passed on to the underlying parallel functions (e.g. [tmapreduce](@ref))
as keyword arguments. Hence, you may provide whatever these functions accept as
keyword arguments. Among others, this includes

* `scheduler` (e.g. `scheduler=:static`): Can be either a [`Scheduler`](@ref) or a `Symbol` (e.g. `:dynamic`, `:static`, `:serial`, or `:greedy`).
* `init` (e.g. `init=0.0`): Initial value to be used in a reduction (requires `reducer=...`).

Settings like `ntasks`, `chunksize`, and `split` etc. can be used to tune the scheduling policy (if the selected scheduler supports it).

Note that the assignment is hoisted above the loop body which means that the scope is *not*
the scope of the loop (even though it looks like it) but rather the scope *surrounding* the
loop body. (`@macroexpand` is a useful tool to inspect the generated code of the `@tasks`
block.)
"""
macro set(args...)
    error("The @set macro may only be used inside of a @tasks block.")
end

@eval begin
    """
        @local name = value

        @local name::T = value

    Can be used inside a `@tasks for ... end` block to specify
    [task-local values](@ref TLS) (TLV) via explicitly typed assignments.
    These values will be allocated once per task
    (rather than once per iteration) and can be re-used between different task-local iterations.

    There can only be a single `@local` block in a `@tasks for ... end` block. To specify
    multiple TLVs, use `@local begin ... end`. Compared to regular assignments, there are some
    limitations though, e.g. TLVs can't reference each other.

    ## Examples

    ```julia
    using OhMyThreads: @tasks
    using OhMyThreads.Tools: taskid

    @tasks for i in 1:10
        @set begin
            scheduler=:dynamic
            ntasks=2
        end
        @local x = zeros(3) # TLV

        x .+= 1
        println(taskid(), " -> ", x)
    end
    ```

    ```julia
    @tasks for i in 1:10
        @local begin
            x = rand(Int, 3)
            M = rand(3, 3)
        end
        # ...
    end
    ```

    Task local variables created by `@local` are by default constrained to their inferred type,
    but if you need to, you can specify a different type during declaration:
    ```julia
    @tasks for i in 1:10
        @local x::Vector{Float64} = some_hard_to_infer_setup_function()
        # ...
    end
    ```

    The right hand side of the assignment is hoisted outside of the loop body and captured
    as a closure used to initialize the task local value. This means that the scope of the
    closure is *not* the scope of the loop (even though it looks like it) but rather the
    scope *surrounding* the loop body. (`@macroexpand` is a useful tool to inspect the
    generated code of the `@tasks` block.)
    """
    macro $(Symbol("local"))(args...)
        error("The @local macro may only be used inside of a @tasks block.")
    end
end

"""
    @only_one begin ... end

This can be used inside a `@tasks for ... end` block to mark a region of code to be
executed by only one of the parallel tasks (all other tasks skip over this region).

## Example

```julia
using OhMyThreads: @tasks

@tasks for i in 1:10
    @set ntasks = 10

    println(i, ": before")
    @only_one begin
        println(i, ": only printed by a single task")
        sleep(1)
    end
    println(i, ": after")
end
```
"""
macro only_one(args...)
    error("The @only_one macro may only be used inside of a @tasks block.")
end

"""
    @one_by_one begin ... end

This can be used inside a `@tasks for ... end` block to mark a region of code to be
executed by one parallel task at a time (i.e. exclusive access). The order may be arbitrary
and non-deterministic.

## Example

```julia
using OhMyThreads: @tasks

@tasks for i in 1:10
    @set ntasks = 10

    println(i, ": before")
    @one_by_one begin
        println(i, ": one task at a time")
        sleep(0.5)
    end
    println(i, ": after")
end
```
"""
macro one_by_one(args...)
    error("The @one_by_one macro may only be used inside of a @tasks block.")
end


const allowing_boxed_captures = ScopedValue(false)

"""
    @allow_boxed_captures expr

By default, OhMyThreads.jl will detect and error on multithreaded code which references local variables
which are 'boxed' -- something that happens if the variable could be re-bound in multiple scopes. This
process can cause very sublte bugs in multithreaded code by creating silent race conditions, e.g.

```julia
let
    function wrong()
        tmap(1:10) do i
            A = i # define A for the first time (lexically)
            sleep(rand()/10)
            A # user is trying to reference local A only
        end
    end
    @show wrong()
    A = 1 # boxed! this hoists "A" to the same variable as in `wrong` but presumably the user wanted a new one
end
```
In this example, you might expect to get `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`, but you would actually observe
incorrect results because `A` is 'boxed'. The fix for this would be to write something like
```julia
let
    function right()
        tmap(1:10) do i
            local A = i
            sleep(rand()/10)
            A 
        end
    end
    @show right()
    A = 1
end
```

However, if you are really sure you want to bypass OhMyThreads's error mechanism, you can use
`@allow_boxed_captures` to wrap code you believe is okay, e.g.

```julia-repl
julia> let A = 1 
           @allow_boxed_captures tmap(1:10) do i
               A = i
               sleep(rand()/10)
               A # race condition!
           end
       end
10-element Vector{Int64}:
 4
 2
 7
 2
 2
 8
 6
 8
 7
 2
```

This is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`.

See also `@disallow_boxed_captures`
"""
macro allow_boxed_captures(ex)
    quote
        @with allowing_boxed_captures => true $(esc(ex))
    end
end

"""
    @disallow_boxed_captures expr

Disable the effect of `@allow_boxed_captures` for any code in `expr`.

This is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`.

See also `@disallow_boxed_captures`
"""
macro disallow_boxed_captures(ex)
    quote
        @with allowing_boxed_captures => false $(esc(ex))
    end
end

"""
   @localize args... expr

Writing
```
@localize x y z expr
```
is equivalent to writing
```
let x=x, y=y, z=z
    expr
end
```
This is useful for avoiding the boxing of captured variables when working with closures.

See https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for more information about boxed variables.
"""
macro localize(args...)
    syms = args[1:end-1]
    ex = args[end]
    letargs = map(syms) do sym
        if !(sym isa Symbol)
            throw(ArgumentError("All but the final argument to `@localize` must be symbols! Got $sym"))
        end
        :($sym = $sym)
    end
    esc(:(let $(letargs...)
              $ex
          end))
end


================================================
FILE: src/schedulers.jl
================================================
module Schedulers

using Base.Threads: nthreads
using ChunkSplitters: Split, Consecutive, RoundRobin, ChunkSplitters

# Used to indicate that a keyword argument has not been set by the user.
# We don't use Nothing because nothing maybe sometimes be a valid user input (e.g. for init)
struct NotGiven end
isgiven(::NotGiven) = false
isgiven(::T) where {T} = true

const MaybeInteger = Union{Integer, NotGiven}

struct NoSplit <: Split end
_parse_split(split::Split) = split
function _parse_split(split::Symbol)
    split in (:consecutive, :batch) && return Consecutive()
    split in (:roundrobin, :scatter) && return RoundRobin()
    throw(ArgumentError("You've provided an unsupported value for `split`"))
end
_splitid(x::Type{<:Split}) = nameof(x) |> string |> lowercase |> Symbol
_splitid(x::Split) = _splitid(typeof(x))

"""
Supertype for all available schedulers:

* [`DynamicScheduler`](@ref): default dynamic scheduler
* [`StaticScheduler`](@ref): low-overhead static scheduler
* [`GreedyScheduler`](@ref): greedy load-balancing scheduler
* [`SerialScheduler`](@ref): serial (non-parallel) execution
"""
abstract type Scheduler end
#! A subtype of Scheduler (let's call it `S`) **must** implement:
#   - `from_symbol(::Val{:symbol})` returning exactly `S` for the given symbol.
#     (e.g. `from_symbol(::Val{:dynamic}) = DynamicScheduler`)

# To enable chunking, S **must** implement:
#   - `chunking_args(::S)::ChunkingArgs` returning the chunking arguments of the scheduler.
#     It usually is a field of the scheduler, and use the constructor
#     `ChunkingArgs` to create it (see below).

# And can optionally implement:
#   - `default_nchunks(::Type{S})` returning the default number of chunks for the scheduler.
#     if chunking is enabled. Default is `Threads.nthreads(:default)`.

from_symbol(::Val) = throw(ArgumentError("unkown scheduler symbol"))

scheduler_from_symbol(s::Symbol; kwargs...) = scheduler_from_symbol(Val(s); kwargs...)
function scheduler_from_symbol(v::Val; kwargs...)
    sched = from_symbol(v)
    return sched(; kwargs...)
end

"""
    ChunkingMode

A trait type to indicate the chunking mode of a scheduler. The following subtypes are available:

* `NoChunking`: no chunking is used
* `FixedCount`: the number of chunks is fixed
* `FixedSize`: the size of each chunk is fixed
"""
abstract type ChunkingMode end
struct NoChunking <: ChunkingMode end
struct FixedCount <: ChunkingMode end
struct FixedSize <: ChunkingMode end

chunksplitter_mode(::Type{FixedCount}) = ChunkSplitters.Internals.FixedCount
chunksplitter_mode(::Type{FixedSize}) = ChunkSplitters.Internals.FixedSize

"""
    ChunkingArgs{C, S <: Split}(n::Union{Int, Nothing}, size::Union{Int, Nothing}, minsize::Union{Int, Nothing}, split::S)
    ChunkingArgs(Sched::Type{<:Scheduler}; n = nothing, size = nothing, minsize = nothing, split::Union{Symbol, Split}; chunking)

Stores all the information needed for chunking. The type parameter `C` is the chunking mode
(`NoChunking`, `FixedSize`, or `FixedCount`). The `chunking` keyword argument is a boolean
and if true, everything is skipped and `C = NoChunking`.

Once the object is created, use the `has_fieldname(object)` function (e.g. `has_size(object)`)
to know if the field is effectively used.
"""
struct ChunkingArgs{C, S <: Split}
    n::Union{Int, Nothing}
    size::Union{Int, Nothing}
    minsize::Union{Int, Nothing}
    split::S
end
function ChunkingArgs(::Type{NoChunking})
    ChunkingArgs{NoChunking, NoSplit}(nothing, nothing, nothing, NoSplit())
end
function ChunkingArgs(
        Sched::Type{<:Scheduler};
        n = nothing,
        size = nothing,
        minsize = nothing,
        split::Union{Symbol, Split},
        chunking
)
    chunking || return ChunkingArgs(NoChunking)

    if isnothing(n) && isnothing(size)
        n = default_nchunks(Sched)
    elseif !isnothing(n) && !isnothing(size)
        throw(ArgumentError("nchunks and chunksize are mutually exclusive"))
    end
    chunking_mode = isnothing(n) ? FixedSize : FixedCount
    split = _parse_split(split)
    return ChunkingArgs{chunking_mode, typeof(split)}(n, size, minsize, split)
end

chunking_mode(::ChunkingArgs{C}) where {C} = C
has_n(ca::ChunkingArgs) = !isnothing(ca.n)
has_size(ca::ChunkingArgs) = !isnothing(ca.size)
has_split(::ChunkingArgs{C, S}) where {C, S} = S !== NoSplit
has_minsize(ca::ChunkingArgs) = !isnothing(ca.minsize)
chunking_enabled(ca::ChunkingArgs) = chunking_mode(ca) != NoChunking

function chunkingargs_to_kwargs(ca::ChunkingArgs, arg)
    minsize = !has_minsize(ca) ? nothing : min(ca.minsize, length(arg))
    return (; ca.n, ca.size, minsize, ca.split)
end

_chunkingstr(ca::ChunkingArgs{NoChunking}) = "none"
function _chunkingstr(ca::ChunkingArgs{FixedCount})
    str = "fixed count ($(ca.n)), split :$(_splitid(ca.split))"
    if has_minsize(ca)
        str = str * ", minimum chunk size  $(ca.minsize)"
    end
    str
end
function _chunkingstr(ca::ChunkingArgs{FixedSize})
    str = "fixed size ($(ca.size)), split :$(_splitid(ca.split))"
    str
end

# Link between a scheduler and its chunking arguments
# The first and only the first method must be overloaded for each scheduler
# that supports chunking.
chunking_args(::Scheduler) = ChunkingArgs(NoChunking)

nchunks(sched::Scheduler) = chunking_args(sched).n
chunksize(sched::Scheduler) = chunking_args(sched).size
chunksplit(sched::Scheduler) = chunking_args(sched).split
minchunksize(sched::Scheduler) = chunking_args(sched).minsize

has_nchunks(sched::Scheduler) = has_n(chunking_args(sched))
has_chunksize(sched::Scheduler) = has_size(chunking_args(sched))
has_chunksplit(sched::Scheduler) = has_split(chunking_args(sched))
has_minchunksize(sched::Scheduler) = has_minsize(chunking_args(sched))

function chunkingargs_to_kwargs(sched::Scheduler, arg)
    chunkingargs_to_kwargs(chunking_args(sched), arg)
end

chunking_mode(sched::Scheduler) = chunking_mode(chunking_args(sched))
chunking_enabled(sched::Scheduler) = chunking_enabled(chunking_args(sched))
_chunkingstr(sched::Scheduler) = _chunkingstr(chunking_args(sched))

"""
    default_nchunks(::Type{<:Scheduler})

Hardcoded default number of chunks, if not provided by the user. Can depend on the
kind of scheduler.
"""
function default_nchunks end
default_nchunks(::Type{<:Scheduler}) = nthreads(:default)

"""
    DynamicScheduler (aka :dynamic)

The default dynamic scheduler. Divides the given collection into chunks and
then spawns a task per chunk to perform the requested operation in parallel.
The tasks are assigned to threads by Julia's dynamic scheduler and are non-sticky, that is,
they can migrate between threads.

Generally preferred since it is flexible, can provide load balancing, and is composable
with other multithreaded code.

## Keyword arguments:

- `nchunks::Integer` or `ntasks::Integer` (default `nthreads(threadpool)`):
    * Determines the number of chunks (and thus also the number of parallel tasks).
    * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)), but at the expense of creating more overhead. For `nchunks <= nthreads()` there are not enough chunks for any load balancing.
    * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads.
- `chunksize::Integer` (default not set)
    * Specifies the desired chunk size (instead of the number of chunks).
    * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be a positive integer).
- `minchunksize::Union{Integer, Nothing}` (default `nothing`)
    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.
- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`):
    * Determines how the collection is divided into chunks (if chunking=true). By default, each chunk consists of contiguous elements and order is maintained.
    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`
    * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**!
- `chunking::Bool` (default `true`):
    * Controls whether input elements are grouped into chunks (`true`) or not (`false`).
    * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as "chunks" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly!
- `threadpool::Symbol` (default `:default`):
    * Possible options are `:default` and `:interactive`.
    * The high-priority pool `:interactive` should be used very carefully since tasks on this threadpool should not be allowed to run for a long time without `yield`ing as it can interfere with [heartbeat](https://en.wikipedia.org/wiki/Heartbeat_(computing)) processes.
"""
struct DynamicScheduler{C <: ChunkingMode, S <: Split, threadpool} <: Scheduler
    chunking_args::ChunkingArgs{C, S}

    function DynamicScheduler(threadpool::Symbol, ca::ChunkingArgs)
        if !(threadpool in (:default, :interactive))
            throw(ArgumentError("threadpool must be either :default or :interactive"))
        end
        new{chunking_mode(ca), typeof(ca.split), threadpool}(ca)
    end
end

function DynamicScheduler(;
        threadpool::Symbol = :default,
        nchunks = nothing,
        ntasks = nothing, # "alias" for nchunks
        chunksize = nothing,
        split::Union{Split, Symbol} = Consecutive(),
        minchunksize = nothing,
        chunking::Bool = true
)
    if !isnothing(ntasks)
        if !isnothing(nchunks)
            throw(ArgumentError("For the dynamic scheduler, nchunks and ntasks are aliases and only one may be provided"))
        end
        nchunks = ntasks
    end
    ca = ChunkingArgs(DynamicScheduler;
        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)
    return DynamicScheduler(threadpool, ca)
end
from_symbol(::Val{:dynamic}) = DynamicScheduler
chunking_args(sched::DynamicScheduler) = sched.chunking_args
threadpool(::DynamicScheduler{C, S, T}) where {C, S, T} = T

function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::DynamicScheduler)
    print(io, "DynamicScheduler", "\n")
    cstr = _chunkingstr(s.chunking_args)
    println(io, "├ Chunking: ", cstr)
    print(io, "└ Threadpool: ", threadpool(s))
end

"""
    StaticScheduler (aka :static)

A static low-overhead scheduler. Divides the given collection into chunks and
then spawns a task per chunk to perform the requested operation in parallel.
The tasks are statically assigned to threads up front and are made *sticky*, that is,
they are guaranteed to stay on the assigned threads (**no task migration**).

Can sometimes be more performant than `DynamicScheduler` when the workload is (close to)
uniform and, because of the lower overhead, for small workloads.
Isn't well composable with other multithreaded code though.

## Keyword arguments:

- `nchunks::Integer` or `ntasks::Integer` (default `nthreads()`):
    * Determines the number of chunks (and thus also the number of parallel tasks).
    * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads.
    * For `nchunks > nthreads()` the chunks will be distributed to the available threads in a round-robin fashion.
- `chunksize::Integer` (default not set)
    * Specifies the desired chunk size (instead of the number of chunks).
    * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be non-zero).
- `minchunksize::Union{Integer, Nothing}` (default `nothing`)
    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.
- `chunking::Bool` (default `true`):
    * Controls whether input elements are grouped into chunks (`true`) or not (`false`).
    * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as "chunks" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly!
- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`):
    * Determines how the collection is divided into chunks. By default, each chunk consists of contiguous elements and order is maintained.
    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`
    * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**!
"""
struct StaticScheduler{C <: ChunkingMode, S <: Split} <: Scheduler
    chunking_args::ChunkingArgs{C, S}
end

function StaticScheduler(;
        nchunks = nothing,
        ntasks = nothing, # "alias" for nchunks
        chunksize = nothing,
        minchunksize = nothing,
        split::Union{Split, Symbol} = Consecutive(),
        chunking::Bool = true
)
    if !isnothing(ntasks)
        if !isnothing(nchunks)
            throw(ArgumentError("For the static scheduler, nchunks and ntasks are aliases and only one may be provided"))
        end
        nchunks = ntasks
    end
    ca = ChunkingArgs(StaticScheduler;
        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)
    return StaticScheduler(ca)
end
from_symbol(::Val{:static}) = StaticScheduler
chunking_args(sched::StaticScheduler) = sched.chunking_args

function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::StaticScheduler)
    print(io, "StaticScheduler", "\n")
    cstr = _chunkingstr(s.chunking_args)
    println(io, "├ Chunking: ", cstr)
    print(io, "└ Threadpool: default")
end

"""
    GreedyScheduler (aka :greedy)

A greedy dynamic scheduler. The elements are put into a shared workqueue and dynamic,
non-sticky, tasks are spawned to process the elements of the queue with each task taking a new
element from the queue as soon as the previous one is done.

Note that elements are processed in a non-deterministic order, and thus a potential reducing
function **must** be [commutative](https://en.wikipedia.org/wiki/Commutative_property) in
addition to being associative, or you could get incorrect results!

Can be good choice for load-balancing slower, uneven computations, but does carry
some additional overhead.

## Keyword arguments:

- `ntasks::Int` (default `nthreads()`):
    * Determines the number of parallel tasks to be spawned.
    * Setting `ntasks < nthreads()` is an effective way to use only a subset of the available threads.
- `chunking::Bool` (default `false`):
    * Controls whether input elements are grouped into chunks (`true`) or not (`false`) before put into the shared workqueue. This can improve the performance especially if there are many iterations each of which are computationally cheap.
    * If `nchunks` or `chunksize` are explicitly specified, `chunking` will be automatically set to `true`.
- `nchunks::Integer` (default `10 * nthreads()`):
    * Determines the number of chunks (that will eventually be put into the shared workqueue).
    * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)). For `nchunks <= nthreads()` there are not enough chunks for any load balancing.
- `chunksize::Integer` (default not set)
    * Specifies the desired chunk size (instead of the number of chunks).
    * The options `chunksize` and `nchunks` are **mutually exclusive** (only one may be a positive integer).
- `minchunksize::Union{Integer, Nothing}` (default `nothing`)
    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.
- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.RoundRobin()`):
    * Determines how the collection is divided into chunks (if chunking=true).
    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`
"""
struct GreedyScheduler{C <: ChunkingMode, S <: Split} <: Scheduler
    ntasks::Int
    chunking_args::ChunkingArgs{C, S}

    function GreedyScheduler(ntasks::Integer, ca::ChunkingArgs)
        ntasks > 0 || throw(ArgumentError("ntasks must be a positive integer"))
        return new{chunking_mode(ca), typeof(ca.split)}(ntasks, ca)
    end
end

function GreedyScheduler(;
        ntasks::Integer = nthreads(),
        nchunks = nothing,
        chunksize = nothing,
        minchunksize = nothing,
        split::Union{Split, Symbol} = RoundRobin(),
        chunking::Bool = false
)
    if !(isnothing(nchunks) && isnothing(chunksize))
        chunking = true
    end
    ca = ChunkingArgs(GreedyScheduler;
        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)
    return GreedyScheduler(ntasks, ca)
end
from_symbol(::Val{:greedy}) = GreedyScheduler
chunking_args(sched::GreedyScheduler) = sched.chunking_args
default_nchunks(::Type{GreedyScheduler}) = 10 * nthreads(:default)

function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, s::GreedyScheduler)
    print(io, "GreedyScheduler", "\n")
    println(io, "├ Num. tasks: ", s.ntasks)
    cstr = _chunkingstr(s)
    println(io, "├ Chunking: ", cstr)
    print(io, "└ Threadpool: default")
end

"""
    SerialScheduler (aka :serial)

A scheduler for turning off any multithreading and running the code in serial. It aims to
make parallel functions like, e.g., `tmapreduce(sin, +, 1:100)` behave like their serial
counterparts, e.g., `mapreduce(sin, +, 1:100)`.

Note that `SerialScheduler` has no arguments and will ignore any that are passed
to it. This is to make it easier to switch to the serial scheduler without
having to change the rest of the code.
"""
struct SerialScheduler <: Scheduler
    # Dummy constructor to allow ignoring settings for other schedulers
    SerialScheduler(; _...) = new()
end
from_symbol(::Val{:serial}) = SerialScheduler

end # module


================================================
FILE: src/tools.jl
================================================
module Tools

using Base.Threads: nthreads

"""
    nthtid(n)

Returns the thread id of the `n`th Julia thread in the `:default` threadpool.
"""
@inline function nthtid(n)
    @static if VERSION < v"1.9"
        @boundscheck 1 <= n <= nthreads()
        return n
    else
        @boundscheck 1 <= n <= nthreads(:default)
        return n + Threads.threadpoolsize(:interactive) # default threads after interactive threads
    end
end

"""
    taskid() :: UInt

Return a `UInt` identifier for the current running [Task](https://docs.julialang.org/en/v1/base/parallel/#Core.Task). This identifier will be unique so long as references to the task it came from still exist.
"""
taskid() = objectid(current_task())

"""
May be used to mark a region in parallel code to be executed by a single task only
(all other tasks shall skip over it).

See [`try_enter!`](@ref) and [`reset!`](@ref).
"""
mutable struct OnlyOneRegion
    @atomic task::Union{Task, Nothing}
    OnlyOneRegion() = new(nothing)
end

"""
    try_enter!(f, s::OnlyOneRegion)

When called from multiple parallel tasks (on a shared `s::OnlyOneRegion`) only a single
task will execute `f`.

## Example

```julia
using OhMyThreads: @tasks
using OhMyThreads.Tools: OnlyOneRegion, try_enter!

only_one = OnlyOneRegion()

@tasks for i in 1:10
    @set ntasks = 10

    println(i, ": before")
    try_enter!(only_one) do
        println(i, ": only printed by a single task")
        sleep(1)
    end
    println(i, ": after")
end
```
"""
function try_enter!(f, s::OnlyOneRegion)
    ct = current_task()
    t = @atomic :monotonic s.task
    if !isnothing(t) && ct != t
        return
    end
    if ct == t || (@atomicreplace s.task nothing=>ct).success
        f()
    end
    return
end

"""
Reset the `OnlyOneRegion` (so that it can be used again).
"""
function reset!(s::OnlyOneRegion)
    @atomic s.task = nothing
    return
end

"""
SimpleBarrier(n::Integer)

Simple reusable barrier for `n` parallel tasks.

Given `b = SimpleBarrier(n)` and `n` parallel tasks, each task that calls
`wait(b)` will block until the other `n-1` tasks have called `wait(b)` as well.

## Example
```
n = nthreads()
barrier = SimpleBarrier(n)
@sync for i in 1:n
    @spawn begin
        println("A")
        wait(barrier) # synchronize all tasks
        println("B")
        wait(barrier) # synchronize all tasks (reusable)
        println("C")
    end
end
```
"""
mutable struct SimpleBarrier
    const n::Int64
    const c::Threads.Condition
    cnt::Int64

    function SimpleBarrier(n::Integer)
        new(n, Threads.Condition(), 0)
    end
end

function Base.wait(b::SimpleBarrier)
    lock(b.c)
    try
        b.cnt += 1
        if b.cnt == b.n
            b.cnt = 0
            notify(b.c)
        else
            wait(b.c)
        end
    finally
        unlock(b.c)
    end
end

end # Tools


================================================
FILE: src/types.jl
================================================
"""
    struct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function

This callable function-like object is meant to represent a function which closes over some
[`TaskLocalValues`](https://github.com/vchuravy/TaskLocalValues.jl). This is, if you do

```
TLV{T} = TaskLocalValue{T}
f = WithTaskLocals((TLV{Int}(() -> 1), TLV{Int}(() -> 2))) do (x, y)
    z -> (x + y)/z
end
```
then that is equivalent to
```
g = let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)
    z -> let x = x[], y=y[]
        (x + y)/z
    end
end
```
however, the main difference is that you can call [`promise_task_local`](@ref) on a
`WithTaskLocals` closure in order to turn it into something equivalent to
```
let x=x[], y=y[]
    z -> (x + y)/z
end
```
which doesn't have the overhead of accessing the `task_local_storage` each time the closure is called.
This of course will lose the safety advantages of `TaskLocalValue`, so you should never do
`f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that
unknown function calls `f_local` on a new task, you'll hit a race condition.
"""
struct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function
    inner_func::F
    tasklocals::TLVs
end

"""
    promise_task_local(f) = f
    promise_task_local(f::WithTaskLocals) = f.inner_func(map(x -> x[], f.tasklocals))

Take a `WithTaskLocals` closure, grab the `TaskLocalValue`s, and passs them to the closure. That is,
it turns a `WithTaskLocals` closure from the equivalent of
```
TLV{T} = TaskLocalValue{T}
let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)
    z -> let x = x[], y=y[]
        (x + y)/z
    end
end
```
into the equivalent of
```
let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)
    let x = x[], y = y[]
        z -> (x + y)/z
    end
end
```
which doesn't have the overhead of accessing the `task_local_storage` each time the closure is called.
This of course will lose the safety advantages of `TaskLocalValue`, so you should never do
`f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that
unknown function calls `f_local` on a new task, you'll hit a race condition. 
```
"""
function promise_task_local(f::WithTaskLocals{F}) where {F}
    f.inner_func(map(x -> x[], f.tasklocals))
end
promise_task_local(f::Any) = f

function (f::WithTaskLocals{F})(args...; kwargs...) where {F}
    promise_task_local(f)(args...; kwargs...)
end

"""
    ChannelLike(itr)

This struct wraps an indexable object such that it can be iterated by concurrent tasks in a
safe manner similar to a `Channel`.

`ChannelLike(itr)` is conceptually similar to:
```julia
Channel{eltype(itr)}(length(itr)) do ch
    foreach(i -> put!(ch, i), itr)
end
```
i.e. creating a channel, `put!`ing all elements of `itr` into it and closing it. The
advantage is that `ChannelLike` doesn't copy the data.

# Examples
```julia
ch = OhMyThreads.ChannelLike(1:5)

@sync for taskid in 1:2
    Threads.@spawn begin
        for i in ch
            println("Task #\$taskid processing item \$i")
            sleep(1 / i)
        end
    end
end

# output

Task #1 processing item 1
Task #2 processing item 2
Task #2 processing item 3
Task #2 processing item 4
Task #1 processing item 5
```

Note that `ChannelLike` is stateful (just like a `Channel`), so you can't iterate over it
twice.

The wrapped iterator must support `firstindex(itr)::Int`, `lastindex(itr)::Int` and
`getindex(itr, ::Int)`.
"""
mutable struct ChannelLike{T}
    const itr::T
    @atomic idx::Int
    function ChannelLike(itr::T) where {T}
        return new{T}(itr, firstindex(itr) - 1)
    end
end

Base.length(ch::ChannelLike) = length(ch.itr)
Base.eltype(ch::ChannelLike) = eltype(ch.itr)

function Base.iterate(ch::ChannelLike, ::Nothing = nothing)
    this = @atomic ch.idx += 1
    if this <= lastindex(ch.itr)
        return (@inbounds(ch.itr[this]), nothing)
    else
        return nothing
    end
end


================================================
FILE: test/Aqua.jl
================================================
using Aqua

@testset "Aqua.jl" begin
  Aqua.test_all(
    OhMyThreads;
    # ambiguities=(exclude=[SomePackage.some_function], broken=true),
    # stale_deps=(ignore=[:SomePackage],),
    deps_compat=(ignore=[:Test],),
    # piracies=false,
    persistent_tasks=false,
  )
end


================================================
FILE: test/runtests.jl
================================================
using Test, OhMyThreads
using OhMyThreads: TaskLocalValue, WithTaskLocals, @fetch, promise_task_local
using OhMyThreads: Consecutive, RoundRobin
using OhMyThreads.Experimental: @barrier
using OhMyThreads.Implementation: BoxedVariableError

@info "Testing with $(Threads.nthreads(:default)),$(Threads.nthreads(:interactive)) threads."

include("Aqua.jl")

sets_to_test = [(~ = isapprox, f = sin ∘ *, op = +,
                    itrs = (rand(ComplexF64, 10, 10), rand(-10:10, 10, 10)),
                    init = complex(0.0))
                (~ = isapprox, f = cos, op = max, itrs = (1:100000,), init = 0.0)
                (~ = (==), f = round, op = vcat, itrs = (randn(1000),), init = Float64[])
                (~ = (==), f = last, op = *,
                    itrs = ([1 => "a", 2 => "b", 3 => "c", 4 => "d", 5 => "e"],),
                    init = "")]

ChunkedGreedy(; kwargs...) = GreedyScheduler(; kwargs...)

@testset "Basics" begin
    for (; ~, f, op, itrs, init) in sets_to_test
        @testset "f=$f, op=$op, itrs::$(typeof(itrs))" begin
            @testset for sched in (
                StaticScheduler, DynamicScheduler, GreedyScheduler,
                DynamicScheduler{OhMyThreads.Schedulers.NoChunking},
                SerialScheduler, ChunkedGreedy)
                @testset for split in (Consecutive(), RoundRobin(), :consecutive, :roundrobin)
                    for nchunks in (1, 2, 6)
                        for minchunksize ∈ (nothing, 1, 3)
                            if sched == GreedyScheduler
                                scheduler = sched(; ntasks = nchunks, minchunksize)
                            elseif sched == DynamicScheduler{OhMyThreads.Schedulers.NoChunking}
                                scheduler = DynamicScheduler(; chunking = false)
                            elseif sched == SerialScheduler
                                scheduler = SerialScheduler(; nchunks)
                            else
                                scheduler = sched(; nchunks, split, minchunksize)
                            end
                            kwargs = (; scheduler)
                            if (split in (RoundRobin(), :roundrobin) ||
                                sched ∈ (GreedyScheduler, ChunkedGreedy)) || op ∉ (vcat, *)
                                # scatter and greedy only works for commutative operators!
                            else
                                mapreduce_f_op_itr = mapreduce(f, op, itrs...)
                                @test tmapreduce(f, op, itrs...; init, kwargs...) ~ mapreduce_f_op_itr
                                @test treducemap(op, f, itrs...; init, kwargs...) ~ mapreduce_f_op_itr
                                @test treduce(op, f.(itrs...); init, kwargs...) ~ mapreduce_f_op_itr
                            end

                            split in (RoundRobin(), :roundrobin) && continue
                            map_f_itr = map(f, itrs...)
                            @test all(tmap(f, Any, itrs...; kwargs...) .~ map_f_itr)
                            @test all(tcollect(Any, (f(x...) for x in collect(zip(itrs...))); kwargs...) .~ map_f_itr)
                            @test all(tcollect(Any, f.(itrs...); kwargs...) .~ map_f_itr)

                            RT = Core.Compiler.return_type(f, Tuple{eltype.(itrs)...})

                            @test tmap(f, RT, itrs...; kwargs...) ~ map_f_itr
                            @test tcollect(RT, (f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr
                            @test tcollect(RT, f.(itrs...); kwargs...) ~ map_f_itr

                            if sched ∉ (GreedyScheduler, ChunkedGreedy)
                                @test tmap(f, itrs...; kwargs...) ~ map_f_itr
                                @test tcollect((f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr
                                @test tcollect(f.(itrs...); kwargs...) ~ map_f_itr
                            end
                        end
                    end
                end
            end
        end
    end
end;

@testset "ChunkSplitters.Chunk" begin
    x = rand(100)
    chnks = OhMyThreads.index_chunks(x; n = Threads.nthreads())
    for scheduler in (
        DynamicScheduler(),
        DynamicScheduler(; chunking = false),
        StaticScheduler(; chunking = false))
        @testset "$scheduler" begin
            @test tmap(x -> sin.(x), chnks; scheduler) ≈ map(x -> sin.(x), chnks)
            @test tmapreduce(x -> sin.(x), vcat, chnks; scheduler) ≈
                  mapreduce(x -> sin.(x), vcat, chnks)
            @test tcollect(chnks; scheduler) == collect(chnks)
            @test treduce(vcat, chnks; scheduler) == reduce(vcat, chnks)
            @test isnothing(tforeach(x -> sin.(x), chnks; scheduler))
        end
    end

    # enumerate(chunks)
    data = 1:100
    @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs)
        [i, sum(@view(data[idcs]))]
    end == [sum(1:5), sum(data)]
    @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; size=5)); chunking=false) do (i, idcs)
        [i, sum(@view(data[idcs]))]
    end == [sum(1:20), sum(data)]
    @test tmap(enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs)
        [i, idcs]
    end == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]]
end;

@testset "macro API" begin
    # basic
    @test @tasks(for i in 1:3
        i
    end) |> isnothing

    # reduction
    @test @tasks(for i in 1:3
        @set reducer = (+)
        i
    end) == 6

    # scheduler settings
    for sched in (StaticScheduler(), DynamicScheduler(), GreedyScheduler())
        @test @tasks(for i in 1:3
            @set scheduler = sched
            i
        end) |> isnothing
    end
    # scheduler settings as symbols
    @test @tasks(for i in 1:3
        @set scheduler = :static
        i
    end) |> isnothing
    @test @tasks(for i in 1:3
        @set scheduler = :dynamic
        i
    end) |> isnothing
    @test @tasks(for i in 1:3
        @set scheduler = :greedy
        i
    end) |> isnothing

    # @set begin ... end
    @test @tasks(for i in 1:10
        @set begin
            scheduler = StaticScheduler()
            reducer = (+)
        end
        i
    end) == 55
    # multiple @set
    @test @tasks(for i in 1:10
        @set scheduler = StaticScheduler()
        i
        @set reducer = (+)
    end) == 55
    # @set init
    @test @tasks(for i in 1:10
        @set begin
            reducer = (+)
            init = 0.0
        end
        i
    end) === 55.0
    @test @tasks(for i in 1:10
        @set begin
            reducer = (+)
            init = 0.0 * im
        end
        i
    end) === (55.0 + 0.0im)

    # top-level "kwargs"
    @test @tasks(for i in 1:3
        @set scheduler = :static
        @set ntasks = 1
        i
    end) |> isnothing
    @test @tasks(for i in 1:3
        @set scheduler = :static
        @set nchunks = 2
        i
    end) |> isnothing
    @test @tasks(for i in 1:3
        @set scheduler = :dynamic
        @set chunksize = 2
        i
    end) |> isnothing
    @test @tasks(for i in 1:3
        @set scheduler = :dynamic
        @set chunking = false
        i
    end) |> isnothing
    @test @tasks(for i in 1:4
        @set minchunksize=2
        i
    end) |> isnothing
    @test_throws ArgumentError @tasks(for i in 1:3
        @set scheduler = DynamicScheduler()
        @set chunking = false
        i
    end)
    @test_throws MethodError @tasks(for i in 1:3
        @set scheduler = :dynamic
        @set asd = 123
        i
    end)

    # TaskLocalValue
    ntd = 2 * Threads.nthreads()
    ptrs = Vector{Ptr{Nothing}}(undef, ntd)
    tids = Vector{UInt64}(undef, ntd)
    tid() = OhMyThreads.Tools.taskid()
    @test @tasks(for i in 1:ntd
        @local C::Vector{Float64} = rand(3)
        @set scheduler = :static
        ptrs[i] = pointer_from_objref(C)
        tids[i] = tid()
    end) |> isnothing
    # check that different iterations of a task
    # have access to the same C (same pointer)
    for t in unique(tids)
        @test allequal(ptrs[findall(==(t), tids)])
    end
    # TaskLocalValue (another fundamental check)
    @test @tasks(for i in 1:ntd
        @local x::Ref{Int64} = Ref(0)
        @set reducer = (+)
        @set scheduler = :static
        x[] += 1
        x[]
    end) == 1.5 * ntd # if a new x would be allocated per iteration, we'd get ntd here.
    # TaskLocalValue (begin ... end block), inferred TLV type
    @test @inferred (() -> @tasks for i in 1:10
        @local begin
            C = fill(4, 3, 3)
            x = fill(5.0, 3)
        end
        @set reducer = (+)
        sum(C * x)
    end)() == 1800

    # hygiene / escaping
    var = 3
    sched = StaticScheduler()
    sched_sym = :static
    data = rand(10)
    red = (a, b) -> a + b
    n = 2
    @test @tasks(for d in data
        @set scheduler = sched
        @set reducer = red
        var * d
    end) ≈ var * sum(data)
    @test @tasks(for d in data
        @set scheduler = sched_sym
        @set ntasks = n
        @set reducer = red
        var * d
    end) ≈ var * sum(data)

    struct SingleInt
        x::Int
    end
    @test @tasks(for _ in 1:10
        @local C = SingleInt(var)
        @set reducer = +
        C.x
    end) == 10 * var

    # enumerate(chunks)
    let data = collect(1:100)
        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; n=5))
                         @set reducer = +
                             @set chunking = false
                         [i, sum(@view(data[idcs]))]
                     end) == [sum(1:5), sum(data)]
        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; size=5))
                         @set reducer = +
                             [i, sum(@view(data[idcs]))]
                     end) == [sum(1:20), sum(data)]
        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(1:100; n=5))
                         @set chunking=false
                         @set collect=true
                         [i, idcs]
                     end) == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]]
    end
end;

@testset "WithTaskLocals" begin
    let x = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0)),
        y = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0))
        # Equivalent to
        # function f()
        #    x[][] += 1
        #    x[][] += 1
        #    x[], y[]
        # end
        f = WithTaskLocals((x, y)) do (x, y)
            function ()
                x[] += 1
                y[] += 1
                x[], y[]
            end
        end
        # Make sure we can call `f` like a regular function
        @test f() == (1, 1)
        @test f() == (2, 2)
        @test @fetch(f()) == (1, 1)
        # Acceptable use of promise_task_local
        @test @fetch(promise_task_local(f)()) == (1, 1)
        # Acceptable use of promise_task_local
        @test promise_task_local(f)() == (3, 3)
        # Acceptable use of promise_task_local
        @test @fetch(promise_task_local(f)()) == (1, 1)
        # Acceptable use of promise_task_local
        g() = @fetch((promise_task_local(f)(); promise_task_local(f)(); f()))
        @test g() == (3, 3)
        @test g() == (3, 3)

        h = promise_task_local(f)
        # Unacceptable use of `promise_task_local`
        # This is essentially testing that if you use `promise_task_local`, then pass that to another task,
        # you could get data races, since we here have a different thread writing to another thread's value.
        @test @fetch(h()) == (4, 4)
        @test @fetch(h()) == (5, 5)
    end
end;

@testset "chunking mode + chunksize option" begin
    @test OhMyThreads.Schedulers.chunking_mode(SerialScheduler()) ==
          OhMyThreads.Schedulers.NoChunking
    for sched in (DynamicScheduler, StaticScheduler, GreedyScheduler)
        @test sched() isa sched
        @test sched(; chunksize = 2) isa sched

        @test OhMyThreads.Schedulers.chunking_mode(sched(; chunksize = 2)) ==
              OhMyThreads.Schedulers.FixedSize
        @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = 2)) ==
              OhMyThreads.Schedulers.FixedCount
        @test OhMyThreads.Schedulers.chunking_mode(sched(; chunking = false)) ==
              OhMyThreads.Schedulers.NoChunking
        if sched != GreedyScheduler
            # For (Dynamic|Static)Scheduler `chunking = false` disables all chunking
            # arguments
            @test OhMyThreads.Schedulers.chunking_mode(sched(;
                nchunks = 2, chunksize = 4, chunking = false)) ==
                  OhMyThreads.Schedulers.NoChunking
            @test OhMyThreads.Schedulers.chunking_mode(sched(;
                nchunks = nothing, chunksize = nothing, split = :whatever, chunking = false)) ==
                  OhMyThreads.Schedulers.NoChunking
            @test OhMyThreads.Schedulers.chunking_enabled(sched(;
                nchunks = nothing, chunksize = nothing, chunking = false)) == false
            @test OhMyThreads.Schedulers.chunking_enabled(sched(;
                nchunks = 2, chunksize = 4, chunking = false)) == false
        else
            # For GreedyScheduler `nchunks` or `chunksize` overrides `chunking = false`
            @test OhMyThreads.Schedulers.chunking_mode(sched(;
                nchunks = 2, chunking = false)) ==
                  OhMyThreads.Schedulers.FixedCount
            @test OhMyThreads.Schedulers.chunking_mode(sched(;
                chunksize = 2, chunking = false)) ==
                  OhMyThreads.Schedulers.FixedSize
            @test OhMyThreads.Schedulers.chunking_enabled(sched(;
                nchunks = 2, chunking = false)) == true
            @test OhMyThreads.Schedulers.chunking_enabled(sched(;
                chunksize = 4, chunking = false)) == true
        end
        @test OhMyThreads.Schedulers.chunking_enabled(sched(; chunksize = 2)) == true
        @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = 2)) == true
        @test_throws ArgumentError sched(; nchunks = 2, chunksize = 3)
        @test_throws ArgumentError sched(; nchunks = 2, split = :whatever)

        let scheduler = sched(; chunksize = 2, split = :batch)
            @test tmapreduce(sin, +, 1:10; scheduler, init=0.0) ≈ mapreduce(sin, +, 1:10)
            @test treduce(+, 1:10; scheduler, init=0.0) ≈ reduce(+, 1:10)
            @test tmap(sin, Float64, 1:10; scheduler) ≈ map(sin, 1:10)
            @test isnothing(tforeach(sin, 1:10; scheduler))
        end
    end
end;

@testset "top-level kwargs" begin
    res_tmr = mapreduce(sin, +, 1:10000)

    # scheduler not given
    @test tmapreduce(sin, +, 1:10000; ntasks = 2) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10000; nchunks = 2) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10000; split = RoundRobin()) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10000; chunksize = 2) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10000; chunking = false) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10000; minchunksize=10) ≈ res_tmr
    @test tmapreduce(sin, +, 1:10; minchunksize=10) == mapreduce(sin, +, 1:10)

    # scheduler isa Scheduler
    @test tmapreduce(sin, +, 1:10000; scheduler = StaticScheduler()) ≈ res_tmr
    @test_throws ArgumentError tmapreduce(
        sin, +, 1:10000; ntasks = 2, scheduler = DynamicScheduler())
    @test_throws ArgumentError tmapreduce(
        sin, +, 1:10000; chunksize = 2, scheduler = DynamicScheduler())
    @test_throws ArgumentError tmapreduce(
        sin, +, 1:10000; split = RoundRobin(), scheduler = StaticScheduler())
    @test_throws ArgumentError tmapreduce(
        sin, +, 1:10000; ntasks = 3, scheduler = SerialScheduler())

    # scheduler isa Symbol
    for s in (:dynamic, :static, :serial, :greedy)
        @test tmapreduce(sin, +, 1:10000; scheduler = s, init = 0.0) ≈ res_tmr
    end
    for s in (:dynamic, :static, :greedy)
        @test tmapreduce(sin, +, 1:10000; ntasks = 2, scheduler = s, init = 0.0) ≈ res_tmr
    end
    for s in (:dynamic, :static)
        @test tmapreduce(sin, +, 1:10000; chunksize = 2, scheduler = s) ≈ res_tmr
        @test tmapreduce(sin, +, 1:10000; chunking = false, scheduler = s) ≈ res_tmr
        @test tmapreduce(sin, +, 1:10000; nchunks = 3, scheduler = s) ≈ res_tmr
        @test tmapreduce(sin, +, 1:10000; ntasks = 3, scheduler = s) ≈ res_tmr
        @test_throws ArgumentError tmapreduce(
            sin, +, 1:10000; ntasks = 3, nchunks = 2, scheduler = s)≈res_tmr
    end
    @test_throws ArgumentError tmapreduce(sin, +, 1:10000; scheduler = :whatever)
    @test_throws ArgumentError tmapreduce(
        sin, +, 1:10000; threadpool = :whatever, chunking = false)
end;

@testset "empty collections" begin
    @static if VERSION < v"1.11.0-"
        err = MethodError
    else
        err = ArgumentError
    end
    for empty_coll in (11:9, Float64[])
        for f in (sin, x -> im * x, identity)
            for op in (+, *, min)
                # mapreduce
                for init in (0.0, 0, 0.0 * im, 0.0f0)
                    @test tmapreduce(f, op, empty_coll; init) == init
                end
                # foreach
                @test tforeach(f, empty_coll) |> isnothing
                # reduce
                if op != min
                    @test treduce(op, empty_coll) == reduce(op, empty_coll)
                else
                    @test_throws err treduce(op, empty_coll)
                end
                # map
                @test tmap(f, empty_coll) == map(f, empty_coll)
                # collect
                @test tcollect(empty_coll) == collect(empty_coll)
            end
        end
    end
end;

# for testing @one_by_one region
mutable struct SingleAccessOnly
    in_use::Bool
    const lck::ReentrantLock
    SingleAccessOnly() = new(false, ReentrantLock())
end
function acquire(f, o::SingleAccessOnly)
    lock(o.lck) do
        o.in_use && throw(ErrorException("Already in use!"))
        o.in_use = true
    end
    try
        f()
    finally
        lock(o.lck) do
            !o.in_use && throw(ErrorException("Conflict!"))
            o.in_use = false
        end
    end
end

@testset "regions" begin
    @testset "@one_by_one" begin
        sao = SingleAccessOnly()

        try
            @tasks for i in 1:10
                @set ntasks = 10
                @one_by_one begin
                    acquire(sao) do
                        sleep(0.01)
                    end
                end
            end
        catch ErrorException
            @test false
        else
            @test true
        end


        # test escaping
        let
            x = Ref(0)
            y = Ref(0)
            @tasks for i in 1:10
                @set ntasks = 10

                y[] += 1 # not safe (race condition)
                @one_by_one begin
                    x[] += 1 # parallel-safe because inside of one_by_one region
                    acquire(sao) do
                        sleep(0.01)
                    end
                end
            end
            @test x[] == 10

        end

        test_f = () -> begin
            x = Ref(0)
            y = Ref(0)
            @tasks for i in 1:10
                @set ntasks = 10

                y[] += 1 # not safe (race condition)
                @one_by_one begin
                    x[] += 1 # parallel-safe because inside of one_by_one region
                    acquire(sao) do
                        sleep(0.01)
                    end
                end
            end
            return x[]
        end
        @test test_f() == 10
    end

    @testset "@only_one" begin
        let
            x = Ref(0)
            y = Ref(0)
            try
                @tasks for i in 1:10
                    @set ntasks = 10

                    y[] += 1 # not safe (race condition)
                    @only_one begin
                        x[] += 1 # parallel-safe because only a single task will execute this
                    end
                end
                @test x[] == 1 # only a single task should have incremented x
            catch ErrorException
                @test false
            end
        end

        let
            x = Ref(0)
            y = Ref(0)
            try
                @tasks for i in 1:10
                    @set ntasks = 2

                    y[] += 1 # not safe (race condition)
                    @only_one begin
                        x[] += 1 # parallel-safe because only a single task will execute this
                    end
                end
                @test x[] == 5 # a single task should have incremented x 5 times
            catch ErrorException
                @test false
            end
        end

        test_f = () -> begin
            x = Ref(0)
            y = Ref(0)
            @tasks for i in 1:10
                @set ntasks = 2

                y[] += 1 # not safe (race condition)
                @only_one begin
                    x[] += 1 # parallel-safe because only a single task will execute this
                end
            end
            return x[]
        end
        @test test_f() == 5
    end

    @testset "@only_one + @one_by_one" begin
        x = Ref(0)
        y = Ref(0)
        try
            @tasks for i in 1:10
                @set ntasks = 10

                @only_one begin
                    x[] += 1 # parallel-safe
                end

                @one_by_one begin
                    y[] += 1 # parallel-safe
                end
            end
            @test x[] == 1 && y[] == 10
        catch ErrorException
            @test false
        end
    end
end;

@testset "@barrier" begin
    @test (@tasks for i in 1:20
        @set ntasks = 20
        @barrier
    end) |> isnothing

    @test try
        @macroexpand @tasks for i in 1:20
            @barrier
        end
        false
    catch
        true
    end

    @test try
        x = Threads.Atomic{Int64}(0)
        y = Threads.Atomic{Int64}(0)
        @tasks for i in 1:20
            @set ntasks = 20

            Threads.atomic_add!(x, 1)
            @barrier
            if x[] < 20 && y[] > 0 # x hasn't reached 20 yet and y is already > 0
                error("shouldn't happen")
            end
            Threads.atomic_add!(y, 1)
        end
        true
    catch ErrorException
        false
    end

    @test try
        x = Threads.Atomic{Int64}(0)
        y = Threads.Atomic{Int64}(0)
        @tasks for i in 1:20
            @set ntasks = 20

            Threads.atomic_add!(x, 1)
            @barrier
            Threads.atomic_add!(x, 1)
            @barrier
            if x[] < 40 && y[] > 0 # x hasn't reached 20 yet and y is already > 0
                error("shouldn't happen")
            end
            Threads.atomic_add!(y, 1)
        end
        true
    catch ErrorException
        false
    end
end

@testset "verbose special macro usage" begin
    # OhMyThreads.@set
    @test @tasks(for i in 1:3
        OhMyThreads.@set reducer = (+)
        i
    end) == 6
    @test @tasks(for i in 1:3
        OhMyThreads.@set begin
            reducer = (+)
        end
        i
    end) == 6
    # OhMyThreads.@local
    ntd = 2 * Threads.nthreads()
    @test @tasks(for i in 1:ntd
        OhMyThreads.@local x::Ref{Int64} = Ref(0)
        OhMyThreads.@set begin
            reducer = (+)
            scheduler = :static
        end
        x[] += 1
        x[]
    end) == @tasks(for i in 1:ntd
        @local x::Ref{Int64} = Ref(0)
        @set begin
            reducer = (+)
            scheduler = :static
        end
        x[] += 1
        x[]
    end)
    # OhMyThreads.@only_one
    let
        x = Ref(0)
        y = Ref(0)
        try
            @tasks for i in 1:10
                OhMyThreads.@set ntasks = 10

                y[] += 1 # not safe (race condition)
                OhMyThreads.@only_one begin
                    x[] += 1 # parallel-safe because only a single task will execute this
                end
            end
            @test x[] == 1 # only a single task should have incremented x
        catch ErrorException
            @test false
        end
    end
    # OhMyThreads.@one_by_one
    test_f = () -> begin
        sao = SingleAccessOnly()
        x = Ref(0)
        y = Ref(0)
        @tasks for i in 1:10
            OhMyThreads.@set ntasks = 10

            y[] += 1 # not safe (race condition)
            OhMyThreads.@one_by_one begin
                x[] += 1 # parallel-safe because inside of one_by_one region
                acquire(sao) do
                    sleep(0.01)
                end
            end
        end
        return x[]
    end
    @test test_f() == 10
end

@testset "show schedulers" begin
    nt = Threads.nthreads(:default)

    @test repr("text/plain", DynamicScheduler()) ==
          """
          DynamicScheduler
          ├ Chunking: fixed count ($nt), split :consecutive
          └ Threadpool: default"""

    @test repr(
        "text/plain", DynamicScheduler(; chunking = false, threadpool = :interactive)) ==
          """
          DynamicScheduler
          ├ Chunking: none
          └ Threadpool: interactive"""

    @test repr("text/plain", StaticScheduler()) ==
          """StaticScheduler
          ├ Chunking: fixed count ($nt), split :consecutive
          └ Threadpool: default"""

    @test repr("text/plain", StaticScheduler(; chunksize = 2, split = :scatter)) ==
          """
          StaticScheduler
          ├ Chunking: fixed size (2), split :roundrobin
          └ Threadpool: default"""

    @test repr("text/plain", GreedyScheduler(; chunking = true)) ==
          """
         GreedyScheduler
         ├ Num. tasks: $nt
         ├ Chunking: fixed count ($(10 * nt)), split :roundrobin
         └ Threadpool: default"""
end

if Threads.nthreads() > 1
    @testset "Boxing detection and error" begin
        let
            f1() = tmap(1:10) do i
                A = i
                sleep(rand()/10)
                A
            end
            f2() = tmap(1:10) do i
                local A = i
                sleep(rand()/10)
                A
            end

            @test f1() == 1:10
            @test f2() == 1:10
        end

        let
            f1() = tmap(1:10) do i
                A = i
                sleep(rand()/10)
                A
            end
            f2() = tmap(1:10) do i
                local A = i
                sleep(rand()/10)
                A
            end

            @test_throws BoxedVariableError f1()
            @test f2() == 1:10

            A = 1 # Cause spooky action-at-a-distance by making A outer-local to the whole let block!
        end

        let
            A = 1
            f1() = tmap(1:10) do i
                A = 1
            end
            @test_throws BoxedVariableError f1() == ones(10) # Throws even though the redefinition is 'harmless'

            @allow_boxed_captures begin
                f2() = tmap(1:10) do i
                    A = 1
                end
                @test f2() == ones(10)
            end

            # Can nest allow and disallow because they're scoped values!
            function f3()
                @disallow_boxed_captures begin
                    tmap(1:10) do i
                    A = 1
                    end
                end
            end
            @allow_boxed_captures begin
                @test_throws BoxedVariableError f3() == ones(10)
            end
        end
        @testset "@localize" begin
            A = 1
            if false
                A = 2
            end
            ## This stops A from being boxed!
            v = @localize A tmap(1:2) do _
                A
            end
            @test v == [1, 1]
        end
    end
end

# Todo way more testing, and easier tests to deal with