[
  {
    "path": ".JuliaFormatter.toml",
    "content": "style = \"sciml\""
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"monthly\"\n"
  },
  {
    "path": ".github/workflows/changelog.yml",
    "content": "name: changelog\non:\n  pull_request:\n    types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]\n\njobs:\n  # Enforces the update of a changelog file on every pull request\n  # Can be skipped with the `Skip-Changelog` label\n  changelog:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: dangoslen/changelog-enforcer@v3"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: CI\non:\n  - push\n  - pull_request\njobs:\n  test:\n    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        version:\n          - '1.10'\n          - 'pre'\n        os:\n          - ubuntu-latest\n          - windows-latest\n        arch:\n          - x64\n        include:\n          - os: macos-latest\n            arch: aarch64\n            version: '1.10'\n          - os: macos-latest\n            arch: aarch64\n            version: 'pre'\n    steps:\n      - uses: actions/checkout@v6\n      - uses: julia-actions/setup-julia@v3\n        with:\n          version: ${{ matrix.version }}\n          arch: ${{ matrix.arch }}\n      - uses: julia-actions/cache@v2\n      - uses: julia-actions/julia-buildpkg@v1\n      - uses: julia-actions/julia-runtest@v1\n        env:\n          JULIA_NUM_THREADS: 4,2\n      - uses: julia-actions/julia-processcoverage@v1\n      - uses: codecov/codecov-action@v5\n        with:\n          files: lcov.info\n"
  },
  {
    "path": ".github/workflows/compathelper.yml",
    "content": "name: CompatHelper\non:\n  schedule:\n    - cron: 0 0 * * *\n  workflow_dispatch:\npermissions:\n  contents: write\n  pull-requests: write\njobs:\n  CompatHelper:\n    runs-on: ubuntu-latest\n    steps:\n      - name: Check if Julia is already available in the PATH\n        id: julia_in_path\n        run: which julia\n        continue-on-error: true\n      - name: Install Julia, but only if it is not already available in the PATH\n        uses: julia-actions/setup-julia@v3\n        with:\n          version: '1'\n          arch: ${{ runner.arch }}\n        if: steps.julia_in_path.outcome != 'success'\n      - name: \"Add the General registry via Git\"\n        run: |\n          import Pkg\n          ENV[\"JULIA_PKG_SERVER\"] = \"\"\n          Pkg.Registry.add(\"General\")\n        shell: julia --color=yes {0}\n      - name: \"Install CompatHelper\"\n        run: |\n          import Pkg\n          name = \"CompatHelper\"\n          uuid = \"aa819f21-2bde-4658-8897-bab36330d9b7\"\n          version = \"3\"\n          Pkg.add(; name, uuid, version)\n        shell: julia --color=yes {0}\n      - name: \"Run CompatHelper\"\n        run: |\n          import CompatHelper\n          CompatHelper.main()\n        shell: julia --color=yes {0}\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}\n          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}\n"
  },
  {
    "path": ".github/workflows/documentation.yml",
    "content": "name: Documentation\n\non:\n  push:\n    branches:\n      - master\n    tags: '*'\n    paths:\n      - 'docs/**'\n      - 'src/**'\n  pull_request:\n    paths:\n      - 'docs/**'\n      - 'src/**'\n\nconcurrency:\n  # Skip intermediate builds: always.\n  # Cancel intermediate builds: always.\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build:\n    permissions:\n      contents: write\n      statuses: write\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v6\n      - uses: julia-actions/setup-julia@latest\n        with:\n          version: '1'\n      - name: Build and deploy\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token\n          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key\n        run: julia docs/build_docs.jl"
  },
  {
    "path": ".github/workflows/downgrade_CI.yml",
    "content": "name: Downgrade\non:\n  pull_request:\n    branches:\n      - master\n    paths-ignore:\n      - 'docs/**'\n  push:\n    branches:\n      - master\n    paths-ignore:\n      - 'docs/**'\njobs:\n  test:\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        version: ['1']\n    steps:\n      - uses: actions/checkout@v6\n      - uses: julia-actions/setup-julia@v3\n        with:\n          version: ${{ matrix.version }}\n      - uses: cjdoris/julia-downgrade-compat-action@v1\n        with:\n          skip: Pkg,TOML,Test,Markdown\n      - uses: julia-actions/julia-buildpkg@v1\n      - uses: julia-actions/julia-runtest@v1\n"
  },
  {
    "path": ".github/workflows/tagbot.yml",
    "content": "name: TagBot\non:\n  issue_comment:\n    types:\n      - created\n  workflow_dispatch:\n    inputs:\n      lookback:\n        default: 3\npermissions:\n  actions: read\n  checks: read\n  contents: write\n  deployments: read\n  issues: read\n  discussions: read\n  packages: read\n  pages: read\n  pull-requests: read\n  repository-projects: read\n  security-events: read\n  statuses: read\njobs:\n  TagBot:\n    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'\n    runs-on: ubuntu-latest\n    steps:\n      - uses: JuliaRegistries/TagBot@v1\n        with:\n          token: ${{ secrets.GITHUB_TOKEN }}\n          # Edit the following line to reflect the actual name of the GitHub Secret containing your private key\n          ssh: ${{ secrets.DOCUMENTER_KEY }}\n          # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }}"
  },
  {
    "path": ".gitignore",
    "content": "docs/build\nManifest.toml\n.vscode\n*~\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "OhMyThreads.jl Changelog\n=========================\n\nUnreleased\n------------\n- ![Enhancement][badge-enhancement] `SerialScheduler` now accepts and ignores arguments passed to it to make switching schedulers easier [#162][gh-pr-162].\n\nVersion 0.8.3\n------------\n- ![Enhancement][badge-enhancement] The overhead of `tmapreduce` in the serial case was reduced a bit. Sentinel values in scheduler kwarg internals were replaced by `nothing` [#148][gh-pr-148]\n\nVersion 0.8.2\n------------\n- ![Feature][badge-feature] Added a `minchunksize` chunking argument for schedulers, so that they can specify a lower bound on the size of chunks which are worth parallelizing. For example, `treduce(+, 1:10; minchunksize=100)` will run serially, but `treduce(+, 1:1000000; minchunksize=100)` will be parallelized [#145][gh-pr-145].\n- ![Enhancement][badge-enhancement] Operations on collections with only one 'chunk' no longer spawn an unnecessary task. That means operations like `treduce(+, 1:10; minchunksize=100)` will have less overhead [#145][gh-pr-145].\n\nVersion 0.8.1\n------------\n- ![Feature][badge-feature] Added a `@localize` macro which turns `@localize x y expr` into `let x=x, y=y; expr end` ([#142][gh-pr-142])\n- ![INFO][badge-info] The error messafe for captured variables now has a longer error hint that displays when the `Markdown` package is loaded (e.g. in the REPL.) ([#142][gh-pr-142])\n\nVersion 0.8.0\n-------------\n- ![BREAKING][badge-breaking] We now detect and throw errors if an `OhMyThreads` parallel function is passed a closure containing a `Box`ed variable. This behaviour can be disabled with the new `@allow_boxed_captures` macro, and re-enabled with `@disallow_boxed_captures`. ([#141][gh-pr-141])\n- ![INFO][badge-info] Schedulder chunking info is no longer directly available via `getproperty`. This was never a public interface, but it's possible some users relied upon it [#135][gh-pr-135].\n\nVersion 0.7.0\n-------------\n- ![BREAKING][badge-breaking] We now use ChunkSplitters version 3.0. The function `OhMyThreads.chunks` has been renamed to `OhMyThreads.index_chunks`. The new functions `index_chunks` and `chunks` (different from the old one with the same name!) are now exported. See ChunkSplitters.jl for more information.\n- ![BREAKING][badge-breaking] If you provide a `chunks` or `index_chunks` as input we now disable the internal chunking without a warning. Previously, we did show a warning unless you had set `chunking=false`. In contrast, we now throw an error when you set any incompatible chunking related keyword arguments.\n- ![Deprecation][badge-deprecation] The `split` options `:batch` and `:scatter` are now deprecated (they still work but will be dropped at some point). Use `:consecutive` and `:roundrobin`, respectively, instead.\n- ![Enhancement][badge-enhancement] The `split` keyword argument can now also be a `<: OhMyThreads.Split`. Compared to providing a `Symbol`, the former can potentially give better performance. For example, you can replace `:consecutive` by `OhMyThreads.Consecutive()` and `:roundrobin` by `OhMyThreads.RoundRobin()`.\n- ![Feature][badge-feature] `ChannelLike` is a new public (but not exported) type. `ChannelLike(itr)` provide a way to iterate over `itr` in a concurrency safe manner similar to `Channel`. See the docstring for more details. ([#121][gh-pr-121])\n- ![Enhancement][badge-enhancement] `ChannelLike` is used internally for the `GreedyScheduler` when `chunking=true`. This improves performance overall but it is especially noticeable when the number of chunks is large. ([#121][gh-pr-121])\n\nVersion 0.6.2\n-------------\n- ![Enhancement][badge-enhancement] Added API support for `enumerate(chunks(...))`. Best used in combination with `chunking=false`\n\nVersion 0.6.1\n-------------\n\nVersion 0.6.0\n-------------\n- ![BREAKING][badge-breaking] Drop support for Julia < 1.10.\n\nVersion 0.5.3\n-------------\n- ![Enhancement][badge-enhancement] For the special/fake \"macros\" like, e.g., `@set`, support the verbose form `OhMyThreads.@set` within a `@tasks` for-loop (#107).\n\nVersion 0.5.2\n-------------\n- ![Enhancement][badge-enhancement] For empty input (e.g. `Float64[]` or `11:10`) behavior is now aligned with the serial functions in `Base`.\n\nVersion 0.5.1\n-------------\n- ![Feature][badge-feature] Within a parallel `@tasks` block one can now mark a region with `@one_by_one`. This region will be run by one task at a time (\"critical region\").\n- ![Feature][badge-feature] Within a `@tasks` block one can now mark a region as with `@only_one`. This region will be run by a single parallel task only (other tasks will skip over it).\n- ![Experimental][badge-experimental] Added tentative support for `@barrier` in `@tasks` blocks. See `?OhMyThreads.Tools.@barrier` for more information. Note that this feature is experimental and **not** part of the public API (i.e. doesn't fall under SemVer).\n- ![Info][badge-info] Compat bounds for [BangBang.jl](https://github.com/JuliaFolds2/BangBang.jl) have been relaxed to include v0.3.40\n\nVersion 0.5.0\n-------------\n\n- ![Feature][badge-feature] The parallel functions (e.g. tmapreduce etc.) now support `scheduler::Symbol` besides `scheduler::Scheduler`. To configure the selected scheduler (e.g. set `nchunks` etc.) one may now pass keyword arguments directly into the parallel functions (they will get passed on to the scheduler constructor). Example: `tmapreduce(sin, +, 1:10; chunksize=2, scheduler=:static)`. Analogous support has been added to the macro API: (Most) settings (`@set name = value`) will now be passed on to the parallel functions as keyword arguments (which then forward them to the scheduler constructor). Note that, to avoid ambiguity, we don't support this feature for `scheduler::Scheduler` but only for `scheduler::Symbol`.\n- ![Feature][badge-feature] Added a `SerialScheduler` that can be used to turn off any multithreading.\n- ![Feature][badge-feature] Added `OhMyThreads.WithTaskLocals` that represents a closure over `TaskLocalValues`, but can have those values materialized as an optimization (using `OhMyThreads.promise_task_local`)\n- ![Feature][badge-feature] In the case `nchunks > nthreads()`, the `StaticScheduler` now distributes chunks in a round-robin fashion (instead of either implicitly decreasing `nchunks` to `nthreads()` or throwing an error).\n- ![Feature][badge-feature] `@set init = ...` may now be used to specify an initial value for a reduction (only has an effect in conjuction with `@set reducer=...` and triggers a warning otherwise).\n- ![Enhancement][badge-enhancement] `SerialScheduler` and `DynamicScheduler` now support the keyword argument `ntasks` as an alias for `nchunks`.\n- ![Enhancement][badge-enhancement] Made `@tasks` use `OhMyThreads.WithTaskLocals` automatically as an optimization.\n- ![Enhancement][badge-enhancement] Uses of `@local` within `@tasks` no-longer require users to declare the type of the task local value, it can be inferred automatically if a type is not provided.\n- ![Enhancement][badge-enhancement] Made `using OhMyThreads: ...` more explicit in examples in the documentation and docstrings.\n- ![BREAKING][badge-breaking] The `DynamicScheduler` (default) and the `StaticScheduler` now support a `chunksize` argument to specify the desired size of chunks instead of the number of chunks (`nchunks`). Note that `chunksize` and `nchunks` are mutually exclusive. (This is unlikely to break existing code but technically could because the type parameter has changed from `Bool` to `ChunkingMode`.)\n- ![BREAKING][badge-breaking] The greedy scheduler now supports chunking (similar to the static and dynamic scheduler). You can opt into it with, e.g., `chunking=true`. (This is unlikely to break existing code but technically could because we introduced a new type parameter for `GreedyScheduler`.)\n- ![Breaking][badge-breaking] `DynamicScheduler` and `StaticScheduler` don't support `nchunks=0` or `chunksize=0` any longer. Instead, chunking can now be turned off via an explicit new keyword argument `chunking=false`.\n- ![BREAKING][badge-breaking] Within a `@tasks` block, task-local values must from now on be defined via `@local` instead of `@init` (renamed).\n- ![BREAKING][badge-breaking] The (already deprecated) `SpawnAllScheduler` has been dropped.\n- ![BREAKING][badge-breaking] The default value for `ntasks`/`nchunks` for `DynamicScheduler` has been changed from `2*nthreads()` to `nthreads()`. With the new value we now align with `@threads :dynamic`. The old value wasn't giving good load balancing anyways and choosing a higher value penalizes uniform use cases even more. To get the old behavior, set `nchunks=2*nthreads()`.\n- ![Bugfix][badge-bugfix] When using the `GreedyScheduler` in combination with `tmapreduce` (or functions that build upon it) there could be non-deterministic errors in some cases (small input collection, not much work per element, see [#82](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82)). These cases should be fixed now.\n- ![Bugfix][badge-bugfix] We now handle empty collections as input in `tmapreduce` and `tforeach` explicitly ([#86](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/86)). Our general philosophy is to try match the behavior of the serial `Base` functions.\n\nVersion 0.4.6\n-------------\n\n- ![Feature][badge-feature] Introduction of macro API (`@tasks`) that transforms for loops into corresponding `tforeach`, `tmapreduce`, and `tmap` calls. This new API enables us to facilitate certain patterns, like defining task local values.\n\nVersion 0.4.5\n-------------\n\n- ![Enhancement][badge-enhancement] Improved the thread-safe storage section of the documentation.\n\nVersion 0.4.4\n-------------\n\n- ![Bugfix][badge-bugfix] Fixed a type specification bug that could occur when passing a `Chunk` into, say, `tmapreduce`.\n\nVersion 0.4.3\n-------------\n\n- ![Feature][badge-feature] Forward (but don't export) the macros `@fetch` and `@fetchfrom` from StableTasks.jl (v0.1.5), which are analogous to the same-named macros in Distributed.jl.\n\nVersion 0.4.2\n-------------\n\n- ![Feature][badge-feature] `DynamicScheduler` now supports `nchunks=0`, which turns off internal chunking entirely.\n- ![Deprecation][badge-deprecation] `SpawnAllScheduler` is now deprecated in favor of `DynamicScheduler(; nchunks=0)`.\n- ![Feature][badge-feature] Partial support for passing in a `ChunkSplitters.Chunk` when using `DynamicScheduler` (default). In this case, one should generally use `DynamicScheduler(; nchunks=0)`, i.e. turn off internal chunking.\n- ![Feature][badge-feature] `StaticScheduler` now supports `nchunks=0`, which turns off internal chunking entirely. Only works for input that has `<= nthreads()` elements.\n\nVersion 0.4.1\n-------------\n\n- ![Feature][badge-feature] Added a new, simple `SpawnAllScheduler` that spawns a task per input element (can be a lot of tasks!).\n- ![Info][badge-info] Added downgrade_CI which makes sure the testsuite works on the oldest versions of dependancies.\n\nVersion 0.4.0\n-------------\n\n- ![BREAKING][badge-breaking] Instead of taking keyword arguments `schedule`, `nchunks`, `split` directly, we now use `Scheduler` structs to specify scheduling options ([#22](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/22)). The latter can be provided to all API functions via the new `scheduler` keyword argument.\n- ![BREAKING][badge-breaking] The default scheduler (`DynamicScheduler`) now, by default, creates `2*nthreads()` tasks to provide load-balancing by default. The old behavior can be restored with `DynamicScheduler(; nchunks=nthreads())`.\n- ![Enhancement][badge-enhancement] We reject unsupported keyword arguments early and give a more helpful error message.\n\nVersion 0.3.1\n-------------\n\n- ![Bugfix][badge-bugfix] The documented Public API wasn't updated in 0.3.0 and thus out of sync with the actual API. Fixed in this version.\n\nVersion 0.3.0\n-------------\n\n- ![BREAKING][badge-breaking] We don't (re-)export `chunks` anymore. Use `OhMyThreads.chunks` instead.\n- ![Feature][badge-feature] We now provide `OhMyThreads.TaskLocalValue` (from [TaskLocalValue.jl](https://github.com/vchuravy/TaskLocalValues.jl)) as a nice solution for task-local values. See the corresponding page in the documentation ([#25][gh-issue-25]).\n- ![Enhancement][badge-enhancement] Added a few missing `@views`.\n- ![Enhancement][badge-enhancement] Added three examples to the docs: monte carlo, julia set, and trapazoidal integration.\n- ![Enhancement][badge-enhancement] Improved all docstrings of the exported API functions. Keyword options are now only shown in the extended help (e.g. `??tmap`) ([#27][gh-issue-27]).\n- ![Enhancement][badge-enhancement] Added a translation page that hopefully helps with the Base.Threads → OhMyThreads.jl transition ([#24][gh-issue-24]).\n\nVersion 0.2.1\n-------------\n\n- ![Enhancement][badge-enhancement] Basic documentation.\n- ![Enhancement][badge-enhancement] Making `ChunkSplitters` available internally.\n\nVersion 0.2.0\n-------------\n\n- Initial version.\n\n[badge-breaking]: https://img.shields.io/badge/BREAKING-red.svg\n[badge-deprecation]: https://img.shields.io/badge/Deprecation-orange.svg\n[badge-feature]: https://img.shields.io/badge/Feature-green.svg\n[badge-experimental]: https://img.shields.io/badge/Experimental-yellow.svg\n[badge-enhancement]: https://img.shields.io/badge/Enhancement-blue.svg\n[badge-bugfix]: https://img.shields.io/badge/Bugfix-purple.svg\n[badge-fix]: https://img.shields.io/badge/Fix-purple.svg\n[badge-info]: https://img.shields.io/badge/Info-gray.svg\n\n[gh-issue-27]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/27\n[gh-issue-24]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/24\n[gh-issue-25]: https://github.com/JuliaFolds2/OhMyThreads.jl/issues/25\n\n[gh-pr-5]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/5\n[gh-pr-121]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/121\n[gh-pr-135]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/135\n[gh-pr-141]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/141\n[gh-pr-142]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/142\n[gh-pr-145]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/145\n[gh-pr-148]: https://github.com/JuliaFolds2/OhMyThreads.jl/pull/148\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2024 Mason Protter\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Project.toml",
    "content": "name = \"OhMyThreads\"\nuuid = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\nauthors = [\"Carsten Bauer <mail@carstenbauer.eu>\", \"Mason Protter <mason.protter@icloud.com>\"]\nversion = \"0.8.5\"\n\n[deps]\nBangBang = \"198e06fe-97b7-11e9-32a5-e1d131e6ad66\"\nChunkSplitters = \"ae650224-84b6-46f8-82ea-d812ca08434e\"\nScopedValues = \"7e506255-f358-4e82-b7e4-beb19740aa63\"\nStableTasks = \"91464d47-22a1-43fe-8b7f-2d57ee82463f\"\nTaskLocalValues = \"ed4db957-447d-4319-bfb6-7fa9ae7ecf34\"\n\n[weakdeps]\nMarkdown = \"d6f4376e-aef5-505a-96c1-9c027394607a\"\n\n[extensions]\nMarkdownExt = \"Markdown\"\n\n[compat]\nAqua = \"0.8\"\nBangBang = \"0.3.40, 0.4\"\nChunkSplitters = \"3.1\"\nMarkdown = \"1\"\nScopedValues = \"1.3\"\nStableTasks = \"0.1.5\"\nTaskLocalValues = \"0.1\"\nTest = \"1\"\njulia = \"1.10\"\n\n[extras]\nAqua = \"4c88cf16-eb10-579e-8560-4a9242c79595\"\nTest = \"8dfed614-e22c-5e08-85e1-65c5234f0b40\"\n\n[targets]\ntest = [\"Test\", \"Aqua\"]\n"
  },
  {
    "path": "README.md",
    "content": "# OhMyThreads\n\n[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg\n[docs-dev-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/dev\n\n[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg\n[docs-stable-url]: https://JuliaFolds2.github.io/OhMyThreads.jl/stable\n\n[ci-img]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml/badge.svg\n[ci-url]: https://github.com/JuliaFolds2/OhMyThreads.jl/actions/workflows/ci.yml\n\n[cov-img]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl/branch/master/graph/badge.svg\n[cov-url]: https://codecov.io/gh/JuliaFolds2/OhMyThreads.jl\n\n[lifecycle-img]: https://img.shields.io/badge/lifecycle-maturing-orange.svg\n\n[code-style-img]: https://img.shields.io/badge/code%20style-blue-4495d1.svg\n[code-style-url]: https://github.com/invenia/BlueStyle\n\n[aqua-img]: https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg\n[aqua-url]: https://github.com/JuliaTesting/Aqua.jl\n\n<!--\n![Lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)\n![Lifecycle](https://img.shields.io/badge/lifecycle-stable-green.svg)\n![Lifecycle](https://img.shields.io/badge/lifecycle-retired-orange.svg)\n![Lifecycle](https://img.shields.io/badge/lifecycle-archived-red.svg)\n![Lifecycle](https://img.shields.io/badge/lifecycle-dormant-blue.svg)\n![Lifecycle](https://img.shields.io/badge/lifecycle-experimental-orange.svg)\n-->\n\n*Simple Multithreading in Julia*\n\n| **Documentation**                                                               | **Build Status**                                                                                |  **Quality**                                                                                |\n|:-------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|\n| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][ci-img]][ci-url] [![][cov-img]][cov-url] | ![][lifecycle-img] [![][aqua-img]][aqua-url] |\n\n[OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a\nfocus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`).\n\n## Example\n\n```julia\nusing OhMyThreads: tmapreduce, @tasks\nusing BenchmarkTools: @btime\nusing Base.Threads: nthreads\n\n# Variant 1: function API\nfunction mc_parallel(N; ntasks=nthreads())\n    M = tmapreduce(+, 1:N; ntasks) do i\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\n# Variant 2: macro API\nfunction mc_parallel_macro(N; ntasks=nthreads())\n    M = @tasks for i in 1:N\n        @set begin\n            reducer=+\n            ntasks=ntasks\n        end\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\nN = 100_000_000\nmc_parallel(N) # gives, e.g., 3.14159924\n\n@btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread)\n@btime mc_parallel($N)           # using all threads\n@btime mc_parallel_macro($N)     # using all threads\n```\n\nWith 5 threads, timings might be something like this:\n\n```\n417.282 ms (14 allocations: 912 bytes)\n83.578 ms (38 allocations: 3.08 KiB)\n83.573 ms (38 allocations: 3.08 KiB)\n```\n\n(Check out the full [Parallel Monte Carlo](https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/mc/mc/) example if you like.)\n\n## Documentation\n\nFor more information, please check out the [documentation](https://JuliaFolds2.github.io/OhMyThreads.jl/stable) of the latest release (or the [development version](https://JuliaFolds2.github.io/OhMyThreads.jl/dev) if you're curious).\n\n"
  },
  {
    "path": "docs/Project.toml",
    "content": "[deps]\nDocumenter = \"e30172f5-a6a5-5a46-863b-614d45cd2de4\"\nDocumenterInterLinks = \"d12716ef-a0f6-4df4-a9f1-a5a34e75c656\"\nDocumenterTools = \"35a29f4d-8980-5a13-9543-d66fff28ecb8\"\n\n[compat]\nDocumenter = \"1.3\"\nDocumenterInterLinks = \"1\"\nDocumenterTools = \"0.1\"\n"
  },
  {
    "path": "docs/build_docs.jl",
    "content": "cd(@__DIR__)\nprintln(\"--- :julia: Instantiating project\")\nusing Pkg\nPkg.activate(\"..\")\nPkg.instantiate()\nPkg.activate(\".\")\nPkg.instantiate()\npush!(LOAD_PATH, joinpath(@__DIR__, \"..\"))\ndeleteat!(LOAD_PATH, 2)\nprintln(\"+++ :julia: Building documentation\")\ninclude(\"make.jl\")\n"
  },
  {
    "path": "docs/make.jl",
    "content": "using Documenter\nusing DocumenterInterLinks\nusing OhMyThreads\n\nconst ci = get(ENV, \"CI\", \"\") == \"true\"\n\nlinks = InterLinks(\n    \"ChunkSplitters\" => (\n        \"https://juliafolds2.github.io/ChunkSplitters.jl/stable/\",\n        \"https://juliafolds2.github.io/ChunkSplitters.jl/stable/objects.inv\",\n        joinpath(@__DIR__, \"inventories\", \"ChunkSplitters.toml\")\n    ),\n);\n\n@info \"Generating Documenter.jl site\"\nmakedocs(;\n    sitename = \"OhMyThreads.jl\",\n    authors = \"Carsten Bauer, Mason Protter\",\n    modules = [OhMyThreads],\n    checkdocs = :exports,\n    doctest = false,\n    pages = [\n        \"OhMyThreads\" => \"index.md\",\n        \"Examples\" => [\n            \"Parallel Monte Carlo\" => \"literate/mc/mc.md\",\n            \"Julia Set\" => \"literate/juliaset/juliaset.md\",\n            \"Trapezoidal Integration\" => \"literate/integration/integration.md\"\n        ],\n        \"Translation Guide\" => \"translation.md\",\n        \"Boxed Variables\" => \"literate/boxing/boxing.md\",\n        \"Thread-Safe Storage\" => \"literate/tls/tls.md\",\n        \"False Sharing\" => \"literate/falsesharing/falsesharing.md\",\n        # \"Explanations\" => [\n        #     \"Task-Based Multithreading\" => \"explain/taskbasedmt.md\",\n        # ],\n        \"API\" => [\n            \"Public API\" => \"refs/api.md\",\n            \"Experimental\" => \"refs/experimental.md\",\n            \"Internal\" => \"refs/internal.md\"\n        ]\n    ],\n    repo = \"https://github.com/JuliaFolds2/OhMyThreads.jl/blob/{commit}{path}#{line}\",\n    format = Documenter.HTML(repolink = \"https://github.com/JuliaFolds2/OhMyThreads.jl\"; collapselevel = 1),\n    plugins = [links],)\n\nif ci\n    @info \"Deploying documentation to GitHub\"\n    deploydocs(;\n        repo = \"github.com/JuliaFolds2/OhMyThreads.jl.git\",\n        devbranch = \"master\",\n        push_preview = true)\nend\n"
  },
  {
    "path": "docs/src/basics.md",
    "content": "# Basics\n\nThis section is still in preparation. For now, you might want to take a look at the [translation guide](@ref TG) and the examples."
  },
  {
    "path": "docs/src/index.md",
    "content": "# OhMyThreads.jl\n\n[OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl/) is meant to be a simple, unambitious package that provides user-friendly ways of doing [task-based](https://docs.julialang.org/en/v1/base/parallel/) multithreaded calculations in Julia. Most importantly, with a\nfocus on [data parallelism](https://en.wikipedia.org/wiki/Data_parallelism), it provides an [API of higher-order functions](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Functions) (e.g. `tmapreduce`) as well as a [macro API](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Macros) `@tasks for ... end` (conceptually similar to `@threads`).\n\n## Quick Start\n\nThe package is registered. Hence, you can simply use\n```\n] add OhMyThreads\n```\nto add the package to your Julia environment.\n\n### Basic example\n\n```julia\nusing OhMyThreads: tmapreduce, @tasks\nusing BenchmarkTools: @btime\nusing Base.Threads: nthreads\n\n# Variant 1: function API\nfunction mc_parallel(N; ntasks=nthreads())\n    M = tmapreduce(+, 1:N; ntasks) do i\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\n# Variant 2: macro API\nfunction mc_parallel_macro(N; ntasks=nthreads())\n    M = @tasks for i in 1:N\n        @set begin\n            reducer=+\n            ntasks=ntasks\n        end\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\nN = 100_000_000\nmc_parallel(N) # gives, e.g., 3.14159924\n\n@btime mc_parallel($N; ntasks=1) # use a single task (and hence a single thread)\n@btime mc_parallel($N)           # using all threads\n@btime mc_parallel_macro($N)     # using all threads\n```\n\nWith 5 threads, timings might be something like this:\n\n```\n417.282 ms (14 allocations: 912 bytes)\n83.578 ms (38 allocations: 3.08 KiB)\n83.573 ms (38 allocations: 3.08 KiB)\n```\n\n(Check out the full [Parallel Monte Carlo](@ref) example if you like.)\n\n## No Transducers\n\nUnlike most [JuliaFolds2](https://github.com/JuliaFolds2) packages, OhMyThreads.jl is not built off of [Transducers.jl](https://github.com/JuliaFolds2/Transducers.jl), nor is it a building block for Transducers.jl. Rather, it is meant to be a simpler, more maintainable, and more accessible alternative to high-level packages like, e.g., [ThreadsX.jl](https://github.com/tkf/ThreadsX.jl) or [Folds.jl](https://github.com/JuliaFolds2/Folds.jl).\n\n## Acknowledgements\n\nThe idea for this package came from [Carsten Bauer](https://github.com/carstenbauer) and [Mason Protter](https://github.com/MasonProtter). Check out the [list of contributors](https://github.com/JuliaFolds2/OhMyThreads.jl/graphs/contributors) for more information.\n"
  },
  {
    "path": "docs/src/literate/Project.toml",
    "content": "[deps]\nLiterate = \"98b081ad-f1c9-55d3-8b20-4c87d4299306\"\n\n[compat]\nLiterate = \"2.16\"\n"
  },
  {
    "path": "docs/src/literate/boxing/Project.toml",
    "content": "[deps]\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\n"
  },
  {
    "path": "docs/src/literate/boxing/boxing.jl",
    "content": "#====================================\n# Boxed Variables\n\nAll multithreading in julia is built around the idea of passing around\nand executing functions, but often these functions \"enclose\" data from\nan outer local scope, making them what's called a \"closure\".\n\n## Boxed variables causing race conditions\n\nJulia allows functions which capture variables to re-bind those variables\nto different values, but doing so can cause subtle race conditions in\nmultithreaded code.\n\nConsider the following example:\n====================================#\n\nlet out = zeros(Int, 10)\n    Threads.@threads for i in 1:10\n        A = i\n        sleep(1/100)\n        out[i] = A\n    end\n    A = 1\n    out\nend\n\n#====================================\nYou may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`,\nbut the nonsense result is caused by `A` actually being a shared mutable\ncontainer here which all the parallel tasks are accessing and mutating\nin parallel, giving unpredictable results.\n\nOhMyThreads.jl tries to protect users from this surprising behaviour:\n====================================#\nusing OhMyThreads\n\ntry\n    let\n        ## this throws an error!\n        out = tmap(1:10) do i\n            A = i\n            sleep(1/100)\n            A\n        end\n        A = 1\n        out\n    end\ncatch e;\n    ## Show the error\n    Base.showerror(stdout, e)\nend\n\n#====================================\nIn this case, we could fix the race conditon by marking `A` as local:\n====================================#\n\nlet\n    out = tmap(1:10) do i\n        local A = i # Note the use of `local`\n        sleep(1/100)\n        A\n    end\n    A = 1\n    out\nend\n\n#====================================\nIf you really desire to bypass this error, you can use the\n`@allow_boxed_captures` macro\n====================================#\n\n@allow_boxed_captures let\n    out = tmap(1:10) do i\n        A = i\n        sleep(1/100)\n        A\n    end\n    A = 1\n    out\nend\n\n#====================================\n## Non-race conditon boxed variables\n\nAny re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop:\n====================================#\ntry\n    let A = 1\n        if rand(Bool)\n            ## Rebind A, it's now boxed!\n            A = 2\n        end\n        @tasks for i in 1:2\n            @show A\n        end\n    end\ncatch e;\n    println(\"Yup, that errored!\")\nend\n#====================================\nThis comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop:\n====================================#\n\nlet A = 1\n    if rand(Bool)\n        A = 2\n    end\n    let A = A # This stops A from being boxed!\n        @tasks for i in 1:2\n            @show A\n        end\n    end\nend\n\n#====================================\nOhMyThreads provides a macro `@localize` to automate this process:\n====================================#\n\nlet A = 1\n    if rand(Bool)\n        A = 2\n    end\n    ## This stops A from being boxed!\n    @localize A @tasks for i in 1:2\n        @show A\n    end\nend\n\n"
  },
  {
    "path": "docs/src/literate/boxing/boxing.md",
    "content": "```@meta\nEditURL = \"boxing.jl\"\n```\n\n# Boxed Variables\n\nAll multithreading in julia is built around the idea of passing around\nand executing functions, but often these functions \"enclose\" data from\nan outer local scope, making them what's called a \"closure\".\n\n## Boxed variables causing race conditions\n\nJulia allows functions which capture variables to re-bind those variables\nto different values, but doing so can cause subtle race conditions in\nmultithreaded code.\n\nConsider the following example:\n\n````julia\nlet out = zeros(Int, 10)\n    Threads.@threads for i in 1:10\n        A = i\n        sleep(1/100)\n        out[i] = A\n    end\n    A = 1\n    out\nend\n````\n\n````\n10-element Vector{Int64}:\n 5\n 4\n 6\n 4\n 5\n 4\n 5\n 4\n 5\n 4\n````\n\nYou may have expected that to return `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`,\nbut the nonsense result is caused by `A` actually being a shared mutable\ncontainer here which all the parallel tasks are accessing and mutating\nin parallel, giving unpredictable results.\n\nOhMyThreads.jl tries to protect users from this surprising behaviour:\n\n````julia\nusing OhMyThreads\n\ntry\n    let\n        # this throws an error!\n        out = tmap(1:10) do i\n            A = i\n            sleep(1/100)\n            A\n        end\n        A = 1\n        out\n    end\ncatch e;\n    # Show the error\n    Base.showerror(stdout, e)\nend\n````\n\n````\nAttempted to capture and modify outer local variable: A\n\nSee https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation.\n\n  Hint\n  ----\n\n  Capturing boxed variables can be not only slow, but also cause surprising\n  and incorrect results.\n\n    •  If you meant for these variables to be local to each loop\n       iteration and not depend on a variable from an outer scope, you\n       should mark them as local inside the closure.\n\n    •  If you meant to reference a variable from the outer scope, but do\n       not want access to it to be boxed, you can wrap uses of it in a\n       let block, like e.g.\n\n  function foo(x, N)\n      rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ...\n      let x = x # ... Unless we localize it here with the let block \n          @tasks for i in 1:N\n              f(x)    \n          end\n      end\n  end\n\n    •  OhMyThreads.jl provides a @localize macro that automates the above\n       let block, i.e. @localize x f(x) is the same as let x=x; f(x) end\n\n    •  If these variables are being re-bound inside a @one_by_one or\n       @only_one block, consider using a mutable Ref instead of\n       re-binding the variable.\n\n  This error can be bypassed with the @allow_boxed_captures macro.\n````\n\nIn this case, we could fix the race conditon by marking `A` as local:\n\n````julia\nlet\n    out = tmap(1:10) do i\n        local A = i # Note the use of `local`\n        sleep(1/100)\n        A\n    end\n    A = 1\n    out\nend\n````\n\n````\n10-element Vector{Int64}:\n  1\n  2\n  3\n  4\n  5\n  6\n  7\n  8\n  9\n 10\n````\n\nIf you really desire to bypass this error, you can use the\n`@allow_boxed_captures` macro\n\n````julia\n@allow_boxed_captures let\n    out = tmap(1:10) do i\n        A = i\n        sleep(1/100)\n        A\n    end\n    A = 1\n    out\nend\n````\n\n````\n10-element Vector{Int64}:\n 3\n 2\n 3\n 2\n 3\n 2\n 3\n 2\n 3\n 3\n````\n\n## Non-race conditon boxed variables\n\nAny re-binding of captured variables can cause boxing, even when that boxing isn't strictly necessary, like the following example where we do not rebind `A` in the loop:\n\n````julia\ntry\n    let A = 1\n        if rand(Bool)\n            # Rebind A, it's now boxed!\n            A = 2\n        end\n        @tasks for i in 1:2\n            @show A\n        end\n    end\ncatch e;\n    println(\"Yup, that errored!\")\nend\n````\n\n````\nYup, that errored!\n\n````\n\nThis comes down to how julia parses and lowers code. To avoid this, you can use an inner `let` block to localize `A` to the loop:\n\n````julia\nlet A = 1\n    if rand(Bool)\n        A = 2\n    end\n    let A = A # This stops A from being boxed!\n        @tasks for i in 1:2\n            @show A\n        end\n    end\nend\n````\n\n````\nA = 1\nA = 1\n\n````\n\nOhMyThreads provides a macro `@localize` to automate this process:\n\n````julia\nlet A = 1\n    if rand(Bool)\n        A = 2\n    end\n    # This stops A from being boxed!\n    @localize A @tasks for i in 1:2\n        @show A\n    end\nend\n````\n\n````\nA = 2\nA = 2\n\n````\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/falsesharing/Project.toml",
    "content": "[deps]\nBenchmarkTools = \"6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf\"\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\nThreadPinning = \"811555cd-349b-4f26-b7bc-1f208b848042\"\n"
  },
  {
    "path": "docs/src/literate/falsesharing/falsesharing.jl",
    "content": "# # [False Sharing](@id FalseSharing)\n#\n# *False Sharing* is a very common but subtle performance issue that comes up again and\n# again when writing parallel code manually. For this reason, we shall discuss what it is\n# about and how to avoid it.\n#\n# For simplicity, let's focus on a specific example: parallel summation.\n#\n# ## Baseline: sequential summation\n#\n# To establish a baseline, that we can later compare against, we define some fake data,\n# which we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function.\n\nusing Base.Threads: nthreads\nusing BenchmarkTools\nusing ThreadPinning #hide\npinthreads(:cores) #hide\n\ndata = rand(1_000_000 * nthreads());\n@btime sum($data);\n\n#\n# ## The problematic parallel implementation\n#\n# A conceptually simple (and valid) approach to parallelizing the summation is to divide\n# the full computation into parts. Specifically, the idea is to divide the data into chunks,\n# compute the partial sums of these chunks in parallel, and finally sum up the partial\n# results. (Note that we will not concern ourselves with potential minor or\n# catastrophic numerical errors due to potential rearrangements of terms in the summation here.)\n#\n# A common, manual implementation of this idea might look like this:\n\nusing OhMyThreads: @spawn, index_chunks\n\nfunction parallel_sum_falsesharing(data; nchunks = nthreads())\n    psums = zeros(eltype(data), nchunks)\n    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))\n        @spawn begin\n            for i in idcs\n                psums[c] += data[i]\n            end\n        end\n    end\n    return sum(psums)\nend\n\n# The code is pretty straightforward: We allocate space for the results of the partial sums\n# (`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in\n# parallel. More importantly, and in this context perhaps surprisingly, the code is also\n# **correct** in the sense that it produces the desired result.\n\nusing Test\n@test sum(data) ≈ parallel_sum_falsesharing(data)\n\n# This is just a reflection of the fact that there is no logical sharing of data - because\n# each parallel tasks modifies a different element of `psums` - implying the absence of\n# race conditions.\n#\n# What's the issue then?! Well, the sole purpose of parallelization is to reduce runtime.\n# So let's see how well we're doing in this respect.\n\nnthreads()\n\n#\n\n@btime parallel_sum_falsesharing($data);\n\n# A **slowdown**?! Clearly, that's the opposite of what we tried to achieve!\n\n#\n# ## The issue: False sharing\n#\n# Although our parallel summation above is semantically correct, it has a\n# big **performance issue**: *False sharing*. To understand false sharing, we have to think\n# a little bit about how computers work. Specifically, we need to realize that processors\n# cache memory in lines (rather than individual elements) and that caches of different processors\n# are kept coherent.\n# When two (or more) different CPU cores operate on independent data elements that **fall\n# into the same cache line** (i.e. they are part of the same memory address region)\n# the **cache coherency mechanism leads to costly synchronization** between cores.\n\n# In our case, this happens despite the fact that different parallel tasks\n# (on different CPU cores) *logically* don't care about the rest of the data in the cache line\n# at all.\n\n# ![](false_sharing.svg)\n\n# Given these insights, we can come up with a few workarounds that mitigate the issue.\n# The most prominent is probably padding, where one simply adds sufficiently many unused\n# zeros to `psums` such that different partial sum counters don't fall into the same cache\n# line. However, let's discuss a more fundamental, more efficient, and more elegant solution.\n\n#\n# ## Task-local parallel summation\n#\n# The key mistake in `parallel_sum_falsesharing` above is the non-local modification of\n# (implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop).\n# We can simply avoid this by making the code more task-local. To this end, we introduce a\n# **task-local accumulator variable**, which we use to perform the task-local partial sums.\n# Only at the very end do we communicate the result to the main thread, e.g. by writing it\n# into `psums` (once!).\n\nfunction parallel_sum_tasklocal(data; nchunks = nthreads())\n    psums = zeros(eltype(data), nchunks)\n    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))\n        @spawn begin\n            local s = zero(eltype(data))\n            for i in idcs\n                s += data[i]\n            end\n            psums[c] = s\n        end\n    end\n    return sum(psums)\nend\n\n@test sum(data) ≈ parallel_sum_tasklocal(data)\n@btime parallel_sum_tasklocal($data);\n\n# Finally, there is a speed up! 🎉\n#\n# Two comments are in order.\n#\n# First, we note that the only role that `psums` plays is\n# as a temporary storage for the results from the parallel tasks to be able to sum them\n# up eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which\n# would get updated via `Threads.atomic_add!` from each task directly. However,\n# for our discussion, this is a detail and we won't discuss it further.\n#\n# Secondly, while keeping the general idea, we can drastically simplify the above code by\n# using `map` and reusing the built-in (sequential) `sum` function on each parallel task:\n\nfunction parallel_sum_map(data; nchunks = nthreads())\n    ts = map(index_chunks(data, n = nchunks)) do idcs\n        @spawn @views sum(data[idcs])\n    end\n    return sum(fetch.(ts))\nend\n\n@test sum(data) ≈ parallel_sum_map(data)\n@btime parallel_sum_map($data);\n\n# This implementation is conceptually\n# clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`,\n# anywhere at all. We can't run into false sharing if we don't modify shared state 😉.\n#\n# Note that since we use the built-in `sum` function, which is highly optimized, we might see\n# better runtimes due to other effects - like SIMD and the absence of bounds checks - compared\n# to the simple for-loop accumulation in `parallel_sum_tasklocal` above.\n\n#\n# ## Parallel summation with OhMyThreads\n#\n# Finally, all of the above is abstracted away for you if you simply use [`treduce`](@ref)\n# to implement the parallel summation. It also only takes a single line and function call.\n\nusing OhMyThreads: treduce\n\n@test sum(data) ≈ treduce(+, data; ntasks = nthreads())\n@btime treduce($+, $data; ntasks = $nthreads());\n"
  },
  {
    "path": "docs/src/literate/falsesharing/falsesharing.md",
    "content": "```@meta\nEditURL = \"falsesharing.jl\"\n```\n\n# [False Sharing](@id FalseSharing)\n\n*False Sharing* is a very common but subtle performance issue that comes up again and\nagain when writing parallel code manually. For this reason, we shall discuss what it is\nabout and how to avoid it.\n\nFor simplicity, let's focus on a specific example: parallel summation.\n\n## Baseline: sequential summation\n\nTo establish a baseline, that we can later compare against, we define some fake data,\nwhich we'll sum up, and benchmark Julia's built-in, non-parallel `sum` function.\n\n````julia\nusing Base.Threads: nthreads\nusing BenchmarkTools\n\ndata = rand(1_000_000 * nthreads());\n@btime sum($data);\n````\n\n````\n  2.327 ms (0 allocations: 0 bytes)\n\n````\n\n## The problematic parallel implementation\n\nA conceptually simple (and valid) approach to parallelizing the summation is to divide\nthe full computation into parts. Specifically, the idea is to divide the data into chunks,\ncompute the partial sums of these chunks in parallel, and finally sum up the partial\nresults. (Note that we will not concern ourselves with potential minor or\ncatastrophic numerical errors due to potential rearrangements of terms in the summation here.)\n\nA common, manual implementation of this idea might look like this:\n\n````julia\nusing OhMyThreads: @spawn, index_chunks\n\nfunction parallel_sum_falsesharing(data; nchunks = nthreads())\n    psums = zeros(eltype(data), nchunks)\n    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))\n        @spawn begin\n            for i in idcs\n                psums[c] += data[i]\n            end\n        end\n    end\n    return sum(psums)\nend\n````\n\n````\nparallel_sum_falsesharing (generic function with 1 method)\n````\n\nThe code is pretty straightforward: We allocate space for the results of the partial sums\n(`psums`) and, on `nchunks` many tasks, add up the data elements of each partial sum in\nparallel. More importantly, and in this context perhaps surprisingly, the code is also\n**correct** in the sense that it produces the desired result.\n\n````julia\nusing Test\n@test sum(data) ≈ parallel_sum_falsesharing(data)\n````\n\n````\nTest Passed\n````\n\nThis is just a reflection of the fact that there is no logical sharing of data - because\neach parallel tasks modifies a different element of `psums` - implying the absence of\nrace conditions.\n\nWhat's the issue then?! Well, the sole purpose of parallelization is to reduce runtime.\nSo let's see how well we're doing in this respect.\n\n````julia\nnthreads()\n````\n\n````\n10\n````\n\n````julia\n@btime parallel_sum_falsesharing($data);\n````\n\n````\n  52.919 ms (221 allocations: 18.47 KiB)\n\n````\n\nA (huge) **slowdown**?! Clearly, that's the opposite of what we tried to achieve!\n\n## The issue: False sharing\n\nAlthough our parallel summation above is semantically correct, it has a\nbig **performance issue**: *False sharing*. To understand false sharing, we have to think\na little bit about how computers work. Specifically, we need to realize that processors\ncache memory in lines (rather than individual elements) and that caches of different processors\nare kept coherent.\nWhen two (or more) different CPU cores operate on independent data elements that **fall\ninto the same cache line** (i.e. they are part of the same memory address region)\nthe **cache coherency mechanism leads to costly synchronization** between cores.\n\nIn our case, this happens despite the fact that different parallel tasks\n(on different CPU cores) *logically* don't care about the rest of the data in the cache line\nat all.\n\n![](false_sharing.svg)\n\nGiven these insights, we can come up with a few workarounds that mitigate the issue.\nThe most prominent is probably padding, where one simply adds sufficiently many unused\nzeros to `psums` such that different partial sum counters don't fall into the same cache\nline. However, let's discuss a more fundamental, more efficient, and more elegant solution.\n\n## Task-local parallel summation\n\nThe key mistake in `parallel_sum_falsesharing` above is the non-local modification of\n(implicitly) shared state (cache lines of `psums`) very frequently (in the innermost loop).\nWe can simply avoid this by making the code more task-local. To this end, we introduce a\n**task-local accumulator variable**, which we use to perform the task-local partial sums.\nOnly at the very end do we communicate the result to the main thread, e.g. by writing it\ninto `psums` (once!).\n\n````julia\nfunction parallel_sum_tasklocal(data; nchunks = nthreads())\n    psums = zeros(eltype(data), nchunks)\n    @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks))\n        @spawn begin\n            local s = zero(eltype(data))\n            for i in idcs\n                s += data[i]\n            end\n            psums[c] = s\n        end\n    end\n    return sum(psums)\nend\n\n@test sum(data) ≈ parallel_sum_tasklocal(data)\n@btime parallel_sum_tasklocal($data);\n````\n\n````\n  1.120 ms (221 allocations: 18.55 KiB)\n\n````\n\nFinally, there is a speed up! 🎉\n\nTwo comments are in order.\n\nFirst, we note that the only role that `psums` plays is\nas a temporary storage for the results from the parallel tasks to be able to sum them\nup eventually. We could get rid of it entirely by using a `Threads.Atomic` instead which\nwould get updated via `Threads.atomic_add!` from each task directly. However,\nfor our discussion, this is a detail and we won't discuss it further.\n\nSecondly, while keeping the general idea, we can drastically simplify the above code by\nusing `map` and reusing the built-in (sequential) `sum` function on each parallel task:\n\n````julia\nfunction parallel_sum_map(data; nchunks = nthreads())\n    ts = map(index_chunks(data, n = nchunks)) do idcs\n        @spawn @views sum(data[idcs])\n    end\n    return sum(fetch.(ts))\nend\n\n@test sum(data) ≈ parallel_sum_map(data)\n@btime parallel_sum_map($data);\n````\n\n````\n  893.396 μs (64 allocations: 5.72 KiB)\n\n````\n\nThis implementation is conceptually\nclearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`,\nanywhere at all. We can't run into false sharing if we don't modify shared state 😉.\n\nNote that since we use the built-in `sum` function, which is highly optimized, we might see\nbetter runtimes due to other effects - like SIMD and the absence of bounds checks - compared\nto the simple for-loop accumulation in `parallel_sum_tasklocal` above.\n\n## Parallel summation with OhMyThreads\n\nFinally, all of the above is abstracted away for you if you simply use [`treduce`](@ref)\nto implement the parallel summation. It also only takes a single line and function call.\n\n````julia\nusing OhMyThreads: treduce\n\n@test sum(data) ≈ treduce(+, data; ntasks = nthreads())\n@btime treduce($+, $data; ntasks = $nthreads());\n````\n\n````\n  899.097 μs (68 allocations: 5.92 KiB)\n\n````\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/integration/Project.toml",
    "content": "[deps]\nBenchmarkTools = \"6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf\"\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\n"
  },
  {
    "path": "docs/src/literate/integration/integration.jl",
    "content": "# # Trapezoidal Integration\n#\n# In this example, we want to parallelize the computation of a simple numerical integral\n# via the trapezoidal rule. The latter is given by\n#\n# $\\int_{a}^{b}f(x)\\,dx \\approx h \\sum_{i=1}^{N}\\frac{f(x_{i-1})+f(x_{i})}{2}.$\n#\n# The function to be integrated is the following.\n\nf(x) = 4 * √(1 - x^2)\n\n# The analytic result of the definite integral (from 0 to 1) is known to be $\\pi$.\n#\n# ## Sequential\n#\n# Naturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.\n\nfunction trapezoidal(a, b, n; h = (b - a) / n)\n    y = (f(a) + f(b)) / 2.0\n    for i in 1:(n - 1)\n        x = a + i * h\n        y = y + f(x)\n    end\n    return y * h\nend\n\n# Let's compute the integral of `f` above and see if we get the expected result.\n# For simplicity, we choose `N`, the number of panels used to discretize the integration\n# interval, as a multiple of the number of available Julia threads.\n\nusing Base.Threads: nthreads\n\nN = nthreads() * 1_000_000\n\n# Calling `trapezoidal` we do indeed find the (approximate) value of $\\pi$.\n\ntrapezoidal(0, 1, N) ≈ π\n\n# ## Parallel\n#\n# Our strategy is the following: Divide the integration interval among the available\n# Julia threads. On each thread, use the sequential trapezoidal rule to compute the partial\n# integral.\n# It is straightforward to implement this strategy with `tmapreduce`. The `map` part\n# is, essentially, the application of `trapezoidal` and the reduction operator is chosen to\n# be `+` to sum up the local integrals.\n\nusing OhMyThreads\n\nfunction trapezoidal_parallel(a, b, N)\n    n = N ÷ nthreads()\n    h = (b - a) / N\n    return tmapreduce(+, 1:nthreads()) do i\n        local α = a + (i - 1) * n * h\n        local β = α + n * h\n        trapezoidal(α, β, n; h)\n    end\nend\n\n## or equivalently\n##\n## function trapezoidal_parallel(a, b, N)\n##     n = N ÷ nthreads()\n##     h = (b - a) / N\n##     @tasks for i in 1:nthreads()\n##         @set reducer=+\n##         local α = a + (i - 1) * n * h\n##         local β = α + n * h\n##         trapezoidal(α, β, n; h)\n##     end\n## end\n\n# First, we check the correctness of our parallel implementation.\ntrapezoidal_parallel(0, 1, N) ≈ π\n\n# Then, we benchmark and compare the performance of the sequential and parallel versions.\n\nusing BenchmarkTools\n@btime trapezoidal(0, 1, $N);\n@btime trapezoidal_parallel(0, 1, $N);\n\n# Because the problem is trivially parallel - all threads to the same thing and don't need\n# to communicate - we expect an ideal speedup of (close to) the number of available threads.\n\nnthreads()\n"
  },
  {
    "path": "docs/src/literate/integration/integration.md",
    "content": "```@meta\nEditURL = \"integration.jl\"\n```\n\n# Trapezoidal Integration\n\nIn this example, we want to parallelize the computation of a simple numerical integral\nvia the trapezoidal rule. The latter is given by\n\n$\\int_{a}^{b}f(x)\\,dx \\approx h \\sum_{i=1}^{N}\\frac{f(x_{i-1})+f(x_{i})}{2}.$\n\nThe function to be integrated is the following.\n\n````julia\nf(x) = 4 * √(1 - x^2)\n````\n\n````\nf (generic function with 1 method)\n````\n\nThe analytic result of the definite integral (from 0 to 1) is known to be $\\pi$.\n\n## Sequential\n\nNaturally, we implement the trapezoidal rule as a straightforward, sequential `for` loop.\n\n````julia\nfunction trapezoidal(a, b, n; h = (b - a) / n)\n    y = (f(a) + f(b)) / 2.0\n    for i in 1:(n - 1)\n        x = a + i * h\n        y = y + f(x)\n    end\n    return y * h\nend\n````\n\n````\ntrapezoidal (generic function with 1 method)\n````\n\nLet's compute the integral of `f` above and see if we get the expected result.\nFor simplicity, we choose `N`, the number of panels used to discretize the integration\ninterval, as a multiple of the number of available Julia threads.\n\n````julia\nusing Base.Threads: nthreads\n\nN = nthreads() * 1_000_000\n````\n\n````\n10000000\n````\n\nCalling `trapezoidal` we do indeed find the (approximate) value of $\\pi$.\n\n````julia\ntrapezoidal(0, 1, N) ≈ π\n````\n\n````\ntrue\n````\n\n## Parallel\n\nOur strategy is the following: Divide the integration interval among the available\nJulia threads. On each thread, use the sequential trapezoidal rule to compute the partial\nintegral.\nIt is straightforward to implement this strategy with `tmapreduce`. The `map` part\nis, essentially, the application of `trapezoidal` and the reduction operator is chosen to\nbe `+` to sum up the local integrals.\n\n````julia\nusing OhMyThreads\n\nfunction trapezoidal_parallel(a, b, N)\n    n = N ÷ nthreads()\n    h = (b - a) / N\n    return tmapreduce(+, 1:nthreads()) do i\n        local α = a + (i - 1) * n * h # the local keywords aren't necessary but good practice\n        local β = α + n * h\n        trapezoidal(α, β, n; h)\n    end\nend\n\n# or equivalently\n#\n# function trapezoidal_parallel(a, b, N)\n#     n = N ÷ nthreads()\n#     h = (b - a) / N\n#     @tasks for i in 1:nthreads()\n#         @set reducer=+\n#         local α = a + (i - 1) * n * h\n#         local β = α + n * h\n#         trapezoidal(α, β, n; h)\n#     end\n# end\n````\n\n````\ntrapezoidal_parallel (generic function with 1 method)\n````\n\nFirst, we check the correctness of our parallel implementation.\n\n````julia\ntrapezoidal_parallel(0, 1, N) ≈ π\n````\n\n````\ntrue\n````\n\nThen, we benchmark and compare the performance of the sequential and parallel versions.\n\n````julia\nusing BenchmarkTools\n@btime trapezoidal(0, 1, $N);\n@btime trapezoidal_parallel(0, 1, $N);\n````\n\n````\n  24.348 ms (0 allocations: 0 bytes)\n  2.457 ms (69 allocations: 6.05 KiB)\n\n````\n\nBecause the problem is trivially parallel - all threads to the same thing and don't need\nto communicate - we expect an ideal speedup of (close to) the number of available threads.\n\n````julia\nnthreads()\n````\n\n````\n10\n````\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/juliaset/Project.toml",
    "content": "[deps]\nBenchmarkTools = \"6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf\"\nDisplayAs = \"0b91fe84-8a4c-11e9-3e1d-67c38462b6d6\"\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\nPlots = \"91a5bcdd-55d7-5caf-9e0b-520d859cae80\"\n"
  },
  {
    "path": "docs/src/literate/juliaset/juliaset.jl",
    "content": "# # Julia Set\n#\n# In this example, we will compute an image of the\n# [Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore\n# the `schedule` and `nchunks` options that can be used to get load balancing.\n#\n# The value of a single pixel of the Julia set, which corresponds to a point in the\n# complex number plane, can be computed by the following iteration procedure.\n\nfunction _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)\n    x = -2.0 + (j - 1) * 4.0 / (n - 1)\n    y = -2.0 + (i - 1) * 4.0 / (n - 1)\n\n    z = x + y * im\n    iter = max_iter\n    for k in 1:max_iter\n        if abs2(z) > 4.0\n            iter = k - 1\n            break\n        end\n        z = z^2 + c\n    end\n    return iter\nend\n\n# Note that the value of the pixel is the number of performed iterations for the\n# corresponding complex input number. Hence, the computational **workload is non-uniform**.\n\n# ## Sequential computation\n#\n# In our naive implementation, we just loop over the dimensions of the image matrix and call\n# the pixel kernel above.\n\nfunction compute_juliaset_sequential!(img)\n    N = size(img, 1)\n    for j in 1:N\n        for i in 1:N\n            img[i, j] = _compute_pixel(i, j, N)\n        end\n    end\n    return img\nend\n\nN = 2000\nimg = zeros(Int, N, N)\ncompute_juliaset_sequential!(img);\n\n# Let's look at the result\n\nusing Plots\nusing DisplayAs #hide\np = heatmap(img)\nDisplayAs.PNG(p) #hide\n\n# ## Parallelization\n#\n# The Julia set computation above is a `map!` operation: We apply some function to each\n# element of the array. Hence, we can use `tmap!` for parallelization. We use\n# `CartesianIndices` to map between linear and two-dimensional cartesian indices.\n\nusing OhMyThreads: tmap!\n\nfunction compute_juliaset_parallel!(img; kwargs...)\n    N = size(img, 1)\n    cart = CartesianIndices(img)\n    tmap!(img, eachindex(img); kwargs...) do idx\n        c = cart[idx]\n        _compute_pixel(c[1], c[2], N)\n    end\n    return img\nend\n\n## or alternatively\n##\n## function compute_juliaset_parallel!(img; kwargs...)\n##     N = size(img, 1)\n##     cart = CartesianIndices(img)\n##     @tasks for idx in eachindex(img)\n##         c = cart[idx]\n##         img[idx] = _compute_pixel(c[1], c[2], N)\n##     end\n##     return img\n## end\n\nN = 2000\nimg = zeros(Int, N, N)\ncompute_juliaset_parallel!(img);\np = heatmap(img)\nDisplayAs.PNG(p) #hide\n\n# ## Benchmark\n#\n# Let's benchmark the variants above.\n\nusing BenchmarkTools\nusing Base.Threads: nthreads\n\nN = 2000\nimg = zeros(Int, N, N)\n\n@show nthreads()\n\n@btime compute_juliaset_sequential!($img) samples=10 evals=3;\n@btime compute_juliaset_parallel!($img) samples=10 evals=3;\n\n# As hoped, the parallel implementation is much faster!\n\n# ### Dynamic vs static scheduling\n#\n# As stated above, the per-pixel computation is non-uniform. Hence, we do benefit from\n# the load balancing of the default dynamic scheduler. The latter divides the overall\n# workload into tasks that can then be dynamically distributed among threads to adjust the\n# per-thread load. We can try to fine tune and improve the load balancing further by\n# increasing the `ntasks` parameter of the scheduler, that is, creating more tasks with\n# smaller per-task workload.\n\nusing OhMyThreads: DynamicScheduler\n\n@btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3;\n\n# Note that while this turns out to be a bit faster, it comes at the expense of much more\n# allocations.\n#\n# To quantify the impact of load balancing we can opt out of dynamic scheduling and use the\n# `StaticScheduler` instead. The latter doesn't provide any form of load balancing.\n\nusing OhMyThreads: StaticScheduler\n\n@btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3;\n"
  },
  {
    "path": "docs/src/literate/juliaset/juliaset.md",
    "content": "```@meta\nEditURL = \"juliaset.jl\"\n```\n\n# Julia Set\n\nIn this example, we will compute an image of the\n[Julia set](https://en.wikipedia.org/wiki/Julia_set) in parallel. We will explore\nthe `schedule` and `nchunks` options that can be used to get load balancing.\n\nThe value of a single pixel of the Julia set, which corresponds to a point in the\ncomplex number plane, can be computed by the following iteration procedure.\n\n````julia\nfunction _compute_pixel(i, j, n; max_iter = 255, c = -0.79 + 0.15 * im)\n    x = -2.0 + (j - 1) * 4.0 / (n - 1)\n    y = -2.0 + (i - 1) * 4.0 / (n - 1)\n\n    z = x + y * im\n    iter = max_iter\n    for k in 1:max_iter\n        if abs2(z) > 4.0\n            iter = k - 1\n            break\n        end\n        z = z^2 + c\n    end\n    return iter\nend\n````\n\n````\n_compute_pixel (generic function with 1 method)\n````\n\nNote that the value of the pixel is the number of performed iterations for the\ncorresponding complex input number. Hence, the computational **workload is non-uniform**.\n\n## Sequential computation\n\nIn our naive implementation, we just loop over the dimensions of the image matrix and call\nthe pixel kernel above.\n\n````julia\nfunction compute_juliaset_sequential!(img)\n    N = size(img, 1)\n    for j in 1:N\n        for i in 1:N\n            img[i, j] = _compute_pixel(i, j, N)\n        end\n    end\n    return img\nend\n\nN = 2000\nimg = zeros(Int, N, N)\ncompute_juliaset_sequential!(img);\n````\n\nLet's look at the result\n\n````julia\nusing Plots\np = heatmap(img)\n````\n![](juliaset-8.png)\n\n## Parallelization\n\nThe Julia set computation above is a `map!` operation: We apply some function to each\nelement of the array. Hence, we can use `tmap!` for parallelization. We use\n`CartesianIndices` to map between linear and two-dimensional cartesian indices.\n\n````julia\nusing OhMyThreads: tmap!\n\nfunction compute_juliaset_parallel!(img; kwargs...)\n    N = size(img, 1)\n    cart = CartesianIndices(img)\n    tmap!(img, eachindex(img); kwargs...) do idx\n        c = cart[idx]\n        _compute_pixel(c[1], c[2], N)\n    end\n    return img\nend\n\n# or alternatively\n#\n# function compute_juliaset_parallel!(img; kwargs...)\n#     N = size(img, 1)\n#     cart = CartesianIndices(img)\n#     @tasks for idx in eachindex(img)\n#         c = cart[idx]\n#         img[idx] = _compute_pixel(c[1], c[2], N)\n#     end\n#     return img\n# end\n\nN = 2000\nimg = zeros(Int, N, N)\ncompute_juliaset_parallel!(img);\np = heatmap(img)\n````\n![](juliaset-10.png)\n\n## Benchmark\n\nLet's benchmark the variants above.\n\n````julia\nusing BenchmarkTools\nusing Base.Threads: nthreads\n\nN = 2000\nimg = zeros(Int, N, N)\n\n@show nthreads()\n\n@btime compute_juliaset_sequential!($img) samples=10 evals=3;\n@btime compute_juliaset_parallel!($img) samples=10 evals=3;\n````\n\n````\nnthreads() = 10\n  131.295 ms (0 allocations: 0 bytes)\n  31.422 ms (68 allocations: 6.09 KiB)\n\n````\n\nAs hoped, the parallel implementation is much faster!\n\n### Dynamic vs static scheduling\n\nAs stated above, the per-pixel computation is non-uniform. Hence, we do benefit from\nthe load balancing of the default dynamic scheduler. The latter divides the overall\nworkload into tasks that can then be dynamically distributed among threads to adjust the\nper-thread load. We can try to fine tune and improve the load balancing further by\nincreasing the `ntasks` parameter of the scheduler, that is, creating more tasks with\nsmaller per-task workload.\n\n````julia\nusing OhMyThreads: DynamicScheduler\n\n@btime compute_juliaset_parallel!($img; ntasks=N, scheduler=:dynamic) samples=10 evals=3;\n````\n\n````\n  17.438 ms (12018 allocations: 1.11 MiB)\n\n````\n\nNote that while this turns out to be a bit faster, it comes at the expense of much more\nallocations.\n\nTo quantify the impact of load balancing we can opt out of dynamic scheduling and use the\n`StaticScheduler` instead. The latter doesn't provide any form of load balancing.\n\n````julia\nusing OhMyThreads: StaticScheduler\n\n@btime compute_juliaset_parallel!($img; scheduler=:static) samples=10 evals=3;\n````\n\n````\n  30.097 ms (73 allocations: 6.23 KiB)\n\n````\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/mc/Project.toml",
    "content": "[deps]\nBenchmarkTools = \"6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf\"\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\n"
  },
  {
    "path": "docs/src/literate/mc/mc.jl",
    "content": "# # Parallel Monte Carlo\n#\n# Calculate the value of $\\pi$ through parallel direct Monte Carlo.\n#\n# A unit circle is inscribed inside a unit square with side length 2 (from -1 to 1).\n# The area of the circle is $\\pi$, the area of the square is 4, and the ratio is $\\pi/4$.\n# This means that, if you throw $N$ darts randomly at the square, approximately $M=N\\pi/4$\n# of those darts will land inside the unit circle.\n#\n# Throw darts randomly at a unit square and count how many of them ($M$) landed inside of\n# a unit circle. Approximate $\\pi \\approx 4M/N$.\n#\n# ## Sequential implementation:\n\nfunction mc(N)\n    M = 0 # number of darts that landed in the circle\n    for i in 1:N\n        if rand()^2 + rand()^2 < 1.0\n            M += 1\n        end\n    end\n    pi = 4 * M / N\n    return pi\nend\n\nN = 100_000_000\n\nmc(N)\n\n# ## Parallelization with `tmapreduce`\n#\n# To parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction\n# operator. For the map part, we take `1:N` as our input collection and \"throw one dart\" per\n# element.\n\nusing OhMyThreads\n\nfunction mc_parallel(N; kwargs...)\n    M = tmapreduce(+, 1:N; kwargs...) do i\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\n## or alternatively\n##\n## function mc_parallel(N)\n##     M = @tasks for _ in 1:N\n##         @set reducer = +\n##         rand()^2 + rand()^2 < 1.0\n##     end\n##     pi = 4 * M / N\n##     return pi\n## end\n\nmc_parallel(N)\n\n# Let's run a quick benchmark.\n\nusing BenchmarkTools\nusing Base.Threads: nthreads\n\n@assert nthreads() > 1 # make sure we have multiple Julia threads\n@show nthreads()       # print out the number of threads\n\n@btime mc($N) samples=10 evals=3;\n@btime mc_parallel($N) samples=10 evals=3;\n\n# ### Static scheduling\n#\n# Because the workload is highly uniform, it makes sense to also try the `StaticScheduler`\n# and compare the performance of static and dynamic scheduling (with default parameters).\n\nusing OhMyThreads: StaticScheduler\n\n@btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default\n@btime mc_parallel($N; scheduler=:static) samples=10 evals=3;\n\n# ## Manual parallelization\n#\n# First, using the `index_chunks` function, we divide the iteration interval `1:N` into\n# `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task\n# per chunk. Each task will locally and independently perform a sequential Monte Carlo\n# simulation. Finally, we fetch the results and compute the average estimate for $\\pi$.\n\nusing OhMyThreads: @spawn, index_chunks\n\nfunction mc_parallel_manual(N; nchunks = nthreads())\n    tasks = map(index_chunks(1:N; n = nchunks)) do idcs\n        @spawn mc(length(idcs))\n    end\n    pi = sum(fetch, tasks) / nchunks\n    return pi\nend\n\nmc_parallel_manual(N)\n\n# And this is the performance:\n\n@btime mc_parallel_manual($N) samples=10 evals=3;\n\n# It is faster than `mc_parallel` above because the task-local computation\n# `mc(length(idcs))` is faster than the implicit task-local computation within\n# `tmapreduce` (which itself is a `mapreduce`).\n\nidcs = first(index_chunks(1:N; n = nthreads()))\n\n@btime mapreduce($+, $idcs) do i\n    rand()^2 + rand()^2 < 1.0\nend samples=10 evals=3;\n\n@btime mc($(length(idcs))) samples=10 evals=3;\n"
  },
  {
    "path": "docs/src/literate/mc/mc.md",
    "content": "```@meta\nEditURL = \"mc.jl\"\n```\n\n# Parallel Monte Carlo\n\nCalculate the value of $\\pi$ through parallel direct Monte Carlo.\n\nA unit circle is inscribed inside a unit square with side length 2 (from -1 to 1).\nThe area of the circle is $\\pi$, the area of the square is 4, and the ratio is $\\pi/4$.\nThis means that, if you throw $N$ darts randomly at the square, approximately $M=N\\pi/4$\nof those darts will land inside the unit circle.\n\nThrow darts randomly at a unit square and count how many of them ($M$) landed inside of\na unit circle. Approximate $\\pi \\approx 4M/N$.\n\n## Sequential implementation:\n\n````julia\nfunction mc(N)\n    M = 0 # number of darts that landed in the circle\n    for i in 1:N\n        if rand()^2 + rand()^2 < 1.0\n            M += 1\n        end\n    end\n    pi = 4 * M / N\n    return pi\nend\n\nN = 100_000_000\n\nmc(N)\n````\n\n````\n3.14171236\n````\n\n## Parallelization with `tmapreduce`\n\nTo parallelize the Monte Carlo simulation, we use [`tmapreduce`](@ref) with `+` as the reduction\noperator. For the map part, we take `1:N` as our input collection and \"throw one dart\" per\nelement.\n\n````julia\nusing OhMyThreads\n\nfunction mc_parallel(N; kwargs...)\n    M = tmapreduce(+, 1:N; kwargs...) do i\n        rand()^2 + rand()^2 < 1.0\n    end\n    pi = 4 * M / N\n    return pi\nend\n\n# or alternatively\n#\n# function mc_parallel(N)\n#     M = @tasks for _ in 1:N\n#         @set reducer = +\n#         rand()^2 + rand()^2 < 1.0\n#     end\n#     pi = 4 * M / N\n#     return pi\n# end\n\nmc_parallel(N)\n````\n\n````\n3.14156496\n````\n\nLet's run a quick benchmark.\n\n````julia\nusing BenchmarkTools\nusing Base.Threads: nthreads\n\n@assert nthreads() > 1 # make sure we have multiple Julia threads\n@show nthreads()       # print out the number of threads\n\n@btime mc($N) samples=10 evals=3;\n@btime mc_parallel($N) samples=10 evals=3;\n````\n\n````\nnthreads() = 10\n  301.636 ms (0 allocations: 0 bytes)\n  41.864 ms (68 allocations: 5.81 KiB)\n\n````\n\n### Static scheduling\n\nBecause the workload is highly uniform, it makes sense to also try the `StaticScheduler`\nand compare the performance of static and dynamic scheduling (with default parameters).\n\n````julia\nusing OhMyThreads: StaticScheduler\n\n@btime mc_parallel($N; scheduler=:dynamic) samples=10 evals=3; # default\n@btime mc_parallel($N; scheduler=:static) samples=10 evals=3;\n````\n\n````\n  41.839 ms (68 allocations: 5.81 KiB)\n  41.838 ms (68 allocations: 5.81 KiB)\n\n````\n\n## Manual parallelization\n\nFirst, using the `index_chunks` function, we divide the iteration interval `1:N` into\n`nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task\nper chunk. Each task will locally and independently perform a sequential Monte Carlo\nsimulation. Finally, we fetch the results and compute the average estimate for $\\pi$.\n\n````julia\nusing OhMyThreads: @spawn, index_chunks\n\nfunction mc_parallel_manual(N; nchunks = nthreads())\n    tasks = map(index_chunks(1:N; n = nchunks)) do idcs\n        @spawn mc(length(idcs))\n    end\n    pi = sum(fetch, tasks) / nchunks\n    return pi\nend\n\nmc_parallel_manual(N)\n````\n\n````\n3.14180504\n````\n\nAnd this is the performance:\n\n````julia\n@btime mc_parallel_manual($N) samples=10 evals=3;\n````\n\n````\n  30.224 ms (65 allocations: 5.70 KiB)\n\n````\n\nIt is faster than `mc_parallel` above because the task-local computation\n`mc(length(idcs))` is faster than the implicit task-local computation within\n`tmapreduce` (which itself is a `mapreduce`).\n\n````julia\nidcs = first(index_chunks(1:N; n = nthreads()))\n\n@btime mapreduce($+, $idcs) do i\n    rand()^2 + rand()^2 < 1.0\nend samples=10 evals=3;\n\n@btime mc($(length(idcs))) samples=10 evals=3;\n````\n\n````\n  41.750 ms (0 allocations: 0 bytes)\n  30.148 ms (0 allocations: 0 bytes)\n\n````\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/tls/Project.toml",
    "content": "[deps]\nBenchmarkTools = \"6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf\"\nBumper = \"8ce10254-0962-460f-a3d8-1f77fea1446e\"\nOhMyThreads = \"67456a42-1dca-4109-a031-0a68de7e3ad5\"\nThreadPinning = \"811555cd-349b-4f26-b7bc-1f208b848042\"\n"
  },
  {
    "path": "docs/src/literate/tls/tls.jl",
    "content": "# # [Thread-Safe Storage](@id TSS)\n#\n# For some programs, it can be useful or even necessary to allocate and (re-)use memory in\n# your parallel code (e.g. your computation might require temporary buffers).\n# The following section demonstrates common issues that can arise in such a scenario and,\n# by means of a simple example, explains techniques to handle such cases safely.\n# Specifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and\n# (2) how channels can be used to organize per-task buffer allocation in a thread-safe\n# manner.\n#\n#\n# ## Test case (sequential)\n#\n# Let's say that we are given two arrays of matrices, `As` and `Bs`, and let's\n# further assume that our goal is to compute the total sum of all pairwise matrix products.\n# We can readily implement a (sequential) function that performs the necessary computations.\nusing LinearAlgebra: mul!, BLAS\nBLAS.set_num_threads(1) #  for simplicity, we turn off OpenBLAS multithreading\nusing ThreadPinning #hide\npinthreads(:cores) #hide\n\nfunction matmulsums(As, Bs)\n    N = size(first(As), 1)\n    C = Matrix{Float64}(undef, N, N)\n    map(As, Bs) do A, B\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\n# Here, we use `map` to perform the desired operation for each pair of matrices,\n# `A` and `B`. However, the crucial point for our discussion is that we want to use the\n# in-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated\n# temporary buffer, the output matrix `C`. This is to avoid the temporary allocation per\n# \"iteration\" (i.e. per matrix pair) that we would get with `C = A*B`.\n#\n# For later comparison, we generate some random input data and store the result.\n\nAs = [rand(256, 16) for _ in 1:768]\nBs = [rand(16, 256) for _ in 1:768]\n\nres = matmulsums(As, Bs);\n\n# ## How to not parallelize\n#\n# The key idea for creating a parallel version of `matmulsums` is to replace the `map` by\n# OhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't\n# entirely trivial. Someone new to parallel computing might be tempted to parallelize\n# `matmulsums` like this:\nusing OhMyThreads: tmap\n\nfunction matmulsums_race(As, Bs)\n    N = size(first(As), 1)\n    C = Matrix{Float64}(undef, N, N)\n    tmap(As, Bs) do A, B\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\n# Unfortunately, this doesn't produce the correct result.\n\nres_race = matmulsums_race(As, Bs)\nres ≈ res_race\n\n# In fact, it doesn't even always produce the same result (check for yourself)!\n# The reason is that there is a race condition: different parallel\n# tasks are trying to use the shared variable `C` simultaneously leading to\n# non-deterministic behavior. Let's see how we can fix this.\n#\n# ### The naive (and inefficient) fix\n#\n# A simple solution for the race condition issue above is to move the allocation of `C`\n# into the body of the parallel `tmap`:\n\nfunction matmulsums_naive(As, Bs)\n    N = size(first(As), 1)\n    tmap(As, Bs) do A, B\n        C = Matrix{Float64}(undef, N, N)\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\n# In this case, a separate `C` will be allocated for each iteration such that parallel tasks\n# no longer mutate shared state. Hence, we'll get the desired result.\n\nres_naive = matmulsums_naive(As, Bs)\nres ≈ res_naive\n\n# However, this variant is obviously inefficient because it is no better than just writing\n# `C = A*B` and thus leads to one allocation per matrix pair. We need a different way of\n# allocating and re-using `C` for an efficient parallel version.\n\n# ## [Task-local storage](@id TLS)\n#\n# ### The manual (and cumbersome) way\n#\n# We've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't\n# allocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a\n# separate \"C\" on each parallel task once and then use this task-local \"C\" for all\n# iterations (i.e. matrix pairs) for which this task is responsible.\n# Before we learn how to do this more conveniently, let's implement this idea of a\n# task-local temporary buffer (for each parallel task) manually.\nusing OhMyThreads: index_chunks, @spawn\nusing Base.Threads: nthreads\n\nfunction matmulsums_manual(As, Bs)\n    N = size(first(As), 1)\n    tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs\n        @spawn begin\n            local C = Matrix{Float64}(undef, N, N)\n            map(idcs) do i\n                A = As[i]\n                B = Bs[i]\n\n                mul!(C, A, B)\n                sum(C)\n            end\n        end\n    end\n    mapreduce(fetch, vcat, tasks)\nend\n\nres_manual = matmulsums_manual(As, Bs)\nres ≈ res_manual\n\n# We note that this is rather cumbersome and you might not\n# want to write it (repeatedly). But let's take a closer look and see what's happening here.\n# First, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of\n# those chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a\n# `results` vector) and (2) performs the actual computations using these pre-allocated\n# buffers. Finally, we `fetch` the results of the tasks and combine them. This variant works\n# just fine and the good news is that we can get the same behavior with less manual work.\n#\n# ### [The shortcut: `TaskLocalValue`](@id TLV)\n#\n# The desire for task-local storage is quite natural with task-based multithreading. For\n# this reason, Julia supports this out of the box with\n# [`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}).\n# But instead of using this directly (which you could), we will use a convenience wrapper\n# around it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl).\n# This allows us to express the idea from above in few lines of code:\nusing OhMyThreads: TaskLocalValue\n\nfunction matmulsums_tlv(As, Bs; kwargs...)\n    N = size(first(As), 1)\n    tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))\n    tmap(As, Bs; kwargs...) do A, B\n        C = tlv[]\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\nres_tlv = matmulsums_tlv(As, Bs)\nres ≈ res_tlv\n\n# Here, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a\n# task-local value - essentially a reference to a value in the task-local storage - that\n# behaves like this: The first time the task-local value is accessed from a task (`tls[]`)\n# it is initialized according to the provided anonymous function. Afterwards, every\n# following query (from the same task!) will simply lookup and return the task-local value.\n# This solves our issues above and leads to $O(\\textrm{parallel tasks})$\n# (instead of $O(\\textrm{iterations})$) allocations.\n#\n# Note that if you use our `@tasks` macro API, there is built-in support for task-local\n# values via `@local`.\n#\n\nusing OhMyThreads: @tasks\n\nfunction matmulsums_tlv_macro(As, Bs; kwargs...)\n    N = size(first(As), 1)\n    @tasks for i in eachindex(As, Bs)\n        @set collect = true\n        @local C = Matrix{Float64}(undef, N, N)\n        mul!(C, As[i], Bs[i])\n        sum(C)\n    end\nend\n\nres_tlv_macro = matmulsums_tlv_macro(As, Bs)\nres ≈ res_tlv_macro\n\n# Here, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically\n# infers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see\n# [`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in\n# loops which take on the order of 100ns to complete.\n#\n#\n# ### Benchmark\n#\n# The whole point of parallelization is increasing performance, so let's benchmark and\n# compare the performance of the variants that we've discussed so far.\n\nusing BenchmarkTools\n\n@show nthreads()\n\n@btime matmulsums($As, $Bs);\nsleep(2) #hide\n@btime matmulsums_naive($As, $Bs);\nsleep(2) #hide\n@btime matmulsums_manual($As, $Bs);\nsleep(2) #hide\n@btime matmulsums_tlv($As, $Bs);\nsleep(2) #hide\n@btime matmulsums_tlv_macro($As, $Bs);\n\n# As we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient\n# but also efficient: It allocates much less memory than `matmulsums_naive` and is about on\n# par with the manual implementation.\n#\n#\n# ## Per-thread allocation\n#\n# The task-local solution above has one potential caveat: If we spawn many parallel tasks\n# (e.g. for load-balancing reasons) we need just as many task-local buffers. This can\n# clearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one\n# buffer per thread should actually suffice.\n# Of course, this raises the question of how to organize a pool of \"per-thread\" buffers\n# such that each running task always has exclusive (temporary) access to a buffer (we need\n# to make sure to avoid races).\n#\n# ### The naive (and incorrect) approach\n# A naive approach to implementing this idea is to pre-allocate an array of buffers\n# and then to use the `threadid()` to select a buffer for a running task.\n#\nusing Base.Threads: threadid\n\nfunction matmulsums_perthread_incorrect(As, Bs)\n    N = size(first(As), 1)\n    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()]\n    tmap(As, Bs) do A, B\n        C = Cs[threadid()]\n        mul!(C, A, B)\n        sum(C)\n    end\nend;\n\n# This approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()`\n# doesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which\n# case `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising\n# but is a simple consequence of the ordering of different kinds of Julia threads: If Julia\n# is started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the\n# interactive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`).\n# [Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087),\n# and so the above code will error by default.\n#\n# But even if we account for this offset there is another, more fundamental problem, namely\n# **task-migration**. By default, all spawned parallel tasks are \"non-sticky\" and can\n# dynamically migrate between different Julia threads (loosely speaking, at any point in time).\n# This means nothing other than that **`threadid()` is not necessarily constant for a task**!\n# For example, imagine that task A starts on thread 4, loads the\n# buffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5.\n# Afterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on\n# (the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race\n# condition because both tasks are mutating the same buffer.\n# (Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example\n# above, but you can't rely on it!)\n#\n# ### The quick (and non-recommended) fix\n#\n# A simple solution for the task-migration issue is to opt-out of dynamic scheduling with\n# `scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically\n# assigns tasks to threads upfront without any dynamic rescheduling\n# (the tasks are sticky and won't migrate).\n#\n# We'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than\n# `nthreads`, as described above.\n#\nnum_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads()\n\nfunction matmulsums_perthread_static(As, Bs)\n    N = size(first(As), 1)\n    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()]\n    ## Note!!!\n    ## This code is *incorrect* if used with a non-static scheduler. this\n    ## isn't just true in OhMyThreads but also applies to `Threads.@threads`\n    ## You *must* use `Threads.@threads :static` or `scheduler = :static` to\n    ## avoid race-conditions caused by task migration.\n    tmap(As, Bs; scheduler = :static) do A, B\n        C = Cs[threadid()]\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\n## non uniform workload\nAs_nu = [rand(256, isqrt(i)^2) for i in 1:768];\nBs_nu = [rand(isqrt(i)^2, 256) for i in 1:768];\nres_nu = matmulsums(As_nu, Bs_nu);\n\nres_pt_static = matmulsums_perthread_static(As_nu, Bs_nu)\nres_nu ≈ res_pt_static\n\n# However, this approach has serious shortcomings.\n#\n# 1. It can easily be broken if someone doesn't know that the `scheduler = :static`\n# option is required for correctness, and removes it in a refactor.\n# 2. It makes the parallel code  non-composable: If we call other multithreaded functions\n# within the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called\n# from another parallel region we will likely oversubscribe the Julia threads and get subpar\n# performance.\n# 3. It can waste memory by creating too many temporary storage slots since `maxthreadid()`\n# can give an over-estimate of the number of slots needed for the computation.\n#\n# While the above pattern might be the easiest to migrate to from the incorrect pattern,\n# we do not recommend it. We instead urge you to use task-local-storages, or the `Channel`\n# based techniques described below:\n#\n# ### The safe way: `Channel`\n#\n# Instead of storing the pre-allocated buffers in an array, we can put them into a `Channel`\n# which internally ensures that parallel access is safe. In this scenario, we simply `take!`\n# a buffer from the channel whenever we need it and `put!` it back after our computation is\n# done.\n#\nfunction matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...)\n    N = size(first(As), 1)\n    chnl = Channel{Matrix{Float64}}(nbuffers)\n    foreach(1:nbuffers) do _\n        put!(chnl, Matrix{Float64}(undef, N, N))\n    end\n    tmap(As, Bs; kwargs...) do A, B\n        C = take!(chnl)\n        mul!(C, A, B)\n        result = sum(C)\n        put!(chnl, C)\n        result\n    end\nend\n\nres_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu)\nres_nu ≈ res_pt_channel\n\n#\n# ### Benchmark\n#\n# Let's benchmark the variants above and compare them to the task-local implementation.\n# We want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter\n# of which gives us dynamic load balancing.\n#\n\n## no load balancing because ntasks == nthreads()\n@btime matmulsums_tlv($As_nu, $Bs_nu);\n@btime matmulsums_perthread_static($As_nu, $Bs_nu);\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu);\n\n## load balancing because ntasks > nthreads()\n@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n\n@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n\n#\n# Note that the runtime of `matmulsums_perthread_channel` improves with increasing number\n# of chunks/tasks (due to load balancing) while the amount of allocated memory doesn't\n# increase much. Contrast this with the drastic memory increase with `matmulsums_tlv`.\n#\n# ### Another safe way based on `Channel`\n#\n# Above, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel\n# and then spawn many tasks (one per input element). Sometimes it can make sense to flip\n# things around and put the (many) input elements into a channel and only spawn\n# a limited number of tasks (e.g. `nthreads()`) with task-local buffers.\n#\nusing OhMyThreads: tmapreduce\n\nfunction matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads())\n    N = size(first(As), 1)\n    chnl = Channel{Int}(length(As); spawn = true) do chnl\n        for i in 1:length(As)\n            put!(chnl, i)\n        end\n    end\n    tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off\n        local C = Matrix{Float64}(undef, N, N)\n        map(chnl) do i # implicitly takes the values from the channel (parallel safe)\n            A = As[i]\n            B = Bs[i]\n            mul!(C, A, B)\n            sum(C)\n        end\n    end\nend;\n\n# Note that one caveat of this approach is that the input → task assignment, and thus the\n# order of the output, is **non-deterministic**. For this reason, we sort the output to check\n# for correctness.\n\nres_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu)\nsort(res_nu) ≈ sort(res_channel_flipped)\n\n# Quick benchmark:\n\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu);\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n\n# In addition, OhMyThreads provides an iterator-wrapper type\n# [`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If\n# the number of elements is large this can be more efficient since there is no\n# need to copy the elements into the `Channel`. Concretely, in the example above,\n# we could replace `Channel() do .. end` with\n# `OhMyThreads.ChannelLike(1:length(As))`.\n\n# ### Bumper.jl (only for the brave)\n#\n# If you are bold and want to cut down temporary allocations even more you can\n# give [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it\n# allows you to *bring your own stacks*, that is, task-local bump allocators which you can\n# dynamically allocate memory to, and reset them at the end of a code block, just like\n# Julia's stack.\n# Be warned though that Bumper.jl is (1) a rather young package with (likely) some bugs\n# and (2) can easily lead to segfaults when used incorrectly. If you can live with the\n# risk, Bumper.jl is especially useful for causes  we don't know ahead of time how large\n# a matrix to pre-allocate, and even more useful if we want to do many intermediate\n# allocations on the task, not just one. For our example, this isn't the case but let's\n# nonetheless how one would use Bumper.jl here.\n\nusing Bumper\n\nfunction matmulsums_bumper(As, Bs)\n    tmap(As, Bs) do A, B\n        @no_escape begin # promising that no memory will escape\n            N = size(A, 1)\n            C = @alloc(Float64, N, N) # from bump allocater (fake \"stack\")\n            mul!(C, A, B)\n            sum(C)\n        end\n    end\nend\n\nres_bumper = matmulsums_bumper(As, Bs);\nsort(res) ≈ sort(res_bumper)\n\n@btime matmulsums_bumper($As, $Bs);\n\n# Note that the benchmark is lying here about the total memory allocation,\n# because it doesn't show the allocation of the task-local bump allocators themselves\n# (the reason is that `SlabBuffer` uses `malloc` directly).\n"
  },
  {
    "path": "docs/src/literate/tls/tls.md",
    "content": "```@meta\nEditURL = \"tls.jl\"\n```\n\n# [Thread-Safe Storage](@id TSS)\n\nFor some programs, it can be useful or even necessary to allocate and (re-)use memory in\nyour parallel code (e.g. your computation might require temporary buffers).\nThe following section demonstrates common issues that can arise in such a scenario and,\nby means of a simple example, explains techniques to handle such cases safely.\nSpecifically, we'll dicuss (1) how task-local storage (TLS) can be used efficiently and\n(2) how channels can be used to organize per-task buffer allocation in a thread-safe\nmanner.\n\n\n## Test case (sequential)\n\nLet's say that we are given two arrays of matrices, `As` and `Bs`, and let's\nfurther assume that our goal is to compute the total sum of all pairwise matrix products.\nWe can readily implement a (sequential) function that performs the necessary computations.\n\n````julia\nusing LinearAlgebra: mul!, BLAS\nBLAS.set_num_threads(1) #  for simplicity, we turn off OpenBLAS multithreading\n\nfunction matmulsums(As, Bs)\n    N = size(first(As), 1)\n    C = Matrix{Float64}(undef, N, N)\n    map(As, Bs) do A, B\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n````\n\n````\nmatmulsums (generic function with 1 method)\n````\n\nHere, we use `map` to perform the desired operation for each pair of matrices,\n`A` and `B`. However, the crucial point for our discussion is that we want to use the\nin-place matrix multiplication `LinearAlgebra.mul!` in conjunction with a pre-allocated\ntemporary buffer, the output matrix `C`. This is to avoid the temporary allocation per\n\"iteration\" (i.e. per matrix pair) that we would get with `C = A*B`.\n\nFor later comparison, we generate some random input data and store the result.\n\n````julia\nAs = [rand(256, 16) for _ in 1:768]\nBs = [rand(16, 256) for _ in 1:768]\n\nres = matmulsums(As, Bs);\n````\n\n## How to not parallelize\n\nThe key idea for creating a parallel version of `matmulsums` is to replace the `map` by\nOhMyThreads' parallel [`tmap`](@ref) function. However, because we re-use `C`, this isn't\nentirely trivial. Someone new to parallel computing might be tempted to parallelize\n`matmulsums` like this:\n\n````julia\nusing OhMyThreads: tmap\n\nfunction matmulsums_race(As, Bs)\n    N = size(first(As), 1)\n    C = Matrix{Float64}(undef, N, N)\n    tmap(As, Bs) do A, B\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n````\n\n````\nmatmulsums_race (generic function with 1 method)\n````\n\nUnfortunately, this doesn't produce the correct result.\n\n````julia\nres_race = matmulsums_race(As, Bs)\nres ≈ res_race\n````\n\n````\nfalse\n````\n\nIn fact, it doesn't even always produce the same result (check for yourself)!\nThe reason is that there is a race condition: different parallel\ntasks are trying to use the shared variable `C` simultaneously leading to\nnon-deterministic behavior. Let's see how we can fix this.\n\n### The naive (and inefficient) fix\n\nA simple solution for the race condition issue above is to move the allocation of `C`\ninto the body of the parallel `tmap`:\n\n````julia\nfunction matmulsums_naive(As, Bs)\n    N = size(first(As), 1)\n    tmap(As, Bs) do A, B\n        C = Matrix{Float64}(undef, N, N)\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n````\n\n````\nmatmulsums_naive (generic function with 1 method)\n````\n\nIn this case, a separate `C` will be allocated for each iteration such that parallel tasks\nno longer mutate shared state. Hence, we'll get the desired result.\n\n````julia\nres_naive = matmulsums_naive(As, Bs)\nres ≈ res_naive\n````\n\n````\ntrue\n````\n\nHowever, this variant is obviously inefficient because it is no better than just writing\n`C = A*B` and thus leads to one allocation per matrix pair. We need a different way of\nallocating and re-using `C` for an efficient parallel version.\n\n## [Task-local storage](@id TLS)\n\n### The manual (and cumbersome) way\n\nWe've seen that we can't allocate `C` once up-front (→ race condition) and also shouldn't\nallocate it within the `tmap` (→ one allocation per iteration). Instead, we can assign a\nseparate \"C\" on each parallel task once and then use this task-local \"C\" for all\niterations (i.e. matrix pairs) for which this task is responsible.\nBefore we learn how to do this more conveniently, let's implement this idea of a\ntask-local temporary buffer (for each parallel task) manually.\n\n````julia\nusing OhMyThreads: index_chunks, @spawn\nusing Base.Threads: nthreads\n\nfunction matmulsums_manual(As, Bs)\n    N = size(first(As), 1)\n    tasks = map(index_chunks(As; n = 2 * nthreads())) do idcs\n        @spawn begin\n            local C = Matrix{Float64}(undef, N, N)\n            map(idcs) do i\n                A = As[i]\n                B = Bs[i]\n\n                mul!(C, A, B)\n                sum(C)\n            end\n        end\n    end\n    mapreduce(fetch, vcat, tasks)\nend\n\nres_manual = matmulsums_manual(As, Bs)\nres ≈ res_manual\n````\n\n````\ntrue\n````\n\nWe note that this is rather cumbersome and you might not\nwant to write it (repeatedly). But let's take a closer look and see what's happening here.\nFirst, we divide the number of matrix pairs into `2 * nthreads()` chunks. Then, for each of\nthose chunks, we spawn a parallel task that (1) allocates a task-local `C` matrix (and a\n`results` vector) and (2) performs the actual computations using these pre-allocated\nbuffers. Finally, we `fetch` the results of the tasks and combine them. This variant works\njust fine and the good news is that we can get the same behavior with less manual work.\n\n### [The shortcut: `TaskLocalValue`](@id TLV)\n\nThe desire for task-local storage is quite natural with task-based multithreading. For\nthis reason, Julia supports this out of the box with\n[`Base.task_local_storage`](https://docs.julialang.org/en/v1/base/parallel/#Base.task_local_storage-Tuple{Any}).\nBut instead of using this directly (which you could), we will use a convenience wrapper\naround it called [`TaskLocalValue`](https://github.com/vchuravy/TaskLocalValues.jl).\nThis allows us to express the idea from above in few lines of code:\n\n````julia\nusing OhMyThreads: TaskLocalValue\n\nfunction matmulsums_tlv(As, Bs; kwargs...)\n    N = size(first(As), 1)\n    tlv = TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))\n    tmap(As, Bs; kwargs...) do A, B\n        C = tlv[]\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\nres_tlv = matmulsums_tlv(As, Bs)\nres ≈ res_tlv\n````\n\n````\ntrue\n````\n\nHere, `TaskLocalValue{Matrix{Float64}}(() -> Matrix{Float64}(undef, N, N))` creates a\ntask-local value - essentially a reference to a value in the task-local storage - that\nbehaves like this: The first time the task-local value is accessed from a task (`tls[]`)\nit is initialized according to the provided anonymous function. Afterwards, every\nfollowing query (from the same task!) will simply lookup and return the task-local value.\nThis solves our issues above and leads to $O(\\textrm{parallel tasks})$\n(instead of $O(\\textrm{iterations})$) allocations.\n\nNote that if you use our `@tasks` macro API, there is built-in support for task-local\nvalues via `@local`.\n\n````julia\nusing OhMyThreads: @tasks\n\nfunction matmulsums_tlv_macro(As, Bs; kwargs...)\n    N = size(first(As), 1)\n    @tasks for i in eachindex(As, Bs)\n        @set collect = true\n        @local C = Matrix{Float64}(undef, N, N)\n        mul!(C, As[i], Bs[i])\n        sum(C)\n    end\nend\n\nres_tlv_macro = matmulsums_tlv_macro(As, Bs)\nres ≈ res_tlv_macro\n````\n\n````\ntrue\n````\n\nHere, `@local` expands to a pattern similar to the `TaskLocalValue` one above, although automatically\ninfers that the object's type is `Matrix{Float64}`, and it carries some optimizations (see\n[`OhMyThreads.WithTaskLocals`](@ref)) which can make accessing task local values more efficient in\nloops which take on the order of 100ns to complete.\n\n\n### Benchmark\n\nThe whole point of parallelization is increasing performance, so let's benchmark and\ncompare the performance of the variants that we've discussed so far.\n\n````julia\nusing BenchmarkTools\n\n@show nthreads()\n\n@btime matmulsums($As, $Bs);\n@btime matmulsums_naive($As, $Bs);\n@btime matmulsums_manual($As, $Bs);\n@btime matmulsums_tlv($As, $Bs);\n@btime matmulsums_tlv_macro($As, $Bs);\n````\n\n````\nnthreads() = 6\n  50.439 ms (6 allocations: 518.14 KiB)\n  39.387 ms (2467 allocations: 384.09 MiB)\n  9.743 ms (165 allocations: 6.05 MiB)\n  9.749 ms (962 allocations: 3.05 MiB)\n  9.859 ms (199 allocations: 3.04 MiB)\n\n````\n\nAs we can see, `matmulsums_tlv` (and `matmulsums_tlv_macro`) isn't only convenient\nbut also efficient: It allocates much less memory than `matmulsums_naive` and is about on\npar with the manual implementation.\n\n\n## Per-thread allocation\n\nThe task-local solution above has one potential caveat: If we spawn many parallel tasks\n(e.g. for load-balancing reasons) we need just as many task-local buffers. This can\nclearly be suboptimal because only `nthreads()` tasks can run simultaneously. Hence, one\nbuffer per thread should actually suffice.\nOf course, this raises the question of how to organize a pool of \"per-thread\" buffers\nsuch that each running task always has exclusive (temporary) access to a buffer (we need\nto make sure to avoid races).\n\n### The naive (and incorrect) approach\nA naive approach to implementing this idea is to pre-allocate an array of buffers\nand then to use the `threadid()` to select a buffer for a running task.\n\n````julia\nusing Base.Threads: threadid\n\nfunction matmulsums_perthread_incorrect(As, Bs)\n    N = size(first(As), 1)\n    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:nthreads()]\n    tmap(As, Bs) do A, B\n        C = Cs[threadid()]\n        mul!(C, A, B)\n        sum(C)\n    end\nend;\n````\n\nThis approach is [**wrong**](https://julialang.org/blog/2023/07/PSA-dont-use-threadid/). The first issue is that `threadid()`\ndoesn't necessarily start at 1 (and thus might return a value `> nthreads()`), in which\ncase `Cs[threadid()]` would be an out-of-bounds access attempt. This might be surprising\nbut is a simple consequence of the ordering of different kinds of Julia threads: If Julia\nis started with a non-zero number of interactive threads, e.g. `--threads 5,2`, the\ninteractive threads come first (look at `Threads.threadpool.(1:Threads.maxthreadid())`).\n[Starting in julia v1.12, julia will launch with at one interactive thread](https://github.com/JuliaLang/julia/pull/57087),\nand so the above code will error by default.\n\nBut even if we account for this offset there is another, more fundamental problem, namely\n**task-migration**. By default, all spawned parallel tasks are \"non-sticky\" and can\ndynamically migrate between different Julia threads (loosely speaking, at any point in time).\nThis means nothing other than that **`threadid()` is not necessarily constant for a task**!\nFor example, imagine that task A starts on thread 4, loads the\nbuffer `Cs[4]`, but then gets paused, migrated, and continues executation on, say, thread 5.\nAfterwards, while task A is performing `mul!(Cs[4], ...)`, a different task B might start on\n(the now available) thread 4 and also read and use `Cs[4]`. This would lead to a race\ncondition because both tasks are mutating the same buffer.\n(Note that, in practice, this - most likely 😉 - doesn't happen for the very simple example\nabove, but you can't rely on it!)\n\n### The quick (and non-recommended) fix\n\nA simple solution for the task-migration issue is to opt-out of dynamic scheduling with\n`scheduler=:static` (or `scheduler=StaticScheduler()`). This scheduler statically\nassigns tasks to threads upfront without any dynamic rescheduling\n(the tasks are sticky and won't migrate).\n\nWe'll also need to switch from `nthreads` to `maxthreadid`, since that can be greater than\n`nthreads`, as described above.\n\n````julia\nnum_to_store() = isdefined(Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads()\n\nfunction matmulsums_perthread_static(As, Bs)\n    N = size(first(As), 1)\n    Cs = [Matrix{Float64}(undef, N, N) for _ in 1:num_to_store()]\n    # Note!!!\n    # This code is *incorrect* if used with a non-static scheduler. this\n    # isn't just true in OhMyThreads but also applies to `Threads.@threads`\n    # You *must* use `Threads.@threads :static` or `scheduler = :static` to\n    # avoid race-conditions caused by task migration.\n    tmap(As, Bs; scheduler = :static) do A, B\n        C = Cs[threadid()]\n        mul!(C, A, B)\n        sum(C)\n    end\nend\n\n# non uniform workload\nAs_nu = [rand(256, isqrt(i)^2) for i in 1:768];\nBs_nu = [rand(isqrt(i)^2, 256) for i in 1:768];\nres_nu = matmulsums(As_nu, Bs_nu);\n\nres_pt_static = matmulsums_perthread_static(As_nu, Bs_nu)\nres_nu ≈ res_pt_static\n````\n\n````\ntrue\n````\n\nHowever, this approach has serious shortcomings.\n\n1. It can easily be broken if someone doesn't know that the `scheduler = :static`\noption is required for correctness, and removes it in a refactor.\n2. It makes the parallel code  non-composable: If we call other multithreaded functions\nwithin the `tmap` or if our parallel `matmulsums_perthread_static` itself gets called\nfrom another parallel region we will likely oversubscribe the Julia threads and get subpar\nperformance.\n3. It can waste memory by creating too many temporary storage slots since `maxthreadid()`\ncan give an over-estimate of the number of slots needed for the computation.\n\nWhile the above pattern might be the easiest to migrate to from the incorrect pattern,\nwe do not recommend it. We instead urge you to use task-local-storages, or the `Channel`\nbased techniques described below:\n\n### The safe way: `Channel`\n\nInstead of storing the pre-allocated buffers in an array, we can put them into a `Channel`\nwhich internally ensures that parallel access is safe. In this scenario, we simply `take!`\na buffer from the channel whenever we need it and `put!` it back after our computation is\ndone.\n\n````julia\nfunction matmulsums_perthread_channel(As, Bs; nbuffers = nthreads(), kwargs...)\n    N = size(first(As), 1)\n    chnl = Channel{Matrix{Float64}}(nbuffers)\n    foreach(1:nbuffers) do _\n        put!(chnl, Matrix{Float64}(undef, N, N))\n    end\n    tmap(As, Bs; kwargs...) do A, B\n        C = take!(chnl)\n        mul!(C, A, B)\n        result = sum(C)\n        put!(chnl, C)\n        result\n    end\nend\n\nres_pt_channel = matmulsums_perthread_channel(As_nu, Bs_nu)\nres_nu ≈ res_pt_channel\n````\n\n````\ntrue\n````\n\n### Benchmark\n\nLet's benchmark the variants above and compare them to the task-local implementation.\nWe want to look at both `ntasks = nthreads()` and `ntasks > nthreads()`, the latter\nof which gives us dynamic load balancing.\n\n````julia\n# no load balancing because ntasks == nthreads()\n@btime matmulsums_tlv($As_nu, $Bs_nu);\n@btime matmulsums_perthread_static($As_nu, $Bs_nu);\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu);\n\n# load balancing because ntasks > nthreads()\n@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n\n@btime matmulsums_tlv($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n@btime matmulsums_perthread_channel($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n````\n\n````\n  212.200 ms (962 allocations: 3.05 MiB)\n  212.014 ms (191 allocations: 4.04 MiB)\n  211.336 ms (190 allocations: 3.04 MiB)\n  168.835 ms (1136 allocations: 6.05 MiB)\n  169.097 ms (334 allocations: 3.04 MiB)\n  130.469 ms (2530 allocations: 30.17 MiB)\n  131.037 ms (1487 allocations: 3.14 MiB)\n\n````\n\nNote that the runtime of `matmulsums_perthread_channel` improves with increasing number\nof chunks/tasks (due to load balancing) while the amount of allocated memory doesn't\nincrease much. Contrast this with the drastic memory increase with `matmulsums_tlv`.\n\n### Another safe way based on `Channel`\n\nAbove, we chose to put a limited number of buffers (e.g. `nthreads()`) into the channel\nand then spawn many tasks (one per input element). Sometimes it can make sense to flip\nthings around and put the (many) input elements into a channel and only spawn\na limited number of tasks (e.g. `nthreads()`) with task-local buffers.\n\n````julia\nusing OhMyThreads: tmapreduce\n\nfunction matmulsums_perthread_channel_flipped(As, Bs; ntasks = nthreads())\n    N = size(first(As), 1)\n    chnl = Channel{Int}(length(As); spawn = true) do chnl\n        for i in 1:length(As)\n            put!(chnl, i)\n        end\n    end\n    tmapreduce(vcat, 1:ntasks; chunking=false) do _ # we turn chunking off\n        local C = Matrix{Float64}(undef, N, N)\n        map(chnl) do i # implicitly takes the values from the channel (parallel safe)\n            A = As[i]\n            B = Bs[i]\n            mul!(C, A, B)\n            sum(C)\n        end\n    end\nend;\n````\n\nNote that one caveat of this approach is that the input → task assignment, and thus the\norder of the output, is **non-deterministic**. For this reason, we sort the output to check\nfor correctness.\n\n````julia\nres_channel_flipped = matmulsums_perthread_channel_flipped(As_nu, Bs_nu)\nsort(res_nu) ≈ sort(res_channel_flipped)\n````\n\n````\ntrue\n````\n\nQuick benchmark:\n\n````julia\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu);\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 2 * nthreads());\n@btime matmulsums_perthread_channel_flipped($As_nu, $Bs_nu; ntasks = 10 * nthreads());\n````\n\n````\n  137.431 ms (133 allocations: 3.04 MiB)\n  126.854 ms (211 allocations: 6.06 MiB)\n  127.647 ms (836 allocations: 30.29 MiB)\n\n````\n\nIn addition, OhMyThreads provides an iterator-wrapper type\n[`OhMyThreads.ChannelLike`](@ref) which can be used in place of a `Channel`. If\nthe number of elements is large this can be more efficient since there is no\nneed to copy the elements into the `Channel`. Concretely, in the example above,\nwe could replace `Channel() do .. end` with\n`OhMyThreads.ChannelLike(1:length(As))`.\n\n### Bumper.jl (only for the brave)\n\nIf you are bold and want to cut down temporary allocations even more you can\ngive [Bumper.jl](https://github.com/MasonProtter/Bumper.jl) a try. Essentially, it\nallows you to *bring your own stacks*, that is, task-local bump allocators which you can\ndynamically allocate memory to, and reset them at the end of a code block, just like\nJulia's stack.\nBe warned though that Bumper.jl is (1) a rather young package with (likely) some bugs\nand (2) can easily lead to segfaults when used incorrectly. If you can live with the\nrisk, Bumper.jl is especially useful for causes  we don't know ahead of time how large\na matrix to pre-allocate, and even more useful if we want to do many intermediate\nallocations on the task, not just one. For our example, this isn't the case but let's\nnonetheless how one would use Bumper.jl here.\n\n````julia\nusing Bumper\n\nfunction matmulsums_bumper(As, Bs)\n    tmap(As, Bs) do A, B\n        @no_escape begin # promising that no memory will escape\n            N = size(A, 1)\n            C = @alloc(Float64, N, N) # from bump allocater (fake \"stack\")\n            mul!(C, A, B)\n            sum(C)\n        end\n    end\nend\n\nres_bumper = matmulsums_bumper(As, Bs);\nsort(res) ≈ sort(res_bumper)\n\n@btime matmulsums_bumper($As, $Bs);\n````\n\n````\n  9.439 ms (198 allocations: 39.25 KiB)\n\n````\n\nNote that the benchmark is lying here about the total memory allocation,\nbecause it doesn't show the allocation of the task-local bump allocators themselves\n(the reason is that `SlabBuffer` uses `malloc` directly).\n\n---\n\n*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*\n\n"
  },
  {
    "path": "docs/src/literate/tomarkdown.sh",
    "content": "#!/usr/bin/env sh\n#=\njulia --project -t 10 $0 $@\nexit\n# =#\n\nconst reporoot = joinpath(@__DIR__, \"../../..\")\nconst repourl = \"https://github.com/JuliaFolds2/OhMyThreads.jl/blob/main/docs\"\n\nusing Literate\nusing Pkg\n\nif length(ARGS) == 0\n    println(\"Error: Please provide the names of the folders that you want to compile to markdown. \" *\n    \"Alternatively, you can pass \\\"all\\\" as the first argument to compile them all.\")\n    exit()\nelse\n    if first(ARGS) == \"all\"\n        dirs = filter(isdir, readdir())\n    else\n        dirs = ARGS\n    end\nend\n@show dirs\n\nfor d in dirs\n    println(\"directory: \", d)\n    cd(d) do\n        Pkg.activate(\".\")\n        Pkg.resolve()\n        Pkg.instantiate()\n        jlfiles = filter(endswith(\".jl\"), readdir())\n        for f in jlfiles\n            Literate.markdown(\n                f,\n                repo_root_url = repourl,\n                execute=true;\n                # config=Dict(\"image_formats\" => [(MIME\"image/png\", \".png\")])\n            )\n        end\n    end\nend\n"
  },
  {
    "path": "docs/src/refs/api.md",
    "content": "```@meta\nCollapsedDocStrings = true\n```\n\n# [Public API](@id API)\n\n## Exported\n\n### Macros\n```@docs\n@tasks\n@set\n@local\n@only_one\n@one_by_one\n@allow_boxed_captures\n@disallow_boxed_captures\n@localize\n```\n\n### Functions\n\n```@docs\ntmapreduce\ntreduce\ntmap\ntmap!\ntforeach\ntcollect\ntreducemap\n```\n\n### Schedulers\n\n```@docs\nScheduler\nDynamicScheduler\nStaticScheduler\nGreedyScheduler\nSerialScheduler\n```\n\n## Re-exported\n\n|                        |                                                                     |\n|------------------------|---------------------------------------------------------------------|\n| `OhMyThreads.chunks`   | see [`ChunkSplitters.chunks`](@extref) |\n| `OhMyThreads.index_chunks`   | see [`ChunkSplitters.index_chunks`](@extref) |\n\n## Public but not exported\n\n|                        |                                                                     |\n|------------------------|---------------------------------------------------------------------|\n| `OhMyThreads.@spawn`   | see [`StableTasks.@spawn`](https://github.com/JuliaFolds2/StableTasks.jl) |\n| `OhMyThreads.@spawnat` | see [`StableTasks.@spawnat`](https://github.com/JuliaFolds2/StableTasks.jl) |\n| `OhMyThreads.@fetch`   | see [`StableTasks.@fetch`](https://github.com/JuliaFolds2/StableTasks.jl) |\n| `OhMyThreads.@fetchfrom` | see [`StableTasks.@fetchfrom`](https://github.com/JuliaFolds2/StableTasks.jl) |\n| `OhMyThreads.TaskLocalValue`   | see [TaskLocalValues.TaskLocalValue](https://github.com/vchuravy/TaskLocalValues.jl) |\n| `OhMyThreads.Split`   | see [`ChunkSplitters.Split`](@extref) |\n| `OhMyThreads.Consecutive`   | see [`ChunkSplitters.Consecutive`](@extref) |\n| `OhMyThreads.RoundRobin`   | see [`ChunkSplitters.RoundRobin`](@extref) |\n\n\n```@docs\nOhMyThreads.WithTaskLocals\nOhMyThreads.promise_task_local\nOhMyThreads.ChannelLike\n```\n"
  },
  {
    "path": "docs/src/refs/experimental.md",
    "content": "```@meta\nCollapsedDocStrings = true\n```\n\n# Experimental\n\n!!! warning\n    **Everything on this page is experimental and might changed or dropped at any point!**\n\n## References\n\n```@autodocs\nModules = [OhMyThreads, OhMyThreads.Experimental]\nPublic = false\nPages   = [\"OhMyThreads.jl\", \"experimental.jl\"]\n```\n"
  },
  {
    "path": "docs/src/refs/internal.md",
    "content": "```@meta\nCollapsedDocStrings = true\n```\n\n# Internal\n\n!!! warning\n    **Everything on this page is internal and and might changed or dropped at any point!**\n\n## References\n\n```@autodocs\nModules = [OhMyThreads, OhMyThreads.Tools]\nPublic = false\nPages   = [\"OhMyThreads.jl\", \"tools.jl\"]\n```\n"
  },
  {
    "path": "docs/src/translation.md",
    "content": "# [Translation Guide](@id TG)\n\nThis page tries to give a general overview of how to translate patterns written with the built-in tools of [Base.Threads](https://docs.julialang.org/en/v1/base/multi-threading/) using the [OhMyThreads.jl API](@ref API). Note that this should be seen as a rough guide and (intentionally) isn't supposed to replace a systematic introduction into OhMyThreads.jl.\n\n\n## Basics\n\n### `@threads`\n\n```julia\n# Base.Threads\nusing Base.Threads: @threads\n\n@threads for i in 1:10\n    println(i)\nend\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\n\n@tasks for i in 1:10\n    println(i)\nend\n\n# or\nusing OhMyThreads: tforeach\n\ntforeach(1:10) do i\n    println(i)\nend\n```\n\n#### `:static` scheduling\n\n```julia\n# Base.Threads\nusing Base.Threads: @threads\n\n@threads :static for i in 1:10\n    println(i)\nend\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\n\n@tasks for i in 1:10\n    @set scheduler=:static\n    println(i)\nend\n\n# or\nusing OhMyThreads: tforeach\n\ntforeach(1:10; scheduler=:static) do i\n    println(i)\nend\n```\n\n### `@spawn`\n\n```julia\n# Base.Threads\nusing Base.Threads: @spawn\n\n@sync for i in 1:10\n    @spawn println(i)\nend\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\n\n@tasks for i in 1:10\n    @set chunking=false\n    println(i)\nend\n\n# or\nusing OhMyThreads: tforeach\n\ntforeach(1:10; chunking=false) do i\n    println(i)\nend\n\n# or\nusing OhMyThreads: @spawn\n\n@sync for i in 1:10\n    @spawn println(i)\nend\n```\n\n## Reduction\n\nNo built-in feature in Base.Threads.\n\n```julia\n# Base.Threads: basic manual implementation\nusing Base.Threads: @spawn\n\ndata = rand(10)\nchunks_itr = Iterators.partition(data, length(data) ÷ nthreads())\ntasks = map(chunks_itr) do chunk\n    @spawn reduce(+, chunk)\nend\nreduce(+, fetch.(tasks))\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\ndata = rand(10)\n\n@tasks for x in data\n    @set reducer=+\nend\n\n# or\nusing OhMyThreads: treduce\n\ntreduce(+, data)\n```\n\n## Mutation\n\n!!! warning\n    Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!**\n\n```julia\n# Base.Threads\nusing Base.Threads: @threads\ndata = rand(10)\n\n@threads for i in eachindex(data)\n    data[i] = calc(i)\nend\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\ndata = rand(10)\n\n@tasks for i in eachindex(data)\n    data[i] = calc(i)\nend\n\n# or\nusing OhMyThreads: tforeach\n\ntforeach(eachindex(data)) do i\n    data[i] = calc(i)\nend\n\n# or\nusing OhMyThreads: tmap!\n\ntmap!(data, eachindex(data)) do i\n    calc(i)\nend\n```\n\n## Parallel initialization\n\n!!! warning\n    Parallel mutation of non-local state, like writing to a shared array, can be the source of correctness errors (e.g. race conditions) and big performance issues (e.g. [false sharing](https://en.wikipedia.org/wiki/False_sharing#:~:text=False%20sharing%20is%20an%20inherent,is%20limited%20to%20RAM%20caches.)). You should carefully consider whether this is necessary or whether the use of [thread-safe storage](@ref TSS) is the better option. **We don't recommend using the examples in this section for anything serious!**\n\n```julia\n# Base.Threads\nusing Base.Threads: @threads\n\ndata = Vector{Float64}(undef, 10)\n@threads for i in eachindex(data)\n    data[i] = calc(i)\nend\n```\n\n```julia\n# OhMyThreads\nusing OhMyThreads: @tasks\n\ndata = @tasks for i in 1:10\n    @set collect=true\n    calc(i)\nend\n\n# or\nusing OhMyThreads: tmap\n\ndata = tmap(i->calc(i), 1:10)\n\n# or\nusing OhMyThreads: tcollect\n\ndata = tcollect(calc(i) for i in 1:10)\n```\n"
  },
  {
    "path": "ext/MarkdownExt.jl",
    "content": "module MarkdownExt\n\nusing Markdown: Markdown, @md_str, term\nusing OhMyThreads.Implementation: BoxedVariableError\n\nfunction __init__()\n    if isdefined(Base.Experimental, :register_error_hint)\n        Base.Experimental.register_error_hint(BoxedVariableError) do io, bve\n            println(io)\n            println(io)\n            term(io, md\"\"\"\n#### Hint\nCapturing boxed variables can be not only slow, but also cause surprising and incorrect results. \n\n* If you meant for these variables to be local to each loop iteration and not depend on a variable from an outer scope, you should mark them as `local` inside the closure.\n* If you meant to reference a variable from the outer scope, but do not want access to it to be boxed, you can wrap uses of it in a let block, like e.g.\n```julia\nfunction foo(x, N)\n    rand(Bool) && x = 1 # This rebinding of x causes it to be boxed ...\n    let x = x # ... Unless we localize it here with the let block \n        @tasks for i in 1:N\n            f(x)    \n        end\n    end\nend\n```\n* OhMyThreads.jl provides a `@localize` macro that automates the above `let` block, i.e. `@localize x f(x)` is the same as `let x=x; f(x) end`\n* If these variables are being re-bound inside a `@one_by_one` or `@only_one` block, consider using a mutable `Ref` instead of re-binding the variable.\n\nThis error can be bypassed with the `@allow_boxed_captures` macro.\n    \"\"\")\n        end\n    end\nend \n\n\n\nend\n"
  },
  {
    "path": "src/OhMyThreads.jl",
    "content": "module OhMyThreads\n\nusing StableTasks: StableTasks\nfor mac in Symbol.([\"@spawn\", \"@spawnat\", \"@fetch\", \"@fetchfrom\"])\n    @eval const $mac = getproperty(StableTasks, $(QuoteNode(mac)))\nend\n\nusing ChunkSplitters: ChunkSplitters\nconst index_chunks = ChunkSplitters.index_chunks\nconst chunks = ChunkSplitters.chunks\nconst Split = ChunkSplitters.Split\nconst Consecutive = ChunkSplitters.Consecutive\nconst RoundRobin = ChunkSplitters.RoundRobin\nexport chunks, index_chunks\n\nusing TaskLocalValues: TaskLocalValues\nconst TaskLocalValue = TaskLocalValues.TaskLocalValue\n\nusing ScopedValues: ScopedValues, ScopedValue, @with\n\ninclude(\"types.jl\")\ninclude(\"functions.jl\")\ninclude(\"macros.jl\")\n\ninclude(\"tools.jl\")\ninclude(\"schedulers.jl\")\nusing .Schedulers: Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler,\n                   SerialScheduler\ninclude(\"implementation.jl\")\ninclude(\"experimental.jl\")\n\nexport @tasks, @set, @local, @one_by_one, @only_one, @allow_boxed_captures, @disallow_boxed_captures, @localize\nexport treduce, tmapreduce, treducemap, tmap, tmap!, tforeach, tcollect\nexport Scheduler, DynamicScheduler, StaticScheduler, GreedyScheduler, SerialScheduler\n\nend # module OhMyThreads\n"
  },
  {
    "path": "src/experimental.jl",
    "content": "module Experimental\n\n\"\"\"\n    @barrier\n\nThis can be used inside a `@tasks for ... end` to synchronize `n` parallel tasks.\nSpecifically, a task can only pass the `@barrier` if `n-1` other tasks have reached it\nas well. The value of `n` is determined from `@set ntasks=...`, which\nis required if one wants to use `@barrier`.\n\nBecause this feature is experimental, it is required to load `@barrier` explicitly, e.g. via\n`using OhMyThreads.Experimental: @barrier`.\n\n**WARNING:** It is the responsibility of the user to ensure that the right number of tasks\nactually reach the barrier. Otherwise, a **deadlock** can occur. In partictular, if the\nnumber of iterations is not a multiple of `n`, the last few iterations (remainder) will be\nrun by less than `n` tasks which will never be able to pass a `@barrier`.\n\n## Example\n\n```julia\nusing OhMyThreads: @tasks\n\n# works\n@tasks for i in 1:20\n    @set ntasks = 20\n\n    sleep(i * 0.2)\n    println(i, \": before\")\n    @barrier\n    println(i, \": after\")\nend\n\n# wrong - deadlock!\n@tasks for i in 1:22 # ntasks % niterations != 0\n    @set ntasks = 20\n\n    println(i, \": before\")\n    @barrier\n    println(i, \": after\")\nend\n```\n\"\"\"\nmacro barrier(args...)\n    error(\"The @barrier macro may only be used inside of a @tasks block.\")\nend\n\nend # Experimental\n"
  },
  {
    "path": "src/functions.jl",
    "content": "\"\"\"\n    tmapreduce(f, op, A::AbstractArray...;\n               [scheduler::Union{Scheduler, Symbol} = :dynamic],\n               [outputtype::Type = Any],\n               [init])\n\nA multithreaded function like `Base.mapreduce`. Perform a reduction over `A`, applying a\nsingle-argument function `f` to each element, and then combining them with the two-argument\nfunction `op`.\n\nNote that `op` **must** be an\n[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense\nthat `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you\nwill get undefined results.\n\n## Example:\n\n```\nusing OhMyThreads: tmapreduce\n\ntmapreduce(√, +, [1, 2, 3, 4, 5])\n```\n\nis the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form\n\n```\n(√1 + √2) + (√3 + √4) + √5\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.\n- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.\n\nIn addition, `tmapreduce` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntmapreduce(√, +, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction tmapreduce end\n\n\"\"\"\n    treducemap(op, f, A::AbstractArray...;\n               [scheduler::Union{Scheduler, Symbol} = :dynamic],\n               [outputtype::Type = Any],\n               [init])\n\nLike `tmapreduce` except the order of the `f` and `op` arguments are switched. This is\nsometimes convenient with `do`-block notation. Perform a reduction over `A`, applying a\nsingle-argument function `f` to each element, and then combining them with the two-argument\nfunction `op`.\n\nNote that `op` **must** be an\n[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense\nthat `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you\nwill get undefined results.\n\n## Example:\n\n```\nusing OhMyThreads: treducemap\n\ntreducemap(+, √, [1, 2, 3, 4, 5])\n```\n\nis the parallelized version of `sum(√, [1, 2, 3, 4, 5])` in the form\n\n```\n(√1 + √2) + (√3 + √4) + √5\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.\n- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.\n\nIn addition, `treducemap` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntreducemap(+, √, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction treducemap end\n\n\"\"\"\n    treduce(op, A::AbstractArray...;\n            [scheduler::Union{Scheduler, Symbol} = :dynamic],\n            [outputtype::Type = Any],\n            [init])\n\nA multithreaded function like `Base.reduce`. Perform a reduction over `A` using the\ntwo-argument function `op`.\n\nNote that `op` **must** be an\n[associative](https://en.wikipedia.org/wiki/Associative_property) function, in the sense\nthat `op(a, op(b, c)) ≈ op(op(a, b), c)`. If `op` is not (approximately) associative, you\nwill get undefined results.\n\n## Example:\n\n```\nusing OhMyThreads: treduce\n\ntreduce(+, [1, 2, 3, 4, 5])\n```\n\nis the parallelized version of `sum([1, 2, 3, 4, 5])` in the form\n\n```\n(1 + 2) + (3 + 4) + 5\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n- `outputtype::Type` (default `Any`): will work as the asserted output type of parallel calculations. We use [StableTasks.jl](https://github.com/JuliaFolds2/StableTasks.jl) to make setting this option unnecessary, but if you experience problems with type stability, you may be able to recover it with this keyword argument.\n- `init`: initial value of the reduction. Will be forwarded to `mapreduce` for the task-local sequential parts of the calculation.\n\nIn addition, `treduce` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntreduce(+, [1, 2, 3, 4, 5]; chunksize=2, scheduler=:static)\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction treduce end\n\n\"\"\"\n    tforeach(f, A::AbstractArray...;\n             [scheduler::Union{Scheduler, Symbol} = :dynamic]) :: Nothing\n\nA multithreaded function like `Base.foreach`. Apply `f` to each element of `A` on\nmultiple parallel tasks, and return `nothing`. I.e. it is the parallel equivalent of\n\n```\nfor x in A\n    f(x)\nend\n```\n\n## Example:\n\n```\nusing OhMyThreads: tforeach\n\ntforeach(1:10) do i\n    println(i^2)\nend\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n\nIn addition, `tforeach` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntforeach(1:10; chunksize=2, scheduler=:static) do i\n    println(i^2)\nend\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction tforeach end\n\n\"\"\"\n    tmap(f, [OutputElementType], A::AbstractArray...;\n         [scheduler::Union{Scheduler, Symbol} = :dynamic])\n\nA multithreaded function like `Base.map`. Create a new container `similar` to `A` and fills\nit in parallel such that the `i`th element is equal to `f(A[i])`.\n\nThe optional argument `OutputElementType` will select a specific element type for the\nreturned container, and will generally incur fewer allocations than the version where\n`OutputElementType` is not specified.\n\n## Example:\n\n```\nusing OhMyThreads: tmap\n\ntmap(sin, 1:10)\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n\nIn addition, `tmap` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntmap(sin, 1:10; chunksize=2, scheduler=:static)\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction tmap end\n\n\"\"\"\n    tmap!(f, out, A::AbstractArray...;\n          [scheduler::Union{Scheduler, Symbol} = :dynamic])\n\nA multithreaded function like `Base.map!`. In parallel on multiple tasks, this function\nassigns each element of `out[i] = f(A[i])` for each index `i` of `A` and `out`.\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n\nIn addition, `tmap!` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor.\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction tmap! end\n\n\"\"\"\n    tcollect([OutputElementType], gen::Union{AbstractArray, Generator{<:AbstractArray}};\n             [scheduler::Union{Scheduler, Symbol} = :dynamic])\n\nA multithreaded function like `Base.collect`. Essentially just calls `tmap` on the\ngenerator function and inputs.\n\nThe optional argument `OutputElementType` will select a specific element type for the\nreturned container, and will generally incur fewer allocations than the version where\n`OutputElementType` is not specified.\n\n## Example:\n\n```\nusing OhMyThreads: tcollect\n\ntcollect(sin(i) for i in 1:10)\n```\n\n## Keyword arguments:\n\n- `scheduler::Union{Scheduler, Symbol}` (default `:dynamic`): determines how the computation is divided into parallel tasks and how these are scheduled. See [`Scheduler`](@ref) for more information on the available schedulers.\n\nIn addition, `tcollect` accepts **all keyword arguments that are supported by the selected\nscheduler**. They will simply be passed on to the corresponding `Scheduler` constructor. Example:\n```\ntcollect(sin(i) for i in 1:10; chunksize=2, scheduler=:static)\n```\nHowever, to avoid ambiguity, this is currently **only supported for `scheduler::Symbol`**\n(but not for `scheduler::Scheduler`).\n\"\"\"\nfunction tcollect end\n"
  },
  {
    "path": "src/implementation.jl",
    "content": "module Implementation\n\nimport OhMyThreads: treduce, tmapreduce, treducemap, tforeach, tmap, tmap!, tcollect\nusing OhMyThreads: @spawn, @spawnat, WithTaskLocals, promise_task_local, ChannelLike,\n                   allowing_boxed_captures\nusing OhMyThreads.Tools: nthtid\nusing OhMyThreads: Scheduler,\n                   DynamicScheduler, StaticScheduler, GreedyScheduler,\n                   SerialScheduler\nusing OhMyThreads.Schedulers: chunksplitter_mode, chunking_enabled,\n                              nchunks, chunksize, chunksplit, minchunksize, has_chunksplit,\n                              has_minchunksize, chunkingargs_to_kwargs,\n                              chunking_mode, ChunkingMode, NoChunking,\n                              FixedSize, FixedCount, scheduler_from_symbol, NotGiven,\n                              isgiven, threadpool as get_threadpool\nusing Base: @propagate_inbounds\nusing Base.Threads: nthreads, @threads\nusing BangBang: append!!\nusing ChunkSplitters: ChunkSplitters, index_chunks, Consecutive\nusing ChunkSplitters.Internals: AbstractChunks, IndexChunks\n\nconst MaybeScheduler = Union{NotGiven, Scheduler, Symbol, Val}\n\ninclude(\"macro_impl.jl\")\n\n@inline function _index_chunks(sched, arg)\n    C = chunking_mode(sched)\n    @assert chunking_enabled(sched)\n    kwargs = chunkingargs_to_kwargs(sched, arg)\n    return index_chunks(arg; kwargs...)::IndexChunks{typeof(arg), chunksplitter_mode(C)}\nend\n\nfunction _scheduler_from_userinput(scheduler::MaybeScheduler; kwargs...)\n    if scheduler isa Scheduler\n        isempty(kwargs) || scheduler_and_kwargs_err(; kwargs...)\n        _scheduler = scheduler\n    elseif scheduler isa Symbol\n        _scheduler = scheduler_from_symbol(scheduler; kwargs...)\n    else # default fallback\n        _scheduler = DynamicScheduler(; kwargs...)\n    end\nend\n\nfunction _check_chunks_incompatible_kwargs(; kwargs...)\n    ks = keys(kwargs)\n    if :ntasks in ks || :nchunks in ks || :chunksize in ks || :split in ks\n        error(\"You've provided `chunks` or `index_chunks` as input and, at the same time, \" *\n              \"chunking related keyword arguments (e.g. `ntasks`, `chunksize`, or `split`). \" *\n              \"This isn't supported. \" *\n              \"Set the chunking options directly in the `chunks` or `index_chunks` call or drop the latter.\")\n    end\n    if :chunking in ks\n        for (k, v) in kwargs\n            if k == :chunking && v == true\n                error(\"You've provided `chunks` or `index_chunks` as input and, at the same time, \" *\n                      \"have set chunking=true. This isn't supported.\")\n            end\n        end\n    end\n    return nothing\nend\n\nfunction has_multiple_chunks(scheduler, coll)\n    C = chunking_mode(scheduler)\n    if C == NoChunking || coll isa Union{AbstractChunks, ChunkSplitters.Internals.Enumerate}\n        length(coll) > 1\n    elseif C == FixedCount\n        if !has_minchunksize(scheduler)\n            mcs = 1\n        else\n            mcs = max(min(minchunksize(scheduler), length(coll)), 1)\n        end\n        min(length(coll) ÷ mcs, nchunks(scheduler)) > 1\n    elseif C == FixedSize\n        length(coll) ÷ chunksize(scheduler) > 1\n    else\n        throw(ArgumentError(\"Unknown chunking mode: $C.\"))\n    end\nend\n\n# we can inline this function because we use @noinline on the main function\n# it can save some time in cases where we do not hit the main function (e.g. when\n# fallback to mapreduce without any threading)\n@inline function tmapreduce(f, op, Arrs...;\n        scheduler::MaybeScheduler = NotGiven(),\n        outputtype::Type = Any,\n        init = NotGiven(),\n        kwargs...)\n    mapreduce_kwargs = isgiven(init) ? (; init) : (;)\n    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)\n\n    A = first(Arrs)\n    if A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate\n        _check_chunks_incompatible_kwargs(; kwargs...)\n    end\n    if _scheduler isa SerialScheduler || !has_multiple_chunks(_scheduler, first(Arrs))\n        # empty input collection → align with Base.mapreduce behavior\n        mapreduce(f, op, Arrs...; mapreduce_kwargs...)\n    else\n        @noinline _tmapreduce(f, op, Arrs, outputtype, _scheduler, mapreduce_kwargs)\n    end\nend\n\n@noinline function scheduler_and_kwargs_err(; kwargs...)\n    kwargstr = join(string.(keys(kwargs)), \", \")\n    throw(ArgumentError(\"Providing an explicit scheduler as well as direct keyword arguments (e.g. $(kwargstr)) is currently not supported.\"))\nend\n\ntreducemap(op, f, A...; kwargs...) = tmapreduce(f, op, A...; kwargs...)\n\n\n# DynamicScheduler: AbstractArray/Generic\nfunction _tmapreduce(f,\n        op,\n        Arrs,\n        ::Type{OutputType},\n        scheduler::DynamicScheduler,\n        mapreduce_kwargs)::OutputType where {OutputType}\n    threadpool = get_threadpool(scheduler)\n    check_all_have_same_indices(Arrs)\n    throw_if_boxed_captures(f, op)\n    if chunking_enabled(scheduler)\n        tasks = map(_index_chunks(scheduler, first(Arrs))) do inds\n\n            args = map(A -> view(A, inds), Arrs)\n            # Note, calling `promise_task_local` here is only safe because we're assuming that\n            # Base.mapreduce isn't going to magically try to do multithreading on us...\n            @spawn threadpool mapreduce(promise_task_local(f), promise_task_local(op),\n                                        args...; $mapreduce_kwargs...)\n        end\n        mapreduce(fetch, promise_task_local(op), tasks)\n    else\n        tasks = map(eachindex(first(Arrs))) do i\n            args = map(A -> @inbounds(A[i]), Arrs)\n            @spawn threadpool promise_task_local(f)(args...)\n        end\n        mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)\n    end\nend\n\n# DynamicScheduler: AbstractChunks\nfunction _tmapreduce(f,\n        op,\n        Arrs::Union{Tuple{AbstractChunks{T}}, Tuple{ChunkSplitters.Internals.Enumerate{T}}},\n        ::Type{OutputType},\n        scheduler::DynamicScheduler,\n        mapreduce_kwargs)::OutputType where {OutputType, T}\n    threadpool = get_threadpool(scheduler)\n    throw_if_boxed_captures(f, op)\n    tasks = map(only(Arrs)) do idcs\n        @spawn threadpool promise_task_local(f)(idcs)\n    end\n    mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)\nend\n\n# StaticScheduler: AbstractArray/Generic\nfunction _tmapreduce(f,\n        op,\n        Arrs,\n        ::Type{OutputType},\n        scheduler::StaticScheduler,\n        mapreduce_kwargs)::OutputType where {OutputType}\n    nt = nthreads()\n    check_all_have_same_indices(Arrs)\n    throw_if_boxed_captures(f, op)\n    if chunking_enabled(scheduler)\n        tasks = map(enumerate(_index_chunks(scheduler, first(Arrs)))) do (c, inds)\n            tid = @inbounds nthtid(mod1(c, nt))\n            args = map(A -> view(A, inds), Arrs)\n            # Note, calling `promise_task_local` here is only safe because we're assuming that\n            # Base.mapreduce isn't going to magically try to do multithreading on us...\n            @spawnat tid mapreduce(promise_task_local(f), promise_task_local(op), args...;\n                mapreduce_kwargs...)\n        end\n        # Note, calling `promise_task_local` here is only safe because we're assuming that\n        # Base.mapreduce isn't going to magically try to do multithreading on us...\n        mapreduce(fetch, promise_task_local(op), tasks)\n    else\n        tasks = map(enumerate(eachindex(first(Arrs)))) do (c, i)\n            tid = @inbounds nthtid(mod1(c, nt))\n            args = map(A -> @inbounds(A[i]), Arrs)\n            @spawnat tid promise_task_local(f)(args...)\n        end\n        # Note, calling `promise_task_local` here is only safe because we're assuming that\n        # Base.mapreduce isn't going to magically try to do multithreading on us...\n        mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)\n    end\nend\n\n# StaticScheduler: AbstractChunks\nfunction _tmapreduce(f,\n        op,\n        Arrs::Tuple{AbstractChunks{T}}, # we don't support multiple chunks for now\n        ::Type{OutputType},\n        scheduler::StaticScheduler,\n        mapreduce_kwargs)::OutputType where {OutputType, T}\n    check_all_have_same_indices(Arrs)\n    throw_if_boxed_captures(f, op)\n    chnks = only(Arrs)\n    nt = nthreads()\n    tasks = map(enumerate(chnks)) do (c, idcs)\n        tid = @inbounds nthtid(mod1(c, nt))\n        # Note, calling `promise_task_local` here is only safe because we're assuming that\n        # Base.mapreduce isn't going to magically try to do multithreading on us...\n        @spawnat tid promise_task_local(f)(idcs)\n    end\n    # Note, calling `promise_task_local` here is only safe because we're assuming that\n    # Base.mapreduce isn't going to magically try to do multithreading on us...\n    mapreduce(fetch, promise_task_local(op), tasks; mapreduce_kwargs...)\nend\n\n# NOTE: once v1.12 releases we should switch this to wait(t; throw=false)\nwait_nothrow(t) = Base._wait(t)\n\n\n\"\"\"\n    empty_collection_error(task)\n\nCheck if a task failed due to an empty collection error.\n\"\"\"\nfunction empty_collection_error end\n\n@static if VERSION < v\"1.11.0-\"\n    function empty_collection_error(task)\n        task.result isa MethodError && task.result.f == Base.mapreduce_empty\n    end\nelse\n    function empty_collection_error(task)\n        task.result isa ArgumentError &&\n            task.result.msg ==\n            \"reducing over an empty collection is not allowed; consider supplying `init` to the reducer\"\n    end\nend\n\n# GreedyScheduler w/o chunking\nfunction _tmapreduce(f,\n        op,\n        Arrs,\n        ::Type{OutputType},\n        scheduler::GreedyScheduler{NoChunking},\n        mapreduce_kwargs)::OutputType where {OutputType}\n    ntasks_desired = scheduler.ntasks\n    if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown\n        ntasks = ntasks_desired\n        ch_len = 0\n    else\n        check_all_have_same_indices(Arrs)\n        ntasks = min(length(first(Arrs)), ntasks_desired)\n        ch_len = length(first(Arrs))\n    end\n    throw_if_boxed_captures(f, op)\n    # TODO: Use ChannelLike for iterators that support it. Dispatch on IndexLinear?\n    ch = Channel{Tuple{eltype.(Arrs)...}}(ch_len; spawn = true) do ch\n        for args in zip(Arrs...)\n            put!(ch, args)\n        end\n    end\n    tasks = map(1:ntasks) do _\n        # Note, calling `promise_task_local` here is only safe because we're assuming that\n        # Base.mapreduce isn't going to magically try to do multithreading on us...\n        @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do args\n            promise_task_local(f)(args...)\n        end\n    end\n    # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82\n    # The idea is that if the channel gets fully consumed before a task gets started up,\n    # then if the user does not supply an `init` kwarg, we'll get an error.\n    # Current way of dealing with this is just filtering out `mapreduce_empty` method\n    # errors. This may not be the most stable way of dealing with things, e.g. if the\n    # name of the function throwing the error changes this could break, so long term\n    # we may want to try a different design.\n    filtered_tasks = filter(tasks) do stabletask\n        task = stabletask.t\n        istaskdone(task) || wait_nothrow(task)\n        if empty_collection_error(task)\n            false\n        else\n            true\n        end\n    end\n    # Note, calling `promise_task_local` here is only safe because we're assuming that\n    # Base.mapreduce isn't going to magically try to do multithreading on us...\n    mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...)\nend\n\n# GreedyScheduler w/ chunking\nfunction _tmapreduce(f,\n        op,\n        Arrs,\n        ::Type{OutputType},\n        scheduler::GreedyScheduler,\n        mapreduce_kwargs)::OutputType where {OutputType}\n    if Base.IteratorSize(first(Arrs)) isa Base.SizeUnknown\n        throw(ArgumentError(\"SizeUnkown iterators in combination with a greedy scheduler and chunking are currently not supported.\"))\n    end\n    check_all_have_same_indices(Arrs)\n    throw_if_boxed_captures(f, op)\n    chnks = _index_chunks(scheduler, first(Arrs))\n    ntasks_desired = scheduler.ntasks\n    ntasks = min(length(chnks), ntasks_desired)\n\n    # ChunkSplitters.IndexChunks support everything needed for ChannelLike\n    ch = ChannelLike(chnks)\n\n    tasks = map(1:ntasks) do _\n        # Note, calling `promise_task_local` here is only safe because we're assuming that\n        # Base.mapreduce isn't going to magically try to do multithreading on us...\n        @spawn mapreduce(promise_task_local(op), ch; mapreduce_kwargs...) do inds\n            args = map(A -> view(A, inds), Arrs)\n            mapreduce(promise_task_local(f), promise_task_local(op), args...)\n        end\n    end\n    # Doing this because of https://github.com/JuliaFolds2/OhMyThreads.jl/issues/82\n    # The idea is that if the channel gets fully consumed before a task gets started up,\n    # then if the user does not supply an `init` kwarg, we'll get an error.\n    # Current way of dealing with this is just filtering out `mapreduce_empty` method\n    # errors. This may not be the most stable way of dealing with things, e.g. if the\n    # name of the function throwing the error changes this could break, so long term\n    # we may want to try a different design.\n    filtered_tasks = filter(tasks) do stabletask\n        task = stabletask.t\n        istaskdone(task) || wait_nothrow(task)\n        if empty_collection_error(task)\n            false\n        else\n            true\n        end\n    end\n    # Note, calling `promise_task_local` here is only safe because we're assuming that\n    # Base.mapreduce isn't going to magically try to do multithreading on us...\n    mapreduce(fetch, promise_task_local(op), filtered_tasks; mapreduce_kwargs...)\nend\n\nfunction check_all_have_same_indices(Arrs)\n    let A = first(Arrs), Arrs = Arrs[2:end]\n        if !all(B -> eachindex(A) == eachindex(B), Arrs)\n            error(\"The indices of the input arrays must match the indices of the output array.\")\n        end\n    end\nend\n\nstruct BoxedVariableError <: Exception\n    vars::Vector{Symbol}\nend\nfunction Base.showerror(io::IO, bve::BoxedVariableError)\n    boxed_fields = join(bve.vars, \", \")\n    suffix = length(bve.vars) > 1 ? \"s\" : \"\"\n    print(io, \"Attempted to capture and modify outer local variable$(suffix): \")\n    printstyled(io, boxed_fields; color=:red)\n    print(io, \"\\n\\nSee https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for a fuller explanation.\")\n    if isdefined(Base.Experimental, :show_error_hints)\n        Base.Experimental.show_error_hints(io, bve)\n    end\nend\n\nfunction throw_if_boxed_captures(f)\n    if allowing_boxed_captures[]\n        return nothing\n    end\n    T = typeof(f)\n    if any(FT -> FT <: Core.Box, fieldtypes(T))\n        boxed_fields = [fieldname(T, i) for i in 1:fieldcount(T) if fieldtype(T,i) <: Core.Box]\n        throw(BoxedVariableError(boxed_fields))\n    end\n    for i ∈ 1:fieldcount(T)\n        # recurse into nested captured functions.\n        if fieldtype(T, i) <: Function\n            f_inner = getfield(f, i)\n            if f !== f_inner\n                # don't recurse into self!\n                throw_if_boxed_captures(getfield(f, i))\n            end\n        end\n    end\nend\n\nfunction throw_if_boxed_captures(f, fs...)\n    throw_if_boxed_captures(f)\n    throw_if_boxed_captures(fs...)\nend\n\n#-------------------------------------------------------------\n\nfunction treduce(op, A...; kwargs...)\n    tmapreduce(identity, op, A...; kwargs...)\nend\n\n#-------------------------------------------------------------\n\nfunction tforeach(f, A...; kwargs...)::Nothing\n    tmapreduce(f, (l, r) -> l, A...; kwargs..., init = nothing, outputtype = Nothing)\nend\n\n#-------------------------------------------------------------\n\nfunction maybe_rewrap(g::G, f::F) where {G, F}\n    g(f)\nend\n\n\"\"\"\n   maybe_rewrap(g, f)\n\ntakes a closure `g(f)` and if `f` is a `WithTaskLocals`, we're going\nto unwrap `f` and delegate its `TaskLocalValues` to `g`.\n\nThis should always be equivalent to just calling `g(f)`.\n\"\"\"\nfunction maybe_rewrap(g::G, f::WithTaskLocals{F}) where {G, F}\n    (; inner_func, tasklocals) = f\n    WithTaskLocals(vals -> g(inner_func(vals)), tasklocals)\nend\n\n#------------------------------------------------------------\n\nfunction tmap(f, ::Type{T}, A::AbstractArray, _Arrs::AbstractArray...; kwargs...) where {T}\n    Arrs = (A, _Arrs...)\n    tmap!(f, similar(A, T), Arrs...; kwargs...)\nend\n\nfunction tmap(f,\n        A::Union{AbstractArray, AbstractChunks, ChunkSplitters.Internals.Enumerate},\n        _Arrs::AbstractArray...;\n        scheduler::MaybeScheduler = NotGiven(),\n        kwargs...)\n    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)\n\n    if _scheduler isa GreedyScheduler\n        error(\"Greedy scheduler isn't supported with `tmap` unless you provide an `OutputElementType` argument, since the greedy schedule requires a commutative reducing operator.\")\n    end\n    if chunking_enabled(_scheduler) && has_chunksplit(_scheduler) &&\n       chunksplit(_scheduler) != Consecutive()\n        error(\"Only `split == Consecutive()` is supported because the parallel operation isn't commutative. (Scheduler: $_scheduler)\")\n    end\n    if (A isa AbstractChunks || A isa ChunkSplitters.Internals.Enumerate)\n        _check_chunks_incompatible_kwargs(; kwargs...)\n        if chunking_enabled(_scheduler)\n            if _scheduler isa DynamicScheduler\n                _scheduler = DynamicScheduler(;\n                    threadpool = get_threadpool(_scheduler),\n                    chunking = false)\n            elseif _scheduler isa StaticScheduler\n                _scheduler = StaticScheduler(; chunking = false)\n            else\n                error(\"Can't disable chunking for this scheduler?! Shouldn't be reached.\",\n                    _scheduler)\n            end\n        end\n    end\n\n    Arrs = (A, _Arrs...)\n    if _scheduler isa SerialScheduler || isempty(A)\n        # empty input collection → align with Base.map behavior\n        map(f, Arrs...; kwargs...)\n    else\n        check_all_have_same_indices(Arrs)\n        @noinline _tmap(_scheduler, f, A, _Arrs...)\n    end\nend\n\n# w/o chunking (DynamicScheduler{NoChunking}): AbstractArray\nfunction _tmap(scheduler::DynamicScheduler{NoChunking},\n        f,\n        A::AbstractArray,\n        _Arrs::AbstractArray...;)\n    threadpool = get_threadpool(scheduler)\n    Arrs = (A, _Arrs...)\n    throw_if_boxed_captures(f)\n    tasks = map(eachindex(A)) do i\n        @spawn threadpool begin\n            args = map(A -> A[i], Arrs)\n            promise_task_local(f)(args...)\n        end\n    end\n    v = map(fetch, tasks)\n    reshape(v, size(A)...)\nend\n\n# w/o chunking (DynamicScheduler{NoChunking}): AbstractChunks\nfunction _tmap(scheduler::DynamicScheduler{NoChunking},\n        f,\n        A::Union{AbstractChunks, ChunkSplitters.Internals.Enumerate},\n        _Arrs::AbstractArray...)\n    threadpool = get_threadpool(scheduler)\n    throw_if_boxed_captures(f)\n    tasks = map(A) do idcs\n        @spawn threadpool promise_task_local(f)(idcs)\n    end\n    map(fetch, tasks)\nend\n\n# w/o chunking (StaticScheduler{NoChunking}): AbstractChunks\nfunction _tmap(scheduler::StaticScheduler{NoChunking},\n        f,\n        A::AbstractChunks,\n        _Arrs::AbstractArray...)\n    nt = nthreads()\n    throw_if_boxed_captures(f)\n    tasks = map(enumerate(A)) do (c, idcs)\n        tid = @inbounds nthtid(mod1(c, nt))\n        @spawnat tid promise_task_local(f)(idcs)\n    end\n    map(fetch, tasks)\nend\n\n# w/o chunking (StaticScheduler{NoChunking}): AbstractArray\nfunction _tmap(scheduler::StaticScheduler{NoChunking},\n        f,\n        A::AbstractArray,\n        _Arrs::AbstractArray...;)\n    Arrs = (A, _Arrs...)\n    nt = nthreads()\n    throw_if_boxed_captures(f)\n    tasks = map(enumerate(A)) do (c, i)\n        tid = @inbounds nthtid(mod1(c, nt))\n        @spawnat tid begin\n            args = map(A -> A[i], Arrs)\n            promise_task_local(f)(args...)\n        end\n    end\n    v = map(fetch, tasks)\n    reshape(v, size(A)...)\nend\n\n# w/ chunking\nfunction _tmap(scheduler::Scheduler,\n        f,\n        A::AbstractArray,\n        _Arrs::AbstractArray...)\n    Arrs = (A, _Arrs...)\n    idcs = collect(_index_chunks(scheduler, A))\n    reduction_f = append!!\n    mapping_f = maybe_rewrap(f) do f\n        (inds) -> begin\n            args = map(A -> @view(A[inds]), Arrs)\n            map(f, args...)\n        end\n    end\n    v = tmapreduce(mapping_f, reduction_f, idcs; scheduler)\n    reshape(v, size(A)...)\nend\n\n@propagate_inbounds function tmap!(f,\n        out,\n        A::AbstractArray,\n        _Arrs::AbstractArray...;\n        scheduler::MaybeScheduler = NotGiven(),\n        kwargs...)\n    _scheduler = _scheduler_from_userinput(scheduler; kwargs...)\n\n    Arrs = (A, _Arrs...)\n    if _scheduler isa SerialScheduler\n        map!(f, out, Arrs...)\n    else\n        @boundscheck check_all_have_same_indices((out, Arrs...))\n        throw_if_boxed_captures(f)\n        mapping_f = maybe_rewrap(f) do f\n            function mapping_function(i)\n                args = map(A -> @inbounds(A[i]), Arrs)\n                res = f(args...)\n                out[i] = res\n            end\n        end\n        @noinline tforeach(mapping_f, eachindex(out); scheduler = _scheduler)\n        out\n    end\nend\n\n#-------------------------------------------------------------\n\nfunction tcollect(::Type{T}, gen::Base.Generator{<:AbstractArray}; kwargs...) where {T}\n    tmap(gen.f, T, gen.iter; kwargs...)\nend\ntcollect(gen::Base.Generator{<:AbstractArray}; kwargs...) = tmap(gen.f, gen.iter; kwargs...)\n\ntcollect(::Type{T}, A; kwargs...) where {T} = tmap(identity, T, A; kwargs...)\ntcollect(A; kwargs...) = tmap(identity, A; kwargs...)\n\nend # module Implementation\n"
  },
  {
    "path": "src/macro_impl.jl",
    "content": "using OhMyThreads.Tools: OnlyOneRegion, try_enter!\nusing OhMyThreads.Tools: SimpleBarrier\nusing OhMyThreads: OhMyThreads\n\nfunction _is_special_macro_expr(arg;\n        lookfor = (\"@set\", \"@local\", \"@only_one\", \"@one_by_one\", \"@barrier\"))\n    if !(arg isa Expr)\n        return false\n    end\n    lookfor_symbols = Symbol.(lookfor)\n    if arg.head == :macrocall\n        if arg.args[1] isa Symbol && arg.args[1] in lookfor_symbols\n            # support, e.g., @set\n            return true\n        elseif arg.args[1] isa Expr && arg.args[1].head == Symbol(\".\")\n            # support, e.g., OhMyThreads.@set\n            x = arg.args[1]\n            if x.args[1] == Symbol(\"OhMyThreads\") && x.args[2] isa QuoteNode &&\n               x.args[2].value in lookfor_symbols\n                return true\n            end\n        end\n    end\n    return false\nend\n\nfunction tasks_macro(forex; __module__)\n    if forex.head != :for\n        throw(ErrorException(\"Expected a for loop after `@tasks`.\"))\n    else\n        if forex.args[1].head != :(=)\n            # this'll catch cases like\n            # @tasks for _ ∈ 1:10, _ ∈ 1:10\n            #     body\n            # end\n            throw(ErrorException(\"`@tasks` currently only supports a single threaded loop, got $(forex.args[1])\"))\n        end\n        it = forex.args[1]\n        itvar = it.args[1]\n        itrng = it.args[2]\n        forbody = forex.args[2]\n    end\n\n    settings = Settings()\n\n    # Escape everything in the loop body that is not used in conjuction with one of our\n    # \"macros\", e.g. @set or @local. Code inside of these macro blocks will be escaped by\n    # the respective \"macro\" handling functions below.\n    for i in findall(!_is_special_macro_expr, forbody.args)\n        forbody.args[i] = esc(forbody.args[i])\n    end\n\n    locals_before, locals_names = _maybe_handle_atlocal_block!(forbody.args)\n    tls_names = isnothing(locals_before) ? [] : map(x -> x.args[1], locals_before)\n    _maybe_handle_atset_block!(settings, forbody.args)\n    setup_onlyone_blocks = _maybe_handle_atonlyone_blocks!(forbody.args)\n    setup_onebyone_blocks = _maybe_handle_atonebyone_blocks!(forbody.args)\n    if isdefined(__module__, Symbol(\"@barrier\"))\n        if __module__.var\"@barrier\" != OhMyThreads.Experimental.var\"@barrier\"\n            error(\"There seems to be a macro `@barrier` around which isn't `OhMyThreads.Experimental.@barrier`. This isn't supported.\")\n        end\n        setup_barriers = _maybe_handle_atbarriers!(forbody.args, settings)\n    else\n        setup_barriers = nothing\n    end\n\n    itrng = esc(itrng)\n    itvar = esc(itvar)\n\n    make_mapping_function = if isempty(tls_names)\n        :(local function mapping_function($itvar,)\n            $(forbody)\n        end)\n\n    else\n        :(local mapping_function = WithTaskLocals(($(tls_names...),)) do ($(locals_names...),)\n            function mapping_function_local($itvar,)\n                $(forbody)\n            end\n        end)\n    end\n    q = if isgiven(settings.reducer)\n        quote\n            $setup_onlyone_blocks\n            $setup_onebyone_blocks\n            $setup_barriers\n            $make_mapping_function\n            tmapreduce(mapping_function, $(settings.reducer),\n                $(itrng))\n        end\n    elseif isgiven(settings.collect)\n        maybe_warn_useless_init(settings)\n        quote\n            $setup_onlyone_blocks\n            $setup_onebyone_blocks\n            $setup_barriers\n            $make_mapping_function\n            tmap(mapping_function, $(itrng))\n        end\n    else\n        maybe_warn_useless_init(settings)\n        quote\n            $setup_onlyone_blocks\n            $setup_onebyone_blocks\n            $setup_barriers\n            $make_mapping_function\n            tforeach(mapping_function, $(itrng))\n        end\n    end\n\n    # insert keyword arguments into the function call\n    kwexpr = :($(Expr(:parameters)))\n    if isgiven(settings.scheduler)\n        push!(kwexpr.args, Expr(:kw, :scheduler, settings.scheduler))\n    end\n    if isgiven(settings.init)\n        push!(kwexpr.args, Expr(:kw, :init, settings.init))\n    end\n    for (k, v) in settings.kwargs\n        push!(kwexpr.args, Expr(:kw, k, v))\n    end\n    insert!(q.args[10].args, 2, kwexpr)\n\n    # wrap everything in a let ... end block\n    # and, potentially, define the `TaskLocalValue`s.\n    result = :(let\n    end)\n    push!(result.args[2].args, q)\n    if !isnothing(locals_before)\n        for x in locals_before\n            push!(result.args[1].args, x)\n        end\n    end\n\n    result\nend\n\nfunction maybe_warn_useless_init(settings)\n    isgiven(settings.init) &&\n        @warn(\"The @set init = ... settings won't have any effect because no reduction is performed.\")\nend\n\nBase.@kwdef mutable struct Settings\n    scheduler::Union{Expr, QuoteNode, NotGiven} = NotGiven()\n    reducer::Union{Expr, Symbol, NotGiven} = NotGiven()\n    collect::Union{Bool, NotGiven} = NotGiven()\n    init::Union{Expr, Symbol, NotGiven} = NotGiven()\n    kwargs::Dict{Symbol, Any} = Dict{Symbol, Any}()\nend\n\nfunction _maybe_handle_atlocal_block!(args)\n    locals_before = nothing\n    local_inner = nothing\n    tlsidx = findfirst(args) do arg\n        _is_special_macro_expr(arg; lookfor = (Symbol(\"@local\"),))\n    end\n    if !isnothing(tlsidx)\n        locals_before, local_inner = _unfold_atlocal_block(args[tlsidx].args[3])\n        deleteat!(args, tlsidx)\n    end\n    return locals_before, local_inner\nend\n\nfunction _unfold_atlocal_block(ex)\n    locals_before = Expr[]\n    locals_names = Expr[]\n    if ex.head == :(=)\n        localb, localn = _atlocal_assign_to_exprs(ex)\n        push!(locals_before, localb)\n        push!(locals_names, localn)\n    elseif ex.head == :block\n        tlsexprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode\n        for x in tlsexprs\n            localb, localn = _atlocal_assign_to_exprs(x)\n            push!(locals_before, localb)\n            push!(locals_names, localn)\n        end\n    else\n        throw(ErrorException(\"Wrong usage of @local. You must either provide a typed assignment or multiple typed assignments in a `begin ... end` block.\"))\n    end\n    return locals_before, locals_names\nend\n\n#=\nIf the TLS doesn't have a declared return type, we're going to use `CC.return_type` to get it\nautomatically. This would normally be non-kosher, but it's okay here for three reasons:\n1) The task local value *only* exists within the function being called, meaning that the worldage\nis frozen for the full lifetime of the TLV, so and `eval` can't change the outcome or cause incorrect inference.\n2) We do not allow users to *write* to the task local value, they can only retrieve its value, so there's no\npotential problems from the type being maximally narrow and then them trying to write a value of another type to it\n3) the task local value is not user-observable. we never let the user inspect its type, unless they themselves are\nusing `code____` tools to inspect the generated code, hence if inference changes and gives a more or less precise\ntype, there's no observable semantic changes, just performance increases or decreases.\n=#\nfunction _atlocal_assign_to_exprs(ex)\n    left_ex = ex.args[1]\n    tls_def = esc(ex.args[2])\n    @gensym tl_storage\n    if Base.isexpr(left_ex, :(::))\n        tls_sym = esc(left_ex.args[1])\n        tls_type = esc(left_ex.args[2])\n        local_before = :($(tl_storage) = TaskLocalValue{$tls_type}(() -> $(tls_def)))\n    else\n        tls_sym = esc(left_ex)\n        local_before = :($(tl_storage) = let f = () -> $(tls_def)\n            TaskLocalValue{Core.Compiler.return_type(f, Tuple{})}(f)\n        end)\n    end\n    local_name = :($(tls_sym))\n    return local_before, local_name\nend\n\nfunction _maybe_handle_atset_block!(settings, args)\n    idcs = findall(args) do arg\n        _is_special_macro_expr(arg; lookfor = (Symbol(\"@set\"),))\n    end\n    isnothing(idcs) && return # no @set block found\n    for i in idcs\n        ex = args[i].args[3]\n        if ex.head == :(=)\n            _handle_atset_single_assign!(settings, ex)\n        elseif ex.head == :block\n            exprs = filter(x -> x isa Expr, ex.args) # skip LineNumberNode\n            _handle_atset_single_assign!.(Ref(settings), exprs)\n        else\n            throw(ErrorException(\"Wrong usage of @set. You must either provide an assignment or multiple assignments in a `begin ... end` block.\"))\n        end\n    end\n    deleteat!(args, idcs)\n    # check incompatible settings\n    if isgiven(settings.collect) && settings.collect && isgiven(settings.reducer)\n        throw(ArgumentError(\"Specifying both collect and reducer isn't supported.\"))\n    end\nend\n\nfunction _handle_atset_single_assign!(settings, ex)\n    if ex.head != :(=)\n        throw(ErrorException(\"Wrong usage of @set. Expected assignment, e.g. `scheduler = StaticScheduler()`.\"))\n    end\n    sym = ex.args[1]\n    def = ex.args[2]\n    if hasfield(Settings, sym)\n        if sym == :collect && !(def isa Bool)\n            throw(ArgumentError(\"Setting collect can only be true or false.\"))\n            #TODO support specifying the OutputElementType\n        end\n        def = def isa Bool ? def : esc(def)\n        setfield!(settings, sym, def)\n    else\n        # push!(settings.kwargs, sym => esc(def))\n        settings.kwargs[sym] = esc(def)\n    end\nend\n\nfunction _maybe_handle_atonlyone_blocks!(args)\n    idcs = findall(args) do arg\n        _is_special_macro_expr(arg; lookfor = (Symbol(\"@only_one\"),))\n    end\n    isnothing(idcs) && return # no @only_one blocks\n    setup_onlyone_blocks = quote end\n    for i in idcs\n        body = args[i].args[3]\n        @gensym onlyone\n        init_onlyone_ex = :($(onlyone) = Tools.OnlyOneRegion())\n        push!(setup_onlyone_blocks.args, init_onlyone_ex)\n        args[i] = quote\n            Tools.try_enter!($(onlyone)) do\n                $(esc(body))\n            end\n        end\n    end\n    return setup_onlyone_blocks\nend\n\nfunction _maybe_handle_atonebyone_blocks!(args)\n    idcs = findall(args) do arg\n        _is_special_macro_expr(arg; lookfor = (Symbol(\"@one_by_one\"),))\n    end\n    isnothing(idcs) && return # no @one_by_one blocks\n    setup_onebyone_blocks = quote end\n    for i in idcs\n        body = args[i].args[3]\n        @gensym onebyone\n        init_lock_ex = :($(onebyone) = Base.ReentrantLock())\n        push!(setup_onebyone_blocks.args, init_lock_ex)\n        args[i] = quote\n            lock($(onebyone)) do\n                $(esc(body))\n            end\n        end\n    end\n    return setup_onebyone_blocks\nend\n\nfunction _maybe_handle_atbarriers!(args, settings)\n    idcs = findall(args) do arg\n        _is_special_macro_expr(arg; lookfor = (Symbol(\"@barrier\"),))\n    end\n    isnothing(idcs) && return # no @barrier found\n    setup_barriers = quote end\n    for i in idcs\n        !haskey(settings.kwargs, :ntasks) &&\n            throw(ErrorException(\"When using `@barrier`, the number of tasks must be \" *\n                                 \"specified explicitly, e.g. via `@set ntasks=...`. \"))\n        ntasks = settings.kwargs[:ntasks]\n        @gensym barrier\n        push!(setup_barriers.args, :($(barrier) = $(SimpleBarrier)($ntasks)))\n        args[i] = :($(esc(:wait))($(barrier)))\n    end\n    return setup_barriers\nend\n"
  },
  {
    "path": "src/macros.jl",
    "content": "\"\"\"\n    @tasks for ... end\n\nA macro to parallelize a `for` loop by spawning a set of tasks that can be run in parallel.\nThe policy of how many tasks to spawn and how to distribute the iteration space among the\ntasks (and more) can be configured via `@set` statements in the loop body.\n\nSupports reductions (`@set reducer=<reducer function>`) and collecting the results\n(`@set collect=true`).\n\nUnder the hood, the `for` loop is translated into corresponding parallel\n[`tforeach`](@ref), [`tmapreduce`](@ref), or [`tmap`](@ref) calls.\n\nSee also: [`@set`](@ref), [`@local`](@ref)\n\n## Examples\n\n```julia\nusing OhMyThreads: @tasks\n```\n\n```julia\n@tasks for i in 1:3\n    println(i)\nend\n```\n\n```julia\n@tasks for x in rand(10)\n    @set reducer=+\n    sin(x)\nend\n```\n\n```julia\n@tasks for i in 1:5\n    @set collect=true\n    i^2\nend\n```\n\n```julia\n@tasks for i in 1:100\n    @set ntasks=4*nthreads()\n    # non-uniform work...\nend\n```\n\n```julia\n@tasks for i in 1:5\n    @set scheduler=:static\n    println(\"i=\", i, \" → \", threadid())\nend\n```\n\n```julia\n@tasks for i in 1:100\n    @set begin\n        scheduler=:static\n        chunksize=10\n    end\n    println(\"i=\", i, \" → \", threadid())\nend\n```\n\"\"\"\nmacro tasks(args...)\n    Implementation.tasks_macro(args...; __module__)\nend\n\n\"\"\"\n    @set name = value\n\nThis can be used inside a `@tasks for ... end` block to specify settings for the parallel\nexecution of the loop.\n\nMultiple settings are supported, either as separate `@set` statements or via\n`@set begin ... end`.\n\n## Settings\n\n* `reducer` (e.g. `reducer=+`): Indicates that a reduction should be performed with the provided binary function. See [`tmapreduce`](@ref) for more information.\n* `collect` (e.g. `collect=true`): Indicates that results should be collected (similar to `map`).\n\nAll other settings will be passed on to the underlying parallel functions (e.g. [tmapreduce](@ref))\nas keyword arguments. Hence, you may provide whatever these functions accept as\nkeyword arguments. Among others, this includes\n\n* `scheduler` (e.g. `scheduler=:static`): Can be either a [`Scheduler`](@ref) or a `Symbol` (e.g. `:dynamic`, `:static`, `:serial`, or `:greedy`).\n* `init` (e.g. `init=0.0`): Initial value to be used in a reduction (requires `reducer=...`).\n\nSettings like `ntasks`, `chunksize`, and `split` etc. can be used to tune the scheduling policy (if the selected scheduler supports it).\n\nNote that the assignment is hoisted above the loop body which means that the scope is *not*\nthe scope of the loop (even though it looks like it) but rather the scope *surrounding* the\nloop body. (`@macroexpand` is a useful tool to inspect the generated code of the `@tasks`\nblock.)\n\"\"\"\nmacro set(args...)\n    error(\"The @set macro may only be used inside of a @tasks block.\")\nend\n\n@eval begin\n    \"\"\"\n        @local name = value\n\n        @local name::T = value\n\n    Can be used inside a `@tasks for ... end` block to specify\n    [task-local values](@ref TLS) (TLV) via explicitly typed assignments.\n    These values will be allocated once per task\n    (rather than once per iteration) and can be re-used between different task-local iterations.\n\n    There can only be a single `@local` block in a `@tasks for ... end` block. To specify\n    multiple TLVs, use `@local begin ... end`. Compared to regular assignments, there are some\n    limitations though, e.g. TLVs can't reference each other.\n\n    ## Examples\n\n    ```julia\n    using OhMyThreads: @tasks\n    using OhMyThreads.Tools: taskid\n\n    @tasks for i in 1:10\n        @set begin\n            scheduler=:dynamic\n            ntasks=2\n        end\n        @local x = zeros(3) # TLV\n\n        x .+= 1\n        println(taskid(), \" -> \", x)\n    end\n    ```\n\n    ```julia\n    @tasks for i in 1:10\n        @local begin\n            x = rand(Int, 3)\n            M = rand(3, 3)\n        end\n        # ...\n    end\n    ```\n\n    Task local variables created by `@local` are by default constrained to their inferred type,\n    but if you need to, you can specify a different type during declaration:\n    ```julia\n    @tasks for i in 1:10\n        @local x::Vector{Float64} = some_hard_to_infer_setup_function()\n        # ...\n    end\n    ```\n\n    The right hand side of the assignment is hoisted outside of the loop body and captured\n    as a closure used to initialize the task local value. This means that the scope of the\n    closure is *not* the scope of the loop (even though it looks like it) but rather the\n    scope *surrounding* the loop body. (`@macroexpand` is a useful tool to inspect the\n    generated code of the `@tasks` block.)\n    \"\"\"\n    macro $(Symbol(\"local\"))(args...)\n        error(\"The @local macro may only be used inside of a @tasks block.\")\n    end\nend\n\n\"\"\"\n    @only_one begin ... end\n\nThis can be used inside a `@tasks for ... end` block to mark a region of code to be\nexecuted by only one of the parallel tasks (all other tasks skip over this region).\n\n## Example\n\n```julia\nusing OhMyThreads: @tasks\n\n@tasks for i in 1:10\n    @set ntasks = 10\n\n    println(i, \": before\")\n    @only_one begin\n        println(i, \": only printed by a single task\")\n        sleep(1)\n    end\n    println(i, \": after\")\nend\n```\n\"\"\"\nmacro only_one(args...)\n    error(\"The @only_one macro may only be used inside of a @tasks block.\")\nend\n\n\"\"\"\n    @one_by_one begin ... end\n\nThis can be used inside a `@tasks for ... end` block to mark a region of code to be\nexecuted by one parallel task at a time (i.e. exclusive access). The order may be arbitrary\nand non-deterministic.\n\n## Example\n\n```julia\nusing OhMyThreads: @tasks\n\n@tasks for i in 1:10\n    @set ntasks = 10\n\n    println(i, \": before\")\n    @one_by_one begin\n        println(i, \": one task at a time\")\n        sleep(0.5)\n    end\n    println(i, \": after\")\nend\n```\n\"\"\"\nmacro one_by_one(args...)\n    error(\"The @one_by_one macro may only be used inside of a @tasks block.\")\nend\n\n\nconst allowing_boxed_captures = ScopedValue(false)\n\n\"\"\"\n    @allow_boxed_captures expr\n\nBy default, OhMyThreads.jl will detect and error on multithreaded code which references local variables\nwhich are 'boxed' -- something that happens if the variable could be re-bound in multiple scopes. This\nprocess can cause very sublte bugs in multithreaded code by creating silent race conditions, e.g.\n\n```julia\nlet\n    function wrong()\n        tmap(1:10) do i\n            A = i # define A for the first time (lexically)\n            sleep(rand()/10)\n            A # user is trying to reference local A only\n        end\n    end\n    @show wrong()\n    A = 1 # boxed! this hoists \"A\" to the same variable as in `wrong` but presumably the user wanted a new one\nend\n```\nIn this example, you might expect to get `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`, but you would actually observe\nincorrect results because `A` is 'boxed'. The fix for this would be to write something like\n```julia\nlet\n    function right()\n        tmap(1:10) do i\n            local A = i\n            sleep(rand()/10)\n            A \n        end\n    end\n    @show right()\n    A = 1\nend\n```\n\nHowever, if you are really sure you want to bypass OhMyThreads's error mechanism, you can use\n`@allow_boxed_captures` to wrap code you believe is okay, e.g.\n\n```julia-repl\njulia> let A = 1 \n           @allow_boxed_captures tmap(1:10) do i\n               A = i\n               sleep(rand()/10)\n               A # race condition!\n           end\n       end\n10-element Vector{Int64}:\n 4\n 2\n 7\n 2\n 2\n 8\n 6\n 8\n 7\n 2\n```\n\nThis is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`.\n\nSee also `@disallow_boxed_captures`\n\"\"\"\nmacro allow_boxed_captures(ex)\n    quote\n        @with allowing_boxed_captures => true $(esc(ex))\n    end\nend\n\n\"\"\"\n    @disallow_boxed_captures expr\n\nDisable the effect of `@allow_boxed_captures` for any code in `expr`.\n\nThis is a dynamically scoped construct, so this effect will apply to *all* nested code inside of `expr`.\n\nSee also `@disallow_boxed_captures`\n\"\"\"\nmacro disallow_boxed_captures(ex)\n    quote\n        @with allowing_boxed_captures => false $(esc(ex))\n    end\nend\n\n\"\"\"\n   @localize args... expr\n\nWriting\n```\n@localize x y z expr\n```\nis equivalent to writing\n```\nlet x=x, y=y, z=z\n    expr\nend\n```\nThis is useful for avoiding the boxing of captured variables when working with closures.\n\nSee https://juliafolds2.github.io/OhMyThreads.jl/stable/literate/boxing/boxing/ for more information about boxed variables.\n\"\"\"\nmacro localize(args...)\n    syms = args[1:end-1]\n    ex = args[end]\n    letargs = map(syms) do sym\n        if !(sym isa Symbol)\n            throw(ArgumentError(\"All but the final argument to `@localize` must be symbols! Got $sym\"))\n        end\n        :($sym = $sym)\n    end\n    esc(:(let $(letargs...)\n              $ex\n          end))\nend\n"
  },
  {
    "path": "src/schedulers.jl",
    "content": "module Schedulers\n\nusing Base.Threads: nthreads\nusing ChunkSplitters: Split, Consecutive, RoundRobin, ChunkSplitters\n\n# Used to indicate that a keyword argument has not been set by the user.\n# We don't use Nothing because nothing maybe sometimes be a valid user input (e.g. for init)\nstruct NotGiven end\nisgiven(::NotGiven) = false\nisgiven(::T) where {T} = true\n\nconst MaybeInteger = Union{Integer, NotGiven}\n\nstruct NoSplit <: Split end\n_parse_split(split::Split) = split\nfunction _parse_split(split::Symbol)\n    split in (:consecutive, :batch) && return Consecutive()\n    split in (:roundrobin, :scatter) && return RoundRobin()\n    throw(ArgumentError(\"You've provided an unsupported value for `split`\"))\nend\n_splitid(x::Type{<:Split}) = nameof(x) |> string |> lowercase |> Symbol\n_splitid(x::Split) = _splitid(typeof(x))\n\n\"\"\"\nSupertype for all available schedulers:\n\n* [`DynamicScheduler`](@ref): default dynamic scheduler\n* [`StaticScheduler`](@ref): low-overhead static scheduler\n* [`GreedyScheduler`](@ref): greedy load-balancing scheduler\n* [`SerialScheduler`](@ref): serial (non-parallel) execution\n\"\"\"\nabstract type Scheduler end\n#! A subtype of Scheduler (let's call it `S`) **must** implement:\n#   - `from_symbol(::Val{:symbol})` returning exactly `S` for the given symbol.\n#     (e.g. `from_symbol(::Val{:dynamic}) = DynamicScheduler`)\n\n# To enable chunking, S **must** implement:\n#   - `chunking_args(::S)::ChunkingArgs` returning the chunking arguments of the scheduler.\n#     It usually is a field of the scheduler, and use the constructor\n#     `ChunkingArgs` to create it (see below).\n\n# And can optionally implement:\n#   - `default_nchunks(::Type{S})` returning the default number of chunks for the scheduler.\n#     if chunking is enabled. Default is `Threads.nthreads(:default)`.\n\nfrom_symbol(::Val) = throw(ArgumentError(\"unkown scheduler symbol\"))\n\nscheduler_from_symbol(s::Symbol; kwargs...) = scheduler_from_symbol(Val(s); kwargs...)\nfunction scheduler_from_symbol(v::Val; kwargs...)\n    sched = from_symbol(v)\n    return sched(; kwargs...)\nend\n\n\"\"\"\n    ChunkingMode\n\nA trait type to indicate the chunking mode of a scheduler. The following subtypes are available:\n\n* `NoChunking`: no chunking is used\n* `FixedCount`: the number of chunks is fixed\n* `FixedSize`: the size of each chunk is fixed\n\"\"\"\nabstract type ChunkingMode end\nstruct NoChunking <: ChunkingMode end\nstruct FixedCount <: ChunkingMode end\nstruct FixedSize <: ChunkingMode end\n\nchunksplitter_mode(::Type{FixedCount}) = ChunkSplitters.Internals.FixedCount\nchunksplitter_mode(::Type{FixedSize}) = ChunkSplitters.Internals.FixedSize\n\n\"\"\"\n    ChunkingArgs{C, S <: Split}(n::Union{Int, Nothing}, size::Union{Int, Nothing}, minsize::Union{Int, Nothing}, split::S)\n    ChunkingArgs(Sched::Type{<:Scheduler}; n = nothing, size = nothing, minsize = nothing, split::Union{Symbol, Split}; chunking)\n\nStores all the information needed for chunking. The type parameter `C` is the chunking mode\n(`NoChunking`, `FixedSize`, or `FixedCount`). The `chunking` keyword argument is a boolean\nand if true, everything is skipped and `C = NoChunking`.\n\nOnce the object is created, use the `has_fieldname(object)` function (e.g. `has_size(object)`)\nto know if the field is effectively used.\n\"\"\"\nstruct ChunkingArgs{C, S <: Split}\n    n::Union{Int, Nothing}\n    size::Union{Int, Nothing}\n    minsize::Union{Int, Nothing}\n    split::S\nend\nfunction ChunkingArgs(::Type{NoChunking})\n    ChunkingArgs{NoChunking, NoSplit}(nothing, nothing, nothing, NoSplit())\nend\nfunction ChunkingArgs(\n        Sched::Type{<:Scheduler};\n        n = nothing,\n        size = nothing,\n        minsize = nothing,\n        split::Union{Symbol, Split},\n        chunking\n)\n    chunking || return ChunkingArgs(NoChunking)\n\n    if isnothing(n) && isnothing(size)\n        n = default_nchunks(Sched)\n    elseif !isnothing(n) && !isnothing(size)\n        throw(ArgumentError(\"nchunks and chunksize are mutually exclusive\"))\n    end\n    chunking_mode = isnothing(n) ? FixedSize : FixedCount\n    split = _parse_split(split)\n    return ChunkingArgs{chunking_mode, typeof(split)}(n, size, minsize, split)\nend\n\nchunking_mode(::ChunkingArgs{C}) where {C} = C\nhas_n(ca::ChunkingArgs) = !isnothing(ca.n)\nhas_size(ca::ChunkingArgs) = !isnothing(ca.size)\nhas_split(::ChunkingArgs{C, S}) where {C, S} = S !== NoSplit\nhas_minsize(ca::ChunkingArgs) = !isnothing(ca.minsize)\nchunking_enabled(ca::ChunkingArgs) = chunking_mode(ca) != NoChunking\n\nfunction chunkingargs_to_kwargs(ca::ChunkingArgs, arg)\n    minsize = !has_minsize(ca) ? nothing : min(ca.minsize, length(arg))\n    return (; ca.n, ca.size, minsize, ca.split)\nend\n\n_chunkingstr(ca::ChunkingArgs{NoChunking}) = \"none\"\nfunction _chunkingstr(ca::ChunkingArgs{FixedCount})\n    str = \"fixed count ($(ca.n)), split :$(_splitid(ca.split))\"\n    if has_minsize(ca)\n        str = str * \", minimum chunk size  $(ca.minsize)\"\n    end\n    str\nend\nfunction _chunkingstr(ca::ChunkingArgs{FixedSize})\n    str = \"fixed size ($(ca.size)), split :$(_splitid(ca.split))\"\n    str\nend\n\n# Link between a scheduler and its chunking arguments\n# The first and only the first method must be overloaded for each scheduler\n# that supports chunking.\nchunking_args(::Scheduler) = ChunkingArgs(NoChunking)\n\nnchunks(sched::Scheduler) = chunking_args(sched).n\nchunksize(sched::Scheduler) = chunking_args(sched).size\nchunksplit(sched::Scheduler) = chunking_args(sched).split\nminchunksize(sched::Scheduler) = chunking_args(sched).minsize\n\nhas_nchunks(sched::Scheduler) = has_n(chunking_args(sched))\nhas_chunksize(sched::Scheduler) = has_size(chunking_args(sched))\nhas_chunksplit(sched::Scheduler) = has_split(chunking_args(sched))\nhas_minchunksize(sched::Scheduler) = has_minsize(chunking_args(sched))\n\nfunction chunkingargs_to_kwargs(sched::Scheduler, arg)\n    chunkingargs_to_kwargs(chunking_args(sched), arg)\nend\n\nchunking_mode(sched::Scheduler) = chunking_mode(chunking_args(sched))\nchunking_enabled(sched::Scheduler) = chunking_enabled(chunking_args(sched))\n_chunkingstr(sched::Scheduler) = _chunkingstr(chunking_args(sched))\n\n\"\"\"\n    default_nchunks(::Type{<:Scheduler})\n\nHardcoded default number of chunks, if not provided by the user. Can depend on the\nkind of scheduler.\n\"\"\"\nfunction default_nchunks end\ndefault_nchunks(::Type{<:Scheduler}) = nthreads(:default)\n\n\"\"\"\n    DynamicScheduler (aka :dynamic)\n\nThe default dynamic scheduler. Divides the given collection into chunks and\nthen spawns a task per chunk to perform the requested operation in parallel.\nThe tasks are assigned to threads by Julia's dynamic scheduler and are non-sticky, that is,\nthey can migrate between threads.\n\nGenerally preferred since it is flexible, can provide load balancing, and is composable\nwith other multithreaded code.\n\n## Keyword arguments:\n\n- `nchunks::Integer` or `ntasks::Integer` (default `nthreads(threadpool)`):\n    * Determines the number of chunks (and thus also the number of parallel tasks).\n    * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)), but at the expense of creating more overhead. For `nchunks <= nthreads()` there are not enough chunks for any load balancing.\n    * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads.\n- `chunksize::Integer` (default not set)\n    * Specifies the desired chunk size (instead of the number of chunks).\n    * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be a positive integer).\n- `minchunksize::Union{Integer, Nothing}` (default `nothing`)\n    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.\n- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`):\n    * Determines how the collection is divided into chunks (if chunking=true). By default, each chunk consists of contiguous elements and order is maintained.\n    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`\n    * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**!\n- `chunking::Bool` (default `true`):\n    * Controls whether input elements are grouped into chunks (`true`) or not (`false`).\n    * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as \"chunks\" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly!\n- `threadpool::Symbol` (default `:default`):\n    * Possible options are `:default` and `:interactive`.\n    * The high-priority pool `:interactive` should be used very carefully since tasks on this threadpool should not be allowed to run for a long time without `yield`ing as it can interfere with [heartbeat](https://en.wikipedia.org/wiki/Heartbeat_(computing)) processes.\n\"\"\"\nstruct DynamicScheduler{C <: ChunkingMode, S <: Split, threadpool} <: Scheduler\n    chunking_args::ChunkingArgs{C, S}\n\n    function DynamicScheduler(threadpool::Symbol, ca::ChunkingArgs)\n        if !(threadpool in (:default, :interactive))\n            throw(ArgumentError(\"threadpool must be either :default or :interactive\"))\n        end\n        new{chunking_mode(ca), typeof(ca.split), threadpool}(ca)\n    end\nend\n\nfunction DynamicScheduler(;\n        threadpool::Symbol = :default,\n        nchunks = nothing,\n        ntasks = nothing, # \"alias\" for nchunks\n        chunksize = nothing,\n        split::Union{Split, Symbol} = Consecutive(),\n        minchunksize = nothing,\n        chunking::Bool = true\n)\n    if !isnothing(ntasks)\n        if !isnothing(nchunks)\n            throw(ArgumentError(\"For the dynamic scheduler, nchunks and ntasks are aliases and only one may be provided\"))\n        end\n        nchunks = ntasks\n    end\n    ca = ChunkingArgs(DynamicScheduler;\n        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)\n    return DynamicScheduler(threadpool, ca)\nend\nfrom_symbol(::Val{:dynamic}) = DynamicScheduler\nchunking_args(sched::DynamicScheduler) = sched.chunking_args\nthreadpool(::DynamicScheduler{C, S, T}) where {C, S, T} = T\n\nfunction Base.show(io::IO, mime::MIME{Symbol(\"text/plain\")}, s::DynamicScheduler)\n    print(io, \"DynamicScheduler\", \"\\n\")\n    cstr = _chunkingstr(s.chunking_args)\n    println(io, \"├ Chunking: \", cstr)\n    print(io, \"└ Threadpool: \", threadpool(s))\nend\n\n\"\"\"\n    StaticScheduler (aka :static)\n\nA static low-overhead scheduler. Divides the given collection into chunks and\nthen spawns a task per chunk to perform the requested operation in parallel.\nThe tasks are statically assigned to threads up front and are made *sticky*, that is,\nthey are guaranteed to stay on the assigned threads (**no task migration**).\n\nCan sometimes be more performant than `DynamicScheduler` when the workload is (close to)\nuniform and, because of the lower overhead, for small workloads.\nIsn't well composable with other multithreaded code though.\n\n## Keyword arguments:\n\n- `nchunks::Integer` or `ntasks::Integer` (default `nthreads()`):\n    * Determines the number of chunks (and thus also the number of parallel tasks).\n    * Setting `nchunks < nthreads()` is an effective way to use only a subset of the available threads.\n    * For `nchunks > nthreads()` the chunks will be distributed to the available threads in a round-robin fashion.\n- `chunksize::Integer` (default not set)\n    * Specifies the desired chunk size (instead of the number of chunks).\n    * The options `chunksize` and `nchunks`/`ntasks` are **mutually exclusive** (only one may be non-zero).\n- `minchunksize::Union{Integer, Nothing}` (default `nothing`)\n    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.\n- `chunking::Bool` (default `true`):\n    * Controls whether input elements are grouped into chunks (`true`) or not (`false`).\n    * For `chunking=false`, the arguments `nchunks`/`ntasks`, `chunksize`, and `split` are ignored and input elements are regarded as \"chunks\" as is. Hence, there will be one parallel task spawned per input element. Note that, depending on the input, this **might spawn many(!) tasks** and can be costly!\n- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.Consecutive()`):\n    * Determines how the collection is divided into chunks. By default, each chunk consists of contiguous elements and order is maintained.\n    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`\n    * Beware that for `split=OhMyThreads.RoundRobin()` the order of elements isn't maintained and a reducer function must not only be associative but also **commutative**!\n\"\"\"\nstruct StaticScheduler{C <: ChunkingMode, S <: Split} <: Scheduler\n    chunking_args::ChunkingArgs{C, S}\nend\n\nfunction StaticScheduler(;\n        nchunks = nothing,\n        ntasks = nothing, # \"alias\" for nchunks\n        chunksize = nothing,\n        minchunksize = nothing,\n        split::Union{Split, Symbol} = Consecutive(),\n        chunking::Bool = true\n)\n    if !isnothing(ntasks)\n        if !isnothing(nchunks)\n            throw(ArgumentError(\"For the static scheduler, nchunks and ntasks are aliases and only one may be provided\"))\n        end\n        nchunks = ntasks\n    end\n    ca = ChunkingArgs(StaticScheduler;\n        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)\n    return StaticScheduler(ca)\nend\nfrom_symbol(::Val{:static}) = StaticScheduler\nchunking_args(sched::StaticScheduler) = sched.chunking_args\n\nfunction Base.show(io::IO, mime::MIME{Symbol(\"text/plain\")}, s::StaticScheduler)\n    print(io, \"StaticScheduler\", \"\\n\")\n    cstr = _chunkingstr(s.chunking_args)\n    println(io, \"├ Chunking: \", cstr)\n    print(io, \"└ Threadpool: default\")\nend\n\n\"\"\"\n    GreedyScheduler (aka :greedy)\n\nA greedy dynamic scheduler. The elements are put into a shared workqueue and dynamic,\nnon-sticky, tasks are spawned to process the elements of the queue with each task taking a new\nelement from the queue as soon as the previous one is done.\n\nNote that elements are processed in a non-deterministic order, and thus a potential reducing\nfunction **must** be [commutative](https://en.wikipedia.org/wiki/Commutative_property) in\naddition to being associative, or you could get incorrect results!\n\nCan be good choice for load-balancing slower, uneven computations, but does carry\nsome additional overhead.\n\n## Keyword arguments:\n\n- `ntasks::Int` (default `nthreads()`):\n    * Determines the number of parallel tasks to be spawned.\n    * Setting `ntasks < nthreads()` is an effective way to use only a subset of the available threads.\n- `chunking::Bool` (default `false`):\n    * Controls whether input elements are grouped into chunks (`true`) or not (`false`) before put into the shared workqueue. This can improve the performance especially if there are many iterations each of which are computationally cheap.\n    * If `nchunks` or `chunksize` are explicitly specified, `chunking` will be automatically set to `true`.\n- `nchunks::Integer` (default `10 * nthreads()`):\n    * Determines the number of chunks (that will eventually be put into the shared workqueue).\n    * Increasing `nchunks` can help with [load balancing](https://en.wikipedia.org/wiki/Load_balancing_(computing)). For `nchunks <= nthreads()` there are not enough chunks for any load balancing.\n- `chunksize::Integer` (default not set)\n    * Specifies the desired chunk size (instead of the number of chunks).\n    * The options `chunksize` and `nchunks` are **mutually exclusive** (only one may be a positive integer).\n- `minchunksize::Union{Integer, Nothing}` (default `nothing`)\n    * Sets a lower bound on the size of chunks. This argument takes priority over `nchunks`, so `treduce(+, 1:10; nchunks=10, minchunksize=5)` will only operate on `2` chunks for example.\n- `split::Union{Symbol, OhMyThreads.Split}` (default `OhMyThreads.RoundRobin()`):\n    * Determines how the collection is divided into chunks (if chunking=true).\n    * See [ChunkSplitters.jl](https://github.com/JuliaFolds2/ChunkSplitters.jl) for more details and available options. We also allow users to pass `:consecutive` in place of `Consecutive()`, and `:roundrobin` in place of `RoundRobin()`\n\"\"\"\nstruct GreedyScheduler{C <: ChunkingMode, S <: Split} <: Scheduler\n    ntasks::Int\n    chunking_args::ChunkingArgs{C, S}\n\n    function GreedyScheduler(ntasks::Integer, ca::ChunkingArgs)\n        ntasks > 0 || throw(ArgumentError(\"ntasks must be a positive integer\"))\n        return new{chunking_mode(ca), typeof(ca.split)}(ntasks, ca)\n    end\nend\n\nfunction GreedyScheduler(;\n        ntasks::Integer = nthreads(),\n        nchunks = nothing,\n        chunksize = nothing,\n        minchunksize = nothing,\n        split::Union{Split, Symbol} = RoundRobin(),\n        chunking::Bool = false\n)\n    if !(isnothing(nchunks) && isnothing(chunksize))\n        chunking = true\n    end\n    ca = ChunkingArgs(GreedyScheduler;\n        n = nchunks, size = chunksize, minsize = minchunksize, split, chunking)\n    return GreedyScheduler(ntasks, ca)\nend\nfrom_symbol(::Val{:greedy}) = GreedyScheduler\nchunking_args(sched::GreedyScheduler) = sched.chunking_args\ndefault_nchunks(::Type{GreedyScheduler}) = 10 * nthreads(:default)\n\nfunction Base.show(io::IO, mime::MIME{Symbol(\"text/plain\")}, s::GreedyScheduler)\n    print(io, \"GreedyScheduler\", \"\\n\")\n    println(io, \"├ Num. tasks: \", s.ntasks)\n    cstr = _chunkingstr(s)\n    println(io, \"├ Chunking: \", cstr)\n    print(io, \"└ Threadpool: default\")\nend\n\n\"\"\"\n    SerialScheduler (aka :serial)\n\nA scheduler for turning off any multithreading and running the code in serial. It aims to\nmake parallel functions like, e.g., `tmapreduce(sin, +, 1:100)` behave like their serial\ncounterparts, e.g., `mapreduce(sin, +, 1:100)`.\n\nNote that `SerialScheduler` has no arguments and will ignore any that are passed\nto it. This is to make it easier to switch to the serial scheduler without\nhaving to change the rest of the code.\n\"\"\"\nstruct SerialScheduler <: Scheduler\n    # Dummy constructor to allow ignoring settings for other schedulers\n    SerialScheduler(; _...) = new()\nend\nfrom_symbol(::Val{:serial}) = SerialScheduler\n\nend # module\n"
  },
  {
    "path": "src/tools.jl",
    "content": "module Tools\n\nusing Base.Threads: nthreads\n\n\"\"\"\n    nthtid(n)\n\nReturns the thread id of the `n`th Julia thread in the `:default` threadpool.\n\"\"\"\n@inline function nthtid(n)\n    @static if VERSION < v\"1.9\"\n        @boundscheck 1 <= n <= nthreads()\n        return n\n    else\n        @boundscheck 1 <= n <= nthreads(:default)\n        return n + Threads.threadpoolsize(:interactive) # default threads after interactive threads\n    end\nend\n\n\"\"\"\n    taskid() :: UInt\n\nReturn a `UInt` identifier for the current running [Task](https://docs.julialang.org/en/v1/base/parallel/#Core.Task). This identifier will be unique so long as references to the task it came from still exist.\n\"\"\"\ntaskid() = objectid(current_task())\n\n\"\"\"\nMay be used to mark a region in parallel code to be executed by a single task only\n(all other tasks shall skip over it).\n\nSee [`try_enter!`](@ref) and [`reset!`](@ref).\n\"\"\"\nmutable struct OnlyOneRegion\n    @atomic task::Union{Task, Nothing}\n    OnlyOneRegion() = new(nothing)\nend\n\n\"\"\"\n    try_enter!(f, s::OnlyOneRegion)\n\nWhen called from multiple parallel tasks (on a shared `s::OnlyOneRegion`) only a single\ntask will execute `f`.\n\n## Example\n\n```julia\nusing OhMyThreads: @tasks\nusing OhMyThreads.Tools: OnlyOneRegion, try_enter!\n\nonly_one = OnlyOneRegion()\n\n@tasks for i in 1:10\n    @set ntasks = 10\n\n    println(i, \": before\")\n    try_enter!(only_one) do\n        println(i, \": only printed by a single task\")\n        sleep(1)\n    end\n    println(i, \": after\")\nend\n```\n\"\"\"\nfunction try_enter!(f, s::OnlyOneRegion)\n    ct = current_task()\n    t = @atomic :monotonic s.task\n    if !isnothing(t) && ct != t\n        return\n    end\n    if ct == t || (@atomicreplace s.task nothing=>ct).success\n        f()\n    end\n    return\nend\n\n\"\"\"\nReset the `OnlyOneRegion` (so that it can be used again).\n\"\"\"\nfunction reset!(s::OnlyOneRegion)\n    @atomic s.task = nothing\n    return\nend\n\n\"\"\"\nSimpleBarrier(n::Integer)\n\nSimple reusable barrier for `n` parallel tasks.\n\nGiven `b = SimpleBarrier(n)` and `n` parallel tasks, each task that calls\n`wait(b)` will block until the other `n-1` tasks have called `wait(b)` as well.\n\n## Example\n```\nn = nthreads()\nbarrier = SimpleBarrier(n)\n@sync for i in 1:n\n    @spawn begin\n        println(\"A\")\n        wait(barrier) # synchronize all tasks\n        println(\"B\")\n        wait(barrier) # synchronize all tasks (reusable)\n        println(\"C\")\n    end\nend\n```\n\"\"\"\nmutable struct SimpleBarrier\n    const n::Int64\n    const c::Threads.Condition\n    cnt::Int64\n\n    function SimpleBarrier(n::Integer)\n        new(n, Threads.Condition(), 0)\n    end\nend\n\nfunction Base.wait(b::SimpleBarrier)\n    lock(b.c)\n    try\n        b.cnt += 1\n        if b.cnt == b.n\n            b.cnt = 0\n            notify(b.c)\n        else\n            wait(b.c)\n        end\n    finally\n        unlock(b.c)\n    end\nend\n\nend # Tools\n"
  },
  {
    "path": "src/types.jl",
    "content": "\"\"\"\n    struct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function\n\nThis callable function-like object is meant to represent a function which closes over some\n[`TaskLocalValues`](https://github.com/vchuravy/TaskLocalValues.jl). This is, if you do\n\n```\nTLV{T} = TaskLocalValue{T}\nf = WithTaskLocals((TLV{Int}(() -> 1), TLV{Int}(() -> 2))) do (x, y)\n    z -> (x + y)/z\nend\n```\nthen that is equivalent to\n```\ng = let x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)\n    z -> let x = x[], y=y[]\n        (x + y)/z\n    end\nend\n```\nhowever, the main difference is that you can call [`promise_task_local`](@ref) on a\n`WithTaskLocals` closure in order to turn it into something equivalent to\n```\nlet x=x[], y=y[]\n    z -> (x + y)/z\nend\n```\nwhich doesn't have the overhead of accessing the `task_local_storage` each time the closure is called.\nThis of course will lose the safety advantages of `TaskLocalValue`, so you should never do\n`f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that\nunknown function calls `f_local` on a new task, you'll hit a race condition.\n\"\"\"\nstruct WithTaskLocals{F, TLVs <: Tuple{Vararg{TaskLocalValue}}} <: Function\n    inner_func::F\n    tasklocals::TLVs\nend\n\n\"\"\"\n    promise_task_local(f) = f\n    promise_task_local(f::WithTaskLocals) = f.inner_func(map(x -> x[], f.tasklocals))\n\nTake a `WithTaskLocals` closure, grab the `TaskLocalValue`s, and passs them to the closure. That is,\nit turns a `WithTaskLocals` closure from the equivalent of\n```\nTLV{T} = TaskLocalValue{T}\nlet x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)\n    z -> let x = x[], y=y[]\n        (x + y)/z\n    end\nend\n```\ninto the equivalent of\n```\nlet x = TLV{Int}(() -> 1), y = TLV{Int}(() -> 2)\n    let x = x[], y = y[]\n        z -> (x + y)/z\n    end\nend\n```\nwhich doesn't have the overhead of accessing the `task_local_storage` each time the closure is called.\nThis of course will lose the safety advantages of `TaskLocalValue`, so you should never do\n`f_local = promise_task_local(f)` and then pass `f_local` to some unknown function, because if that\nunknown function calls `f_local` on a new task, you'll hit a race condition. \n```\n\"\"\"\nfunction promise_task_local(f::WithTaskLocals{F}) where {F}\n    f.inner_func(map(x -> x[], f.tasklocals))\nend\npromise_task_local(f::Any) = f\n\nfunction (f::WithTaskLocals{F})(args...; kwargs...) where {F}\n    promise_task_local(f)(args...; kwargs...)\nend\n\n\"\"\"\n    ChannelLike(itr)\n\nThis struct wraps an indexable object such that it can be iterated by concurrent tasks in a\nsafe manner similar to a `Channel`.\n\n`ChannelLike(itr)` is conceptually similar to:\n```julia\nChannel{eltype(itr)}(length(itr)) do ch\n    foreach(i -> put!(ch, i), itr)\nend\n```\ni.e. creating a channel, `put!`ing all elements of `itr` into it and closing it. The\nadvantage is that `ChannelLike` doesn't copy the data.\n\n# Examples\n```julia\nch = OhMyThreads.ChannelLike(1:5)\n\n@sync for taskid in 1:2\n    Threads.@spawn begin\n        for i in ch\n            println(\"Task #\\$taskid processing item \\$i\")\n            sleep(1 / i)\n        end\n    end\nend\n\n# output\n\nTask #1 processing item 1\nTask #2 processing item 2\nTask #2 processing item 3\nTask #2 processing item 4\nTask #1 processing item 5\n```\n\nNote that `ChannelLike` is stateful (just like a `Channel`), so you can't iterate over it\ntwice.\n\nThe wrapped iterator must support `firstindex(itr)::Int`, `lastindex(itr)::Int` and\n`getindex(itr, ::Int)`.\n\"\"\"\nmutable struct ChannelLike{T}\n    const itr::T\n    @atomic idx::Int\n    function ChannelLike(itr::T) where {T}\n        return new{T}(itr, firstindex(itr) - 1)\n    end\nend\n\nBase.length(ch::ChannelLike) = length(ch.itr)\nBase.eltype(ch::ChannelLike) = eltype(ch.itr)\n\nfunction Base.iterate(ch::ChannelLike, ::Nothing = nothing)\n    this = @atomic ch.idx += 1\n    if this <= lastindex(ch.itr)\n        return (@inbounds(ch.itr[this]), nothing)\n    else\n        return nothing\n    end\nend\n"
  },
  {
    "path": "test/Aqua.jl",
    "content": "using Aqua\n\n@testset \"Aqua.jl\" begin\n  Aqua.test_all(\n    OhMyThreads;\n    # ambiguities=(exclude=[SomePackage.some_function], broken=true),\n    # stale_deps=(ignore=[:SomePackage],),\n    deps_compat=(ignore=[:Test],),\n    # piracies=false,\n    persistent_tasks=false,\n  )\nend\n"
  },
  {
    "path": "test/runtests.jl",
    "content": "using Test, OhMyThreads\nusing OhMyThreads: TaskLocalValue, WithTaskLocals, @fetch, promise_task_local\nusing OhMyThreads: Consecutive, RoundRobin\nusing OhMyThreads.Experimental: @barrier\nusing OhMyThreads.Implementation: BoxedVariableError\n\n@info \"Testing with $(Threads.nthreads(:default)),$(Threads.nthreads(:interactive)) threads.\"\n\ninclude(\"Aqua.jl\")\n\nsets_to_test = [(~ = isapprox, f = sin ∘ *, op = +,\n                    itrs = (rand(ComplexF64, 10, 10), rand(-10:10, 10, 10)),\n                    init = complex(0.0))\n                (~ = isapprox, f = cos, op = max, itrs = (1:100000,), init = 0.0)\n                (~ = (==), f = round, op = vcat, itrs = (randn(1000),), init = Float64[])\n                (~ = (==), f = last, op = *,\n                    itrs = ([1 => \"a\", 2 => \"b\", 3 => \"c\", 4 => \"d\", 5 => \"e\"],),\n                    init = \"\")]\n\nChunkedGreedy(; kwargs...) = GreedyScheduler(; kwargs...)\n\n@testset \"Basics\" begin\n    for (; ~, f, op, itrs, init) in sets_to_test\n        @testset \"f=$f, op=$op, itrs::$(typeof(itrs))\" begin\n            @testset for sched in (\n                StaticScheduler, DynamicScheduler, GreedyScheduler,\n                DynamicScheduler{OhMyThreads.Schedulers.NoChunking},\n                SerialScheduler, ChunkedGreedy)\n                @testset for split in (Consecutive(), RoundRobin(), :consecutive, :roundrobin)\n                    for nchunks in (1, 2, 6)\n                        for minchunksize ∈ (nothing, 1, 3)\n                            if sched == GreedyScheduler\n                                scheduler = sched(; ntasks = nchunks, minchunksize)\n                            elseif sched == DynamicScheduler{OhMyThreads.Schedulers.NoChunking}\n                                scheduler = DynamicScheduler(; chunking = false)\n                            elseif sched == SerialScheduler\n                                scheduler = SerialScheduler(; nchunks)\n                            else\n                                scheduler = sched(; nchunks, split, minchunksize)\n                            end\n                            kwargs = (; scheduler)\n                            if (split in (RoundRobin(), :roundrobin) ||\n                                sched ∈ (GreedyScheduler, ChunkedGreedy)) || op ∉ (vcat, *)\n                                # scatter and greedy only works for commutative operators!\n                            else\n                                mapreduce_f_op_itr = mapreduce(f, op, itrs...)\n                                @test tmapreduce(f, op, itrs...; init, kwargs...) ~ mapreduce_f_op_itr\n                                @test treducemap(op, f, itrs...; init, kwargs...) ~ mapreduce_f_op_itr\n                                @test treduce(op, f.(itrs...); init, kwargs...) ~ mapreduce_f_op_itr\n                            end\n\n                            split in (RoundRobin(), :roundrobin) && continue\n                            map_f_itr = map(f, itrs...)\n                            @test all(tmap(f, Any, itrs...; kwargs...) .~ map_f_itr)\n                            @test all(tcollect(Any, (f(x...) for x in collect(zip(itrs...))); kwargs...) .~ map_f_itr)\n                            @test all(tcollect(Any, f.(itrs...); kwargs...) .~ map_f_itr)\n\n                            RT = Core.Compiler.return_type(f, Tuple{eltype.(itrs)...})\n\n                            @test tmap(f, RT, itrs...; kwargs...) ~ map_f_itr\n                            @test tcollect(RT, (f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr\n                            @test tcollect(RT, f.(itrs...); kwargs...) ~ map_f_itr\n\n                            if sched ∉ (GreedyScheduler, ChunkedGreedy)\n                                @test tmap(f, itrs...; kwargs...) ~ map_f_itr\n                                @test tcollect((f(x...) for x in collect(zip(itrs...))); kwargs...) ~ map_f_itr\n                                @test tcollect(f.(itrs...); kwargs...) ~ map_f_itr\n                            end\n                        end\n                    end\n                end\n            end\n        end\n    end\nend;\n\n@testset \"ChunkSplitters.Chunk\" begin\n    x = rand(100)\n    chnks = OhMyThreads.index_chunks(x; n = Threads.nthreads())\n    for scheduler in (\n        DynamicScheduler(),\n        DynamicScheduler(; chunking = false),\n        StaticScheduler(; chunking = false))\n        @testset \"$scheduler\" begin\n            @test tmap(x -> sin.(x), chnks; scheduler) ≈ map(x -> sin.(x), chnks)\n            @test tmapreduce(x -> sin.(x), vcat, chnks; scheduler) ≈\n                  mapreduce(x -> sin.(x), vcat, chnks)\n            @test tcollect(chnks; scheduler) == collect(chnks)\n            @test treduce(vcat, chnks; scheduler) == reduce(vcat, chnks)\n            @test isnothing(tforeach(x -> sin.(x), chnks; scheduler))\n        end\n    end\n\n    # enumerate(chunks)\n    data = 1:100\n    @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs)\n        [i, sum(@view(data[idcs]))]\n    end == [sum(1:5), sum(data)]\n    @test tmapreduce(+, enumerate(OhMyThreads.index_chunks(data; size=5)); chunking=false) do (i, idcs)\n        [i, sum(@view(data[idcs]))]\n    end == [sum(1:20), sum(data)]\n    @test tmap(enumerate(OhMyThreads.index_chunks(data; n=5)); chunking=false) do (i, idcs)\n        [i, idcs]\n    end == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]]\nend;\n\n@testset \"macro API\" begin\n    # basic\n    @test @tasks(for i in 1:3\n        i\n    end) |> isnothing\n\n    # reduction\n    @test @tasks(for i in 1:3\n        @set reducer = (+)\n        i\n    end) == 6\n\n    # scheduler settings\n    for sched in (StaticScheduler(), DynamicScheduler(), GreedyScheduler())\n        @test @tasks(for i in 1:3\n            @set scheduler = sched\n            i\n        end) |> isnothing\n    end\n    # scheduler settings as symbols\n    @test @tasks(for i in 1:3\n        @set scheduler = :static\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:3\n        @set scheduler = :dynamic\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:3\n        @set scheduler = :greedy\n        i\n    end) |> isnothing\n\n    # @set begin ... end\n    @test @tasks(for i in 1:10\n        @set begin\n            scheduler = StaticScheduler()\n            reducer = (+)\n        end\n        i\n    end) == 55\n    # multiple @set\n    @test @tasks(for i in 1:10\n        @set scheduler = StaticScheduler()\n        i\n        @set reducer = (+)\n    end) == 55\n    # @set init\n    @test @tasks(for i in 1:10\n        @set begin\n            reducer = (+)\n            init = 0.0\n        end\n        i\n    end) === 55.0\n    @test @tasks(for i in 1:10\n        @set begin\n            reducer = (+)\n            init = 0.0 * im\n        end\n        i\n    end) === (55.0 + 0.0im)\n\n    # top-level \"kwargs\"\n    @test @tasks(for i in 1:3\n        @set scheduler = :static\n        @set ntasks = 1\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:3\n        @set scheduler = :static\n        @set nchunks = 2\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:3\n        @set scheduler = :dynamic\n        @set chunksize = 2\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:3\n        @set scheduler = :dynamic\n        @set chunking = false\n        i\n    end) |> isnothing\n    @test @tasks(for i in 1:4\n        @set minchunksize=2\n        i\n    end) |> isnothing\n    @test_throws ArgumentError @tasks(for i in 1:3\n        @set scheduler = DynamicScheduler()\n        @set chunking = false\n        i\n    end)\n    @test_throws MethodError @tasks(for i in 1:3\n        @set scheduler = :dynamic\n        @set asd = 123\n        i\n    end)\n\n    # TaskLocalValue\n    ntd = 2 * Threads.nthreads()\n    ptrs = Vector{Ptr{Nothing}}(undef, ntd)\n    tids = Vector{UInt64}(undef, ntd)\n    tid() = OhMyThreads.Tools.taskid()\n    @test @tasks(for i in 1:ntd\n        @local C::Vector{Float64} = rand(3)\n        @set scheduler = :static\n        ptrs[i] = pointer_from_objref(C)\n        tids[i] = tid()\n    end) |> isnothing\n    # check that different iterations of a task\n    # have access to the same C (same pointer)\n    for t in unique(tids)\n        @test allequal(ptrs[findall(==(t), tids)])\n    end\n    # TaskLocalValue (another fundamental check)\n    @test @tasks(for i in 1:ntd\n        @local x::Ref{Int64} = Ref(0)\n        @set reducer = (+)\n        @set scheduler = :static\n        x[] += 1\n        x[]\n    end) == 1.5 * ntd # if a new x would be allocated per iteration, we'd get ntd here.\n    # TaskLocalValue (begin ... end block), inferred TLV type\n    @test @inferred (() -> @tasks for i in 1:10\n        @local begin\n            C = fill(4, 3, 3)\n            x = fill(5.0, 3)\n        end\n        @set reducer = (+)\n        sum(C * x)\n    end)() == 1800\n\n    # hygiene / escaping\n    var = 3\n    sched = StaticScheduler()\n    sched_sym = :static\n    data = rand(10)\n    red = (a, b) -> a + b\n    n = 2\n    @test @tasks(for d in data\n        @set scheduler = sched\n        @set reducer = red\n        var * d\n    end) ≈ var * sum(data)\n    @test @tasks(for d in data\n        @set scheduler = sched_sym\n        @set ntasks = n\n        @set reducer = red\n        var * d\n    end) ≈ var * sum(data)\n\n    struct SingleInt\n        x::Int\n    end\n    @test @tasks(for _ in 1:10\n        @local C = SingleInt(var)\n        @set reducer = +\n        C.x\n    end) == 10 * var\n\n    # enumerate(chunks)\n    let data = collect(1:100)\n        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; n=5))\n                         @set reducer = +\n                             @set chunking = false\n                         [i, sum(@view(data[idcs]))]\n                     end) == [sum(1:5), sum(data)]\n        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(data; size=5))\n                         @set reducer = +\n                             [i, sum(@view(data[idcs]))]\n                     end) == [sum(1:20), sum(data)]\n        @test @tasks(for (i, idcs) in enumerate(OhMyThreads.index_chunks(1:100; n=5))\n                         @set chunking=false\n                         @set collect=true\n                         [i, idcs]\n                     end) == [[1, 1:20], [2, 21:40], [3, 41:60], [4, 61:80], [5, 81:100]]\n    end\nend;\n\n@testset \"WithTaskLocals\" begin\n    let x = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0)),\n        y = TaskLocalValue{Base.RefValue{Int}}(() -> Ref{Int}(0))\n        # Equivalent to\n        # function f()\n        #    x[][] += 1\n        #    x[][] += 1\n        #    x[], y[]\n        # end\n        f = WithTaskLocals((x, y)) do (x, y)\n            function ()\n                x[] += 1\n                y[] += 1\n                x[], y[]\n            end\n        end\n        # Make sure we can call `f` like a regular function\n        @test f() == (1, 1)\n        @test f() == (2, 2)\n        @test @fetch(f()) == (1, 1)\n        # Acceptable use of promise_task_local\n        @test @fetch(promise_task_local(f)()) == (1, 1)\n        # Acceptable use of promise_task_local\n        @test promise_task_local(f)() == (3, 3)\n        # Acceptable use of promise_task_local\n        @test @fetch(promise_task_local(f)()) == (1, 1)\n        # Acceptable use of promise_task_local\n        g() = @fetch((promise_task_local(f)(); promise_task_local(f)(); f()))\n        @test g() == (3, 3)\n        @test g() == (3, 3)\n\n        h = promise_task_local(f)\n        # Unacceptable use of `promise_task_local`\n        # This is essentially testing that if you use `promise_task_local`, then pass that to another task,\n        # you could get data races, since we here have a different thread writing to another thread's value.\n        @test @fetch(h()) == (4, 4)\n        @test @fetch(h()) == (5, 5)\n    end\nend;\n\n@testset \"chunking mode + chunksize option\" begin\n    @test OhMyThreads.Schedulers.chunking_mode(SerialScheduler()) ==\n          OhMyThreads.Schedulers.NoChunking\n    for sched in (DynamicScheduler, StaticScheduler, GreedyScheduler)\n        @test sched() isa sched\n        @test sched(; chunksize = 2) isa sched\n\n        @test OhMyThreads.Schedulers.chunking_mode(sched(; chunksize = 2)) ==\n              OhMyThreads.Schedulers.FixedSize\n        @test OhMyThreads.Schedulers.chunking_mode(sched(; nchunks = 2)) ==\n              OhMyThreads.Schedulers.FixedCount\n        @test OhMyThreads.Schedulers.chunking_mode(sched(; chunking = false)) ==\n              OhMyThreads.Schedulers.NoChunking\n        if sched != GreedyScheduler\n            # For (Dynamic|Static)Scheduler `chunking = false` disables all chunking\n            # arguments\n            @test OhMyThreads.Schedulers.chunking_mode(sched(;\n                nchunks = 2, chunksize = 4, chunking = false)) ==\n                  OhMyThreads.Schedulers.NoChunking\n            @test OhMyThreads.Schedulers.chunking_mode(sched(;\n                nchunks = nothing, chunksize = nothing, split = :whatever, chunking = false)) ==\n                  OhMyThreads.Schedulers.NoChunking\n            @test OhMyThreads.Schedulers.chunking_enabled(sched(;\n                nchunks = nothing, chunksize = nothing, chunking = false)) == false\n            @test OhMyThreads.Schedulers.chunking_enabled(sched(;\n                nchunks = 2, chunksize = 4, chunking = false)) == false\n        else\n            # For GreedyScheduler `nchunks` or `chunksize` overrides `chunking = false`\n            @test OhMyThreads.Schedulers.chunking_mode(sched(;\n                nchunks = 2, chunking = false)) ==\n                  OhMyThreads.Schedulers.FixedCount\n            @test OhMyThreads.Schedulers.chunking_mode(sched(;\n                chunksize = 2, chunking = false)) ==\n                  OhMyThreads.Schedulers.FixedSize\n            @test OhMyThreads.Schedulers.chunking_enabled(sched(;\n                nchunks = 2, chunking = false)) == true\n            @test OhMyThreads.Schedulers.chunking_enabled(sched(;\n                chunksize = 4, chunking = false)) == true\n        end\n        @test OhMyThreads.Schedulers.chunking_enabled(sched(; chunksize = 2)) == true\n        @test OhMyThreads.Schedulers.chunking_enabled(sched(; nchunks = 2)) == true\n        @test_throws ArgumentError sched(; nchunks = 2, chunksize = 3)\n        @test_throws ArgumentError sched(; nchunks = 2, split = :whatever)\n\n        let scheduler = sched(; chunksize = 2, split = :batch)\n            @test tmapreduce(sin, +, 1:10; scheduler, init=0.0) ≈ mapreduce(sin, +, 1:10)\n            @test treduce(+, 1:10; scheduler, init=0.0) ≈ reduce(+, 1:10)\n            @test tmap(sin, Float64, 1:10; scheduler) ≈ map(sin, 1:10)\n            @test isnothing(tforeach(sin, 1:10; scheduler))\n        end\n    end\nend;\n\n@testset \"top-level kwargs\" begin\n    res_tmr = mapreduce(sin, +, 1:10000)\n\n    # scheduler not given\n    @test tmapreduce(sin, +, 1:10000; ntasks = 2) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10000; nchunks = 2) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10000; split = RoundRobin()) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10000; chunksize = 2) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10000; chunking = false) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10000; minchunksize=10) ≈ res_tmr\n    @test tmapreduce(sin, +, 1:10; minchunksize=10) == mapreduce(sin, +, 1:10)\n\n    # scheduler isa Scheduler\n    @test tmapreduce(sin, +, 1:10000; scheduler = StaticScheduler()) ≈ res_tmr\n    @test_throws ArgumentError tmapreduce(\n        sin, +, 1:10000; ntasks = 2, scheduler = DynamicScheduler())\n    @test_throws ArgumentError tmapreduce(\n        sin, +, 1:10000; chunksize = 2, scheduler = DynamicScheduler())\n    @test_throws ArgumentError tmapreduce(\n        sin, +, 1:10000; split = RoundRobin(), scheduler = StaticScheduler())\n    @test_throws ArgumentError tmapreduce(\n        sin, +, 1:10000; ntasks = 3, scheduler = SerialScheduler())\n\n    # scheduler isa Symbol\n    for s in (:dynamic, :static, :serial, :greedy)\n        @test tmapreduce(sin, +, 1:10000; scheduler = s, init = 0.0) ≈ res_tmr\n    end\n    for s in (:dynamic, :static, :greedy)\n        @test tmapreduce(sin, +, 1:10000; ntasks = 2, scheduler = s, init = 0.0) ≈ res_tmr\n    end\n    for s in (:dynamic, :static)\n        @test tmapreduce(sin, +, 1:10000; chunksize = 2, scheduler = s) ≈ res_tmr\n        @test tmapreduce(sin, +, 1:10000; chunking = false, scheduler = s) ≈ res_tmr\n        @test tmapreduce(sin, +, 1:10000; nchunks = 3, scheduler = s) ≈ res_tmr\n        @test tmapreduce(sin, +, 1:10000; ntasks = 3, scheduler = s) ≈ res_tmr\n        @test_throws ArgumentError tmapreduce(\n            sin, +, 1:10000; ntasks = 3, nchunks = 2, scheduler = s)≈res_tmr\n    end\n    @test_throws ArgumentError tmapreduce(sin, +, 1:10000; scheduler = :whatever)\n    @test_throws ArgumentError tmapreduce(\n        sin, +, 1:10000; threadpool = :whatever, chunking = false)\nend;\n\n@testset \"empty collections\" begin\n    @static if VERSION < v\"1.11.0-\"\n        err = MethodError\n    else\n        err = ArgumentError\n    end\n    for empty_coll in (11:9, Float64[])\n        for f in (sin, x -> im * x, identity)\n            for op in (+, *, min)\n                # mapreduce\n                for init in (0.0, 0, 0.0 * im, 0.0f0)\n                    @test tmapreduce(f, op, empty_coll; init) == init\n                end\n                # foreach\n                @test tforeach(f, empty_coll) |> isnothing\n                # reduce\n                if op != min\n                    @test treduce(op, empty_coll) == reduce(op, empty_coll)\n                else\n                    @test_throws err treduce(op, empty_coll)\n                end\n                # map\n                @test tmap(f, empty_coll) == map(f, empty_coll)\n                # collect\n                @test tcollect(empty_coll) == collect(empty_coll)\n            end\n        end\n    end\nend;\n\n# for testing @one_by_one region\nmutable struct SingleAccessOnly\n    in_use::Bool\n    const lck::ReentrantLock\n    SingleAccessOnly() = new(false, ReentrantLock())\nend\nfunction acquire(f, o::SingleAccessOnly)\n    lock(o.lck) do\n        o.in_use && throw(ErrorException(\"Already in use!\"))\n        o.in_use = true\n    end\n    try\n        f()\n    finally\n        lock(o.lck) do\n            !o.in_use && throw(ErrorException(\"Conflict!\"))\n            o.in_use = false\n        end\n    end\nend\n\n@testset \"regions\" begin\n    @testset \"@one_by_one\" begin\n        sao = SingleAccessOnly()\n\n        try\n            @tasks for i in 1:10\n                @set ntasks = 10\n                @one_by_one begin\n                    acquire(sao) do\n                        sleep(0.01)\n                    end\n                end\n            end\n        catch ErrorException\n            @test false\n        else\n            @test true\n        end\n\n\n        # test escaping\n        let\n            x = Ref(0)\n            y = Ref(0)\n            @tasks for i in 1:10\n                @set ntasks = 10\n\n                y[] += 1 # not safe (race condition)\n                @one_by_one begin\n                    x[] += 1 # parallel-safe because inside of one_by_one region\n                    acquire(sao) do\n                        sleep(0.01)\n                    end\n                end\n            end\n            @test x[] == 10\n\n        end\n\n        test_f = () -> begin\n            x = Ref(0)\n            y = Ref(0)\n            @tasks for i in 1:10\n                @set ntasks = 10\n\n                y[] += 1 # not safe (race condition)\n                @one_by_one begin\n                    x[] += 1 # parallel-safe because inside of one_by_one region\n                    acquire(sao) do\n                        sleep(0.01)\n                    end\n                end\n            end\n            return x[]\n        end\n        @test test_f() == 10\n    end\n\n    @testset \"@only_one\" begin\n        let\n            x = Ref(0)\n            y = Ref(0)\n            try\n                @tasks for i in 1:10\n                    @set ntasks = 10\n\n                    y[] += 1 # not safe (race condition)\n                    @only_one begin\n                        x[] += 1 # parallel-safe because only a single task will execute this\n                    end\n                end\n                @test x[] == 1 # only a single task should have incremented x\n            catch ErrorException\n                @test false\n            end\n        end\n\n        let\n            x = Ref(0)\n            y = Ref(0)\n            try\n                @tasks for i in 1:10\n                    @set ntasks = 2\n\n                    y[] += 1 # not safe (race condition)\n                    @only_one begin\n                        x[] += 1 # parallel-safe because only a single task will execute this\n                    end\n                end\n                @test x[] == 5 # a single task should have incremented x 5 times\n            catch ErrorException\n                @test false\n            end\n        end\n\n        test_f = () -> begin\n            x = Ref(0)\n            y = Ref(0)\n            @tasks for i in 1:10\n                @set ntasks = 2\n\n                y[] += 1 # not safe (race condition)\n                @only_one begin\n                    x[] += 1 # parallel-safe because only a single task will execute this\n                end\n            end\n            return x[]\n        end\n        @test test_f() == 5\n    end\n\n    @testset \"@only_one + @one_by_one\" begin\n        x = Ref(0)\n        y = Ref(0)\n        try\n            @tasks for i in 1:10\n                @set ntasks = 10\n\n                @only_one begin\n                    x[] += 1 # parallel-safe\n                end\n\n                @one_by_one begin\n                    y[] += 1 # parallel-safe\n                end\n            end\n            @test x[] == 1 && y[] == 10\n        catch ErrorException\n            @test false\n        end\n    end\nend;\n\n@testset \"@barrier\" begin\n    @test (@tasks for i in 1:20\n        @set ntasks = 20\n        @barrier\n    end) |> isnothing\n\n    @test try\n        @macroexpand @tasks for i in 1:20\n            @barrier\n        end\n        false\n    catch\n        true\n    end\n\n    @test try\n        x = Threads.Atomic{Int64}(0)\n        y = Threads.Atomic{Int64}(0)\n        @tasks for i in 1:20\n            @set ntasks = 20\n\n            Threads.atomic_add!(x, 1)\n            @barrier\n            if x[] < 20 && y[] > 0 # x hasn't reached 20 yet and y is already > 0\n                error(\"shouldn't happen\")\n            end\n            Threads.atomic_add!(y, 1)\n        end\n        true\n    catch ErrorException\n        false\n    end\n\n    @test try\n        x = Threads.Atomic{Int64}(0)\n        y = Threads.Atomic{Int64}(0)\n        @tasks for i in 1:20\n            @set ntasks = 20\n\n            Threads.atomic_add!(x, 1)\n            @barrier\n            Threads.atomic_add!(x, 1)\n            @barrier\n            if x[] < 40 && y[] > 0 # x hasn't reached 20 yet and y is already > 0\n                error(\"shouldn't happen\")\n            end\n            Threads.atomic_add!(y, 1)\n        end\n        true\n    catch ErrorException\n        false\n    end\nend\n\n@testset \"verbose special macro usage\" begin\n    # OhMyThreads.@set\n    @test @tasks(for i in 1:3\n        OhMyThreads.@set reducer = (+)\n        i\n    end) == 6\n    @test @tasks(for i in 1:3\n        OhMyThreads.@set begin\n            reducer = (+)\n        end\n        i\n    end) == 6\n    # OhMyThreads.@local\n    ntd = 2 * Threads.nthreads()\n    @test @tasks(for i in 1:ntd\n        OhMyThreads.@local x::Ref{Int64} = Ref(0)\n        OhMyThreads.@set begin\n            reducer = (+)\n            scheduler = :static\n        end\n        x[] += 1\n        x[]\n    end) == @tasks(for i in 1:ntd\n        @local x::Ref{Int64} = Ref(0)\n        @set begin\n            reducer = (+)\n            scheduler = :static\n        end\n        x[] += 1\n        x[]\n    end)\n    # OhMyThreads.@only_one\n    let\n        x = Ref(0)\n        y = Ref(0)\n        try\n            @tasks for i in 1:10\n                OhMyThreads.@set ntasks = 10\n\n                y[] += 1 # not safe (race condition)\n                OhMyThreads.@only_one begin\n                    x[] += 1 # parallel-safe because only a single task will execute this\n                end\n            end\n            @test x[] == 1 # only a single task should have incremented x\n        catch ErrorException\n            @test false\n        end\n    end\n    # OhMyThreads.@one_by_one\n    test_f = () -> begin\n        sao = SingleAccessOnly()\n        x = Ref(0)\n        y = Ref(0)\n        @tasks for i in 1:10\n            OhMyThreads.@set ntasks = 10\n\n            y[] += 1 # not safe (race condition)\n            OhMyThreads.@one_by_one begin\n                x[] += 1 # parallel-safe because inside of one_by_one region\n                acquire(sao) do\n                    sleep(0.01)\n                end\n            end\n        end\n        return x[]\n    end\n    @test test_f() == 10\nend\n\n@testset \"show schedulers\" begin\n    nt = Threads.nthreads(:default)\n\n    @test repr(\"text/plain\", DynamicScheduler()) ==\n          \"\"\"\n          DynamicScheduler\n          ├ Chunking: fixed count ($nt), split :consecutive\n          └ Threadpool: default\"\"\"\n\n    @test repr(\n        \"text/plain\", DynamicScheduler(; chunking = false, threadpool = :interactive)) ==\n          \"\"\"\n          DynamicScheduler\n          ├ Chunking: none\n          └ Threadpool: interactive\"\"\"\n\n    @test repr(\"text/plain\", StaticScheduler()) ==\n          \"\"\"StaticScheduler\n          ├ Chunking: fixed count ($nt), split :consecutive\n          └ Threadpool: default\"\"\"\n\n    @test repr(\"text/plain\", StaticScheduler(; chunksize = 2, split = :scatter)) ==\n          \"\"\"\n          StaticScheduler\n          ├ Chunking: fixed size (2), split :roundrobin\n          └ Threadpool: default\"\"\"\n\n    @test repr(\"text/plain\", GreedyScheduler(; chunking = true)) ==\n          \"\"\"\n         GreedyScheduler\n         ├ Num. tasks: $nt\n         ├ Chunking: fixed count ($(10 * nt)), split :roundrobin\n         └ Threadpool: default\"\"\"\nend\n\nif Threads.nthreads() > 1\n    @testset \"Boxing detection and error\" begin\n        let\n            f1() = tmap(1:10) do i\n                A = i\n                sleep(rand()/10)\n                A\n            end\n            f2() = tmap(1:10) do i\n                local A = i\n                sleep(rand()/10)\n                A\n            end\n\n            @test f1() == 1:10\n            @test f2() == 1:10\n        end\n\n        let\n            f1() = tmap(1:10) do i\n                A = i\n                sleep(rand()/10)\n                A\n            end\n            f2() = tmap(1:10) do i\n                local A = i\n                sleep(rand()/10)\n                A\n            end\n\n            @test_throws BoxedVariableError f1()\n            @test f2() == 1:10\n\n            A = 1 # Cause spooky action-at-a-distance by making A outer-local to the whole let block!\n        end\n\n        let\n            A = 1\n            f1() = tmap(1:10) do i\n                A = 1\n            end\n            @test_throws BoxedVariableError f1() == ones(10) # Throws even though the redefinition is 'harmless'\n\n            @allow_boxed_captures begin\n                f2() = tmap(1:10) do i\n                    A = 1\n                end\n                @test f2() == ones(10)\n            end\n\n            # Can nest allow and disallow because they're scoped values!\n            function f3()\n                @disallow_boxed_captures begin\n                    tmap(1:10) do i\n                    A = 1\n                    end\n                end\n            end\n            @allow_boxed_captures begin\n                @test_throws BoxedVariableError f3() == ones(10)\n            end\n        end\n        @testset \"@localize\" begin\n            A = 1\n            if false\n                A = 2\n            end\n            ## This stops A from being boxed!\n            v = @localize A tmap(1:2) do _\n                A\n            end\n            @test v == [1, 1]\n        end\n    end\nend\n\n# Todo way more testing, and easier tests to deal with\n"
  }
]