[
  {
    "path": ".github/dependabot.yml",
    "content": "# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"weekly\"\n"
  },
  {
    "path": ".github/workflows/CI.yml",
    "content": "name: CI\non:\n  pull_request:\n    branches:\n      - master\n  push:\n    branches:\n      - master\n    tags: '*'\n  workflow_dispatch:\n\nconcurrency:\n  # Skip intermediate builds: all builds except for builds on the `master` branch\n  # Cancel intermediate builds: only pull request builds\n  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }}\n  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}\n\njobs:\n  test:\n    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ github.event_name }}\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        version:\n          - 'min'\n          - 'lts'\n          - '1'\n          - 'pre'\n        os:\n          - ubuntu-latest\n          - windows-latest\n          - macOS-latest\n    steps:\n      - uses: actions/checkout@v6\n      - uses: julia-actions/setup-julia@v3\n        with:\n          version: ${{ matrix.version }}\n      - uses: julia-actions/cache@v3\n      - uses: julia-actions/julia-buildpkg@v1\n      - uses: julia-actions/julia-runtest@v1\n      - uses: julia-actions/julia-processcoverage@v1\n      - uses: codecov/codecov-action@v5\n        with:\n          files: lcov.info\n          token: ${{ secrets.CODECOV_TOKEN }}\n          fail_ci_if_error: true\n  docs:\n    name: Documentation\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v6\n      - uses: julia-actions/setup-julia@v3\n        with:\n          version: '1'\n      - uses: julia-actions/cache@v3\n      - run: julia --project=docs -e 'import Pkg; Pkg.instantiate()'\n      - run: |\n          julia --project=docs -e '\n            using Documenter: doctest\n            using DistributedArrays\n            doctest(DistributedArrays)'\n      - run: julia --project=docs docs/make.jl\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}\n"
  },
  {
    "path": ".github/workflows/CompatHelper.yml",
    "content": "name: CompatHelper\non:\n  schedule:\n    - cron: 0 0 * * *\n  workflow_dispatch:\njobs:\n  CompatHelper:\n    runs-on: ubuntu-latest\n    steps:\n      - name: \"Add the General registry via Git\"\n        run: |\n          import Pkg\n          ENV[\"JULIA_PKG_SERVER\"] = \"\"\n          Pkg.Registry.add(\"General\")\n        shell: julia --color=yes {0}\n      - name: \"Install CompatHelper\"\n        run: |\n          import Pkg\n          name = \"CompatHelper\"\n          uuid = \"aa819f21-2bde-4658-8897-bab36330d9b7\"\n          version = \"3\"\n          Pkg.add(; name, uuid, version)\n        shell: julia --color=yes {0}\n      - name: \"Run CompatHelper\"\n        run: |\n          import CompatHelper\n          CompatHelper.main(; subdirs = [\"\", \"docs\"])\n        shell: julia --color=yes {0}\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}\n"
  },
  {
    "path": ".github/workflows/TagBot.yml",
    "content": "name: TagBot\non:\n  issue_comment:\n    types:\n      - created\n  workflow_dispatch:\njobs:\n  TagBot:\n    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'\n    runs-on: ubuntu-latest\n    steps:\n      - uses: JuliaRegistries/TagBot@v1\n        with:\n          token: ${{ secrets.GITHUB_TOKEN }}\n          ssh: ${{ secrets.DOCUMENTER_KEY }}\n"
  },
  {
    "path": ".gitignore",
    "content": "Manifest.toml\n*.jl.cov\n*.jl.mem\n.DS_Store\n.vscode/"
  },
  {
    "path": "LICENSE.md",
    "content": "The DistributedArrays.jl package is licensed under the MIT \"Expat\" License:\n\n> Copyright (c) 2015: Julia Parallel Contributors\n>\n> Permission is hereby granted, free of charge, to any person obtaining\n> a copy of this software and associated documentation files (the\n> \"Software\"), to deal in the Software without restriction, including\n> without limitation the rights to use, copy, modify, merge, publish,\n> distribute, sublicense, and/or sell copies of the Software, and to\n> permit persons to whom the Software is furnished to do so, subject to\n> the following conditions:\n>\n> The above copyright notice and this permission notice shall be\n> included in all copies or substantial portions of the Software.\n>\n> THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "Project.toml",
    "content": "name = \"DistributedArrays\"\nuuid = \"aaf54ef3-cdf8-58ed-94cc-d582ad619b94\"\nversion = \"0.6.9\"\n\n[deps]\nDistributed = \"8ba89e20-285c-5b6f-9357-94700520ee1b\"\nLinearAlgebra = \"37e2e46d-f89d-539d-b4ee-838fcccc9c8e\"\nPrimes = \"27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae\"\nRandom = \"9a3f8284-a2c9-5f02-9a11-845980a1fd5c\"\nSerialization = \"9e88b42a-f829-5b0c-bbe9-9e923198166b\"\n\n[weakdeps]\nSparseArrays = \"2f01184e-e22b-5df5-ae63-d93ebab69eaf\"\nStatistics = \"10745b16-79ce-11e8-11f9-7d13ad32a3b2\"\n\n[extensions]\nSparseArraysExt = \"SparseArrays\"\nStatisticsExt = \"Statistics\"\n\n[compat]\nAqua = \"0.8.12\"\nDistributed = \"<0.0.1, 1\"\nExplicitImports = \"1.13.2\"\nLinearAlgebra = \"<0.0.1, 1\"\nPrimes = \"0.4, 0.5\"\nRandom = \"<0.0.1, 1\"\nSerialization = \"<0.0.1, 1\"\nSparseArrays = \"<0.0.1, 1\"\nSpecialFunctions = \"0.8, 1, 2\"\nStatistics = \"<0.0.1, 1\"\nTest = \"<0.0.1, 1\"\njulia = \"1.10\"\n\n[extras]\nAqua = \"4c88cf16-eb10-579e-8560-4a9242c79595\"\nExplicitImports = \"7d51a73a-1435-4ff3-83d9-f097790105c7\"\nSparseArrays = \"2f01184e-e22b-5df5-ae63-d93ebab69eaf\"\nSpecialFunctions = \"276daf66-3868-5448-9aa4-cd146d93841b\"\nStatistics = \"10745b16-79ce-11e8-11f9-7d13ad32a3b2\"\nTest = \"8dfed614-e22c-5e08-85e1-65c5234f0b40\"\n\n[targets]\ntest = [\"Aqua\", \"ExplicitImports\", \"SparseArrays\", \"SpecialFunctions\", \"Statistics\", \"Test\"]\n"
  },
  {
    "path": "README.md",
    "content": "# DistributedArrays\n\n*Distributed arrays for Julia.*\n\n| **Documentation**                                                         | **Build Status**                                              |\n|:-------------------------------------------------------------------------:|:-------------------------------------------------------------:|\n| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][travis-img]][travis-url] [![][codecov-img]][codecov-url] |\n\n## Introduction\n\n`DistributedArrays.jl` uses the stdlib [`Distributed`][distributed-docs] to implement a *Global Array* interface.\nA `DArray` is distributed across a set of workers. Each worker can read and write from its local portion of the array and each worker has read-only access to the portions of the array held by other workers.\n\n## Installation\n\nThe package can be installed with the Julia package manager.\nFrom the Julia REPL, type `]` to enter the Pkg REPL mode and run:\n\n```\npkg> add DistributedArrays\n```\n\nOr, equivalently, via the `Pkg` API:\n\n```julia\njulia> import Pkg; Pkg.add(\"DistributedArrays\")\n```\n\n## Documentation\n\n- [**STABLE**][docs-stable-url] &mdash; **documentation of the most recently tagged version.**\n- [**DEVEL**][docs-dev-url] &mdash; *documentation of the in-development version.*\n\n## Project Status\n\nThe package is tested against\nJulia 1.10.0 (oldest supported Julia version),\nthe Julia LTS version,\nthe latest stable release of Julia,\nand the pre-release version of Julia.\n\n## Questions and Contributions\n\nUsage questions can be posted on the [Julia Discourse forum][discourse-tag-url] under the `Parallel/Distributed` category, in the #parallel channel of the [Julia Slack](https://julialang.org/community/).\n\nContributions are very welcome, as are feature requests and suggestions. Please open an [issue][issues-url] if you encounter any problems. In particular additions to documentation are encouraged!\n\n[contrib-url]: https://juliadocs.github.io/Documenter.jl/latest/man/contributing/\n[discourse-tag-url]: https://discourse.julialang.org/c/domain/parallel\n\n[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg\n[docs-dev-url]: https://juliaparallel.github.io/DistributedArrays.jl/dev\n\n[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg\n[docs-stable-url]: https://juliaparallel.github.io/DistributedArrays.jl/stable\n\n[travis-img]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master\n[travis-url]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl\n\n[codecov-img]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg\n[codecov-url]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl\n\n[issues-url]: https://github.com/JuliaParallel/DistributedArrays.jl/issues\n[distributed-docs]: https://docs.julialang.org/en/v1/manual/parallel-computing/#Multi-Core-or-Distributed-Processing-1\n"
  },
  {
    "path": "codecov.yml",
    "content": " comment: off\n"
  },
  {
    "path": "docs/.gitignore",
    "content": "build/\n"
  },
  {
    "path": "docs/Project.toml",
    "content": "[deps]\nDistributedArrays = \"aaf54ef3-cdf8-58ed-94cc-d582ad619b94\"\nDocumenter = \"e30172f5-a6a5-5a46-863b-614d45cd2de4\"\n\n[compat]\nDistributedArrays = \"0.6\"\nDocumenter = \"1\"\n\n[sources.DistributedArrays]\npath = \"..\"\n"
  },
  {
    "path": "docs/make.jl",
    "content": "using Documenter, DistributedArrays\n\nmakedocs(\n    modules = [DistributedArrays],\n    format = Documenter.HTML(),\n    sitename = \"DistributedArrays.jl\",\n    pages = [\n        \"Introduction\" => \"index.md\"\n        \"API\" => \"api.md\"\n    ],\n    doctest = true\n)\n\ndeploydocs(\n    repo = \"github.com/JuliaParallel/DistributedArrays.jl.git\",\n)\n"
  },
  {
    "path": "docs/src/api.md",
    "content": "# API\n\n```@autodocs\nModules = [DistributedArrays]\n```\n"
  },
  {
    "path": "docs/src/index.md",
    "content": "# DistributedArrays.jl\n\n```@contents\n```\n\nDistributed Arrays\n------------------\n\nLarge computations are often organized around large arrays of data. In these\ncases, a particularly natural way to obtain parallelism is to distribute arrays\namong several processes. This combines the memory resources of multiple\nmachines, allowing use of arrays too large to fit on one machine. Each process\ncan read and write to the part of the array it owns and has read-only access to\nthe parts it doesn't own. This provides a ready answer to the question of how a\nprogram should be divided among machines.\n\nJulia distributed arrays are implemented by the `DArray` type. A\n`DArray` has an element type and dimensions just like an `Array`.\nA `DArray` can also use arbitrary array-like types to represent the local\nchunks that store actual data. The data in a `DArray` is distributed by\ndividing the index space into some number of blocks in each dimension.\n\nCommon kinds of arrays can be constructed with functions beginning with\n`d`:\n\n```julia\ndzeros(100,100,10)\ndones(100,100,10)\ndrand(100,100,10)\ndrandn(100,100,10)\ndfill(x,100,100,10)\n```\n\nIn the last case, each element will be initialized to the specified\nvalue `x`. These functions automatically pick a distribution for you.\nFor more control, you can specify which processes to use, and how the\ndata should be distributed:\n\n```julia\ndzeros((100,100), workers()[1:4], [1,4])\n```\n\nThe second argument specifies that the array should be created on the first\nfour workers. When dividing data among a large number of processes,\none often sees diminishing returns in performance. Placing `DArray`s\non a subset of processes allows multiple `DArray` computations to\nhappen at once, with a higher ratio of work to communication on each\nprocess.\n\nThe third argument specifies a distribution; the nth element of\nthis array specifies how many pieces dimension n should be divided into.\nIn this example the first dimension will not be divided, and the second\ndimension will be divided into 4 pieces. Therefore each local chunk will be\nof size `(100,25)`. Note that the product of the distribution array must\nequal the number of processes.\n\n* `distribute(a::Array)` converts a local array to a distributed array.\n\n* `localpart(d::DArray)` obtains the locally-stored portion\n  of a  `DArray`.\n\n* Localparts can be retrieved and set via the indexing syntax too.\n  Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which\n  are all equivalent. For example, `d[:L]` returns the localpart of `d`\n  while `d[:L]=v` sets `v` as the localpart of `d`.\n\n* `localindices(a::DArray)` gives a tuple of the index ranges owned by the\n  local process.\n\n* `convert(Array, a::DArray)` brings all the data to the local process.\n\nIndexing a `DArray` (square brackets) with ranges of indices always\ncreates a `SubArray`, not copying any data.\n\n\nConstructing Distributed Arrays\n-------------------------------\n\nThe primitive `DArray` constructor has the following somewhat elaborate signature:\n\n```julia\nDArray(init, dims[, procs, dist])\n```\n\n`init` is a function that accepts a tuple of index ranges. This function should\nallocate a local chunk of the distributed array and initialize it for the specified\nindices. `dims` is the overall size of the distributed array.\n`procs` optionally specifies a vector of process IDs to use.\n`dist` is an integer vector specifying how many chunks the\ndistributed array should be divided into in each dimension.\n\nThe last two arguments are optional, and defaults will be used if they\nare omitted.\n\nAs an example, here is how to turn the local array constructor `fill`\ninto a distributed array constructor:\n\n```julia\ndfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)\n```\n\nIn this case the `init` function only needs to call `fill` with the\ndimensions of the local piece it is creating.\n\n`DArray`s can also be constructed from multidimensional `Array` comprehensions with\nthe `@DArray` macro syntax.  This syntax is just sugar for the primitive `DArray` constructor:\n\n```julia\njulia> [i+j for i = 1:5, j = 1:5]\n5x5 Array{Int64,2}:\n 2  3  4  5   6\n 3  4  5  6   7\n 4  5  6  7   8\n 5  6  7  8   9\n 6  7  8  9  10\n\njulia> @DArray [i+j for i = 1:5, j = 1:5]\n5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}:\n 2  3  4  5   6\n 3  4  5  6   7\n 4  5  6  7   8\n 5  6  7  8   9\n 6  7  8  9  10\n```\n\n### Construction from arrays generated on separate processes\n`DArray`s can also be constructed from arrays that have been constructed on separate processes, as demonstrated below:\n```julia\nras = [@spawnat p rand(30,30) for p in workers()[1:4]]\nras = reshape(ras,(2,2))\nD   = DArray(ras)\n```\nAn alternative syntax is:\n```julia\nr1 = DistributedArrays.remotecall(() -> rand(10,10), workers()[1]) \nr2 = DistributedArrays.remotecall(() -> rand(10,10), workers()[2]) \nr3 = DistributedArrays.remotecall(() -> rand(10,10), workers()[3]) \nr4 = DistributedArrays.remotecall(() -> rand(10,10), workers()[4]) \nD  = DArray(reshape([r1 r2 r3 r4], (2,2))) \n```\nThe distribution of indices across workers can be checked with\n```julia\n[@fetchfrom p localindices(D) for p in workers()]\n```\n\n\n\nDistributed Array Operations\n----------------------------\n\nAt this time, distributed arrays do not have much functionality. Their\nmajor utility is allowing communication to be done via array indexing, which\nis convenient for many problems. As an example, consider implementing the\n\"life\" cellular automaton, where each cell in a grid is updated according\nto its neighboring cells. To compute a chunk of the result of one iteration,\neach process needs the immediate neighbor cells of its local chunk. The\nfollowing code accomplishes this:\n\n```julia\nfunction life_step(d::DArray)\n    DArray(size(d),procs(d)) do I\n        top   = mod(first(I[1])-2,size(d,1))+1\n        bot   = mod( last(I[1])  ,size(d,1))+1\n        left  = mod(first(I[2])-2,size(d,2))+1\n        right = mod( last(I[2])  ,size(d,2))+1\n\n        old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2)\n        old[1      , 1      ] = d[top , left]   # left side\n        old[2:end-1, 1      ] = d[I[1], left]\n        old[end    , 1      ] = d[bot , left]\n        old[1      , 2:end-1] = d[top , I[2]]\n        old[2:end-1, 2:end-1] = d[I[1], I[2]]   # middle\n        old[end    , 2:end-1] = d[bot , I[2]]\n        old[1      , end    ] = d[top , right]  # right side\n        old[2:end-1, end    ] = d[I[1], right]\n        old[end    , end    ] = d[bot , right]\n\n        life_rule(old)\n    end\nend\n```\n\nAs you can see, we use a series of indexing expressions to fetch\ndata into a local array `old`. Note that the `do` block syntax is\nconvenient for passing `init` functions to the `DArray` constructor.\nNext, the serial function `life_rule` is called to apply the update rules\nto the data, yielding the needed `DArray` chunk. Nothing about `life_rule`\nis `DArray`-specific, but we list it here for completeness:\n\n```julia\nfunction life_rule(old)\n    m, n = size(old)\n    new = similar(old, m-2, n-2)\n    for j = 2:n-1\n        for i = 2:m-1\n            nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1],\n                   old[i  ,j-1],             old[i  ,j+1],\n                   old[i+1,j-1], old[i+1,j], old[i+1,j+1])\n            new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j])\n        end\n    end\n    new\nend\n```\n\n\n\nNumerical Results of Distributed Computations\n---------------------------------------------\n\nFloating point arithmetic is not associative and this comes up\nwhen performing distributed computations over `DArray`s.  All `DArray`\noperations are performed over the `localpart` chunks and then aggregated.\nThe change in ordering of the operations will change the numeric result as\nseen in this simple example:\n\n```julia\njulia> addprocs(8);\n\njulia> using DistributedArrays\n\njulia> A = fill(1.1, (100,100));\n\njulia> sum(A)\n11000.000000000013\n\njulia> DA = distribute(A);\n\njulia> sum(DA)\n11000.000000000127\n\njulia> sum(A) == sum(DA)\nfalse\n```\n\nThe ultimate ordering of operations will be dependent on how the `Array` is distributed.\n\n\n\nGarbage Collection and `DArray`s\n------------------------------\n\nWhen a `DArray` is constructed (typically on the master process), the returned `DArray` objects stores information on how the\narray is distributed, which processor holds which indices and so on. When the `DArray` object\non the master process is garbage collected, all participating workers are notified and\nlocalparts of the `DArray` freed on each worker.\n\nSince the size of the `DArray` object itself is small, a problem arises as `gc` on the master faces no memory pressure to\ncollect the `DArray` immediately. This results in a delay of the memory being released on the participating workers.\n\nTherefore it is highly recommended to explicitly call `close(d::DArray)` as soon as user code\nhas finished working with the distributed array.\n\nIt is also important to note that the localparts of the `DArray` is collected from all participating workers\nwhen the `DArray` object on the process creating the `DArray` is collected. It is therefore important to maintain\na reference to a `DArray` object on the creating process for as long as it is being computed upon.\n\n`d_closeall()` is another useful function to manage distributed memory. It releases all `DArrays` created from\nthe calling process, including any temporaries created during computation.\n\n\n\nWorking with distributed non-array data (requires Julia 0.6)\n------------------------------------------------------------\n\nThe function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used\nto created a distributed vector whose localparts need not be Arrays.\n\nIt returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same.\n\n`ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value,\ninitially initialized to `nothing`.\n\nArgument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`.\nIf the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1,\nreturns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`.\n\n`gather{T}(d::DArray{T,1,T})` returns an `Array{T,1}` consisting of all distributed elements of `d`.\n\nGiven a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart`\non the ith worker that `d` is distributed over.\n\n\n\nSPMD Mode (An MPI Style SPMD mode with MPI like primitives, requires Julia 0.6)\n-------------------------------------------------------------------------------\nSPMD, i.e., a Single Program Multiple Data mode, is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background.\n\nThe same block of code is executed concurrently on all workers using the `spmd` function.\n\n```julia\n# define foo() on all workers\n@everywhere function foo(arg1, arg2)\n    ....\nend\n\n# call foo() everywhere using the `spmd` function\nd_in=DArray(.....)\nd_out=ddata()\nspmd(foo,d_in,d_out; pids=workers()) # executes on all workers\n```\n\n`spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)`\n\n`args` is one or more arguments to be passed to `f`. `pids` identifies the workers\nthat `f` needs to be run on. `context` identifies a run context, which is explained\nlater.\n\nThe following primitives can be used in SPMD mode.\n\n- `sendto(pid, data; tag=nothing)` - sends `data` to `pid`\n\n- `recvfrom(pid; tag=nothing)` - receives data from `pid`\n\n- `recvfrom_any(; tag=nothing)` - receives data from any `pid`\n\n- `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceed\n\n- `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid`\n\n- `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid`\n\n- `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid`\n\nTag `tag` should be used to differentiate between consecutive calls of the same type, for example,\nconsecutive `bcast` calls.\n\n`spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to\nimport it explicitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example,\n`SPMD.sendto`.\n\n\n\nExample\n-------\n\nThis toy example exchanges data with each of its neighbors `n` times.\n\n```julia\nusing Distributed\nusing DistributedArrays\naddprocs(8)\n@everywhere using DistributedArrays\n@everywhere using DistributedArrays.SPMD\n\nd_in=d=DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1])\nd_out=ddata();\n\n# define the function everywhere\n@everywhere function foo_spmd(d_in, d_out, n)\n    pids = sort(vec(procs(d_in)))\n    pididx = findfirst(isequal(myid()), pids)\n    mylp = d_in[:L]\n    localsum = 0\n\n    # Have each worker exchange data with its neighbors\n    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1\n    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1\n\n    for i in 1:n\n        sendto(pids[n_pididx], mylp[2])\n        sendto(pids[p_pididx], mylp[1])\n\n        mylp[2] = recvfrom(pids[p_pididx])\n        mylp[1] = recvfrom(pids[n_pididx])\n\n        barrier(;pids=pids)\n        localsum = localsum + mylp[1] + mylp[2]\n    end\n\n    # finally store the sum in d_out\n    d_out[:L] = localsum\nend\n\n# run foo_spmd on all workers\nspmd(foo_spmd, d_in, d_out, 10, pids=workers())\n\n# print values of d_in and d_out after the run\nprintln(d_in)\nprintln(d_out)\n```\n\n\n\nSPMD Context\n------------\n\nEach SPMD run is implicitly executed in a different context. This allows for multiple `spmd` calls to\nbe active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`.\n\n`context(pids=procs())` returns a new SPMD context.\n\nA SPMD context also provides a context local storage, a dict, which can be used to store\nkey-value pairs between spmd runs under the same context.\n\n`context_local_storage()` returns the dictionary associated with the context.\n\nNOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context\nwhich live only for the duration of the call. Explicitly created context objects can be released\nearly by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries\non all participating `pids`. Else they will be released when the context object is gc'ed\non the node that created it.\n\n\n\nNested `spmd` calls\n-------------------\nAs `spmd` executes the specified function on all participating nodes, we need to be careful with nesting `spmd` calls.\n\nAn example of an unsafe(wrong) way:\n```julia\nfunction foo(.....)\n    ......\n    spmd(bar, ......)\n    ......\nend\n\nfunction bar(....)\n    ......\n    spmd(baz, ......)\n    ......\nend\n\nspmd(foo,....)\n```\nIn the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currently part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process.\n\nThe correct way (only have the driver process initiate `spmd` calls):\n```julia\nfunction foo()\n    ......\n    myid()==1 && spmd(bar, ......)\n    ......\nend\n\nfunction bar()\n    ......\n    myid()==1 && spmd(baz, ......)\n    ......\nend\n\nspmd(foo,....)\n```\n\nThis is also true of functions which automatically distribute computation on DArrays.\n```julia\nfunction foo(d::DArray)\n    ......\n    myid()==1 && map!(bar, d)\n    ......\nend\nspmd(foo,....)\n```\nWithout the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is probably not what we want.\n\nSimilarly `@everywhere` from within a SPMD run should also be driven from the master node only.\n"
  },
  {
    "path": "ext/SparseArraysExt.jl",
    "content": "module SparseArraysExt\n\nusing DistributedArrays: DArray, SubDArray, SubOrDArray, localpart\nusing DistributedArrays.Distributed: remotecall_fetch\nusing SparseArrays: SparseArrays, nnz\n\nfunction SparseArrays.nnz(A::DArray)\n    B = asyncmap(A.pids) do p\n        remotecall_fetch(nnz∘localpart, p, A)\n    end\n    return reduce(+, B)\nend\n\n# Fix method ambiguities\n# TODO: Improve efficiency?\nBase.copyto!(dest::SubOrDArray{<:Any,2}, src::SparseArrays.AbstractSparseMatrixCSC) = copyto!(dest, Matrix(src))\n@static if isdefined(SparseArrays, :CHOLMOD)\n    Base.copyto!(dest::SubOrDArray, src::SparseArrays.CHOLMOD.Dense) = copyto!(dest, Array(src))\n    Base.copyto!(dest::SubOrDArray{T}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))\n    Base.copyto!(dest::SubOrDArray{T,2}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))\nend\n\n# Fix method ambiguities\nfor T in (:DArray, :SubDArray)\n    @eval begin\n        Base.:(==)(d1::$T{<:Any,1}, d2::SparseArrays.ReadOnly) = d1 == parent(d2)\n        Base.:(==)(d1::SparseArrays.ReadOnly, d2::$T{<:Any,1}) = parent(d1) == d2\n    end\nend\n\nend\n"
  },
  {
    "path": "ext/StatisticsExt.jl",
    "content": "module StatisticsExt\n\nusing DistributedArrays: DArray\nusing Statistics: Statistics\n\nStatistics._mean(f, A::DArray, region) = sum(f, A, dims = region) ./ prod((size(A, i) for i in region))\n\nend\n"
  },
  {
    "path": "src/DistributedArrays.jl",
    "content": "module DistributedArrays\n\nusing Base: Callable\nusing Base.Broadcast: BroadcastStyle, Broadcasted\n\nusing Distributed: Distributed, RemoteChannel, Future, myid, nworkers, procs, remotecall, remotecall_fetch, remotecall_wait, worker_id_from_socket, workers\nusing LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, I, Transpose, adjoint, adjoint!, axpy!, dot, lmul!, mul!, norm, rmul!, transpose, transpose!\nusing Random: Random, rand!\nusing Serialization: Serialization, AbstractSerializer, deserialize, serialize\n\nusing Primes: factor\n\n# DArray exports\nexport DArray, SubDArray, SubOrDArray, @DArray\nexport dzeros, dones, dfill, drand, drandn, distribute, localpart, localindices, ppeval\n\n# non-array distributed data\nexport ddata, gather\n\n# immediate release of localparts\nexport d_closeall\n\ninclude(\"darray.jl\")\ninclude(\"core.jl\")\ninclude(\"serialize.jl\")\ninclude(\"broadcast.jl\")\ninclude(\"mapreduce.jl\")\ninclude(\"linalg.jl\")\ninclude(\"sort.jl\")\n\ninclude(\"spmd.jl\")\nexport SPMD\n\nend # module\n"
  },
  {
    "path": "src/broadcast.jl",
    "content": "###\n# Distributed broadcast implementation\n##\n\n# We define a custom ArrayStyle here since we need to keep track of\n# the fact that it is Distributed and what kind of underlying broadcast behaviour\n# we will encounter.\nstruct DArrayStyle{Style <: Union{Nothing,BroadcastStyle}} <: Broadcast.AbstractArrayStyle{Any} end\nDArrayStyle(::S) where {S} = DArrayStyle{S}()\nDArrayStyle(::S, ::Val{N}) where {S,N} = DArrayStyle(S(Val(N)))\nDArrayStyle(::Val{N}) where N = DArrayStyle{Broadcast.DefaultArrayStyle{N}}()\n\nBroadcast.BroadcastStyle(::Type{<:DArray{<:Any, N, A}}) where {N, A} = DArrayStyle(BroadcastStyle(A), Val(N))\n\n# promotion rules\n# TODO: test this\nfunction Broadcast.BroadcastStyle(::DArrayStyle{AStyle}, ::DArrayStyle{BStyle}) where {AStyle, BStyle}\n    DArrayStyle(BroadcastStyle(AStyle, BStyle))\nend\n\nfunction Broadcast.broadcasted(::DArrayStyle{Style}, f, args...) where Style\n    inner = Broadcast.broadcasted(Style(), f, args...)\n    if inner isa Broadcasted\n        return Broadcasted{DArrayStyle{Style}}(inner.f, inner.args, inner.axes)\n    else # eagerly evaluated\n        return inner\n    end\nend\n\n# # deal with one layer deep lazy arrays\n# BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where T <: DArray = BroadcastStyle(T)\n# BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where T <: DArray = BroadcastStyle(T)\n# BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,<:T}}) where T <: DArray = BroadcastStyle(T)\n\n# # This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray\n# # and we could define our methods in terms of Union{DArray, WrappedArray{<:Any, <:DArray}}\n# const DDestArray = Union{DArray,\n#                          LinearAlgebra.Transpose{<:Any,<:DArray},\n#                          LinearAlgebra.Adjoint{<:Any,<:DArray},\n#                          SubArray{<:Any, <:Any, <:DArray}}\nconst DDestArray = DArray\n\n# This method is responsible for selection the output type of broadcast\nfunction Base.similar(bc::Broadcasted{<:DArrayStyle{Style}}, ::Type{ElType}) where {Style, ElType}\n    DArray(map(length, axes(bc))) do I \n        # create fake Broadcasted for underlying ArrayStyle\n        bc′ = Broadcasted{Style}(identity, (), map(length, I))\n        similar(bc′, ElType)\n    end\nend\n\n##\n# Ref https://docs.julialang.org/en/v1/manual/interfaces/#extending-in-place-broadcast-2\n#\n# We purposefully only specialise `copyto!`,\n# Broadcast implementation that defers to the underlying BroadcastStyle. We can't \n# assume that `getindex` is fast, furthermore  we can't assume that the distribution of\n# DArray across workers is equal or that the underlying array type is consistent.\n#\n# Implementation:\n#   - first distribute all arguments\n#     - Q: How do decide on the cuts\n#   - then localise arguments on each node\n##\n@inline function Base.copyto!(dest::DDestArray, bc::Broadcasted{Nothing})\n    axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))\n\n    # Distribute Broadcasted\n    # This will turn local AbstractArrays into DArrays\n    dbc = bcdistribute(bc)\n\n    @sync for p in procs(dest)\n        @async remotecall_wait(p) do\n            # get the indices for the localpart\n            lpidx = localpartindex(dest)\n            @assert lpidx != 0\n            # create a local version of the broadcast, by constructing views\n            # Note: creates copies of the argument\n            lbc = bclocal(dbc, dest.indices[lpidx])\n            copyto!(localpart(dest), lbc)\n        end\n    end\n\n    return dest\nend\n\n# Test\n# a = Array\n# a .= DArray(x,y)\n\n@inline function Base.copy(bc::Broadcasted{<:DArrayStyle})\n    dbc = bcdistribute(bc)\n    # TODO: teach DArray about axes since this is wrong for OffsetArrays\n    DArray(map(length, axes(bc))) do I\n        lbc = bclocal(dbc, I)\n        copy(lbc)\n    end\nend\n\n# _bcview creates takes the shapes of a view and the shape of a broadcasted argument,\n# and produces the view over that argument that constitutes part of the broadcast\n# it is in a sense the inverse of _bcs in Base.Broadcast\n_bcview(::Tuple{}, ::Tuple{}) = ()\n_bcview(::Tuple{}, view::Tuple) = ()\n_bcview(shape::Tuple, ::Tuple{}) = (shape[1], _bcview(tail(shape), ())...)\nfunction _bcview(shape::Tuple, view::Tuple)\n    return (_bcview1(shape[1], view[1]), _bcview(tail(shape), tail(view))...)\nend\n\n# _bcview1 handles the logic for a single dimension\nfunction _bcview1(a, b)\n    if a == 1 || a == 1:1\n        return 1:1\n    elseif first(a) <= first(b) <= last(a) &&\n           first(a) <= last(b)  <= last(b)\n        return b\n    else\n        throw(DimensionMismatch(\"broadcast view could not be constructed\"))\n    end\nend\n\n# Distribute broadcast\n# TODO: How to decide on cuts\n@inline bcdistribute(bc::Broadcasted{Style}) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{DArrayStyle{Style}}(bc.f, bcdistribute_args(bc.args), bc.axes)\n@inline bcdistribute(bc::Broadcasted{Style}) where Style<:DArrayStyle = Broadcasted{Style}(bc.f, bcdistribute_args(bc.args), bc.axes)\n\n# ask BroadcastStyle to decide if argument is in need of being distributed\nbcdistribute(x::T) where T = _bcdistribute(BroadcastStyle(T), x)\n_bcdistribute(::DArrayStyle, x) = x\n# Don't bother distributing singletons\n_bcdistribute(::Broadcast.AbstractArrayStyle{0}, x) = x\n_bcdistribute(::Broadcast.AbstractArrayStyle, x) = distribute(x)\n_bcdistribute(::Any, x) = x\n\n@inline bcdistribute_args(args::Tuple) = (bcdistribute(args[1]), bcdistribute_args(tail(args))...)\nbcdistribute_args(args::Tuple{Any}) = (bcdistribute(args[1]),)\nbcdistribute_args(args::Tuple{}) = ()\n\n# dropping axes here since recomputing is easier\n@inline bclocal(bc::Broadcasted{DArrayStyle{Style}}, idxs) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{Style}(bc.f, bclocal_args(_bcview(axes(bc), idxs), bc.args))\n\n# bclocal will do a view of the data and the copy it over\n# except when the data already is local\nfunction bclocal(x::DArray{T, N, AT}, idxs) where {T, N, AT}\n    bcidxs = _bcview(axes(x), idxs)\n    makelocal(x, bcidxs...)\nend\nbclocal(x, idxs) = x\n\n@inline bclocal_args(idxs, args::Tuple) = (bclocal(args[1], idxs), bclocal_args(idxs, tail(args))...)\nbclocal_args(idxs, args::Tuple{Any}) = (bclocal(args[1], idxs),)\nbclocal_args(idxs, args::Tuple{}) = ()\n"
  },
  {
    "path": "src/core.jl",
    "content": "# Thread-safe registry of DArray references\nstruct DArrayRegistry\n    data::Dict{Tuple{Int,Int}, Any}\n    lock::ReentrantLock\n    DArrayRegistry() = new(Dict{Tuple{Int,Int}, Any}(), ReentrantLock())\nend\nconst REGISTRY = DArrayRegistry()\n\nfunction Base.get(r::DArrayRegistry, id::Tuple{Int,Int}, default)\n    @lock r.lock begin\n        return get(r.data, id, default)\n    end\nend\nfunction Base.getindex(r::DArrayRegistry, id::Tuple{Int,Int})\n    @lock r.lock begin\n        return r.data[id]\n    end\nend\nfunction Base.setindex!(r::DArrayRegistry, val, id::Tuple{Int,Int})\n    @lock r.lock begin\n        r.data[id] = val\n    end\n    return r\nend\nfunction Base.delete!(r::DArrayRegistry, id::Tuple{Int,Int})\n    @lock r.lock delete!(r.data, id)\n    return r\nend\n\n# Thread-safe set of IDs of DArrays created on this node\nstruct DArrayRefs\n    data::Set{Tuple{Int,Int}}\n    lock::ReentrantLock\n    DArrayRefs() = new(Set{Tuple{Int,Int}}(), ReentrantLock())\nend\nconst REFS = DArrayRefs()\n\nfunction Base.push!(r::DArrayRefs, id::Tuple{Int,Int})\n    # Ensure id refers to a DArray created on this node\n    if first(id) != myid()\n        throw(\n            ArgumentError(\n                lazy\"`DArray` is not created on the current worker: Only `DArray`s created on worker $(myid()) can be stored in this set but the `DArray` was created on worker $(first(id)).\"))\n    end\n    @lock r.lock begin\n        return push!(r.data, id)\n    end\nend\nfunction Base.delete!(r::DArrayRefs, id::Tuple{Int,Int})\n    @lock r.lock delete!(r.data, id)\n    return r\nend\n\n# Global counter to generate a unique ID for each DArray\nconst DID = Threads.Atomic{Int}(1)\n\n\"\"\"\n    next_did()\n\nIncrement a global counter and return a tuple of the current worker ID and the incremented\nvalue of the counter.\n\nThis tuple is used as a unique ID for a new `DArray`.\n\"\"\"\nnext_did() = (myid(), Threads.atomic_add!(DID, 1))\n\nrelease_localpart(id::Tuple{Int,Int}) = (delete!(REGISTRY, id); nothing)\nfunction release_allparts(id::Tuple{Int,Int}, pids::Array{Int})\n    @sync begin\n        released_myid = false\n        for p in pids\n            if p == myid()\n                @async release_localpart(id)\n                released_myid = true\n            else\n                @async remotecall_fetch(release_localpart, p, id)\n            end\n        end\n        if !released_myid\n            @async release_localpart(id)\n        end\n    end\n    return nothing\nend\n\nfunction close_by_id(id::Tuple{Int,Int}, pids::Array{Int})\n    release_allparts(id, pids)\n    delete!(REFS, id)\n    nothing\nend\n\nfunction d_closeall()\n    @lock REFS.lock begin\n        while !isempty(REFS.data)\n            id = pop!(REFS.data)\n            d = d_from_weakref_or_d(id)\n            if d isa DArray\n                finalize(d)\n            end\n        end\n    end\n    return nothing\nend\n\nBase.close(d::DArray) = finalize(d)\n\n\"\"\"\n    procs(d::DArray)\n\nGet the vector of processes storing pieces of DArray `d`.\n\"\"\"\nDistributed.procs(d::DArray)    = d.pids\nDistributed.procs(d::SubDArray) = procs(parent(d))\n\n\"\"\"\n    localpart(A)\n\nThe identity when input is not distributed\n\"\"\"\nlocalpart(A) = A\n"
  },
  {
    "path": "src/darray.jl",
    "content": "\"\"\"\n    DArray(init, dims, [procs, dist])\n\nConstruct a distributed array.\n\nThe parameter `init` is a function that accepts a tuple of index ranges.\nThis function should allocate a local chunk of the distributed array and initialize it for the specified indices.\n\n`dims` is the overall size of the distributed array.\n\n`procs` optionally specifies a vector of process IDs to use.\nIf unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode,\ni.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the\ninteractive julia prompt.\n\n`dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension.\n\nFor example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as:\n\n### Example\n```jl\ndfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)\n```\n\"\"\"\nmutable struct DArray{T,N,A} <: AbstractArray{T,N}\n    id::Tuple{Int,Int}\n    dims::NTuple{N,Int}\n    pids::Array{Int,N}                          # pids[i]==p ⇒ processor p has piece i\n    indices::Array{NTuple{N,UnitRange{Int}},N}  # indices held by piece i\n    cuts::Vector{Vector{Int}}                   # cuts[d][i] = first index of chunk i in dimension d\n    localpart::Union{A,Nothing}\n\n    function DArray{T,N,A}(id::Tuple{Int,Int}, dims::NTuple{N,Int}, pids, indices, cuts, lp) where {T,N,A}\n        # check invariants\n        if dims != map(last, last(indices))\n            throw(ArgumentError(\"dimension of DArray (dim) and indices do not match\"))\n        end\n\n        d = d_from_weakref_or_d(id)\n        if d === nothing\n            d = new(id, dims, pids, indices, cuts, lp)\n        end\n\n        if first(id) == myid()\n            push!(REFS, id)\n            REGISTRY[id] = WeakRef(d)\n            finalizer(d) do d\n                @async close_by_id(d.id, d.pids)\n            end\n        end\n        d\n    end\n\n    DArray{T,N,A}() where {T,N,A} = new()\nend\n\nunpack_weakref(x) = x\nunpack_weakref(x::WeakRef) = x.value\nd_from_weakref_or_d(id::Tuple{Int,Int}) = unpack_weakref(get(REGISTRY, id, nothing))\n\nBase.eltype(::Type{DArray{T}}) where {T} = T\nempty_localpart(T,N,A) = A(Array{T}(undef, ntuple(zero, N)))\n\nconst SubDArray{T,N,D<:DArray} = SubArray{T,N,D}\nconst SubOrDArray{T,N} = Union{DArray{T,N}, SubDArray{T,N}}\n\nlocaltype(::Type{DArray{T,N,S}}) where {T,N,S} = S\nlocaltype(::Type{SubDArray{T,N,D}}) where {T,N,D} = localtype(D)\nlocaltype(A::SubOrDArray) = localtype(typeof(A))\nlocaltype(A::AbstractArray) = typeof(A)\n\nBase.hash(d::DArray, h::UInt) = Base.hash(d.id, h)\n\n## core constructors ##\n\nfunction DArray(id::Tuple{Int,Int}, init::I, dims, pids, idxs, cuts) where {I}\n    localtypes = Vector{DataType}(undef,length(pids))\n    if init isa Function\n        asyncmap!(localtypes, pids) do pid\n            return remotecall_fetch(construct_localparts, pid, init, id, dims, pids, idxs, cuts)\n        end\n    else\n        asyncmap!(localtypes, pids, init) do pid, pid_init\n            # constructing from an array of remote refs.\n            return remotecall_fetch(construct_localparts, pid, pid_init, id, dims, pids, idxs, cuts)\n        end\n    end\n\n    if !allequal(localtypes)\n        @sync for p in pids\n            @async remotecall_wait(release_localpart, p, id)\n        end\n        throw(ErrorException(lazy\"Constructed localparts have different `eltype`: $(localtypes)\"))\n    end\n    A = first(localtypes)\n\n    if myid() in pids\n        return unpack_weakref(REGISTRY[id])\n    else\n        T = eltype(A)\n        N = length(dims)\n        return DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A))\n    end\nend\n\nfunction construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing)\n    localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init)\n    if A == nothing\n        A = typeof(localpart)\n    end\n    if T == nothing\n        T = eltype(A)\n    end\n    N = length(dims)\n    d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart)\n    REGISTRY[id] = d\n    A\nend\n\nfunction ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])\n    pids=sort(vec(pids))\n    id = next_did()\n    npids = length(pids)\n    ldata = length(data)\n    idxs, cuts = chunk_idxs([npids], [npids])\n\n    if ldata > 0\n        @assert rem(ldata,npids) == 0\n        if ldata == npids\n            T = eltype(data)\n            s = DestinationSerializer(pididx->data[pididx], pids)\n            init = I->localpart(s)\n        else\n            # call the standard distribute function\n            return distribute(data)\n        end\n    end\n\n    @sync for p in pids\n        @async remotecall_wait(construct_localparts, p, init, id, (npids,), pids, idxs, cuts; T=T, A=T)\n    end\n\n    if myid() in pids\n        return unpack_weakref(REGISTRY[id])\n    else\n        return DArray{T,1,T}(id, (npids,), pids, idxs, cuts, nothing)\n    end\nend\n\nfunction gather(d::DArray{T,1,T}) where T\n    pids = procs(d)\n    a = Vector{T}(undef, length(pids))\n    asyncmap!(a, pids) do p\n        remotecall_fetch(localpart, p, d)\n    end\n    a\nend\n\nfunction DArray(init, dims, procs, dist)\n    np = prod(dist)\n    procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist)))\n    idxs, cuts = chunk_idxs([dims...], dist)\n    id = next_did()\n\n    return DArray(id, init, dims, procs, idxs, cuts)\nend\n\nfunction DArray(init, dims, procs)\n    if isempty(procs)\n        throw(ArgumentError(\"no processors given\"))\n    end\n    return DArray(init, dims, procs, defaultdist(dims, procs))\nend\nDArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))])\n\n# Create a DArray from a collection of references\n# The refs must have the same layout as the parts distributed.\n# i.e.\n#    size(refs) must specify the distribution of dimensions across processors\n#    prod(size(refs)) must equal number of parts\n# FIXME : Empty parts are currently not supported.\nfunction DArray(refs)\n    dimdist = size(refs)\n    id = next_did()\n\n    nsizes = Array{Tuple}(undef, dimdist)\n    asyncmap!(nsizes, refs) do r\n        remotecall_fetch(sz_localpart_ref, r.where, r, id)\n    end\n\n    nindices = Array{NTuple{length(dimdist),UnitRange{Int}}}(undef, dimdist...)\n\n    for i in 1:length(nindices)\n        subidx = CartesianIndices(dimdist)[i]\n        nindices[i] = ntuple(length(subidx)) do x\n            idx_in_dim = subidx[x]\n            startidx = 1\n            for j in 1:(idx_in_dim-1)\n                prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx))\n                prevsize = nsizes[prevsubidx...]\n                startidx += prevsize[x]\n            end\n            startidx:startidx+(nsizes[i][x])-1\n        end\n    end\n\n    lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindices]...)\n    ncuts = Array{Int,1}[pushfirst!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)]\n    ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...)\n\n    DArray(id, refs, ndims, map(r -> r.where, refs), nindices, ncuts)\nend\n\nmacro DArray(ex0::Expr)\n    if ex0.head !== :comprehension\n        throw(ArgumentError(\"invalid @DArray syntax\"))\n    end\n    ex = ex0.args[1]\n    if ex.head !== :generator\n        throw(ArgumentError(\"invalid @DArray syntax\"))\n    end\n    ex.args[1] = esc(ex.args[1])\n    ndim = length(ex.args) - 1\n    ranges = map(r->esc(r.args[2]), ex.args[2:end])\n    for d = 1:ndim\n        var = ex.args[d+1].args[1]\n        ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )\n    end\n    return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),\n                tuple($(map(r->:(length($r)), ranges)...))) )\nend\n\n# new DArray similar to an existing one\nDArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indices, d.cuts)\n\nsz_localpart_ref(ref, id) = size(fetch(ref))\n\nBase.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(undef, map(length,I)), dims, procs(d))\nBase.similar(d::DArray, T::Type) = similar(d, T, size(d))\nBase.similar(d::DArray{T}, dims::Dims) where {T} = similar(d, T, dims)\nBase.similar(d::DArray{T}) where {T} = similar(d, T, size(d))\n\nBase.size(d::DArray) = d.dims\n\nchunktype(d::DArray{T,N,A}) where {T,N,A} = A\n\n## chunk index utilities ##\n\n# decide how to divide each dimension\n# returns size of chunks array\nfunction defaultdist(dims, pids)\n    dims = [dims...]\n    chunks = ones(Int, length(dims))\n    np = length(pids)\n    f = sort!(collect(keys(factor(np))), rev=true)\n    k = 1\n    while np > 1\n        # repeatedly allocate largest factor to largest dim\n        if np % f[k] != 0\n            k += 1\n            if k > length(f)\n                break\n            end\n        end\n        fac = f[k]\n        (d, dno) = findmax(dims)\n        # resolve ties to highest dim\n        dno = findlast(isequal(d), dims)\n        if dims[dno] >= fac\n            dims[dno] = div(dims[dno], fac)\n            chunks[dno] *= fac\n        end\n        np = div(np, fac)\n    end\n    return chunks\nend\n\n# get array of start indices for dividing sz into nc chunks\nfunction defaultdist(sz::Int, nc::Int)\n    if sz >= nc\n        chunk_size = div(sz,nc)\n        remainder = rem(sz,nc)\n        grid = zeros(Int64, nc+1)\n        for i = 1:(nc+1)\n            grid[i] += (i-1)*chunk_size + 1\n            if i<= remainder\n                grid[i] += i-1\n            else\n                grid[i] += remainder\n            end\n        end\n        return grid\n    else\n        return [[1:(sz+1);]; zeros(Int, nc-sz)]\n    end\nend\n\n# compute indices array for dividing dims into chunks\nfunction chunk_idxs(dims, chunks)\n    cuts = map(defaultdist, dims, chunks)\n    n = length(dims)\n    idxs = Array{NTuple{n,UnitRange{Int}}}(undef, chunks...)\n    for cidx in CartesianIndices(tuple(chunks...))\n        idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n)\n    end\n    return (idxs, cuts)\nend\n\nfunction localpartindex(pids::Array{Int})\n    mi = myid()\n    for i = 1:length(pids)\n        if pids[i] == mi\n            return i\n        end\n    end\n    return 0\nend\nlocalpartindex(d::DArray) = localpartindex(procs(d))\n\n\"\"\"\n    localpart(d::DArray)\n\nGet the local piece of a distributed array.\nReturns an empty array if no local part exists on the calling process.\n\nd[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts.\nThis syntaxt can also be used for assignment. For example,\n`d[:L]=v` will assign `v` to the localpart of `d`.\n\"\"\"\nfunction localpart(d::DArray{T,N,A}) where {T,N,A}\n    lpidx = localpartindex(d)\n    if lpidx == 0\n        return empty_localpart(T,N,A)::A\n    end\n\n    return d.localpart::A\nend\n\nlocalpart(d::DArray, localidx...) = localpart(d)[localidx...]\n\n_localindex(i::Integer, offset) = i - offset\n_localindex(i::AbstractRange, offset) = (first(i)-offset):step(i):(last(i)-offset)\n_localindex(i::AbstractUnitRange, offset) = (first(i)-offset):(last(i)-offset)\n\n\"\"\"\n    makelocal(A::DArray, I...)\n\nEquivalent to `Array(view(A, I...))` but optimised for the case that the data is local.\nCan return a view into `localpart(A)`\n\"\"\"\n@inline function makelocal(A::DArray{<:Any, <:Any, AT}, I::Vararg{Any, N}) where {N, AT}\n    J = map(i->Base.unalias(A, i), to_indices(A, I))\n    J = map(j-> isa(j, Base.Slice) ? j.indices : j, J)\n    @boundscheck checkbounds(A, J...)\n\n    lidcs = localindices(A)\n    if Base.checkbounds_indices(Bool, lidcs, J)\n        # data we want is local\n        viewidcs = ntuple(i -> _localindex(J[i], first(lidcs[i]) - 1), ndims(A))\n        view(localpart(A), viewidcs...)\n    else\n        # Make more efficient (?maybe) by allocating new memory\n        # only for the remote part\n        viewidcs = ntuple(i -> _localindex(J[i], 0), ndims(A))\n        arr = similar(AT, map(length, viewidcs)...)\n        copyto!(arr, view(A, viewidcs...))\n    end\nend\n\n# shortcut to set/get localparts of a distributed object\nBase.getindex(d::DArray, s::Symbol) = _getindex(d, s)\nBase.getindex(d::DArray{<:Any, 1}, s::Symbol) = _getindex(d, s)\nfunction _getindex(d::DArray, s::Symbol)\n    @assert s in [:L, :l, :LP, :lp]\n    return localpart(d)\nend\n\nfunction Base.setindex!(d::DArray{T,N,A}, new_lp::A, s::Symbol) where {T,N,A}\n    @assert s in [:L, :l, :LP, :lp]\n    d.localpart = new_lp\n    new_lp\nend\n\n\n# fetch localpart of d at pids[i]\nBase.fetch(d::DArray{T,N,A}, i) where {T,N,A} = remotecall_fetch(localpart, d.pids[i], d)\n\n\"\"\"\n    localindices(d)\n\nA tuple describing the indices owned by the local process.\nReturns a tuple with empty ranges if no local part exists on the calling process.\n\"\"\"\nfunction localindices(d::DArray)\n    lpidx = localpartindex(d)\n    if lpidx == 0\n        return ntuple(i -> 1:0, ndims(d))\n    end\n    return d.indices[lpidx]\nend\n\n# Equality\nfunction Base.:(==)(d::DArray{<:Any,<:Any,A}, a::AbstractArray) where A\n    if size(d) != size(a)\n        return false\n    else\n        b = asyncmap(procs(d)) do p\n            remotecall_fetch(p) do\n                localpart(d) == A(a[localindices(d)...])\n            end\n        end\n        return all(b)\n    end\nend\nfunction Base.:(==)(d::SubDArray, a::AbstractArray)\n    cd = copy(d)\n    t = cd == a\n    finalize(cd)\n    return t\nend\nBase.:(==)(a::AbstractArray, d::DArray) = d == a\nBase.:(==)(a::AbstractArray, d::SubDArray) = d == a\nBase.:(==)(d1::DArray, d2::DArray) = invoke(==, Tuple{DArray, AbstractArray}, d1, d2)\nfunction Base.:(==)(d1::SubDArray, d2::DArray)\n    cd1 = copy(d1)\n    t = cd1 == d2\n    finalize(cd1)\n    return t\nend\nfunction Base.:(==)(d1::DArray, d2::SubDArray)\n    cd2 = copy(d2)\n    t = d1 == cd2\n    finalize(cd2)\n    return t\nend\nfunction Base.:(==)(d1::SubDArray, d2::SubDArray)\n    cd1 = copy(d1)\n    t = cd1 == d2\n    finalize(cd1)\n    return t\nend\n\n\"\"\"\n    locate(d::DArray, I::Int...)\n\nDetermine the index of `procs(d)` that hold element `I`.\n\"\"\"\nfunction locate(d::DArray, I::Int...)\n    ntuple(ndims(d)) do i\n        fi = searchsortedlast(d.cuts[i], I[i])\n        if fi >= length(d.cuts[i])\n            throw(ArgumentError(\"element not contained in array\"))\n        end\n        return fi\n    end\nend\n\nchunk(d::DArray{T,N,A}, pid::Int) where {T,N,A} = remotecall_fetch(localpart, pid, d)::A\n\n## convenience constructors ##\n\n\"\"\"\n     dzeros(dims, ...)\n\nConstruct a distributed array of zeros.\nTrailing arguments are the same as those accepted by `DArray`.\n\"\"\"\ndzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...)\ndzeros(::Type{T}, dims::Dims, args...) where {T} = DArray(I->zeros(T,map(length,I)), dims, args...)\ndzeros(::Type{T}, d1::Integer, drest::Integer...) where {T} = dzeros(T, convert(Dims, tuple(d1, drest...)))\ndzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...)))\ndzeros(d::Dims) = dzeros(Float64, d)\n\n\n\"\"\"\n    dones(dims, ...)\n\nConstruct a distributed array of ones.\nTrailing arguments are the same as those accepted by `DArray`.\n\"\"\"\ndones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...)\ndones(::Type{T}, dims::Dims, args...) where {T} = DArray(I->ones(T,map(length,I)), dims, args...)\ndones(::Type{T}, d1::Integer, drest::Integer...) where {T} = dones(T, convert(Dims, tuple(d1, drest...)))\ndones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...)))\ndones(d::Dims) = dones(Float64, d)\n\n\"\"\"\n     dfill(x, dims, ...)\n\nConstruct a distributed array filled with value `x`.\nTrailing arguments are the same as those accepted by `DArray`.\n\"\"\"\ndfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...)\ndfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...)))\n\n\"\"\"\n     drand(dims, ...)\n\nConstruct a distributed uniform random array.\nTrailing arguments are the same as those accepted by `DArray`.\n\"\"\"\ndrand(::Type{T}, dims::Dims) where {T} = DArray(I -> rand(T, map(length, I)), dims)\ndrand(X, dims::Dims) = DArray(I -> rand(X, map(length, I)), dims)\ndrand(dims::Dims) = drand(Float64, dims)\n\ndrand(::Type{T}, d1::Integer, drest::Integer...) where {T} = drand(T, Dims((d1, drest...)))\ndrand(X, d1::Integer, drest::Integer...) = drand(X, Dims((d1, drest...)))\ndrand(d1::Integer, drest::Integer...) = drand(Float64, Dims((d1, drest...)))\n\n# With optional process IDs and number of chunks\nfor N in (1, 2)\n    @eval begin\n        drand(::Type{T}, dims::Dims, args::Vararg{Any,$N}) where {T} = DArray(I -> rand(T, map(length, I)), dims, args...)\n        drand(X, dims::Dims, args::Vararg{Any,$N}) = DArray(I -> rand(X, map(length, I)), dims, args...)\n        drand(dims::Dims, args::Vararg{Any,$N}) = drand(Float64, dims, args...)\n    end\nend\n\n# Fix method ambiguities\ndrand(dims::Dims, procs::Tuple{Vararg{Int}}) = drand(Float64, dims, procs)\ndrand(dims::Dims, procs::Tuple{Vararg{Int}}, dist) = drand(Float64, dims, procs, dist)\ndrand(X::Tuple{Vararg{Int}}, dim::Integer) = drand(X, Dims((dim,)))\ndrand(X::Tuple{Vararg{Int}}, d1::Integer, d2::Integer) = drand(X, Dims((d1, d2)))\n\n\"\"\"\n     drandn(dims, ...)\n\nConstruct a distributed normal random array.\nTrailing arguments are the same as those accepted by `DArray`.\n\"\"\"\ndrandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...)\ndrandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...)))\n\n## conversions ##\n\n\"\"\"\n     distribute(A[; procs, dist])\n\nConvert a local array to distributed.\n\n`procs` optionally specifies an array of process IDs to use. (defaults to all workers)\n`dist` optionally specifies a vector or tuple of the number of partitions in each dimension\n\"\"\"\nfunction distribute(A::AbstractArray;\n    procs = workers()[1:min(nworkers(), maximum(size(A)))],\n    dist = defaultdist(size(A), procs))\n    np = prod(dist)\n    procs_used = procs[1:np]\n    idxs, _ = chunk_idxs([size(A)...], dist)\n\n    s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx\n        A[idxs[pididx]...]\n    end\n    return DArray(I->localpart(s), size(A), procs_used, dist)\nend\n\n\"\"\"\n    distribute(A, DA)\n\nDistribute a local array `A` like the distributed array `DA`.\n\n\"\"\"\nfunction distribute(A::AbstractArray, DA::DArray)\n    size(DA) == size(A) || throw(DimensionMismatch(\"Distributed array has size $(size(DA)) but array has $(size(A))\"))\n\n    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx\n        A[DA.indices[pididx]...]\n    end\n    return DArray(I->localpart(s), DA)\nend\n\nDArray{T,N,S}(A::S) where {T,N,S<:AbstractArray} = distribute(convert(AbstractArray{T,N}, A))\n\nfunction Array{S,N}(d::DArray{T,N}) where {S,T,N}\n    a = Array{S}(undef, size(d))\n    @sync for (pid, indices) in zip(d.pids, d.indices)\n        if !any(isempty, indices)\n            @async a[indices...] = chunk(d, pid)\n        end\n    end\n    return a\nend\n\nfunction Array{S,N}(s::SubDArray{T,N}) where {S,T,N}\n    I = s.indices\n    d = parent(s)\n    if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S && !isempty(s)\n        l = locate(d, map(first, I)...)\n        if isequal(d.indices[l...], I)\n            # SubDArray corresponds to a chunk\n            return chunk(d, d.pids[l...])\n        end\n    end\n    a = Array{S}(undef, size(s))\n    copyto!(a, s)\nend\n\nfunction Base.copyto!(a::Array, s::SubDArray)\n    N = ndims(a)\n    a[[1:size(a,i) for i=1:N]...] = s\n    return a\nend\n\nfunction DArray(SD::SubArray{T,N}) where {T,N}\n    D = SD.parent\n    DArray(size(SD), procs(D)) do I\n        lindices = Base.reindex(SD.indices, I)\n        convert(Array, D[lindices...])\n    end\nend\n\nfunction Base.reshape(A::DArray{T,1,S}, d::Dims) where {T,S<:Array}\n    if prod(d) != length(A)\n        throw(DimensionMismatch(\"dimensions must be consistent with array size\"))\n    end\n    return DArray(d) do I\n        sz = map(length,I)\n        d1offs = first(I[1])\n        nd = length(I)\n\n        B = Array{T}(undef, sz)\n        nr = size(B,1)\n        sztail = size(B)[2:end]\n\n        for i=1:div(length(B),nr)\n            i2 = CartesianIndices(sztail)[i]\n            globalidx = [ I[j][i2[j-1]] for j=2:nd ]\n\n            a = LinearIndices(d)[d1offs, globalidx...]\n\n            B[:,i] = Array(A[a:(a+nr-1)])\n        end\n        B\n    end\nend\n\n## indexing ##\nconst _allowscalar = Ref(true)\nallowscalar(flag = true) = (_allowscalar[] = flag)\n_scalarindexingallowed() = _allowscalar[] || throw(ErrorException(\"scalar indexing disabled\"))\n\ngetlocalindex(d::DArray, idx...) = localpart(d)[idx...]\nfunction getindex_tuple(d::DArray{T,N}, I::NTuple{N,Int}) where {T,N}\n    chidx = locate(d, I...)\n    idxs = d.indices[chidx...]\n    localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d))\n    pid = d.pids[chidx...]\n    return remotecall_fetch(getlocalindex, pid, d, localidx...)::T\nend\n\nfunction Base.getindex(d::DArray, i::Int)\n    _scalarindexingallowed()\n    return getindex_tuple(d, Tuple(CartesianIndices(d)[i]))\nend\nfunction Base.getindex(d::DArray{<:Any,N}, i::Vararg{Int,N}) where {N}\n    _scalarindexingallowed()\n    return getindex_tuple(d, i)\nend\nBase.getindex(d::DArray) = d[1]\nBase.getindex(d::SubDArray, I::Int...) = invoke(getindex, Tuple{SubArray{<:Any,N},Vararg{Int,N}} where N, d, I...)\nBase.getindex(d::SubOrDArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...)\n\nfunction Base.isassigned(D::DArray, i::Integer...)\n    try\n        getindex_tuple(D, i)\n        true\n    catch e\n        if isa(e, BoundsError) || isa(e, UndefRefError)\n            return false\n        else\n            rethrow(e)\n        end\n    end\nend\n\nBase.copy(d::SubDArray) = copyto!(similar(d), d)\nBase.copy(d::SubDArray{<:Any,2}) = copyto!(similar(d), d)\n\nfunction Base.copyto!(dest::SubOrDArray, src::AbstractArray)\n    @sync for p in procs(dest)\n        @async remotecall_wait(p) do\n            ldest = localpart(dest)\n            copyto!(ldest, view(src, localindices(dest)...))\n        end\n    end\n    return dest\nend\n\nfunction Base.deepcopy(src::DArray)\n    dest = similar(src)\n    @sync for p in procs(src)\n        @async remotecall_wait(p) do\n            dest[:L] = deepcopy(src[:L])\n        end\n    end\n    return dest\nend\n# We also want to optimize setindex! with a SubDArray source, but this is hard\n# and only works on 0.5.\n\n# Similar to Base.indexin, but just create a logical mask. Note that this\n# must return a logical mask in order to support merging multiple masks\n# together into one linear index since we need to know how many elements to\n# skip at the end. In many cases range intersection would be much faster\n# than generating a logical mask, but that loses the endpoint information.\nindexin_mask(a, b::Number) = a .== b\nindexin_mask(a, r::AbstractRange{Int}) = [i in r for i in a]\nindexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))\nindexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))\nindexin_mask(a, b) = [i in b for i in a]\n\nimport Base: tail\n# Given a tuple of indices and a tuple of masks, restrict the indices to the\n# valid regions. This is, effectively, reversing Base.setindex_shape_check.\n# We can't just use indexing into MergedIndices here because getindex is much\n# pickier about singleton dimensions than setindex! is.\nrestrict_indices(::Tuple{}, ::Tuple{}) = ()\nfunction restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})\n    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)\n        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)\n    elseif length(a[1]) == 1\n        (a[1], restrict_indices(tail(a), b))\n    elseif length(b[1]) == 1 && b[1][1]\n        restrict_indices(a, tail(b))\n    else\n        throw(DimensionMismatch(\"this should be caught by setindex_shape_check; please submit an issue\"))\n    end\nend\n# The final indices are funky - they're allowed to accumulate together.\n# An easy (albeit very inefficient) fix for too many masks is to use the\n# outer product to merge them. But we can do that lazily with a custom type:\nfunction restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})\n    (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)\nend\n# But too many indices is much harder; this requires merging the indices\n# in `a` before applying the final mask in `b`.\nfunction restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})\n    if length(a[1]) == 1\n        (a[1], restrict_indices(tail(a), b))\n    else\n        # When one mask spans multiple indices, we need to merge the indices\n        # together. At this point, we can just use indexing to merge them since\n        # there's no longer special handling of singleton dimensions\n        (view(MergedIndices(a, map(length, a)), b[1]),)\n    end\nend\n\nstruct ProductIndices{I,N} <: AbstractArray{Bool, N}\n    indices::I\n    sz::NTuple{N,Int}\nend\nBase.size(P::ProductIndices) = P.sz\n# This gets passed to map to avoid breaking propagation of inbounds\nBase.@propagate_inbounds propagate_getindex(A, I...) = A[I...]\nBase.@propagate_inbounds Base.getindex(P::ProductIndices{J,N}, I::Vararg{Int, N}) where {J,N} =\n    Bool((&)(map(propagate_getindex, P.indices, I)...))\n\nstruct MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}\n    indices::I\n    sz::NTuple{N,Int}\nend\nBase.size(M::MergedIndices) = M.sz\nBase.@propagate_inbounds Base.getindex(M::MergedIndices{J,N}, I::Vararg{Int, N}) where {J,N} =\n    CartesianIndex(map(propagate_getindex, M.indices, I))\n# Additionally, we optimize bounds checking when using MergedIndices as an\n# array index since checking, e.g., A[1:500, 1:500] is *way* faster than\n# checking an array of 500^2 elements of CartesianIndex{2}. This optimization\n# also applies to reshapes of MergedIndices since the outer shape of the\n# container doesn't affect the index elements themselves. We can go even\n# farther and say that even restricted views of MergedIndices must be valid\n# over the entire array. This is overly strict in general, but in this\n# use-case all the merged indices must be valid at some point, so it's ok.\nconst ReshapedMergedIndices{T,N,M<:MergedIndices} = Base.ReshapedArray{T,N,M}\nconst SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} = SubArray{T,N,M}\nconst MergedIndicesOrSub = Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}\n@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =\n    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))\n@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =\n    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))\n@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =\n    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))\n\n# The tricky thing here is that we want to optimize the accesses into the\n# distributed array, but in doing so, we lose track of which indices in I we\n# should be using.\n#\n# I’ve come to the conclusion that the function is utterly insane.\n# There are *6* flavors of indices with four different reference points:\n# 1. Find the indices of each portion of the DArray.\n# 2. Find the valid subset of indices for the SubArray into that portion.\n# 3. Find the portion of the `I` indices that should be used when you access the\n#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards\n#    from all other arrays, wherein we simply iterate over the source array’s\n#    elements.  You need to *both* know which elements in `J` were skipped\n#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)\n# 4. If `K` doesn't correspond to an entire chunk, reinterpret `K` in terms of\n#    the local portion of the source array\nfunction Base.setindex!(a::Array, s::SubDArray,\n        I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)\n    Inew = Base.to_indices(a, I)\n    Base.setindex_shape_check(s, Base.index_lengths(Inew...)...)\n    d = parent(s)\n    J = Base.to_indices(d, s.indices)\n    @sync for (pid, K_c) in zip(d.pids, d.indices)\n        K = map(intersect, J, K_c)\n        if !any(isempty, K)\n            K_mask = map(indexin_mask, J, K_c)\n            idxs = restrict_indices(Inew, K_mask)\n            if isequal(K, K_c)\n                # whole chunk\n                @async a[idxs...] = chunk(d, pid)\n            else\n                # partial chunk\n                localidxs = map((Kj, K_cj) -> Kj .- (first(K_cj) - 1), K, K_c)\n                @async a[idxs...] = remotecall_fetch((d, idxs) -> localpart(d)[idxs...], pid, d, localidxs)\n            end\n        end\n    end\n    return a\nend\n\nfunction Base.fill!(A::DArray, x)\n    @sync for p in procs(A)\n        @async remotecall_wait((A,x)->fill!(localpart(A), x), p, A, x)\n    end\n    return A\nend\n\nfunction Random.rand!(A::DArray, ::Type{T}) where T\n    @sync for p in procs(A)\n        @async remotecall_wait((A, T)->rand!(localpart(A), T), p, A, T)\n    end\n    return A\nend\n"
  },
  {
    "path": "src/linalg.jl",
    "content": "function Base.copy(Dadj::Adjoint{T,<:DArray{T,2}}) where T\n    D = parent(Dadj)\n    DArray(reverse(size(D)), procs(D)) do I\n        lp = Array{T}(undef, map(length, I))\n        rp = convert(Array, D[reverse(I)...])\n        adjoint!(lp, rp)\n    end\nend\n\nfunction Base.copy(Dtr::Transpose{T,<:DArray{T,2}}) where T\n    D = parent(Dtr)\n    DArray(reverse(size(D)), procs(D)) do I\n        lp = Array{T}(undef, map(length, I))\n        rp = convert(Array, D[reverse(I)...])\n        transpose!(lp, rp)\n    end\nend\n\nconst DVector{T,A} = DArray{T,1,A}\nconst DMatrix{T,A} = DArray{T,2,A}\n\n# Level 1\n\nfunction LinearAlgebra.axpy!(α, x::DArray, y::DArray)\n    if length(x) != length(y)\n        throw(DimensionMismatch(\"vectors must have same length\"))\n    end\n    @sync for p in procs(y)\n        @async remotecall_wait(p) do\n            axpy!(α, localpart(x), localpart(y))\n        end\n    end\n    return y\nend\n\nfunction LinearAlgebra.dot(x::DVector, y::DVector)\n    if length(x) != length(y)\n        throw(DimensionMismatch(\"\"))\n    end\n\n    results = asyncmap(procs(x)) do p\n        remotecall_fetch((x, y) -> dot(localpart(x), makelocal(y, localindices(x)...)), p, x, y)\n    end\n    return reduce(+, results)\nend\n\nfunction LinearAlgebra.norm(x::DArray, p::Real = 2)\n    results = asyncmap(procs(x)) do pp\n        remotecall_fetch(() -> norm(localpart(x), p), pp)\n    end\n    return norm(results, p)\nend\n\nfunction LinearAlgebra.rmul!(A::DArray, x::Number)\n    @sync for p in procs(A)\n        @async remotecall_wait((A,x)->rmul!(localpart(A), x), p, A, x)\n    end\n    return A\nend\n\n# Level 2\nfunction add!(dest, src, scale = one(dest[1]))\n    if length(dest) != length(src)\n        throw(DimensionMismatch(\"source and destination arrays must have same number of elements\"))\n    end\n    if scale == one(scale)\n        @simd for i = eachindex(dest)\n            @inbounds dest[i] += src[i]\n        end\n    else\n        @simd for i = eachindex(dest)\n            @inbounds dest[i] += scale*src[i]\n        end\n    end\n    return dest\nend\n\nfunction LinearAlgebra.mul!(y::DVector, A::DMatrix, x::AbstractVector, α::Number = 1, β::Number = 0)\n\n    # error checks\n    if size(A, 2) != length(x)\n        throw(DimensionMismatch(\"\"))\n    end\n    if y.cuts[1] != A.cuts[1]\n        throw(ArgumentError(\"cuts of output vector must match cuts of first dimension of matrix\"))\n    end\n\n    # Multiply on each tile of A\n    R = Array{Future}(undef, size(A.pids))\n    for j = 1:size(A.pids, 2)\n        xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1]\n        for i = 1:size(A.pids, 1)\n            R[i,j] = remotecall(procs(A)[i,j]) do\n                localpart(A)*convert(localtype(x), xj)\n            end\n        end\n    end\n\n    # Scale y if necessary\n    if β != one(β)\n        asyncmap(procs(y)) do p\n            remotecall_wait(p) do\n                if !iszero(β)\n                    rmul!(localpart(y), β)\n                else\n                    fill!(localpart(y), 0)\n                end\n            end\n        end\n    end\n\n    # Update y\n    @sync for i = 1:size(R, 1)\n        p = y.pids[i]\n        for j = 1:size(R, 2)\n            rij = R[i,j]\n            @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)\n        end\n    end\n\n    return y\nend\n\nfunction LinearAlgebra.mul!(y::DVector, adjA::Adjoint{<:Number,<:DMatrix}, x::AbstractVector, α::Number = 1, β::Number = 0)\n\n    A = parent(adjA)\n\n    # error checks\n    if size(A, 1) != length(x)\n        throw(DimensionMismatch(\"\"))\n    end\n    if y.cuts[1] != A.cuts[2]\n        throw(ArgumentError(\"cuts of output vector must match cuts of second dimension of matrix\"))\n    end\n\n    # Multiply on each tile of A\n    R = Array{Future}(undef, reverse(size(A.pids)))\n    for j = 1:size(A.pids, 1)\n        xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1]\n        for i = 1:size(A.pids, 2)\n            R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i])\n        end\n    end\n\n    # Scale y if necessary\n    if β != one(β)\n        @sync for p in procs(y)\n            @async remotecall_wait(p) do\n                if !iszero(β)\n                    rmul!(localpart(y), β)\n                else\n                    fill!(localpart(y), 0)\n                end\n            end\n        end\n    end\n\n    # Update y\n    @sync for i = 1:size(R, 1)\n        p = y.pids[i]\n        for j = 1:size(R, 2)\n            rij = R[i,j]\n            @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)\n        end\n    end\n    return y\nend\n\nfunction LinearAlgebra.lmul!(D::Diagonal, DA::DMatrix)\n    d = D.diag\n    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx\n        d[DA.indices[pididx][1]]\n    end\n    map_localparts!(DA) do lDA\n        lmul!(Diagonal(localpart(s)), lDA)\n    end\nend\n\nfunction LinearAlgebra.rmul!(DA::DMatrix, D::Diagonal)\n    d = D.diag\n    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx\n        d[DA.indices[pididx][2]]\n    end\n    map_localparts!(DA) do lDA\n        rmul!(lDA, Diagonal(localpart(s)))\n    end\nend\n\n# Level 3\nfunction _matmatmul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number, β::Number, tA)\n    # error checks\n    Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1)\n    mA, nA = (size(A, Ad1), size(A, Ad2))\n    mB, nB = size(B)\n    if mB != nA\n        throw(DimensionMismatch(\"matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)\"))\n    end\n    if size(C,1) != mA || size(C,2) != nB\n        throw(DimensionMismatch(\"result C has dimensions $(size(C)), needs ($mA, $nB)\"))\n    end\n    if C.cuts[1] != A.cuts[Ad1]\n        throw(ArgumentError(\"cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix\"))\n    end\n\n    # Multiply on each tile of A\n    if tA == 'N'\n        R = Array{Future}(undef, size(procs(A))..., size(procs(C), 2))\n    else\n        R = Array{Future}(undef, reverse(size(procs(A)))..., size(procs(C), 2))\n    end\n    for j = 1:size(A.pids, Ad2)\n        for k = 1:size(C.pids, 2)\n            Acuts = A.cuts[Ad2]\n            Ccuts = C.cuts[2]\n            Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1]\n            for i = 1:size(A.pids, Ad1)\n                p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i]\n                R[i,j,k] = remotecall(p) do\n                    if tA == 'T'\n                        return transpose(localpart(A))*convert(localtype(B), Bjk)\n                    elseif tA == 'C'\n                        return adjoint(localpart(A))*convert(localtype(B), Bjk)\n                    else\n                        return localpart(A)*convert(localtype(B), Bjk)\n                    end\n                end\n            end\n        end\n    end\n\n    # Scale C if necessary\n    if β != one(β)\n        @sync for p in C.pids\n            if iszero(β)\n                @async remotecall_wait(() -> fill!(localpart(C), 0), p)\n            else\n                @async remotecall_wait(() -> rmul!(localpart(C), β), p)\n            end\n        end\n    end\n\n    # Update C\n    @sync for i = 1:size(R, 1)\n        for k = 1:size(C.pids, 2)\n            p = C.pids[i,k]\n            for j = 1:size(R, 2)\n                rijk = R[i,j,k]\n                @async remotecall_wait(d -> add!(localpart(d), fetch(rijk), α), p, C)\n            end\n        end\n    end\n    return C\nend\n\nLinearAlgebra.mul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, A, B, α, β, 'N')\nLinearAlgebra.mul!(C::DMatrix, A::Adjoint{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'C')\nLinearAlgebra.mul!(C::DMatrix, A::Transpose{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'T')\n\n_matmul_op = (t,s) -> t*s + t*s\n\nfunction Base.:*(A::DMatrix, x::AbstractVector)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))\n    y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),))\n    return mul!(y, A, x)\nend\nfunction Base.:*(A::DMatrix, B::AbstractMatrix)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))\n    C = DArray(I -> Array{T}(undef, map(length, I)),\n            (size(A, 1), size(B, 2)),\n            procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))],\n            (size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2))))\n    return mul!(C, A, B)\nend\n\nfunction Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, x::AbstractVector)\n    A = parent(adjA)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))\n    y = DArray(I -> Array{T}(undef, map(length, I)),\n            (size(A, 2),),\n            procs(A)[1,:],\n            (size(procs(A), 2),))\n    return mul!(y, adjA, x)\nend\nfunction Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, B::AbstractMatrix)\n    A = parent(adjA)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))\n    C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),\n        size(B, 2)),\n        procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],\n        (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))\n    return mul!(C, adjA, B)\nend\n\nfunction Base.:*(trA::Transpose{<:Any,<:DMatrix}, x::AbstractVector)\n    A = parent(trA)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))\n    y = DArray(I -> Array{T}(undef, map(length, I)),\n            (size(A, 2),),\n            procs(A)[1,:],\n            (size(procs(A), 2),))\n    return mul!(y, trA, x)\nend\nfunction Base.:*(trA::Transpose{<:Any,<:DMatrix}, B::AbstractMatrix)\n    A = parent(trA)\n    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))\n    C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),\n        size(B, 2)),\n        procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],\n        (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))\n    return mul!(C, trA, B)\nend\n"
  },
  {
    "path": "src/mapreduce.jl",
    "content": "## higher-order functions ##\n\nBase.map(f, d0::DArray, ds::AbstractArray...) = broadcast(f, d0, ds...)\n\nfunction Base.map!(f::F, dest::DArray, src::DArray{<:Any,<:Any,A}) where {F,A}\n    @sync for p in procs(dest)\n        @async remotecall_wait(p) do\n            map!(f, localpart(dest), makelocal(src, localindices(dest)...))\n        end\n    end\n    return dest\nend\n\n# Only defining `reduce(f, ::DArray)` causes method ambiguity issues with\n# - `reduce(hcat, ::AbstractVector{<:AbstractVecOrMat})`\n# - `reduce(vcat, ::AbstractVector{<:AbstractVecOrMat})`\nBase.reduce(f, d::DArray) = _reduce(f, d)\nBase.reduce(::typeof(hcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(hcat, d)\nBase.reduce(::typeof(vcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(vcat, d)\nfunction _reduce(f, d::DArray)\n    results = asyncmap(procs(d)) do p\n        remotecall_fetch(p) do\n            return reduce(f, localpart(d))\n        end\n    end\n    reduce(f, results)\nend\n\nfunction Base._mapreduce(f, op, ::IndexCartesian, d::DArray)\n    results = asyncmap(procs(d)) do p\n        remotecall_fetch((_f,_op,_d)->mapreduce(_f, _op, localpart(_d)), p, f, op, d)\n    end\n\n    reduce(op, results)\nend\nBase._mapreduce(f, op, ::IndexCartesian, d::SubDArray) = Base._mapreduce(f, op, IndexCartesian(), DArray(d))\n# Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d)\n# Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d)\n# Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d)\n\n# mapreducedim\nfunction Base.reducedim_initarray(A::DArray, region, v0, ::Type{R}) where {R}\n    # Store reduction on lowest pids\n    pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...]\n    chunks = similar(pids, Future)\n    asyncmap!(chunks, pids) do p\n        remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), p)\n    end\n    return DArray(chunks)\nend\nBase.reducedim_initarray(A::DArray, region, v0::T) where {T} = Base.reducedim_initarray(A, region, v0, T)\n\n# Compute mapreducedim of each localpart and store the result in a new DArray\nfunction mapreducedim_within(f, op, A::DArray, region)\n    arraysize = [size(A)...]\n    gridsize = [size(A.indices)...]\n    arraysize[[region...]] = gridsize[[region...]]\n    indx = similar(A.indices)\n\n    for i in CartesianIndices(indx)\n        indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indices[i][j], ndims(A))\n    end\n    cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)]\n    return DArray(next_did(), I -> mapreduce(f, op, localpart(A), dims=region),\n        tuple(arraysize...), procs(A), indx, cuts)\nend\n\n# Compute mapreducedim across the processes. This should be done after mapreducedim\n# has been run on each localpart with mapreducedim_within. Eventually, we might\n# want to write mapreducedim_between! as a binary reduction.\nfunction mapreducedim_between!(f, op, R::DArray, A::DArray, region)\n    @sync for p in procs(R)\n        @async remotecall_wait(p, f, op, R, A, region) do f, op, R, A, region\n            localind = [r for r = localindices(A)]\n            localind[[region...]] = [1:n for n = size(A)[[region...]]]\n            B = convert(Array, A[localind...])\n            Base.mapreducedim!(f, op, localpart(R), B)\n        end\n    end\n    return R\nend\n\nfunction Base.mapreducedim!(f, op, R::DArray, A::DArray)\n    lsize = Base.check_reducedims(R,A)\n    if isempty(A)\n        return copy(R)\n    end\n    region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...)\n    if isempty(region)\n        return copyto!(R, A)\n    end\n    B = mapreducedim_within(f, op, A, region)\n    return mapreducedim_between!(identity, op, R, B, region)\nend\n\n## Some special cases\nfunction Base._all(f, A::DArray, ::Colon)\n    B = asyncmap(procs(A)) do p\n        remotecall_fetch(p) do\n            all(f, localpart(A))\n        end\n    end\n    return all(B)\nend\n\nfunction Base._any(f, A::DArray, ::Colon)\n    B = asyncmap(procs(A)) do p\n        remotecall_fetch(p) do\n            any(f, localpart(A))\n        end\n    end\n    return any(B)\nend\n\nfunction Base.count(f, A::DArray)\n    B = asyncmap(procs(A)) do p\n        remotecall_fetch(p) do\n            count(f, localpart(A))\n        end\n    end\n    return sum(B)\nend\n\nfunction Base.extrema(d::DArray)\n    r = asyncmap(procs(d)) do p\n        remotecall_fetch(p) do\n            extrema(localpart(d))\n        end\n    end\n    return reduce((t,s) -> (min(t[1], s[1]), max(t[2], s[2])), r)\nend\n\n# Unary vector functions\nBase.:(-)(D::DArray) = map(-, D)\n\n\nmap_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d)\nmap_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I\n    f(localpart(d1), localpart(d2))\nend\n\nfunction map_localparts(f::Callable, DA::DArray, A::Array)\n    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx\n        A[DA.indices[pididx]...]\n    end\n    DArray(DA) do I\n        f(localpart(DA), localpart(s))\n    end\nend\n\nfunction map_localparts(f::Callable, A::Array, DA::DArray)\n    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx\n        A[DA.indices[pididx]...]\n    end\n    DArray(DA) do I\n        f(localpart(s), localpart(DA))\n    end\nend\n\nfunction map_localparts!(f::Callable, d::DArray)\n    @sync for p in procs(d)\n        @async remotecall_wait((f,d)->f(localpart(d)), p, f, d)\n    end\n    return d\nend\n\n# Here we assume all the DArrays have\n# the same size and distribution\nmap_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1])\n\n\nfunction samedist(A::DArray, B::DArray)\n    (size(A) == size(B)) || throw(DimensionMismatch())\n    if (procs(A) != procs(B)) || (A.cuts != B.cuts)\n        B = DArray(x->B[x...], A)\n    end\n    B\nend\n\nfor f in (:+, :-, :div, :mod, :rem, :&, :|, :xor)\n    @eval begin\n        function Base.$f(A::DArray{T}, B::DArray{T}) where T\n            B = samedist(A, B)\n            map_localparts($f, A, B)\n        end\n        Base.$f(A::DArray{T}, B::Array{T}) where {T} = map_localparts($f, A, B)\n        Base.$f(A::Array{T}, B::DArray{T}) where {T} = map_localparts($f, A, B)\n    end\nend\n\nfunction Base.mapslices(f, D::DArray{T,N,A}; dims) where {T,N,A}\n    if !(dims isa AbstractVector)\n        dims = [dims...]\n    end\n    if !all(t -> t == 1, size(D.indices)[dims])\n        p = ones(Int, ndims(D))\n        nondims = filter(t -> !(t in dims), 1:ndims(D))\n        p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D))\n        DD = DArray(size(D), procs(D), p) do I\n            return convert(A, D[I...])\n        end\n        return mapslices(f, DD, dims=dims)\n    end\n\n    refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),dims=z), p, f, D, dims) for p in procs(D)]\n\n    DArray(reshape(refs, size(procs(D))))\nend\n\nfunction _ppeval(f, A...; dim = map(ndims, A))\n    if length(dim) != length(A)\n        throw(ArgumentError(\"dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))\"))\n    end\n    narg = length(A)\n    dimlength = size(A[1], dim[1])\n    for i = 2:narg\n        if dim[i] > 0 && dimlength != size(A[i], dim[i])\n            throw(ArgumentError(\"lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))\"))\n        end\n    end\n    dims = []\n    idx  = []\n    args = []\n    for i = 1:narg\n        push!(dims, ndims(A[i]))\n        push!(idx, Any[Colon() for d in 1:dims[i]])\n        if dim[i] > 0\n            idx[i][dim[i]] = 1\n            push!(args, view(A[i], idx[i]...))\n        else\n            push!(args, A[i])\n        end\n    end\n    R1 = f(args...)\n    ridx = Any[1:size(R1, d) for d in 1:ndims(R1)]\n    push!(ridx, 1)\n    Rsize = map(last, ridx)\n    Rsize[end] = dimlength\n    R = Array{eltype(R1)}(undef, Rsize...)\n\n    for i = 1:dimlength\n        for j = 1:narg\n            if dim[j] > 0\n                idx[j][dim[j]] = i\n                args[j] = view(A[j], idx[j]...)\n            else\n                args[j] = A[j]\n            end\n        end\n        ridx[end] = i\n        R[ridx...] = f(args...)\n    end\n\n    return R\nend\n\n\"\"\"\n     ppeval(f, D...; dim::NTuple)\n\nEvaluates the callable argument `f` on slices of the elements of the `D` tuple.\n\n#### Arguments\n`f` can be any callable object that accepts sliced or broadcasted elements of `D`.\nThe result returned from `f` must be either an array or a scalar.\n\n`D` has any number of elements and the elements can have any type. If an element\nof `D` is a distributed array along the dimension specified by `dim`. If an\nelement of `D` is not distributed, the element is by default broadcasted and\napplied on all evaluations of `f`.\n\n`dim` is a tuple of integers specifying the dimension over which the elements\nof `D` is slices. The length of the tuple must therefore be the same as the\nnumber of arguments `D`. By default distributed arrays are slides along the\nlast dimension. If the value is less than or equal to zero the element are\nbroadcasted to all evaluations of `f`.\n\n#### Result\n`ppeval` returns a distributed array of dimension `p+1` where the first `p`\nsizes correspond to the sizes of return values of `f`. The last dimension of\nthe return array from `ppeval` has the same length as the dimension over which\nthe input arrays are sliced.\n\n#### Examples\n```jl\naddprocs(Sys.CPU_THREADS)\n\nusing DistributedArrays\n\nA = drandn((10, 10, Sys.CPU_THREADS), workers(), [1, 1, Sys.CPU_THREADS])\n\nppeval(eigvals, A)\n\nppeval(eigvals, A, randn(10,10)) # broadcasting second argument\n\nB = drandn((10, Sys.CPU_THREADS), workers(), [1, Sys.CPU_THREADS])\n\nppeval(*, A, B)\n```\n\"\"\"\nfunction ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D))\n    #Ensure that the complete DArray is available on the specified dims on all processors\n    for i = 1:length(D)\n        if isa(D[i], DArray)\n            for idxs in D[i].indices\n                for d in setdiff(1:ndims(D[i]), dim[i])\n                    if length(idxs[d]) != size(D[i], d)\n                        throw(DimensionMismatch(string(\"dimension $d is distributed. \",\n                            \"ppeval requires dimension $d to be completely available on all processors.\")))\n                    end\n                end\n            end\n        end\n    end\n\n    refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])]\n\n    # The array of Futures has to be reshaped for the DArray constructor to work correctly.\n    # This requires a fetch and the DArray is also fetching so it might be better to modify\n    # the DArray constructor.\n    sd = [size(D[1].pids)...]\n    nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1])\n    DArray(reshape(refs, tuple([sd[1:nd - 1]; sd[end]]...)))\nend\n"
  },
  {
    "path": "src/serialize.jl",
    "content": "function Serialization.serialize(S::AbstractSerializer, d::DArray{T,N,A}) where {T,N,A}\n    # Only send the ident for participating workers - we expect the DArray to exist in the\n    # remote registry. DO NOT send the localpart.\n    destpid = worker_id_from_socket(S.io)\n    Serialization.serialize_type(S, typeof(d))\n    if (destpid in d.pids) || (destpid == d.id[1])\n        serialize(S, (true, d.id))    # (id_only, id)\n    else\n        serialize(S, (false, d.id))\n        for n in [:dims, :pids, :indices, :cuts]\n            serialize(S, getfield(d, n))\n        end\n        serialize(S, A)\n    end\nend\n\nfunction Serialization.deserialize(S::AbstractSerializer, t::Type{DT}) where DT<:DArray\n    what = deserialize(S)\n    id_only = what[1]\n    id = what[2]\n\n    if id_only\n        d = d_from_weakref_or_d(id)\n        if d === nothing\n            # access to fields will throw an error, at least the deserialization process will not\n            # result in worker death\n            d = DT()\n            d.id = id\n        end\n        return d\n    else\n        # We are not a participating worker, deser fields and instantiate locally.\n        dims = deserialize(S)\n        pids = deserialize(S)\n        indices = deserialize(S)\n        cuts = deserialize(S)\n        A = deserialize(S)\n        T=eltype(DT)\n        N=length(dims)\n        return DT(id, dims, pids, indices, cuts, empty_localpart(T,N,A))\n    end\nend\n\n# Serialize only those parts of the object as required by the destination worker.\nmutable struct DestinationSerializer\n    generate::Union{Function,Nothing}  # Function to generate the part to be serialized\n    pids::Union{Array,Nothing}         # MUST have the same shape as the distribution\n    deser_obj::Any                     # Deserialized part\n\n    DestinationSerializer(f,p,d) = new(f,p,d)\nend\n\nDestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, nothing)\n\n# constructs a DestinationSerializer after verifying that the shape of pids.\nfunction verified_destination_serializer(f::Function, pids::Array, verify_size)\n    @assert size(pids) == verify_size\n    return DestinationSerializer(f, pids)\nend\n\nDestinationSerializer(deser_obj::Any) = DestinationSerializer(nothing, nothing, deser_obj)\n\nfunction Serialization.serialize(S::AbstractSerializer, s::DestinationSerializer)\n    pid = worker_id_from_socket(S.io)\n    pididx = findfirst(isequal(pid), s.pids)\n    @assert pididx !== nothing\n    Serialization.serialize_type(S, typeof(s))\n    serialize(S, s.generate(pididx))\nend\n\nfunction Serialization.deserialize(S::AbstractSerializer, t::Type{T}) where T<:DestinationSerializer\n    lpart = deserialize(S)\n    return DestinationSerializer(lpart)\nend\n\n\nfunction localpart(s::DestinationSerializer)\n    if s.deser_obj !== nothing\n        return s.deser_obj\n    elseif s.generate !== nothing && (myid() in s.pids)\n        # Handle the special case where myid() is part of s.pids.\n        # In this case serialize/deserialize is not called as the remotecall is executed locally\n        return s.generate(findfirst(isequal(myid()), s.pids))\n    else\n        throw(ErrorException(string(\"Invalid state in DestinationSerializer.\")))\n    end\nend\n"
  },
  {
    "path": "src/sort.jl",
    "content": "# Sorting a DVector using samplesort\n\nfunction sample_n_setup_ref(d::DVector, sample_size; kwargs...)\n    lp = localpart(d)\n    llp = length(lp)\n    np = length(procs(d))\n    sample_size = llp > sample_size ? sample_size : llp\n    sorted = sort(lp; kwargs...)\n    sample = sorted[collect(1:div(llp,sample_size):llp)]\n    ref = RemoteChannel(()->Channel(np+1))             # To collect parts to be sorted locally later.\n                                                       # First element is the locally sorted vector\n    put!(ref, sorted)\n    return (sample, ref)\nend\n\n\nfunction scatter_n_sort_localparts(d, myidx, refs, boundaries::Array{T}; by = identity, kwargs...) where T\n    if d==nothing\n        sorted = take!(refs[myidx])  # First entry in the remote channel is sorted localpart\n    else\n        sorted = sort(localpart(d); by = by, kwargs...)\n    end\n\n    # send respective parts to correct workers, iterate over sorted array\n    p_sorted = 1\n    for (i,r) in enumerate(refs)\n        p_till = length(sorted)+1\n\n        # calculate range to send to refs[i]\n        ctr=1\n        for x in sorted[p_sorted:end]\n            if by(x) > by(boundaries[i+1])\n                p_till = p_sorted+ctr-1\n                break\n            else\n                ctr += 1\n            end\n        end\n\n        if p_till == p_sorted\n            @async put!(r, Array{T}(undef,0))\n        else\n            v = sorted[p_sorted:p_till-1]\n            @async put!(r, v)\n        end\n\n        p_sorted = p_till\n    end\n\n    # wait to receive all of my parts from all other workers\n    lp_sorting=T[]\n    for _ in refs\n        v = take!(refs[myidx])\n        append!(lp_sorting, v)\n    end\n\n    sorted_ref=RemoteChannel()\n    put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...))\n    return (sorted_ref, length(lp_sorting))\nend\n\nfunction compute_boundaries(d::DVector{T}; kwargs...) where T\n    pids = procs(d)\n    np = length(pids)\n    sample_sz_on_wrkr = 512\n\n    results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)\n\n    samples = Array{T}(undef,0)\n    for x in results\n        append!(samples, x[1])\n    end\n    sort!(samples; kwargs...)\n    samples[1] = typemin(T)\n\n    refs=[x[2] for x in results]\n\n    boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]\n    push!(boundaries, typemax(T))\n\n    return (boundaries, refs)\nend\n\n\"\"\"\n    sort(d::DVector; sample=true, kwargs...) -> DVector\n\nSorts and returns a new distributed vector.\n\nThe sorted vector may not have the same distribution as the original.\n\nKeyword argument `sample` can take values:\n\n- `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`.\n\n- `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d)\n\n- 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values\n\n- Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution.\n\nKeyword argument `alg` takes the same options `Base.sort`\n\"\"\"\nfunction Base.sort(d::DVector{T}; sample=true, kwargs...) where T\n    pids = procs(d)\n    np = length(pids)\n\n    # Only `alg` and `sample` are supported as keyword arguments\n    if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0\n        throw(ArgumentError(\"Only `alg`, `by` and `sample` are supported as keyword arguments\"))\n    end\n\n    if sample==true\n        boundaries, refs = compute_boundaries(d; kwargs...)\n        presorted=true\n\n    elseif sample==false\n        # Assume an uniform distribution between min and max values\n        minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)\n        min_d = minimum(T[x[1] for x in minmax])\n        max_d = maximum(T[x[2] for x in minmax])\n\n        return sort(d; sample=(min_d,max_d), kwargs...)\n\n    elseif isa(sample, Tuple)\n        # Assume an uniform distribution between min and max values in the tuple\n        lb=sample[1]\n        ub=sample[2]\n\n        @assert lb<=ub\n\n        s = Array{T}(undef,np)\n        part = abs(ub - lb)/np\n        (isnan(part) || isinf(part)) && throw(ArgumentError(\"lower and upper bounds must not be infinities\"))\n\n        for n in 1:np\n            v = lb + (n-1)*part\n            if T <: Integer\n                s[n] = round(v)\n            else\n                s[n] = v\n            end\n        end\n        return sort(d; sample=s, kwargs...)\n\n    elseif isa(sample, Array)\n        # Provided array is used as a sample\n        samples = sort(copy(sample))\n        samples[1] = typemin(T)\n        boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]\n        push!(boundaries, typemax(T))\n        presorted=false\n\n        refs=[RemoteChannel(p) for p in procs(d)]\n    else\n        throw(ArgumentError(\"keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : \" * string(sample)))\n    end\n\n    local_sort_results = Array{Tuple}(undef,np)\n\n    Base.asyncmap!((i,p) -> remotecall_fetch(\n            scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...),\n                                    local_sort_results, 1:np, pids)\n\n    # Construct a new DArray from the sorted refs. Remove parts with 0-length since\n    # the DArray constructor_from_refs does not yet support it. This implies that\n    # the participating workers for the sorted darray may be different from the original\n    # for highly non-uniform distributions.\n    local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)]\n    return DArray(local_sorted_refs)\nend\n"
  },
  {
    "path": "src/spmd.jl",
    "content": "module SPMD\n\nusing Distributed: RemoteChannel, myid, procs, remote_do, remotecall_fetch, remotecall_wait\nusing ..DistributedArrays: DistributedArrays, gather, next_did\n\nexport sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather\nexport context_local_storage, context, spmd\n\n\nmutable struct WorkerDataChannel\n    pid::Int\n    rc::Union{RemoteChannel,Nothing}\n    lock::ReentrantLock\n\n    WorkerDataChannel(pid) = new(pid, nothing, ReentrantLock())\nend\n\nmutable struct SPMDContext\n    id::Tuple{Int,Int}\n    chnl::Channel\n    store::Dict{Any,Any}\n    pids::Array{Int}\n\n    function SPMDContext(id::Tuple{Int,Int}, pids::Vector{Int})\n        ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), pids)\n        if first(id) == myid()\n            finalizer(ctxt) do ctxt\n                for p in ctxt.pids\n                    @async remote_do(delete_ctxt_id, p, ctxt.id)\n                end\n            end\n        end\n        return ctxt\n    end\nend\n\n\n# Every worker is associated with its own RemoteChannel\nstruct WorkerChannelDict\n    data::Dict{Int, WorkerDataChannel}\n    lock::ReentrantLock\n    WorkerChannelDict() = new(Dict{Int, WorkerDataChannel}(), ReentrantLock())\nend\nconst WORKERCHANNELS = WorkerChannelDict()\n\nBase.get!(f::Function, x::WorkerChannelDict, id::Int) = @lock x.lock get!(f, x.data, id)\n\n# mapping between a context id and context object\nstruct SPMDContextDict\n    data::Dict{Tuple{Int,Int}, SPMDContext}\n    lock::ReentrantLock\n    SPMDContextDict() = new(Dict{Tuple{Int,Int}, SPMDContext}(), ReentrantLock())\nend\nconst CONTEXTS = SPMDContextDict()\n\nBase.delete!(x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock delete!(x.data, id)\nBase.get!(f::Function, x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock get!(f, x.data, id)\n\nfunction context_local_storage()\n    ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT))\n    ctxt.store\nend\n\ncontext(pids::Vector{Int}=procs()) = SPMDContext(next_did(), pids)\n\n# Multiple SPMD blocks can be executed concurrently,\n# each in its own context. Messages are still sent as part of the\n# same remote channels associated with each worker. They are\n# read from the remote channel into local channels each associated\n# with a different run of `spmd`.\n\nfunction get_dc(wc::WorkerDataChannel)\n    lock(wc.lock)\n    try\n        if wc.rc === nothing\n            if wc.pid == myid()\n                myrc = RemoteChannel(()->Channel(typemax(Int)))\n                wc.rc = myrc\n\n                # start a task to transfer incoming messages into local\n                # channels based on the execution context\n                @async begin\n                    while true\n                        msg = take!(myrc)\n                        ctxt_id = msg[1] # First element of the message tuple is the context id.\n                        ctxt = get_ctxt_from_id(ctxt_id)\n                        put!(ctxt.chnl, msg[2:end]) # stripping the context_id\n                    end\n                end\n            else\n                wc.rc = remotecall_fetch(()->get_remote_dc(myid()), wc.pid)\n            end\n        end\n    finally\n        unlock(wc.lock)\n    end\n    return wc.rc\nend\n\nfunction get_ctxt_from_id(ctxt_id::Tuple{Int,Int})\n    ctxt = get!(CONTEXTS, ctxt_id) do\n        return SPMDContext(ctxt_id, Int[])\n    end\n    return ctxt\nend\n\n# Since modules may be loaded in any order on the workers,\n# and workers may be dynamically added, pull in the remote channel\n# handles when accessed for the first time.\nfunction get_remote_dc(pid::Int)\n    wc = get!(WORKERCHANNELS, pid) do\n        return WorkerDataChannel(pid)\n    end\n    return get_dc(wc)\nend\n\nfunction send_msg(to, typ, data, tag)\n    ctxt_id = task_local_storage(:SPMD_CTXT)\n    @async begin\n        dc = get_remote_dc(to)\n        put!(dc, (ctxt_id, typ, myid(), data, tag))\n#        println(\"Sent to \", dc)\n    end\nend\n\nfunction get_msg(typ_check, from_check=false, tag_check=nothing)\n    ctxt_id = task_local_storage(:SPMD_CTXT)\n    chnl = get_ctxt_from_id(ctxt_id).chnl\n\n    unexpected_msgs=[]\n    while true\n        typ, from, data, tag = take!(chnl)\n\n        if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check)\n            push!(unexpected_msgs, (typ, from, data, tag))\n#            println(\"Unexpected in get_msg \", unexpected_msgs, \" looking for \", typ_check, \" \", from_check, \" \", tag_check)\n        else\n            # put all the messages we read (but not expected) back to the local channel\n            foreach(x->put!(chnl, x), unexpected_msgs)\n            return (from, data)\n        end\n    end\nend\n\nfunction sendto(pid::Int, data::Any; tag=nothing)\n    send_msg(pid, :sendto, data, tag)\nend\n\nfunction recvfrom(pid::Int; tag=nothing)\n    _, data = get_msg(:sendto, pid, tag)\n    return data\nend\n\nfunction recvfrom_any(; tag=nothing)\n    from, data = get_msg(:sendto, false, tag)\n    return (from,data)\nend\n\nfunction barrier(;pids=procs(), tag=nothing)\n    # send a message to everyone\n    for p in sort(pids)\n        send_msg(p, :barrier, nothing, tag)\n    end\n    # make sure we recv a message from everyone\n    pending=deepcopy(pids)\n    unexpected_msgs=[]\n\n    while length(pending) > 0\n        from, _ = get_msg(:barrier, false, tag)\n        if from in pending\n            filter!(x->x!=from, pending)\n        else\n            # handle case of 2 (or more) consecutive barrier calls.\n            push!(unexpected_msgs, (:barrier, from, nothing, tag))\n#            println(\"Unexpected \", from)\n        end\n#        length(pending) == 1 && println(\"Waiting for \", pending)\n    end\n\n    ctxt_id = task_local_storage(:SPMD_CTXT)\n    chnl = get_ctxt_from_id(ctxt_id).chnl\n    foreach(x->put!(chnl, x), unexpected_msgs)\n    return nothing\nend\n\nfunction bcast(data::Any, pid::Int; tag=nothing, pids=procs())\n    if myid() == pid\n        for p in filter(x->x!=pid, sort(pids))\n            send_msg(p, :bcast, data, tag)\n        end\n        return data\n    else\n        from, data = get_msg(:bcast, pid, tag)\n        return data\n    end\nend\n\nfunction scatter(x, pid::Int; tag=nothing, pids=procs())\n    if myid() == pid\n        @assert rem(length(x), length(pids)) == 0\n        cnt = div(length(x), length(pids))\n        for (i,p) in enumerate(sort(pids))\n            p == pid && continue\n            send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag)\n        end\n        myidx = findfirst(isequal(pid), sort(pids))\n        return x[cnt*(myidx-1)+1:cnt*myidx]\n    else\n        _, data = get_msg(:scatter, pid, tag)\n        return data\n    end\nend\n\nfunction DistributedArrays.gather(x, pid::Int; tag=nothing, pids=procs())\n    if myid() == pid\n        gathered_data = Array{Any}(undef, length(pids))\n        myidx = findfirst(isequal(pid), sort(pids))\n        gathered_data[myidx] = x\n        n = length(pids) - 1\n        while n > 0\n            from, data_x = get_msg(:gather, false, tag)\n            fromidx = findfirst(isequal(from), sort(pids))\n            gathered_data[fromidx] = data_x\n            n=n-1\n        end\n        return gathered_data\n    else\n        send_msg(pid, :gather, x, tag)\n        return x\n    end\nend\n\nfunction spmd_local(f, ctxt_id, clear_ctxt)\n    task_local_storage(:SPMD_CTXT, ctxt_id)\n    f()\n    clear_ctxt && delete_ctxt_id(ctxt_id)\n    return nothing\nend\n\nfunction spmd(f, args...; pids=procs(), context=nothing)\n    f_noarg = ()->f(args...)\n    clear_ctxt = false\n    if context == nothing\n        ctxt_id = next_did()\n        clear_ctxt = true    # temporary unique context created for this run.\n                             # should be cleared at the end of the run.\n    else\n        ctxt_id = context.id\n    end\n    @sync for p in pids\n        @async remotecall_wait(spmd_local, p, f_noarg, ctxt_id, clear_ctxt)\n    end\n    nothing\nend\n\ndelete_ctxt_id(ctxt_id::Tuple{Int,Int}) = delete!(CONTEXTS, ctxt_id)\n\nBase.close(ctxt::SPMDContext) = finalize(ctxt)\n\nend\n"
  },
  {
    "path": "test/aqua.jl",
    "content": "using DistributedArrays, Test\nimport Aqua\n\n@testset \"Aqua\" begin\n    Aqua.test_all(DistributedArrays; ambiguities = (; broken = true))\nend\n"
  },
  {
    "path": "test/darray.jl",
    "content": "using Test, LinearAlgebra, SpecialFunctions\nusing Statistics: mean\nusing SparseArrays: nnz\nusing Random\n@everywhere using SparseArrays: sprandn\n\n@testset \"test distribute and other constructors\" begin\n    A = rand(1:100, (100,100))\n\n    @testset \"test default distribute\" begin\n        DA = distribute(A)\n        @test length(procs(DA)) == nworkers()\n        @test sum(DA) == sum(A)\n        close(DA)\n    end\n\n    @testset \"test distribute with procs arguments\" begin\n        DA = distribute(A, procs = procs())\n        @test length(procs(DA)) == nprocs()\n        @test sum(DA) == sum(A)\n        close(DA)\n    end\n\n    @testset \"test distribute with procs and dist arguments\" begin\n        DA = distribute(A, procs = [1, 2], dist = [1,2])\n        @test size(procs(DA)) == (1,2)\n        @test sum(DA) == sum(A)\n        close(DA)\n    end\n\n    @testset \"Create darray with unconventional distribution and distribute like it\" begin\n        block = 10\n        Y = nworkers() * block\n        X = nworkers() * block\n        remote_parts = map(workers()) do wid\n            remotecall(rand, wid, block, Y)\n        end\n        DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1)))\n        A = rand(X, Y)\n        DA2 = distribute(A, DA1)\n\n        @test size(DA1) == size(DA2)\n\n        close(DA1)\n        close(DA2)\n    end\n\n    @testset \"Global DArray serialization issue #134\" begin\n        global A134 = drandn(1)\n        D2 = DArray(I -> DistributedArrays.localpart(A134), A134)\n        @test D2 == A134\n        close(A134)\n        close(D2)\n    end\n\n    @testset \"empty_localpart should work when only constructor (not conversion is defined)\" begin\n        @test DistributedArrays.empty_localpart(Float64,2,LowerTriangular{Float64,Matrix{Float64}}) isa\n                LowerTriangular\n    end\n    \n    @testset \"Consistent Uneven Distribution issue #166\" begin\n        DA = drand((2+length(OTHERIDS),), [MYID, OTHERIDS])\n        @test fetch(@spawnat MYID length(localpart(DA)) == 2)\n        @test fetch(@spawnat OTHERIDS length(localpart(DA)) == 1)\n        close(DA)\n        @test DistributedArrays.defaultdist(50,4) == [1,14,27,39,51]\n    end\n    \n    @testset \"Inhomogeneous typeof(localpart)\" begin\n        block = 10\n        Y = nworkers() * block\n        X = nworkers() * block\n\n        @assert nworkers() > 1\n        @test_throws ErrorException DArray((X, Y)) do I\n            eltype = first(CartesianIndices(I)) == CartesianIndex(1, 1) ? Int64 : Float64\n            zeros(eltype, map(length, I))\n        end\n    end\nend\n\ncheck_leaks()\n\n@testset \"test DArray equality/copy/deepcopy\" begin\n    D = drand((200,200), [MYID, OTHERIDS])\n\n    @testset \"test isequal(::DArray, ::DArray)\" begin\n        DC = copy(D)\n        @test D == DC\n        close(DC)\n    end\n\n    @testset \"test [deep]copy(::DArray) does a copy of each localpart\" begin\n        DC = copy(D)\n        @spawnat OTHERIDS localpart(DC)[1] = 0\n        @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)\n        DD = deepcopy(D)\n        @spawnat OTHERIDS localpart(DD)[1] = 0\n        @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)\n        close(DC)\n        close(DD)\n    end\n\n    @testset \"test copy(::DArray) is shallow\" begin\n        DA = @DArray [rand(100) for i=1:10]\n        DC = copy(DA)\n        id = procs(DC)[1]\n        @test DA == DC\n        fetch(@spawnat id localpart(DC)[1] .= -1.0)\n        @test DA == DC\n        @test fetch(@spawnat id all(localpart(DA)[1] .== -1.0))\n        close(DA)\n        close(DC)\n    end\n\n    @testset \"test deepcopy(::DArray) is not shallow\" begin\n        DA = @DArray [rand(100) for i=1:10]\n        DC = deepcopy(DA)\n        id = procs(DC)[1]\n        @test DA == DC\n        fetch(@spawnat id localpart(DC)[1] .= -1.0)\n        @test DA != DC\n        @test fetch(@spawnat id all(localpart(DA)[1] .>= 0.0))\n        close(DA)\n        close(DC)\n    end\n\n    close(D)\nend\n\ncheck_leaks()\n\n@testset \"test DArray similar\" begin\n    D = drand((200,200), [MYID, OTHERIDS])\n    DS = similar(D,Float16)\n\n    @testset \"test eltype of a similar\" begin\n        @test eltype(DS) == Float16\n    end\n\n    @testset \"test dims of a similar\" begin\n        @test size(D) == size(DS)\n    end\n    close(D)\n    close(DS)\nend\n\ncheck_leaks()\n\n@testset \"test DArray reshape\" begin\n    D = drand((200,200), [MYID, OTHERIDS])\n\n    @testset \"Test error-throwing in reshape\" begin\n        @test_throws DimensionMismatch reshape(D,(100,100))\n    end\n\n    DR = reshape(D,(100,400))\n    @testset \"Test reshape\" begin\n        @test size(DR) == (100,400)\n    end\n    close(D)\nend\n\ncheck_leaks()\n\n@testset \"test @DArray comprehension constructor\" begin\n\n    @testset \"test valid use of @DArray\" begin\n        D = @DArray [i+j for i=1:10, j=1:10]\n        @test D == [i+j for i=1:10, j=1:10]\n        close(D)\n    end\n\n    @testset \"test invalid use of @DArray\" begin\n        #@test_throws ArgumentError eval(:((@DArray [1,2,3,4])))\n        @test_throws LoadError eval(:((@DArray [1,2,3,4])))\n    end\nend\n\ncheck_leaks()\n\n@testset \"test DArray / Array conversion\" begin\n    D = drand((200,200), [MYID, OTHERIDS])\n\n    @testset \"test construct Array from (Sub)DArray\" begin\n        S = Matrix{Float64}(D[1:150, 1:150])\n        A = Matrix{Float64}(D)\n\n        @test A[1:150,1:150] == S\n        D2 = DArray{Float64,2,Matrix{Float64}}(A)\n        @test D2 == D\n        DistributedArrays.allowscalar(true)\n        @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1]\n        @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101]\n        DistributedArrays.allowscalar(false)\n        close(D2)\n\n        S2 = Vector{Float64}(D[4, 23:176])\n        @test A[4, 23:176] == S2\n\n        S3 = Vector{Float64}(D[23:176, 197])\n        @test A[23:176, 197] == S3\n\n        S4 = zeros(4)\n        setindex!(S4, D[3:4, 99:100], :)\n        # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}\n        DistributedArrays.allowscalar(true)\n        @test S4 == vec(D[3:4, 99:100])\n        @test S4 == vec(A[3:4, 99:100])\n        DistributedArrays.allowscalar(false)\n\n        S5 = zeros(2,2)\n        setindex!(S5, D[1,1:4], :, 1:2)\n        # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}\n        DistributedArrays.allowscalar(true)\n        @test vec(S5) == D[1, 1:4]\n        @test vec(S5) == A[1, 1:4]\n        DistributedArrays.allowscalar(false)\n    end\n    close(D)\nend\n\ncheck_leaks()\n\n@testset \"test copy!\" begin\n    D1 = dzeros((10,10))\n    r1 = remotecall_wait(() -> randn(3,10), workers()[1])\n    r2 = remotecall_wait(() -> randn(7,10), workers()[2])\n    D2 = DArray(reshape([r1; r2], 2, 1))\n    copyto!(D2, D1)\n    @test D1 == D2\n    close(D1)\n    close(D2)\nend\n\ncheck_leaks()\n\n@testset \"test DArray reduce\" begin\n    D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS])\n\n    @testset \"test reduce\" begin\n        @test reduce(+, D) == ((50*MYID) + (50*OTHERIDS))\n    end\n\n    @testset \"test map / reduce\" begin\n        D2 = map(x->1, D)\n        @test D2 isa DArray\n        @test reduce(+, D2) == 100\n        close(D2)\n    end\n\n    @testset \"test map! / reduce\" begin\n        map!(x->1, D, D)\n        @test reduce(+, D) == 100\n    end\n    close(D)\nend\n\ncheck_leaks()\n\n@testset \"test rmul\" begin\n    A = randn(100,100)\n    DA = distribute(A)\n    @test rmul!(DA, 2) == rmul!(A, 2)\n    close(DA)\nend\n\ncheck_leaks()\n\n@testset \"test rmul!(Diagonal, A)\" begin\n    A = randn(100, 100)\n    b = randn(100)\n    D = Diagonal(b)\n    DA = distribute(A)\n    @test lmul!(D, A) == lmul!(D, DA)\n    close(DA)\n    A = randn(100, 100)\n    b = randn(100)\n    DA = distribute(A)\n    @test rmul!(A, D) == rmul!(DA, D)\n    close(DA)\nend\n\ncheck_leaks()\n\n@testset \"test mapreduce on DArrays\" begin\n    for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *]\n        A = rand(1:5, rand(2:30))\n        DA = distribute(A)\n        @test DA isa DArray\n        @test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0\n        close(DA)\n    end\nend\n\ncheck_leaks()\n\n@testset \"test mapreducedim on DArrays\" begin\n    D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS])\n    D2 = map(x->1, D)\n    @test D2 isa DArray\n    @test mapreduce(t -> t*t, +, D2, dims=1) == mapreduce(t -> t*t, +, convert(Array, D2), dims=1)\n    @test mapreduce(t -> t*t, +, D2, dims=2) == mapreduce(t -> t*t, +, convert(Array, D2), dims=2)\n    @test mapreduce(t -> t*t, +, D2, dims=(1,2)) == mapreduce(t -> t*t, +, convert(Array, D2), dims=(1,2))\n\n    # Test non-regularly chunked DArrays\n    r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1])\n    r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2])\n    D = DArray(reshape([r1; r2], (2,1)))\n    @test Array(sum(D, dims=2)) == sum(Array(D), dims=2)\n\n    # close(D)\n    # close(D2)\n    d_closeall()   # temp created by the mapreduce above\nend\n\ncheck_leaks()\n\n@testset \"test mapreducdim, reducedim on DArrays\" begin\n    dims = (20,20,20)\n    DA = drandn(dims)\n    A = convert(Array, DA)\n\n    @testset \"dimension $dms\" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))\n        @test mapreduce(t -> t*t, +, A, dims=dms) ≈ mapreduce(t -> t*t, +, DA, dims=dms)\n        @test mapreduce(t -> t*t, +, A, dims=dms, init=1.0) ≈ mapreduce(t -> t*t, +, DA, dims=dms, init=1.0)\n        @test reduce(*, A, dims=dms) ≈ reduce(*, DA, dims=dms)\n        @test reduce(*, A, dims=dms, init=2.0) ≈ reduce(*, DA, dims=dms, init=2.0)\n    end\n    close(DA)\n    d_closeall()   # temp created by the mapreduce above\nend\n\ncheck_leaks()\n\n@testset \"test statistical functions on DArrays\" begin\n    dims = (20,20,20)\n    DA = drandn(dims)\n    A = Array(DA)\n\n    @testset \"test $f for dimension $dms\" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))\n        # std is pending implementation\n        @test f(DA, dims=dms) ≈ f(A, dims=dms)\n    end\n\n    close(DA)\n    d_closeall()   # temporaries created above\nend\n\ncheck_leaks()\n\nunpack(ex::Base.CapturedException) = unpack(ex.ex)\nunpack(ex::Distributed.RemoteException) = unpack(ex.captured)\nunpack(ex::Base.TaskFailedException) = unpack(ex.task.exception)\nunpack(ex) = ex\n\n@testset \"test sum on DArrays\" begin\n    A = randn(100,100)\n    DA = distribute(A)\n\n    # sum either throws an ArgumentError, a CompositeException of ArgumentErrors,\n    # or a RemoteException wrapping an ArgumentError\n    try\n        sum(DA, dims=-1)\n    catch err\n        if isa(err, CompositeException)\n            @test !isempty(err.exceptions)\n            for excep in err.exceptions\n                # Unpack the remote exception\n                orig_err = unpack(excep)\n                @test isa(orig_err, ArgumentError)\n            end\n        elseif isa(err, RemoteException)\n            @test err.captured isa CapturedException\n            @test err.captured.ex isa ArgumentError\n        else\n            @test isa(err, ArgumentError)\n        end\n    end\n    try\n        sum(DA, dims=0)\n    catch err\n        if isa(err, CompositeException)\n            @test !isempty(err.exceptions)\n            for excep in err.exceptions\n                # Unpack the remote exception\n                orig_err = unpack(excep)\n                @test isa(orig_err, ArgumentError)\n            end\n        elseif isa(err, RemoteException)\n            @test err.captured isa CapturedException\n            @test err.captured.ex isa ArgumentError\n        else\n            @test isa(err, ArgumentError)\n        end\n    end\n\n    @test sum(DA) ≈ sum(A)\n    @test sum(DA, dims=1) ≈ sum(A, dims=1)\n    @test sum(DA, dims=2) ≈ sum(A, dims=2)\n    @test sum(DA, dims=3) ≈ sum(A, dims=3)\n    close(DA)\n    d_closeall()   # temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test size on DArrays\" begin\n\n    A = randn(100,100)\n    DA = distribute(A)\n\n    @test_throws BoundsError size(DA, 0)\n    @test size(DA,1) == size(A,1)\n    @test size(DA,2) == size(A,2)\n    @test size(DA,3) == size(A,3)\n    close(DA)\nend\n\ncheck_leaks()\n\n# test length / lastindex\n@testset \"test collections API\" begin\n    A = randn(23,23)\n    DA = distribute(A)\n\n    @testset \"test length\" begin\n        @test length(DA) == length(A)\n    end\n\n    @testset \"test lastindex\" begin\n        @test lastindex(DA) == lastindex(A)\n    end\n    close(DA)\nend\n\ncheck_leaks()\n\n@testset \"test max / min / sum\" begin\n    a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))\n    d = distribute(a)\n\n    @test sum(d)          == sum(a)\n    @test maximum(d)      == maximum(a)\n    @test minimum(d)      == minimum(a)\n    @test maximum(abs, d) == maximum(abs, a)\n    @test minimum(abs, d) == minimum(abs, a)\n    @test sum(abs, d)     == sum(abs, a)\n    @test sum(abs2, d)    == sum(abs2, a)\n    @test extrema(d)      == extrema(a)\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"test all / any\" begin\n    a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))\n    a = [true for i in 1:100]\n    d = distribute(a)\n\n    @test all(d)\n    @test any(d)\n\n    close(d)\n\n    a[50] = false\n    d = distribute(a)\n    @test !all(d)\n    @test any(d)\n\n    close(d)\n\n    a = [false for i in 1:100]\n    d = distribute(a)\n    @test !all(d)\n    @test !any(d)\n\n    close(d)\n\n    d = dones(10,10)\n    @test !all(x-> x>1.0, d)\n    @test all(x-> x>0.0, d)\n\n    close(d)\n\n    a = ones(10,10)\n    a[10] = 2.0\n    d = distribute(a)\n    @test any(x-> x == 1.0, d)\n    @test any(x-> x == 2.0, d)\n    @test !any(x-> x == 3.0, d)\n\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"test count\"  begin\n    a = ones(10,10)\n    a[10] = 2.0\n    d = distribute(a)\n\n    @test count(x-> x == 2.0, d) == 1\n    @test count(x-> x == 1.0, d) == 99\n    @test count(x-> x == 0.0, d) == 0\n\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"test prod\" begin\n    a = fill(2, 10);\n    d = distribute(a);\n    @test prod(d) == 2^10\n\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"test zeros\" begin\n    @testset \"1D dzeros default element type\" begin\n        A = dzeros(10)\n        @test A == zeros(10)\n        @test eltype(A) == Float64\n        @test size(A) == (10,)\n        close(A)\n    end\n\n    @testset \"1D dzeros with specified element type\" begin\n        A = dzeros(Int, 10)\n        @test A == zeros(10)\n        @test eltype(A) == Int\n        @test size(A) == (10,)\n        close(A)\n    end\n\n    @testset \"2D dzeros default element type, Dims constructor\" begin\n        A = dzeros((10,10))\n        @test A == zeros((10,10))\n        @test eltype(A) == Float64\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dzeros specified element type, Dims constructor\" begin\n        A = dzeros(Int, (10,10))\n        @test A == zeros(Int, (10,10))\n        @test eltype(A) == Int\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dzeros, default element type\" begin\n        A = dzeros(10,10)\n        @test A == zeros(10,10)\n        @test eltype(A) == Float64\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dzeros, specified element type\" begin\n        A = dzeros(Int, 10, 10)\n        @test A == zeros(Int, 10, 10)\n        @test eltype(A) == Int\n        @test size(A) == (10,10)\n        close(A)\n    end\nend\n\ncheck_leaks()\n\n@testset \"test dones\" begin\n    @testset \"1D dones default element type\" begin\n        A = dones(10)\n        @test A == ones(10)\n        @test eltype(A) == Float64\n        @test size(A) == (10,)\n        close(A)\n    end\n\n    @testset \"1D dones with specified element type\" begin\n        A = dones(Int, 10)\n        @test eltype(A) == Int\n        @test size(A) == (10,)\n        close(A)\n    end\n\n    @testset \"2D dones default element type, Dims constructor\" begin\n        A = dones((10,10))\n        @test A == ones((10,10))\n        @test eltype(A) == Float64\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dones specified element type, Dims constructor\" begin\n        A = dones(Int, (10,10))\n        @test A == ones(Int, (10,10))\n        @test eltype(A) == Int\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dones, default element type\" begin\n        A = dones(10,10)\n        @test A == ones(10,10)\n        @test eltype(A) == Float64\n        @test size(A) == (10,10)\n        close(A)\n    end\n\n    @testset \"2D dones, specified element type\" begin\n        A = dones(Int, 10, 10)\n        @test A == ones(Int, 10, 10)\n        @test eltype(A) == Int\n        @test size(A) == (10,10)\n        close(A)\n    end\nend\n\ncheck_leaks()\n\n@testset \"test drand\" begin\n    @testset \"1D drand\" begin\n        A = drand(100)\n        @test eltype(A) == Float64\n        @test size(A) == (100,)\n        @test all(x-> x >= 0.0 && x <= 1.0, A)\n        close(A)\n    end\n\n    @testset \"1D drand, specified element type\" begin\n        A = drand(Int, 100)\n        @test eltype(A) == Int\n        @test size(A) == (100,)\n        close(A)\n    end\n\n    @testset \"1D drand, UnitRange\" begin\n        A = drand(1:10, 100)\n        @test eltype(A) == Int\n        @test size(A) == (100,)\n        close(A)\n    end\n\n    @testset \"1D drand, Array\" begin\n        A = drand([-1,0,1], 100)\n        @test eltype(A) == Int\n        @test size(A) == (100,)\n        close(A)\n    end\n\n    @testset \"2D drand, Dims constructor\" begin\n        A = drand((50,50))\n        @test eltype(A) == Float64\n        @test size(A) == (50,50)\n        @test all(x-> x >= 0.0 && x <= 1.0, A)\n        close(A)\n    end\n\n    @testset \"2D drand\" begin\n        A = drand(100,100)\n        @test eltype(A) == Float64\n        @test size(A) == (100,100)\n        @test all(x-> x >= 0.0 && x <= 1.0, A)\n        close(A)\n    end\n\n    @testset \"2D drand, Dims constructor, specified element type\" begin\n        A = drand(Int, (100,100))\n        @test eltype(A) == Int\n        @test size(A) == (100,100)\n        close(A)\n    end\n\n    @testset \"2D drand, specified element type\" begin\n        A = drand(Int, 100, 100)\n        @test eltype(A) == Int\n        @test size(A) == (100,100)\n        close(A)\n    end\nend\n\ncheck_leaks()\n\n@testset \"test randn\" begin\n    @testset \"1D drandn\" begin\n        A = drandn(100)\n        @test eltype(A) == Float64\n        @test size(A) == (100,)\n        close(A)\n    end\n\n    @testset \"2D drandn, Dims constructor\" begin\n        A = drandn((50,50))\n        @test eltype(A) == Float64\n        @test size(A) == (50,50)\n        close(A)\n    end\n\n    @testset \"2D drandn\" begin\n        A = drandn(100,100)\n        @test eltype(A) == Float64\n        @test size(A) == (100,100)\n        close(A)\n    end\nend\n\ncheck_leaks()\n\n@testset \"test transpose/adjoint\" begin\n    @testset \"test transpose real\" begin\n        A = drand(Float64, 100, 200)\n        @test copy(transpose(A)) == transpose(Array(A))\n        close(A)\n    end\n    @testset \"test transpose complex\" begin\n        A = drand(ComplexF64, 200, 100)\n        @test copy(transpose(A)) == transpose(Array(A))\n        close(A)\n    end\n    @testset \"test adjoint real\" begin\n        A = drand(Float64, 200, 100)\n        @test copy(adjoint(A)) == adjoint(Array(A))\n        close(A)\n    end\n    @testset \"test adjoint complex\" begin\n        A = drand(ComplexF64, 100, 200)\n        @test copy(adjoint(A)) == adjoint(Array(A))\n        close(A)\n    end\n\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"makelocal\" begin\n    A = randn(5*nprocs(), 5*nprocs())\n    dA = distribute(A, procs=procs())\n    for i in 1:size(dA, 2)\n        a = DistributedArrays.makelocal(dA, :, i)\n        @test all(Array(view(dA, :, i)) .== a)\n        @test all(      view( A, :, i) .== a)\n    end\n    for i in 1:size(dA, 1)\n        a = DistributedArrays.makelocal(dA, i, :)\n        @test all(Array(view(dA, i:i, :)) .== a)\n        @test all(      view( A, i:i, :) .== a)\n    end\n    a = DistributedArrays.makelocal(dA, 1:5, 1:5)\n    @test all(Array(view(dA, 1:5, 1:5)) .== a)\n    @test all(      view( A, 1:5, 1:5) .== a)\n    close(dA)\nend\n\n@testset \"test convert from subdarray\" begin\n    a = drand(20, 20);\n\n    s = view(a, 1:5, 5:8)\n    @test isa(s, SubDArray)\n    @test s == DArray(s)\n\n    s = view(a, 6:5, 5:8)\n    @test isa(s, SubDArray)\n    @test s == DArray(s)\n    close(a)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test scalar math\" begin\n    a = drand(20, 20);\n    b = convert(Array, a)\n    @testset \"$f\" for f in (-, abs, abs2, acos, acosd, acot,\n              acotd, acsch, angle, asech, asin,\n              asind, asinh, atan, atand, atanh,\n              big, cbrt, ceil, cis, complex, conj,\n              cos, cosc, cosd, cosh, cospi, cot,\n              cotd, coth, csc, cscd, csch, dawson,\n              deg2rad, digamma, erf, erfc, erfcinv,\n              erfcx, erfi, erfinv, exp, exp10, exp2,\n              expm1, exponent, float, floor, gamma, imag,\n              invdigamma, isfinite, isinf, isnan,\n              loggamma, log, log10, log1p, log2, rad2deg, real,\n              sec, secd, sech, sign, sin, sinc, sind,\n              sinh, sinpi, sqrt, tan, tand, tanh, trigamma)\n        @test f.(a) == f.(b)\n    end\n    a = a .+ 1\n    b = b .+ 1\n    @testset \"$f\" for f in (asec, asecd, acosh, acsc, acscd, acoth)\n        @test f.(a) == f.(b)\n    end\n    close(a)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test mapslices\" begin\n    A = randn(5,5,5)\n    D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)])\n    @test mapslices(svdvals, D, dims=(1,2)) ≈ mapslices(svdvals, A, dims=(1,2))\n    @test mapslices(svdvals, D, dims=(1,3)) ≈ mapslices(svdvals, A, dims=(1,3))\n    @test mapslices(svdvals, D, dims=(2,3)) ≈ mapslices(svdvals, A, dims=(2,3))\n    @test mapslices(sort, D, dims=(1,)) ≈ mapslices(sort, A, dims=(1,))\n    @test mapslices(sort, D, dims=(2,)) ≈ mapslices(sort, A, dims=(2,))\n    @test mapslices(sort, D, dims=(3,)) ≈ mapslices(sort, A, dims=(3,))\n\n    # issue #3613\n    B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), dims=[1,2])\n    @test size(B) == (1,1,4)\n    @test all(B.==6)\n\n    # issue #5141\n    C1 = mapslices(x-> maximum(-x), D, dims=[])\n    @test C1 == -D\n\n    # issue #5177\n    c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)])\n    m1 = mapslices(x-> ones(2,3), c, dims=[1,2])\n    m2 = mapslices(x-> ones(2,4), c, dims=[1,3])\n    m3 = mapslices(x-> ones(3,4), c, dims=[2,3])\n    @test size(m1) == size(m2) == size(m3) == size(c)\n\n    n1 = mapslices(x-> ones(6), c, dims=[1,2])\n    n2 = mapslices(x-> ones(6), c, dims=[1,3])\n    n3 = mapslices(x-> ones(6), c, dims=[2,3])\n    n1a = mapslices(x-> ones(1,6), c, dims=[1,2])\n    n2a = mapslices(x-> ones(1,6), c, dims=[1,3])\n    n3a = mapslices(x-> ones(1,6), c, dims=[2,3])\n    @test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5))\n    @test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5))\n    close(D)\n    close(c)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test scalar ops\" begin\n    a = drand(20,20)\n    b = convert(Array, a)\n    c = drand(20,20)\n    d = convert(Array, c)\n\n    @testset \"$f\" for f in (:+, :-, :*, :/, :%)\n        x = rand()\n        @test @eval ($f).($a, $x) == ($f).($b, $x)\n        @test @eval ($f).($x, $a) == ($f).($x, $b)\n        @test @eval ($f).($a, $c) == ($f).($b, $d)\n    end\n\n    close(a)\n    close(c)\n\n    a = dones(Int, 20, 20)\n    b = convert(Array, a)\n    @testset \"$f\" for f in (:<<, :>>)\n        @test @eval ($f).($a, 2)  == ($f).($b, 2)\n        @test @eval ($f).(2, $a)  == ($f).(2, $b)\n        @test @eval ($f).($a, $a) == ($f).($b, $b)\n    end\n\n    @testset \"$f\" for f in (:rem,)\n        x = rand()\n        @test @eval ($f).($a, $x) == ($f).($b, $x)\n    end\n    close(a)\n    close(c)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test broadcast ops\" begin\n    wrkrs = workers()\n    nwrkrs = length(wrkrs)\n    nrows = 20 * nwrkrs\n    ncols = 10 * nwrkrs\n    a = drand((nrows,ncols), wrkrs, (1, nwrkrs))\n    m = mean(a, dims=1)\n    c = a .- m\n    d = convert(Array, a) .- convert(Array, m)\n    @test c == d\n    e = @DArray [ones(10) for i=1:4]\n    f = 2 .* e\n    @test Array(f) == 2 .* Array(e)\n    @test Array(map(x -> sum(x) .+ 2, e)) == map(x -> sum(x) .+ 2, e)\n\n    @testset \"test nested broadcast\" begin\n       g = a .- m .* sin.(c)\n       @test Array(g) == Array(a) .- Array(m) .* sin.(Array(c))\n    end\n\n    @testset \"Broadcasting into DArray\" begin\n        a .= ones(nrows, ncols)\n        @test all(isone, a)\n        a .= 3 .+ abs2.(@view(zeros(nrows, ncols + 5)[:, 6:end]))\n        @test all(x -> x == 3, a)\n    end\n\n    # @testset \"lazy wrapped broadcast\" begin\n    #    l = similar(a)\n    #    l[1:10, :] .= view(a, 1:10, : )\n    # end\n    d_closeall()\nend\n\ncheck_leaks()\n\n@testset \"test matrix multiplication\" begin\n    A = drandn(20,20)\n    b = drandn(20)\n    B = drandn(20,20)\n\n    @test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps())\n    @test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps())\n    @test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps())\n    @test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps())\n    close(A)\n    close(b)\n    close(B)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"dot product\" begin\n    A = drandn(20,20)\n    b = drandn(20)\n    c = A * b\n\n    @test dot(c, b) ≈ dot(convert(Array, c), convert(Array, b))\n    close(A)\n    close(b)\n    close(c)\nend\n\ncheck_leaks()\n\n@testset \"test norm\" begin\n    x = drandn(20)\n\n    @test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps())\n    @test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps())\n    @test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps())\n    @test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps())\n    close(x)\nend\n\ncheck_leaks()\n\n@testset \"test axpy!\" begin\n    for (x, y) in ((drandn(20), drandn(20)),\n                   (drandn(20, 2), drandn(20, 2)))\n\n        @test Array(axpy!(2.0, x, copy(y))) ≈ axpy!(2.0, Array(x), Array(y))\n        @test_throws DimensionMismatch axpy!(2.0, x, zeros(length(x) + 1))\n        close(x)\n        close(y)\n    end\n\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test ppeval\" begin\n    A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()])\n    B = drandn((10, nworkers()), workers(), [1, nworkers()])\n\n    R = zeros(10, nworkers())\n    for i = 1:nworkers()\n        R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i]\n    end\n    @test convert(Array, ppeval(*, A, B)) ≈ R\n    @test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, Matrix{Float64}(I,10,10)))\n    close(A)\n    close(B)\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"test nnz\" begin\n    A = sprandn(10, 10, 0.5)\n    @test nnz(distribute(A)) == nnz(A)\nend\n\n@testset \"test matmatmul\" begin\n    A = drandn(30, 30)\n    B = drandn(30, 20)\n    a = convert(Array, A)\n    b = convert(Array, B)\n\n    AB = A * B\n    AtB = transpose(A) * B\n    AcB = A' * B\n\n    ab = a * b\n    atb = transpose(a) * b\n    acb = a' * b\n\n    @test AB ≈ ab\n    @test AtB ≈ atb\n    @test AcB ≈ acb\n    d_closeall()  # close the temporaries created above\nend\n\n@testset \"sort, T = $T, 10^$i elements\" for i in 0:6, T in [Int, Float64]\n    d = DistributedArrays.drand(T, 10^i)\n    @testset \"sample = $sample\" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)]\n        d2 = DistributedArrays.sort(d; sample=sample)\n        a  = convert(Array, d)\n        a2 = convert(Array, d2)\n        @test length(d) == length(d2)\n        @test sort(a) == a2\n    end\n    d_closeall()  # close the temporaries created above\nend\n\ncheck_leaks()\n\n@testset \"ddata\" begin\n    d = ddata(;T=Int, init=I->myid())\n    for p in workers()\n        @test p == remotecall_fetch(d->d[:L], p, d)\n    end\n    @test Int[workers()...] == gather(d)\n\n    close(d)\n\n    d = ddata(;T=Int, data=workers())\n    for p in workers()\n        @test p == remotecall_fetch(d->d[:L], p, d)\n    end\n    @test Int[workers()...] == gather(d)\n\n    close(d)\n\n    d = ddata(;T=Any, init=I->\"Hello World!\")\n    for p in workers()\n        @test \"Hello World!\" == remotecall_fetch(d->d[:L], p, d)\n    end\n    Any[\"Hello World!\" for p in workers()] == gather(d)\n\n\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"rand!\" begin\n    d = dzeros(30, 30)\n    rand!(d)\n\n    close(d)\nend\n\ncheck_leaks()\n\n@testset \"fill!\" begin\n    d = dzeros(30, 30)\n    fill!(d, 3.14)\n    @test all(x-> x == 3.14, d)\n\n    close(d)\nend\n\ncheck_leaks()\n\nd_closeall()\n\n@testset \"test for any leaks\" begin\n    sleep(1.0)     # allow time for any cleanup to complete\n    allrefszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REFS.lock, isempty(DistributedArrays.REFS.data)), p) for p in procs()]\n    @test all(allrefszero)\n\n    allregistrieszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REGISTRY.lock, isempty(DistributedArrays.REGISTRY.data)), p) for p in procs()]\n    @test all(allregistrieszero)\nend\n\n"
  },
  {
    "path": "test/explicit_imports.jl",
    "content": "using DistributedArrays, Test\nimport ExplicitImports\n\n@testset \"ExplicitImports\" begin\n    # No implicit imports in DistributedArrays (ie. no `using MyPkg`)\n    @test ExplicitImports.check_no_implicit_imports(DistributedArrays) === nothing\n\n    # No non-owning imports in DistributedArrays (ie. no `using LinearAlgebra: map`)\n    @test ExplicitImports.check_all_explicit_imports_via_owners(DistributedArrays) === nothing\n\n    # Limit non-public imports in DistributedArrays (ie. `using MyPkg: _non_public_internal_func`)\n    # to a few selected types and functions\n    @test ExplicitImports.check_all_explicit_imports_are_public(\n        DistributedArrays;\n        ignore = (\n            # Base\n            :Broadcasted,\n            :Callable,\n            (VERSION < v\"1.11\" ? (:tail,) : ())...,\n        ),\n    ) === nothing\n\n    # No stale imports in DistributedArrays (ie. no `using MyPkg: func` where `func` is not used in DistributedArrays)\n    @test ExplicitImports.check_no_stale_explicit_imports(DistributedArrays) === nothing\n\n    # No non-owning accesses in DistributedArrays (ie. no `... LinearAlgebra.map(...)`)\n    @test ExplicitImports.check_all_qualified_accesses_via_owners(DistributedArrays) === nothing\n\n    # Limit non-public accesses in DistributedArrays (ie. no `... MyPkg._non_public_internal_func(...)`)\n    # to a few selected types and methods from Base\n    @test ExplicitImports.check_all_qualified_accesses_are_public(\n        DistributedArrays;\n        ignore = (\n            # Base.Broadcast\n            :AbstractArrayStyle,\n            :DefaultArrayStyle,\n            :broadcasted,\n            :throwdm,\n            # Base\n            (VERSION < v\"1.11\" ? (Symbol(\"@propagate_inbounds\"),) : ())...,\n            :ReshapedArray,\n            :Slice,\n            :_all,\n            :_any,\n            :_mapreduce,\n            :check_reducedims,\n            :checkbounds_indices,\n            :index_lengths,\n            :mapreducedim!,\n            :promote_op,\n            :reducedim_initarray,\n            :reindex,\n            :setindex_shape_check,\n            :unalias,\n            # Serialization\n            :serialize_type,\n            # Statistics        \n            :_mean,\n        ),\n    ) === nothing\n\n    # No self-qualified accesses in DistributedArrays (ie. no `... DistributedArrays.func(...)`)\n    @test ExplicitImports.check_no_self_qualified_accesses(DistributedArrays) === nothing\nend\n"
  },
  {
    "path": "test/runtests.jl",
    "content": "using Test\nusing Distributed\nusing DistributedArrays\n\n# Disable scalar indexing to avoid falling back on generic methods\n# for AbstractArray\nDistributedArrays.allowscalar(false)\n\n# add at least 3 worker processes\nif nworkers() < 3\n    n = max(3, min(8, Sys.CPU_THREADS))\n    addprocs(n; exeflags=`--check-bounds=yes`)\nend\n@assert nprocs() > 3\n@assert nworkers() >= 3\n\n@everywhere using Distributed\n@everywhere using DistributedArrays\n@everywhere using DistributedArrays.SPMD\n@everywhere using Random\n@everywhere using LinearAlgebra\n\n@everywhere Random.seed!(1234 + myid())\n\nconst MYID = myid()\nconst OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))]\n\nfunction check_leaks()\n    nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)\n    if !iszero(nrefs)\n        sleep(0.1)  # allow time for any cleanup to complete and test again\n        nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)\n        if !iszero(nrefs)\n            @warn(\"Probable leak of \", nrefs, \" darrays\")\n        end\n    end\nend\n\ninclude(\"aqua.jl\")\ninclude(\"explicit_imports.jl\")\ninclude(\"darray.jl\")\ninclude(\"spmd.jl\")\n\n"
  },
  {
    "path": "test/spmd.jl",
    "content": "@everywhere function spmd_test1()\n    barrier(;tag=:b1)\n\n    if myid() == 1\n        @assert SPMD.recvfrom(2) == \"Hello from 2\"\n        println(\"SPMD: Passed send/recv\")\n    elseif myid() == 2\n        data = \"Hello from 2\"\n        sendto(1, data)\n    end\n\n    stime = rand(1:5)\n#    println(\"Sleeping for $stime seconds\")\n    sleep(stime)\n    barrier(;tag=:b2)\n\n    bcast_val = nothing\n    if myid() == 1\n        bcast_val = rand(2)\n    end\n\n    bcast_val = bcast(bcast_val, 1)\n\n    if myid() == 1\n        @assert bcast_val == SPMD.recvfrom(2)\n        println(\"SPMD: Passed broadcast\")\n    elseif myid() == 2\n        sendto(1, bcast_val)\n    end\n\n    barrier()\n\n    scatter_data = nothing\n    if myid() == 1\n        scatter_data = rand(Int8, nprocs())\n    end\n    lp = scatter(scatter_data, 1, tag=1)\n\n    if myid() == 1\n        @assert scatter_data[2:2] == SPMD.recvfrom(2)\n        println(\"SPMD: Passed scatter 1\")\n    elseif myid() == 2\n        sendto(1, lp)\n    end\n\n    scatter_data = nothing\n    if myid() == 1\n        scatter_data = rand(Int8, nprocs()*2)\n    end\n    lp = scatter(scatter_data, 1, tag=2)\n\n    if myid() == 1\n        @assert scatter_data[3:4] == SPMD.recvfrom(2)\n        println(\"SPMD: Passed scatter 2\")\n    elseif myid() == 2\n        sendto(1, lp)\n    end\n\n    gathered_data = gather(myid(), 1, tag=3)\n    if myid() == 1\n        @assert gathered_data == procs()\n        println(\"SPMD: Passed gather 1\")\n    end\n\n    gathered_data = gather([myid(), myid()], 1, tag=4)\n    if myid() == 1\n        @assert gathered_data == [[p,p] for p in procs()]\n        println(\"SPMD: Passed gather 2\")\n    end\nend\n\nspmd(spmd_test1)\n\n# Test running only on the workers using the spmd function.\n\n# define the function everywhere\n@everywhere function foo_spmd(d_in, d_out, n)\n    pids=sort(vec(procs(d_in)))\n    pididx = findfirst(isequal(myid()), pids)\n    mylp = localpart(d_in)\n    localsum = 0\n\n    # Have each node exchange data with its neighbors\n    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1\n    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1\n\n#    println(p_pididx, \" p\", pids[p_pididx], \" \", n_pididx, \" p\", pids[n_pididx])\n#    println(mylp)\n\n    for i in 1:n\n        sendto(pids[n_pididx], mylp[2])\n        sendto(pids[p_pididx], mylp[1])\n\n        mylp[2] = SPMD.recvfrom(pids[p_pididx])\n        mylp[1] = SPMD.recvfrom(pids[n_pididx])\n\n#        println(mylp)\n\n        barrier(;pids=pids)\n        localsum = localsum + mylp[1] + mylp[2]\n    end\n\n    # finally store the sum in d_out\n    d_out[:L] = localsum\nend\n\n# run foo_spmd on all workers, many of them, all concurrently using implicitly different contexts.\nin_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)\nout_arrays = map(x->ddata(), 1:8)\n\n@sync for i in 1:8\n    @async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers())\nend\nfor i in 1:8\n    @test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])\nend\n\nprintln(\"SPMD: Passed testing of spmd function run concurrently\")\n\n# run concurrently with explicitly different contexts\n\n# define the function everywhere\n@everywhere function foo_spmd2(d_in, d_out, n)\n    pids=sort(vec(procs(d_in)))\n    pididx = findfirst(isequal(myid()), pids)\n    mylp = localpart(d_in)\n\n    # see if we have a value in the local store.\n    store = context_local_storage()\n\n    localsum = get!(store, :LOCALSUM, 0)\n\n    # Have each node exchange data with its neighbors\n    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1\n    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1\n\n    for i in 1:n\n        sendto(pids[n_pididx], mylp[2])\n        sendto(pids[p_pididx], mylp[1])\n\n        mylp[2] = SPMD.recvfrom(pids[p_pididx])\n        mylp[1] = SPMD.recvfrom(pids[n_pididx])\n\n        barrier(;pids=pids)\n        localsum = localsum + mylp[1] + mylp[2]\n    end\n\n    # finally store the sum in d_out\n    d_out[:L] = localsum\n    store[:LOCALSUM] = localsum\nend\n\n\nin_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)\nout_arrays = map(x->ddata(), 1:8)\ncontexts = map(x->context(workers()), 1:8)\n\n@sync for i in 1:8\n    @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])\nend\n# Second run will add the value stored in the previous run.\n@sync for i in 1:8\n    @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])\nend\n\nfor i in 1:8\n    @test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])\nend\n\n# verify localstores with appropriate context store values exist.\n@everywhere begin\n    if myid() != 1\n        local n = 0\n        @lock DistributedArrays.SPMD.CONTEXTS.lock begin\n            for (k,v) in DistributedArrays.SPMD.CONTEXTS.data\n                store = v.store\n                localsum = store[:LOCALSUM]\n                if localsum != 2*sum(workers())*2\n                    println(\"localsum \", localsum, \" != $(2*sum(workers())*2)\")\n                    error(\"localsum mismatch\")\n                end\n                n += 1\n            end\n        end\n        @assert n == 8\n    end\nend\n\n# close the contexts\nforeach(close, contexts)\n\n# verify that the localstores have been deleted.\n@everywhere begin\n    @assert @lock DistributedArrays.SPMD.CONTEXTS.lock isempty(DistributedArrays.SPMD.CONTEXTS.data)\nend\n\nprintln(\"SPMD: Passed spmd function with explicit context run concurrently\")\n\n"
  }
]