Repository: JuliaParallel/DistributedArrays.jl
Branch: master
Commit: db355b31aefd
Files: 30
Total size: 138.8 KB

Directory structure:
gitextract_34pth6or/

├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── CI.yml
│       ├── CompatHelper.yml
│       └── TagBot.yml
├── .gitignore
├── LICENSE.md
├── Project.toml
├── README.md
├── codecov.yml
├── docs/
│   ├── .gitignore
│   ├── Project.toml
│   ├── make.jl
│   └── src/
│       ├── api.md
│       └── index.md
├── ext/
│   ├── SparseArraysExt.jl
│   └── StatisticsExt.jl
├── src/
│   ├── DistributedArrays.jl
│   ├── broadcast.jl
│   ├── core.jl
│   ├── darray.jl
│   ├── linalg.jl
│   ├── mapreduce.jl
│   ├── serialize.jl
│   ├── sort.jl
│   └── spmd.jl
└── test/
    ├── aqua.jl
    ├── darray.jl
    ├── explicit_imports.jl
    ├── runtests.jl
    └── spmd.jl

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/dependabot.yml
================================================
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"


================================================
FILE: .github/workflows/CI.yml
================================================
name: CI
on:
  pull_request:
    branches:
      - master
  push:
    branches:
      - master
    tags: '*'
  workflow_dispatch:

concurrency:
  # Skip intermediate builds: all builds except for builds on the `master` branch
  # Cancel intermediate builds: only pull request builds
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }}
  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}

jobs:
  test:
    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ github.event_name }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        version:
          - 'min'
          - 'lts'
          - '1'
          - 'pre'
        os:
          - ubuntu-latest
          - windows-latest
          - macOS-latest
    steps:
      - uses: actions/checkout@v6
      - uses: julia-actions/setup-julia@v3
        with:
          version: ${{ matrix.version }}
      - uses: julia-actions/cache@v3
      - uses: julia-actions/julia-buildpkg@v1
      - uses: julia-actions/julia-runtest@v1
      - uses: julia-actions/julia-processcoverage@v1
      - uses: codecov/codecov-action@v5
        with:
          files: lcov.info
          token: ${{ secrets.CODECOV_TOKEN }}
          fail_ci_if_error: true
  docs:
    name: Documentation
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: julia-actions/setup-julia@v3
        with:
          version: '1'
      - uses: julia-actions/cache@v3
      - run: julia --project=docs -e 'import Pkg; Pkg.instantiate()'
      - run: |
          julia --project=docs -e '
            using Documenter: doctest
            using DistributedArrays
            doctest(DistributedArrays)'
      - run: julia --project=docs docs/make.jl
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}


================================================
FILE: .github/workflows/CompatHelper.yml
================================================
name: CompatHelper
on:
  schedule:
    - cron: 0 0 * * *
  workflow_dispatch:
jobs:
  CompatHelper:
    runs-on: ubuntu-latest
    steps:
      - name: "Add the General registry via Git"
        run: |
          import Pkg
          ENV["JULIA_PKG_SERVER"] = ""
          Pkg.Registry.add("General")
        shell: julia --color=yes {0}
      - name: "Install CompatHelper"
        run: |
          import Pkg
          name = "CompatHelper"
          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
          version = "3"
          Pkg.add(; name, uuid, version)
        shell: julia --color=yes {0}
      - name: "Run CompatHelper"
        run: |
          import CompatHelper
          CompatHelper.main(; subdirs = ["", "docs"])
        shell: julia --color=yes {0}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}


================================================
FILE: .github/workflows/TagBot.yml
================================================
name: TagBot
on:
  issue_comment:
    types:
      - created
  workflow_dispatch:
jobs:
  TagBot:
    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
    runs-on: ubuntu-latest
    steps:
      - uses: JuliaRegistries/TagBot@v1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          ssh: ${{ secrets.DOCUMENTER_KEY }}


================================================
FILE: .gitignore
================================================
Manifest.toml
*.jl.cov
*.jl.mem
.DS_Store
.vscode/

================================================
FILE: LICENSE.md
================================================
The DistributedArrays.jl package is licensed under the MIT "Expat" License:

> Copyright (c) 2015: Julia Parallel Contributors
>
> Permission is hereby granted, free of charge, to any person obtaining
> a copy of this software and associated documentation files (the
> "Software"), to deal in the Software without restriction, including
> without limitation the rights to use, copy, modify, merge, publish,
> distribute, sublicense, and/or sell copies of the Software, and to
> permit persons to whom the Software is furnished to do so, subject to
> the following conditions:
>
> The above copyright notice and this permission notice shall be
> included in all copies or substantial portions of the Software.
>
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: Project.toml
================================================
name = "DistributedArrays"
uuid = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
version = "0.6.9"

[deps]
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[weakdeps]
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[extensions]
SparseArraysExt = "SparseArrays"
StatisticsExt = "Statistics"

[compat]
Aqua = "0.8.12"
Distributed = "<0.0.1, 1"
ExplicitImports = "1.13.2"
LinearAlgebra = "<0.0.1, 1"
Primes = "0.4, 0.5"
Random = "<0.0.1, 1"
Serialization = "<0.0.1, 1"
SparseArrays = "<0.0.1, 1"
SpecialFunctions = "0.8, 1, 2"
Statistics = "<0.0.1, 1"
Test = "<0.0.1, 1"
julia = "1.10"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Aqua", "ExplicitImports", "SparseArrays", "SpecialFunctions", "Statistics", "Test"]


================================================
FILE: README.md
================================================
# DistributedArrays

*Distributed arrays for Julia.*

| **Documentation**                                                         | **Build Status**                                              |
|:-------------------------------------------------------------------------:|:-------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][travis-img]][travis-url] [![][codecov-img]][codecov-url] |

## Introduction

`DistributedArrays.jl` uses the stdlib [`Distributed`][distributed-docs] to implement a *Global Array* interface.
A `DArray` is distributed across a set of workers. Each worker can read and write from its local portion of the array and each worker has read-only access to the portions of the array held by other workers.

## Installation

The package can be installed with the Julia package manager.
From the Julia REPL, type `]` to enter the Pkg REPL mode and run:

```
pkg> add DistributedArrays
```

Or, equivalently, via the `Pkg` API:

```julia
julia> import Pkg; Pkg.add("DistributedArrays")
```

## Documentation

- [**STABLE**][docs-stable-url] &mdash; **documentation of the most recently tagged version.**
- [**DEVEL**][docs-dev-url] &mdash; *documentation of the in-development version.*

## Project Status

The package is tested against
Julia 1.10.0 (oldest supported Julia version),
the Julia LTS version,
the latest stable release of Julia,
and the pre-release version of Julia.

## Questions and Contributions

Usage questions can be posted on the [Julia Discourse forum][discourse-tag-url] under the `Parallel/Distributed` category, in the #parallel channel of the [Julia Slack](https://julialang.org/community/).

Contributions are very welcome, as are feature requests and suggestions. Please open an [issue][issues-url] if you encounter any problems. In particular additions to documentation are encouraged!

[contrib-url]: https://juliadocs.github.io/Documenter.jl/latest/man/contributing/
[discourse-tag-url]: https://discourse.julialang.org/c/domain/parallel

[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg
[docs-dev-url]: https://juliaparallel.github.io/DistributedArrays.jl/dev

[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
[docs-stable-url]: https://juliaparallel.github.io/DistributedArrays.jl/stable

[travis-img]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master
[travis-url]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl

[codecov-img]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg
[codecov-url]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl

[issues-url]: https://github.com/JuliaParallel/DistributedArrays.jl/issues
[distributed-docs]: https://docs.julialang.org/en/v1/manual/parallel-computing/#Multi-Core-or-Distributed-Processing-1


================================================
FILE: codecov.yml
================================================
 comment: off


================================================
FILE: docs/.gitignore
================================================
build/


================================================
FILE: docs/Project.toml
================================================
[deps]
DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

[compat]
DistributedArrays = "0.6"
Documenter = "1"

[sources.DistributedArrays]
path = ".."


================================================
FILE: docs/make.jl
================================================
using Documenter, DistributedArrays

makedocs(
    modules = [DistributedArrays],
    format = Documenter.HTML(),
    sitename = "DistributedArrays.jl",
    pages = [
        "Introduction" => "index.md"
        "API" => "api.md"
    ],
    doctest = true
)

deploydocs(
    repo = "github.com/JuliaParallel/DistributedArrays.jl.git",
)


================================================
FILE: docs/src/api.md
================================================
# API

```@autodocs
Modules = [DistributedArrays]
```


================================================
FILE: docs/src/index.md
================================================
# DistributedArrays.jl

```@contents
```

Distributed Arrays
------------------

Large computations are often organized around large arrays of data. In these
cases, a particularly natural way to obtain parallelism is to distribute arrays
among several processes. This combines the memory resources of multiple
machines, allowing use of arrays too large to fit on one machine. Each process
can read and write to the part of the array it owns and has read-only access to
the parts it doesn't own. This provides a ready answer to the question of how a
program should be divided among machines.

Julia distributed arrays are implemented by the `DArray` type. A
`DArray` has an element type and dimensions just like an `Array`.
A `DArray` can also use arbitrary array-like types to represent the local
chunks that store actual data. The data in a `DArray` is distributed by
dividing the index space into some number of blocks in each dimension.

Common kinds of arrays can be constructed with functions beginning with
`d`:

```julia
dzeros(100,100,10)
dones(100,100,10)
drand(100,100,10)
drandn(100,100,10)
dfill(x,100,100,10)
```

In the last case, each element will be initialized to the specified
value `x`. These functions automatically pick a distribution for you.
For more control, you can specify which processes to use, and how the
data should be distributed:

```julia
dzeros((100,100), workers()[1:4], [1,4])
```

The second argument specifies that the array should be created on the first
four workers. When dividing data among a large number of processes,
one often sees diminishing returns in performance. Placing `DArray`s
on a subset of processes allows multiple `DArray` computations to
happen at once, with a higher ratio of work to communication on each
process.

The third argument specifies a distribution; the nth element of
this array specifies how many pieces dimension n should be divided into.
In this example the first dimension will not be divided, and the second
dimension will be divided into 4 pieces. Therefore each local chunk will be
of size `(100,25)`. Note that the product of the distribution array must
equal the number of processes.

* `distribute(a::Array)` converts a local array to a distributed array.

* `localpart(d::DArray)` obtains the locally-stored portion
  of a  `DArray`.

* Localparts can be retrieved and set via the indexing syntax too.
  Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which
  are all equivalent. For example, `d[:L]` returns the localpart of `d`
  while `d[:L]=v` sets `v` as the localpart of `d`.

* `localindices(a::DArray)` gives a tuple of the index ranges owned by the
  local process.

* `convert(Array, a::DArray)` brings all the data to the local process.

Indexing a `DArray` (square brackets) with ranges of indices always
creates a `SubArray`, not copying any data.


Constructing Distributed Arrays
-------------------------------

The primitive `DArray` constructor has the following somewhat elaborate signature:

```julia
DArray(init, dims[, procs, dist])
```

`init` is a function that accepts a tuple of index ranges. This function should
allocate a local chunk of the distributed array and initialize it for the specified
indices. `dims` is the overall size of the distributed array.
`procs` optionally specifies a vector of process IDs to use.
`dist` is an integer vector specifying how many chunks the
distributed array should be divided into in each dimension.

The last two arguments are optional, and defaults will be used if they
are omitted.

As an example, here is how to turn the local array constructor `fill`
into a distributed array constructor:

```julia
dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
```

In this case the `init` function only needs to call `fill` with the
dimensions of the local piece it is creating.

`DArray`s can also be constructed from multidimensional `Array` comprehensions with
the `@DArray` macro syntax.  This syntax is just sugar for the primitive `DArray` constructor:

```julia
julia> [i+j for i = 1:5, j = 1:5]
5x5 Array{Int64,2}:
 2  3  4  5   6
 3  4  5  6   7
 4  5  6  7   8
 5  6  7  8   9
 6  7  8  9  10

julia> @DArray [i+j for i = 1:5, j = 1:5]
5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}:
 2  3  4  5   6
 3  4  5  6   7
 4  5  6  7   8
 5  6  7  8   9
 6  7  8  9  10
```

### Construction from arrays generated on separate processes
`DArray`s can also be constructed from arrays that have been constructed on separate processes, as demonstrated below:
```julia
ras = [@spawnat p rand(30,30) for p in workers()[1:4]]
ras = reshape(ras,(2,2))
D   = DArray(ras)
```
An alternative syntax is:
```julia
r1 = DistributedArrays.remotecall(() -> rand(10,10), workers()[1]) 
r2 = DistributedArrays.remotecall(() -> rand(10,10), workers()[2]) 
r3 = DistributedArrays.remotecall(() -> rand(10,10), workers()[3]) 
r4 = DistributedArrays.remotecall(() -> rand(10,10), workers()[4]) 
D  = DArray(reshape([r1 r2 r3 r4], (2,2))) 
```
The distribution of indices across workers can be checked with
```julia
[@fetchfrom p localindices(D) for p in workers()]
```


Distributed Array Operations
----------------------------

At this time, distributed arrays do not have much functionality. Their
major utility is allowing communication to be done via array indexing, which
is convenient for many problems. As an example, consider implementing the
"life" cellular automaton, where each cell in a grid is updated according
to its neighboring cells. To compute a chunk of the result of one iteration,
each process needs the immediate neighbor cells of its local chunk. The
following code accomplishes this:

```julia
function life_step(d::DArray)
    DArray(size(d),procs(d)) do I
        top   = mod(first(I[1])-2,size(d,1))+1
        bot   = mod( last(I[1])  ,size(d,1))+1
        left  = mod(first(I[2])-2,size(d,2))+1
        right = mod( last(I[2])  ,size(d,2))+1

        old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2)
        old[1      , 1      ] = d[top , left]   # left side
        old[2:end-1, 1      ] = d[I[1], left]
        old[end    , 1      ] = d[bot , left]
        old[1      , 2:end-1] = d[top , I[2]]
        old[2:end-1, 2:end-1] = d[I[1], I[2]]   # middle
        old[end    , 2:end-1] = d[bot , I[2]]
        old[1      , end    ] = d[top , right]  # right side
        old[2:end-1, end    ] = d[I[1], right]
        old[end    , end    ] = d[bot , right]

        life_rule(old)
    end
end
```

As you can see, we use a series of indexing expressions to fetch
data into a local array `old`. Note that the `do` block syntax is
convenient for passing `init` functions to the `DArray` constructor.
Next, the serial function `life_rule` is called to apply the update rules
to the data, yielding the needed `DArray` chunk. Nothing about `life_rule`
is `DArray`-specific, but we list it here for completeness:

```julia
function life_rule(old)
    m, n = size(old)
    new = similar(old, m-2, n-2)
    for j = 2:n-1
        for i = 2:m-1
            nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1],
                   old[i  ,j-1],             old[i  ,j+1],
                   old[i+1,j-1], old[i+1,j], old[i+1,j+1])
            new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j])
        end
    end
    new
end
```


Numerical Results of Distributed Computations
---------------------------------------------

Floating point arithmetic is not associative and this comes up
when performing distributed computations over `DArray`s.  All `DArray`
operations are performed over the `localpart` chunks and then aggregated.
The change in ordering of the operations will change the numeric result as
seen in this simple example:

```julia
julia> addprocs(8);

julia> using DistributedArrays

julia> A = fill(1.1, (100,100));

julia> sum(A)
11000.000000000013

julia> DA = distribute(A);

julia> sum(DA)
11000.000000000127

julia> sum(A) == sum(DA)
false
```

The ultimate ordering of operations will be dependent on how the `Array` is distributed.


Garbage Collection and `DArray`s
------------------------------

When a `DArray` is constructed (typically on the master process), the returned `DArray` objects stores information on how the
array is distributed, which processor holds which indices and so on. When the `DArray` object
on the master process is garbage collected, all participating workers are notified and
localparts of the `DArray` freed on each worker.

Since the size of the `DArray` object itself is small, a problem arises as `gc` on the master faces no memory pressure to
collect the `DArray` immediately. This results in a delay of the memory being released on the participating workers.

Therefore it is highly recommended to explicitly call `close(d::DArray)` as soon as user code
has finished working with the distributed array.

It is also important to note that the localparts of the `DArray` is collected from all participating workers
when the `DArray` object on the process creating the `DArray` is collected. It is therefore important to maintain
a reference to a `DArray` object on the creating process for as long as it is being computed upon.

`d_closeall()` is another useful function to manage distributed memory. It releases all `DArrays` created from
the calling process, including any temporaries created during computation.


Working with distributed non-array data (requires Julia 0.6)
------------------------------------------------------------

The function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used
to created a distributed vector whose localparts need not be Arrays.

It returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same.

`ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value,
initially initialized to `nothing`.

Argument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`.
If the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1,
returns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`.

`gather{T}(d::DArray{T,1,T})` returns an `Array{T,1}` consisting of all distributed elements of `d`.

Given a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart`
on the ith worker that `d` is distributed over.


SPMD Mode (An MPI Style SPMD mode with MPI like primitives, requires Julia 0.6)
-------------------------------------------------------------------------------
SPMD, i.e., a Single Program Multiple Data mode, is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background.

The same block of code is executed concurrently on all workers using the `spmd` function.

```julia
# define foo() on all workers
@everywhere function foo(arg1, arg2)
    ....
end

# call foo() everywhere using the `spmd` function
d_in=DArray(.....)
d_out=ddata()
spmd(foo,d_in,d_out; pids=workers()) # executes on all workers
```

`spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)`

`args` is one or more arguments to be passed to `f`. `pids` identifies the workers
that `f` needs to be run on. `context` identifies a run context, which is explained
later.

The following primitives can be used in SPMD mode.

- `sendto(pid, data; tag=nothing)` - sends `data` to `pid`

- `recvfrom(pid; tag=nothing)` - receives data from `pid`

- `recvfrom_any(; tag=nothing)` - receives data from any `pid`

- `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceed

- `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid`

- `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid`

- `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid`

Tag `tag` should be used to differentiate between consecutive calls of the same type, for example,
consecutive `bcast` calls.

`spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to
import it explicitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example,
`SPMD.sendto`.


Example
-------

This toy example exchanges data with each of its neighbors `n` times.

```julia
using Distributed
using DistributedArrays
addprocs(8)
@everywhere using DistributedArrays
@everywhere using DistributedArrays.SPMD

d_in=d=DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1])
d_out=ddata();

# define the function everywhere
@everywhere function foo_spmd(d_in, d_out, n)
    pids = sort(vec(procs(d_in)))
    pididx = findfirst(isequal(myid()), pids)
    mylp = d_in[:L]
    localsum = 0

    # Have each worker exchange data with its neighbors
    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1

    for i in 1:n
        sendto(pids[n_pididx], mylp[2])
        sendto(pids[p_pididx], mylp[1])

        mylp[2] = recvfrom(pids[p_pididx])
        mylp[1] = recvfrom(pids[n_pididx])

        barrier(;pids=pids)
        localsum = localsum + mylp[1] + mylp[2]
    end

    # finally store the sum in d_out
    d_out[:L] = localsum
end

# run foo_spmd on all workers
spmd(foo_spmd, d_in, d_out, 10, pids=workers())

# print values of d_in and d_out after the run
println(d_in)
println(d_out)
```


SPMD Context
------------

Each SPMD run is implicitly executed in a different context. This allows for multiple `spmd` calls to
be active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`.

`context(pids=procs())` returns a new SPMD context.

A SPMD context also provides a context local storage, a dict, which can be used to store
key-value pairs between spmd runs under the same context.

`context_local_storage()` returns the dictionary associated with the context.

NOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context
which live only for the duration of the call. Explicitly created context objects can be released
early by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries
on all participating `pids`. Else they will be released when the context object is gc'ed
on the node that created it.


Nested `spmd` calls
-------------------
As `spmd` executes the specified function on all participating nodes, we need to be careful with nesting `spmd` calls.

An example of an unsafe(wrong) way:
```julia
function foo(.....)
    ......
    spmd(bar, ......)
    ......
end

function bar(....)
    ......
    spmd(baz, ......)
    ......
end

spmd(foo,....)
```
In the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currently part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process.

The correct way (only have the driver process initiate `spmd` calls):
```julia
function foo()
    ......
    myid()==1 && spmd(bar, ......)
    ......
end

function bar()
    ......
    myid()==1 && spmd(baz, ......)
    ......
end

spmd(foo,....)
```

This is also true of functions which automatically distribute computation on DArrays.
```julia
function foo(d::DArray)
    ......
    myid()==1 && map!(bar, d)
    ......
end
spmd(foo,....)
```
Without the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is probably not what we want.

Similarly `@everywhere` from within a SPMD run should also be driven from the master node only.


================================================
FILE: ext/SparseArraysExt.jl
================================================
module SparseArraysExt

using DistributedArrays: DArray, SubDArray, SubOrDArray, localpart
using DistributedArrays.Distributed: remotecall_fetch
using SparseArrays: SparseArrays, nnz

function SparseArrays.nnz(A::DArray)
    B = asyncmap(A.pids) do p
        remotecall_fetch(nnz∘localpart, p, A)
    end
    return reduce(+, B)
end

# Fix method ambiguities
# TODO: Improve efficiency?
Base.copyto!(dest::SubOrDArray{<:Any,2}, src::SparseArrays.AbstractSparseMatrixCSC) = copyto!(dest, Matrix(src))
@static if isdefined(SparseArrays, :CHOLMOD)
    Base.copyto!(dest::SubOrDArray, src::SparseArrays.CHOLMOD.Dense) = copyto!(dest, Array(src))
    Base.copyto!(dest::SubOrDArray{T}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))
    Base.copyto!(dest::SubOrDArray{T,2}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))
end

# Fix method ambiguities
for T in (:DArray, :SubDArray)
    @eval begin
        Base.:(==)(d1::$T{<:Any,1}, d2::SparseArrays.ReadOnly) = d1 == parent(d2)
        Base.:(==)(d1::SparseArrays.ReadOnly, d2::$T{<:Any,1}) = parent(d1) == d2
    end
end

end


================================================
FILE: ext/StatisticsExt.jl
================================================
module StatisticsExt

using DistributedArrays: DArray
using Statistics: Statistics

Statistics._mean(f, A::DArray, region) = sum(f, A, dims = region) ./ prod((size(A, i) for i in region))

end


================================================
FILE: src/DistributedArrays.jl
================================================
module DistributedArrays

using Base: Callable
using Base.Broadcast: BroadcastStyle, Broadcasted

using Distributed: Distributed, RemoteChannel, Future, myid, nworkers, procs, remotecall, remotecall_fetch, remotecall_wait, worker_id_from_socket, workers
using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, I, Transpose, adjoint, adjoint!, axpy!, dot, lmul!, mul!, norm, rmul!, transpose, transpose!
using Random: Random, rand!
using Serialization: Serialization, AbstractSerializer, deserialize, serialize

using Primes: factor

# DArray exports
export DArray, SubDArray, SubOrDArray, @DArray
export dzeros, dones, dfill, drand, drandn, distribute, localpart, localindices, ppeval

# non-array distributed data
export ddata, gather

# immediate release of localparts
export d_closeall

include("darray.jl")
include("core.jl")
include("serialize.jl")
include("broadcast.jl")
include("mapreduce.jl")
include("linalg.jl")
include("sort.jl")

include("spmd.jl")
export SPMD

end # module


================================================
FILE: src/broadcast.jl
================================================
###
# Distributed broadcast implementation
##

# We define a custom ArrayStyle here since we need to keep track of
# the fact that it is Distributed and what kind of underlying broadcast behaviour
# we will encounter.
struct DArrayStyle{Style <: Union{Nothing,BroadcastStyle}} <: Broadcast.AbstractArrayStyle{Any} end
DArrayStyle(::S) where {S} = DArrayStyle{S}()
DArrayStyle(::S, ::Val{N}) where {S,N} = DArrayStyle(S(Val(N)))
DArrayStyle(::Val{N}) where N = DArrayStyle{Broadcast.DefaultArrayStyle{N}}()

Broadcast.BroadcastStyle(::Type{<:DArray{<:Any, N, A}}) where {N, A} = DArrayStyle(BroadcastStyle(A), Val(N))

# promotion rules
# TODO: test this
function Broadcast.BroadcastStyle(::DArrayStyle{AStyle}, ::DArrayStyle{BStyle}) where {AStyle, BStyle}
    DArrayStyle(BroadcastStyle(AStyle, BStyle))
end

function Broadcast.broadcasted(::DArrayStyle{Style}, f, args...) where Style
    inner = Broadcast.broadcasted(Style(), f, args...)
    if inner isa Broadcasted
        return Broadcasted{DArrayStyle{Style}}(inner.f, inner.args, inner.axes)
    else # eagerly evaluated
        return inner
    end
end

# # deal with one layer deep lazy arrays
# BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where T <: DArray = BroadcastStyle(T)
# BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where T <: DArray = BroadcastStyle(T)
# BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,<:T}}) where T <: DArray = BroadcastStyle(T)

# # This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray
# # and we could define our methods in terms of Union{DArray, WrappedArray{<:Any, <:DArray}}
# const DDestArray = Union{DArray,
#                          LinearAlgebra.Transpose{<:Any,<:DArray},
#                          LinearAlgebra.Adjoint{<:Any,<:DArray},
#                          SubArray{<:Any, <:Any, <:DArray}}
const DDestArray = DArray

# This method is responsible for selection the output type of broadcast
function Base.similar(bc::Broadcasted{<:DArrayStyle{Style}}, ::Type{ElType}) where {Style, ElType}
    DArray(map(length, axes(bc))) do I 
        # create fake Broadcasted for underlying ArrayStyle
        bc′ = Broadcasted{Style}(identity, (), map(length, I))
        similar(bc′, ElType)
    end
end

##
# Ref https://docs.julialang.org/en/v1/manual/interfaces/#extending-in-place-broadcast-2
#
# We purposefully only specialise `copyto!`,
# Broadcast implementation that defers to the underlying BroadcastStyle. We can't 
# assume that `getindex` is fast, furthermore  we can't assume that the distribution of
# DArray across workers is equal or that the underlying array type is consistent.
#
# Implementation:
#   - first distribute all arguments
#     - Q: How do decide on the cuts
#   - then localise arguments on each node
##
@inline function Base.copyto!(dest::DDestArray, bc::Broadcasted{Nothing})
    axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))

    # Distribute Broadcasted
    # This will turn local AbstractArrays into DArrays
    dbc = bcdistribute(bc)

    @sync for p in procs(dest)
        @async remotecall_wait(p) do
            # get the indices for the localpart
            lpidx = localpartindex(dest)
            @assert lpidx != 0
            # create a local version of the broadcast, by constructing views
            # Note: creates copies of the argument
            lbc = bclocal(dbc, dest.indices[lpidx])
            copyto!(localpart(dest), lbc)
        end
    end

    return dest
end

# Test
# a = Array
# a .= DArray(x,y)

@inline function Base.copy(bc::Broadcasted{<:DArrayStyle})
    dbc = bcdistribute(bc)
    # TODO: teach DArray about axes since this is wrong for OffsetArrays
    DArray(map(length, axes(bc))) do I
        lbc = bclocal(dbc, I)
        copy(lbc)
    end
end

# _bcview creates takes the shapes of a view and the shape of a broadcasted argument,
# and produces the view over that argument that constitutes part of the broadcast
# it is in a sense the inverse of _bcs in Base.Broadcast
_bcview(::Tuple{}, ::Tuple{}) = ()
_bcview(::Tuple{}, view::Tuple) = ()
_bcview(shape::Tuple, ::Tuple{}) = (shape[1], _bcview(tail(shape), ())...)
function _bcview(shape::Tuple, view::Tuple)
    return (_bcview1(shape[1], view[1]), _bcview(tail(shape), tail(view))...)
end

# _bcview1 handles the logic for a single dimension
function _bcview1(a, b)
    if a == 1 || a == 1:1
        return 1:1
    elseif first(a) <= first(b) <= last(a) &&
           first(a) <= last(b)  <= last(b)
        return b
    else
        throw(DimensionMismatch("broadcast view could not be constructed"))
    end
end

# Distribute broadcast
# TODO: How to decide on cuts
@inline bcdistribute(bc::Broadcasted{Style}) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{DArrayStyle{Style}}(bc.f, bcdistribute_args(bc.args), bc.axes)
@inline bcdistribute(bc::Broadcasted{Style}) where Style<:DArrayStyle = Broadcasted{Style}(bc.f, bcdistribute_args(bc.args), bc.axes)

# ask BroadcastStyle to decide if argument is in need of being distributed
bcdistribute(x::T) where T = _bcdistribute(BroadcastStyle(T), x)
_bcdistribute(::DArrayStyle, x) = x
# Don't bother distributing singletons
_bcdistribute(::Broadcast.AbstractArrayStyle{0}, x) = x
_bcdistribute(::Broadcast.AbstractArrayStyle, x) = distribute(x)
_bcdistribute(::Any, x) = x

@inline bcdistribute_args(args::Tuple) = (bcdistribute(args[1]), bcdistribute_args(tail(args))...)
bcdistribute_args(args::Tuple{Any}) = (bcdistribute(args[1]),)
bcdistribute_args(args::Tuple{}) = ()

# dropping axes here since recomputing is easier
@inline bclocal(bc::Broadcasted{DArrayStyle{Style}}, idxs) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{Style}(bc.f, bclocal_args(_bcview(axes(bc), idxs), bc.args))

# bclocal will do a view of the data and the copy it over
# except when the data already is local
function bclocal(x::DArray{T, N, AT}, idxs) where {T, N, AT}
    bcidxs = _bcview(axes(x), idxs)
    makelocal(x, bcidxs...)
end
bclocal(x, idxs) = x

@inline bclocal_args(idxs, args::Tuple) = (bclocal(args[1], idxs), bclocal_args(idxs, tail(args))...)
bclocal_args(idxs, args::Tuple{Any}) = (bclocal(args[1], idxs),)
bclocal_args(idxs, args::Tuple{}) = ()


================================================
FILE: src/core.jl
================================================
# Thread-safe registry of DArray references
struct DArrayRegistry
    data::Dict{Tuple{Int,Int}, Any}
    lock::ReentrantLock
    DArrayRegistry() = new(Dict{Tuple{Int,Int}, Any}(), ReentrantLock())
end
const REGISTRY = DArrayRegistry()

function Base.get(r::DArrayRegistry, id::Tuple{Int,Int}, default)
    @lock r.lock begin
        return get(r.data, id, default)
    end
end
function Base.getindex(r::DArrayRegistry, id::Tuple{Int,Int})
    @lock r.lock begin
        return r.data[id]
    end
end
function Base.setindex!(r::DArrayRegistry, val, id::Tuple{Int,Int})
    @lock r.lock begin
        r.data[id] = val
    end
    return r
end
function Base.delete!(r::DArrayRegistry, id::Tuple{Int,Int})
    @lock r.lock delete!(r.data, id)
    return r
end

# Thread-safe set of IDs of DArrays created on this node
struct DArrayRefs
    data::Set{Tuple{Int,Int}}
    lock::ReentrantLock
    DArrayRefs() = new(Set{Tuple{Int,Int}}(), ReentrantLock())
end
const REFS = DArrayRefs()

function Base.push!(r::DArrayRefs, id::Tuple{Int,Int})
    # Ensure id refers to a DArray created on this node
    if first(id) != myid()
        throw(
            ArgumentError(
                lazy"`DArray` is not created on the current worker: Only `DArray`s created on worker $(myid()) can be stored in this set but the `DArray` was created on worker $(first(id))."))
    end
    @lock r.lock begin
        return push!(r.data, id)
    end
end
function Base.delete!(r::DArrayRefs, id::Tuple{Int,Int})
    @lock r.lock delete!(r.data, id)
    return r
end

# Global counter to generate a unique ID for each DArray
const DID = Threads.Atomic{Int}(1)

"""
    next_did()

Increment a global counter and return a tuple of the current worker ID and the incremented
value of the counter.

This tuple is used as a unique ID for a new `DArray`.
"""
next_did() = (myid(), Threads.atomic_add!(DID, 1))

release_localpart(id::Tuple{Int,Int}) = (delete!(REGISTRY, id); nothing)
function release_allparts(id::Tuple{Int,Int}, pids::Array{Int})
    @sync begin
        released_myid = false
        for p in pids
            if p == myid()
                @async release_localpart(id)
                released_myid = true
            else
                @async remotecall_fetch(release_localpart, p, id)
            end
        end
        if !released_myid
            @async release_localpart(id)
        end
    end
    return nothing
end

function close_by_id(id::Tuple{Int,Int}, pids::Array{Int})
    release_allparts(id, pids)
    delete!(REFS, id)
    nothing
end

function d_closeall()
    @lock REFS.lock begin
        while !isempty(REFS.data)
            id = pop!(REFS.data)
            d = d_from_weakref_or_d(id)
            if d isa DArray
                finalize(d)
            end
        end
    end
    return nothing
end

Base.close(d::DArray) = finalize(d)

"""
    procs(d::DArray)

Get the vector of processes storing pieces of DArray `d`.
"""
Distributed.procs(d::DArray)    = d.pids
Distributed.procs(d::SubDArray) = procs(parent(d))

"""
    localpart(A)

The identity when input is not distributed
"""
localpart(A) = A


================================================
FILE: src/darray.jl
================================================
"""
    DArray(init, dims, [procs, dist])

Construct a distributed array.

The parameter `init` is a function that accepts a tuple of index ranges.
This function should allocate a local chunk of the distributed array and initialize it for the specified indices.

`dims` is the overall size of the distributed array.

`procs` optionally specifies a vector of process IDs to use.
If unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode,
i.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the
interactive julia prompt.

`dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension.

For example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as:

### Example
```jl
dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
```
"""
mutable struct DArray{T,N,A} <: AbstractArray{T,N}
    id::Tuple{Int,Int}
    dims::NTuple{N,Int}
    pids::Array{Int,N}                          # pids[i]==p ⇒ processor p has piece i
    indices::Array{NTuple{N,UnitRange{Int}},N}  # indices held by piece i
    cuts::Vector{Vector{Int}}                   # cuts[d][i] = first index of chunk i in dimension d
    localpart::Union{A,Nothing}

    function DArray{T,N,A}(id::Tuple{Int,Int}, dims::NTuple{N,Int}, pids, indices, cuts, lp) where {T,N,A}
        # check invariants
        if dims != map(last, last(indices))
            throw(ArgumentError("dimension of DArray (dim) and indices do not match"))
        end

        d = d_from_weakref_or_d(id)
        if d === nothing
            d = new(id, dims, pids, indices, cuts, lp)
        end

        if first(id) == myid()
            push!(REFS, id)
            REGISTRY[id] = WeakRef(d)
            finalizer(d) do d
                @async close_by_id(d.id, d.pids)
            end
        end
        d
    end

    DArray{T,N,A}() where {T,N,A} = new()
end

unpack_weakref(x) = x
unpack_weakref(x::WeakRef) = x.value
d_from_weakref_or_d(id::Tuple{Int,Int}) = unpack_weakref(get(REGISTRY, id, nothing))

Base.eltype(::Type{DArray{T}}) where {T} = T
empty_localpart(T,N,A) = A(Array{T}(undef, ntuple(zero, N)))

const SubDArray{T,N,D<:DArray} = SubArray{T,N,D}
const SubOrDArray{T,N} = Union{DArray{T,N}, SubDArray{T,N}}

localtype(::Type{DArray{T,N,S}}) where {T,N,S} = S
localtype(::Type{SubDArray{T,N,D}}) where {T,N,D} = localtype(D)
localtype(A::SubOrDArray) = localtype(typeof(A))
localtype(A::AbstractArray) = typeof(A)

Base.hash(d::DArray, h::UInt) = Base.hash(d.id, h)

## core constructors ##

function DArray(id::Tuple{Int,Int}, init::I, dims, pids, idxs, cuts) where {I}
    localtypes = Vector{DataType}(undef,length(pids))
    if init isa Function
        asyncmap!(localtypes, pids) do pid
            return remotecall_fetch(construct_localparts, pid, init, id, dims, pids, idxs, cuts)
        end
    else
        asyncmap!(localtypes, pids, init) do pid, pid_init
            # constructing from an array of remote refs.
            return remotecall_fetch(construct_localparts, pid, pid_init, id, dims, pids, idxs, cuts)
        end
    end

    if !allequal(localtypes)
        @sync for p in pids
            @async remotecall_wait(release_localpart, p, id)
        end
        throw(ErrorException(lazy"Constructed localparts have different `eltype`: $(localtypes)"))
    end
    A = first(localtypes)

    if myid() in pids
        return unpack_weakref(REGISTRY[id])
    else
        T = eltype(A)
        N = length(dims)
        return DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A))
    end
end

function construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing)
    localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init)
    if A == nothing
        A = typeof(localpart)
    end
    if T == nothing
        T = eltype(A)
    end
    N = length(dims)
    d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart)
    REGISTRY[id] = d
    A
end

function ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])
    pids=sort(vec(pids))
    id = next_did()
    npids = length(pids)
    ldata = length(data)
    idxs, cuts = chunk_idxs([npids], [npids])

    if ldata > 0
        @assert rem(ldata,npids) == 0
        if ldata == npids
            T = eltype(data)
            s = DestinationSerializer(pididx->data[pididx], pids)
            init = I->localpart(s)
        else
            # call the standard distribute function
            return distribute(data)
        end
    end

    @sync for p in pids
        @async remotecall_wait(construct_localparts, p, init, id, (npids,), pids, idxs, cuts; T=T, A=T)
    end

    if myid() in pids
        return unpack_weakref(REGISTRY[id])
    else
        return DArray{T,1,T}(id, (npids,), pids, idxs, cuts, nothing)
    end
end

function gather(d::DArray{T,1,T}) where T
    pids = procs(d)
    a = Vector{T}(undef, length(pids))
    asyncmap!(a, pids) do p
        remotecall_fetch(localpart, p, d)
    end
    a
end

function DArray(init, dims, procs, dist)
    np = prod(dist)
    procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist)))
    idxs, cuts = chunk_idxs([dims...], dist)
    id = next_did()

    return DArray(id, init, dims, procs, idxs, cuts)
end

function DArray(init, dims, procs)
    if isempty(procs)
        throw(ArgumentError("no processors given"))
    end
    return DArray(init, dims, procs, defaultdist(dims, procs))
end
DArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))])

# Create a DArray from a collection of references
# The refs must have the same layout as the parts distributed.
# i.e.
#    size(refs) must specify the distribution of dimensions across processors
#    prod(size(refs)) must equal number of parts
# FIXME : Empty parts are currently not supported.
function DArray(refs)
    dimdist = size(refs)
    id = next_did()

    nsizes = Array{Tuple}(undef, dimdist)
    asyncmap!(nsizes, refs) do r
        remotecall_fetch(sz_localpart_ref, r.where, r, id)
    end

    nindices = Array{NTuple{length(dimdist),UnitRange{Int}}}(undef, dimdist...)

    for i in 1:length(nindices)
        subidx = CartesianIndices(dimdist)[i]
        nindices[i] = ntuple(length(subidx)) do x
            idx_in_dim = subidx[x]
            startidx = 1
            for j in 1:(idx_in_dim-1)
                prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx))
                prevsize = nsizes[prevsubidx...]
                startidx += prevsize[x]
            end
            startidx:startidx+(nsizes[i][x])-1
        end
    end

    lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindices]...)
    ncuts = Array{Int,1}[pushfirst!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)]
    ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...)

    DArray(id, refs, ndims, map(r -> r.where, refs), nindices, ncuts)
end

macro DArray(ex0::Expr)
    if ex0.head !== :comprehension
        throw(ArgumentError("invalid @DArray syntax"))
    end
    ex = ex0.args[1]
    if ex.head !== :generator
        throw(ArgumentError("invalid @DArray syntax"))
    end
    ex.args[1] = esc(ex.args[1])
    ndim = length(ex.args) - 1
    ranges = map(r->esc(r.args[2]), ex.args[2:end])
    for d = 1:ndim
        var = ex.args[d+1].args[1]
        ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
    end
    return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),
                tuple($(map(r->:(length($r)), ranges)...))) )
end

# new DArray similar to an existing one
DArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indices, d.cuts)

sz_localpart_ref(ref, id) = size(fetch(ref))

Base.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(undef, map(length,I)), dims, procs(d))
Base.similar(d::DArray, T::Type) = similar(d, T, size(d))
Base.similar(d::DArray{T}, dims::Dims) where {T} = similar(d, T, dims)
Base.similar(d::DArray{T}) where {T} = similar(d, T, size(d))

Base.size(d::DArray) = d.dims

chunktype(d::DArray{T,N,A}) where {T,N,A} = A

## chunk index utilities ##

# decide how to divide each dimension
# returns size of chunks array
function defaultdist(dims, pids)
    dims = [dims...]
    chunks = ones(Int, length(dims))
    np = length(pids)
    f = sort!(collect(keys(factor(np))), rev=true)
    k = 1
    while np > 1
        # repeatedly allocate largest factor to largest dim
        if np % f[k] != 0
            k += 1
            if k > length(f)
                break
            end
        end
        fac = f[k]
        (d, dno) = findmax(dims)
        # resolve ties to highest dim
        dno = findlast(isequal(d), dims)
        if dims[dno] >= fac
            dims[dno] = div(dims[dno], fac)
            chunks[dno] *= fac
        end
        np = div(np, fac)
    end
    return chunks
end

# get array of start indices for dividing sz into nc chunks
function defaultdist(sz::Int, nc::Int)
    if sz >= nc
        chunk_size = div(sz,nc)
        remainder = rem(sz,nc)
        grid = zeros(Int64, nc+1)
        for i = 1:(nc+1)
            grid[i] += (i-1)*chunk_size + 1
            if i<= remainder
                grid[i] += i-1
            else
                grid[i] += remainder
            end
        end
        return grid
    else
        return [[1:(sz+1);]; zeros(Int, nc-sz)]
    end
end

# compute indices array for dividing dims into chunks
function chunk_idxs(dims, chunks)
    cuts = map(defaultdist, dims, chunks)
    n = length(dims)
    idxs = Array{NTuple{n,UnitRange{Int}}}(undef, chunks...)
    for cidx in CartesianIndices(tuple(chunks...))
        idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n)
    end
    return (idxs, cuts)
end

function localpartindex(pids::Array{Int})
    mi = myid()
    for i = 1:length(pids)
        if pids[i] == mi
            return i
        end
    end
    return 0
end
localpartindex(d::DArray) = localpartindex(procs(d))

"""
    localpart(d::DArray)

Get the local piece of a distributed array.
Returns an empty array if no local part exists on the calling process.

d[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts.
This syntaxt can also be used for assignment. For example,
`d[:L]=v` will assign `v` to the localpart of `d`.
"""
function localpart(d::DArray{T,N,A}) where {T,N,A}
    lpidx = localpartindex(d)
    if lpidx == 0
        return empty_localpart(T,N,A)::A
    end

    return d.localpart::A
end

localpart(d::DArray, localidx...) = localpart(d)[localidx...]

_localindex(i::Integer, offset) = i - offset
_localindex(i::AbstractRange, offset) = (first(i)-offset):step(i):(last(i)-offset)
_localindex(i::AbstractUnitRange, offset) = (first(i)-offset):(last(i)-offset)

"""
    makelocal(A::DArray, I...)

Equivalent to `Array(view(A, I...))` but optimised for the case that the data is local.
Can return a view into `localpart(A)`
"""
@inline function makelocal(A::DArray{<:Any, <:Any, AT}, I::Vararg{Any, N}) where {N, AT}
    J = map(i->Base.unalias(A, i), to_indices(A, I))
    J = map(j-> isa(j, Base.Slice) ? j.indices : j, J)
    @boundscheck checkbounds(A, J...)

    lidcs = localindices(A)
    if Base.checkbounds_indices(Bool, lidcs, J)
        # data we want is local
        viewidcs = ntuple(i -> _localindex(J[i], first(lidcs[i]) - 1), ndims(A))
        view(localpart(A), viewidcs...)
    else
        # Make more efficient (?maybe) by allocating new memory
        # only for the remote part
        viewidcs = ntuple(i -> _localindex(J[i], 0), ndims(A))
        arr = similar(AT, map(length, viewidcs)...)
        copyto!(arr, view(A, viewidcs...))
    end
end

# shortcut to set/get localparts of a distributed object
Base.getindex(d::DArray, s::Symbol) = _getindex(d, s)
Base.getindex(d::DArray{<:Any, 1}, s::Symbol) = _getindex(d, s)
function _getindex(d::DArray, s::Symbol)
    @assert s in [:L, :l, :LP, :lp]
    return localpart(d)
end

function Base.setindex!(d::DArray{T,N,A}, new_lp::A, s::Symbol) where {T,N,A}
    @assert s in [:L, :l, :LP, :lp]
    d.localpart = new_lp
    new_lp
end


# fetch localpart of d at pids[i]
Base.fetch(d::DArray{T,N,A}, i) where {T,N,A} = remotecall_fetch(localpart, d.pids[i], d)

"""
    localindices(d)

A tuple describing the indices owned by the local process.
Returns a tuple with empty ranges if no local part exists on the calling process.
"""
function localindices(d::DArray)
    lpidx = localpartindex(d)
    if lpidx == 0
        return ntuple(i -> 1:0, ndims(d))
    end
    return d.indices[lpidx]
end

# Equality
function Base.:(==)(d::DArray{<:Any,<:Any,A}, a::AbstractArray) where A
    if size(d) != size(a)
        return false
    else
        b = asyncmap(procs(d)) do p
            remotecall_fetch(p) do
                localpart(d) == A(a[localindices(d)...])
            end
        end
        return all(b)
    end
end
function Base.:(==)(d::SubDArray, a::AbstractArray)
    cd = copy(d)
    t = cd == a
    finalize(cd)
    return t
end
Base.:(==)(a::AbstractArray, d::DArray) = d == a
Base.:(==)(a::AbstractArray, d::SubDArray) = d == a
Base.:(==)(d1::DArray, d2::DArray) = invoke(==, Tuple{DArray, AbstractArray}, d1, d2)
function Base.:(==)(d1::SubDArray, d2::DArray)
    cd1 = copy(d1)
    t = cd1 == d2
    finalize(cd1)
    return t
end
function Base.:(==)(d1::DArray, d2::SubDArray)
    cd2 = copy(d2)
    t = d1 == cd2
    finalize(cd2)
    return t
end
function Base.:(==)(d1::SubDArray, d2::SubDArray)
    cd1 = copy(d1)
    t = cd1 == d2
    finalize(cd1)
    return t
end

"""
    locate(d::DArray, I::Int...)

Determine the index of `procs(d)` that hold element `I`.
"""
function locate(d::DArray, I::Int...)
    ntuple(ndims(d)) do i
        fi = searchsortedlast(d.cuts[i], I[i])
        if fi >= length(d.cuts[i])
            throw(ArgumentError("element not contained in array"))
        end
        return fi
    end
end

chunk(d::DArray{T,N,A}, pid::Int) where {T,N,A} = remotecall_fetch(localpart, pid, d)::A

## convenience constructors ##

"""
     dzeros(dims, ...)

Construct a distributed array of zeros.
Trailing arguments are the same as those accepted by `DArray`.
"""
dzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...)
dzeros(::Type{T}, dims::Dims, args...) where {T} = DArray(I->zeros(T,map(length,I)), dims, args...)
dzeros(::Type{T}, d1::Integer, drest::Integer...) where {T} = dzeros(T, convert(Dims, tuple(d1, drest...)))
dzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...)))
dzeros(d::Dims) = dzeros(Float64, d)


"""
    dones(dims, ...)

Construct a distributed array of ones.
Trailing arguments are the same as those accepted by `DArray`.
"""
dones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...)
dones(::Type{T}, dims::Dims, args...) where {T} = DArray(I->ones(T,map(length,I)), dims, args...)
dones(::Type{T}, d1::Integer, drest::Integer...) where {T} = dones(T, convert(Dims, tuple(d1, drest...)))
dones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...)))
dones(d::Dims) = dones(Float64, d)

"""
     dfill(x, dims, ...)

Construct a distributed array filled with value `x`.
Trailing arguments are the same as those accepted by `DArray`.
"""
dfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...)
dfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...)))

"""
     drand(dims, ...)

Construct a distributed uniform random array.
Trailing arguments are the same as those accepted by `DArray`.
"""
drand(::Type{T}, dims::Dims) where {T} = DArray(I -> rand(T, map(length, I)), dims)
drand(X, dims::Dims) = DArray(I -> rand(X, map(length, I)), dims)
drand(dims::Dims) = drand(Float64, dims)

drand(::Type{T}, d1::Integer, drest::Integer...) where {T} = drand(T, Dims((d1, drest...)))
drand(X, d1::Integer, drest::Integer...) = drand(X, Dims((d1, drest...)))
drand(d1::Integer, drest::Integer...) = drand(Float64, Dims((d1, drest...)))

# With optional process IDs and number of chunks
for N in (1, 2)
    @eval begin
        drand(::Type{T}, dims::Dims, args::Vararg{Any,$N}) where {T} = DArray(I -> rand(T, map(length, I)), dims, args...)
        drand(X, dims::Dims, args::Vararg{Any,$N}) = DArray(I -> rand(X, map(length, I)), dims, args...)
        drand(dims::Dims, args::Vararg{Any,$N}) = drand(Float64, dims, args...)
    end
end

# Fix method ambiguities
drand(dims::Dims, procs::Tuple{Vararg{Int}}) = drand(Float64, dims, procs)
drand(dims::Dims, procs::Tuple{Vararg{Int}}, dist) = drand(Float64, dims, procs, dist)
drand(X::Tuple{Vararg{Int}}, dim::Integer) = drand(X, Dims((dim,)))
drand(X::Tuple{Vararg{Int}}, d1::Integer, d2::Integer) = drand(X, Dims((d1, d2)))

"""
     drandn(dims, ...)

Construct a distributed normal random array.
Trailing arguments are the same as those accepted by `DArray`.
"""
drandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...)
drandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...)))

## conversions ##

"""
     distribute(A[; procs, dist])

Convert a local array to distributed.

`procs` optionally specifies an array of process IDs to use. (defaults to all workers)
`dist` optionally specifies a vector or tuple of the number of partitions in each dimension
"""
function distribute(A::AbstractArray;
    procs = workers()[1:min(nworkers(), maximum(size(A)))],
    dist = defaultdist(size(A), procs))
    np = prod(dist)
    procs_used = procs[1:np]
    idxs, _ = chunk_idxs([size(A)...], dist)

    s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx
        A[idxs[pididx]...]
    end
    return DArray(I->localpart(s), size(A), procs_used, dist)
end

"""
    distribute(A, DA)

Distribute a local array `A` like the distributed array `DA`.

"""
function distribute(A::AbstractArray, DA::DArray)
    size(DA) == size(A) || throw(DimensionMismatch("Distributed array has size $(size(DA)) but array has $(size(A))"))

    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
        A[DA.indices[pididx]...]
    end
    return DArray(I->localpart(s), DA)
end

DArray{T,N,S}(A::S) where {T,N,S<:AbstractArray} = distribute(convert(AbstractArray{T,N}, A))

function Array{S,N}(d::DArray{T,N}) where {S,T,N}
    a = Array{S}(undef, size(d))
    @sync for (pid, indices) in zip(d.pids, d.indices)
        if !any(isempty, indices)
            @async a[indices...] = chunk(d, pid)
        end
    end
    return a
end

function Array{S,N}(s::SubDArray{T,N}) where {S,T,N}
    I = s.indices
    d = parent(s)
    if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S && !isempty(s)
        l = locate(d, map(first, I)...)
        if isequal(d.indices[l...], I)
            # SubDArray corresponds to a chunk
            return chunk(d, d.pids[l...])
        end
    end
    a = Array{S}(undef, size(s))
    copyto!(a, s)
end

function Base.copyto!(a::Array, s::SubDArray)
    N = ndims(a)
    a[[1:size(a,i) for i=1:N]...] = s
    return a
end

function DArray(SD::SubArray{T,N}) where {T,N}
    D = SD.parent
    DArray(size(SD), procs(D)) do I
        lindices = Base.reindex(SD.indices, I)
        convert(Array, D[lindices...])
    end
end

function Base.reshape(A::DArray{T,1,S}, d::Dims) where {T,S<:Array}
    if prod(d) != length(A)
        throw(DimensionMismatch("dimensions must be consistent with array size"))
    end
    return DArray(d) do I
        sz = map(length,I)
        d1offs = first(I[1])
        nd = length(I)

        B = Array{T}(undef, sz)
        nr = size(B,1)
        sztail = size(B)[2:end]

        for i=1:div(length(B),nr)
            i2 = CartesianIndices(sztail)[i]
            globalidx = [ I[j][i2[j-1]] for j=2:nd ]

            a = LinearIndices(d)[d1offs, globalidx...]

            B[:,i] = Array(A[a:(a+nr-1)])
        end
        B
    end
end

## indexing ##
const _allowscalar = Ref(true)
allowscalar(flag = true) = (_allowscalar[] = flag)
_scalarindexingallowed() = _allowscalar[] || throw(ErrorException("scalar indexing disabled"))

getlocalindex(d::DArray, idx...) = localpart(d)[idx...]
function getindex_tuple(d::DArray{T,N}, I::NTuple{N,Int}) where {T,N}
    chidx = locate(d, I...)
    idxs = d.indices[chidx...]
    localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d))
    pid = d.pids[chidx...]
    return remotecall_fetch(getlocalindex, pid, d, localidx...)::T
end

function Base.getindex(d::DArray, i::Int)
    _scalarindexingallowed()
    return getindex_tuple(d, Tuple(CartesianIndices(d)[i]))
end
function Base.getindex(d::DArray{<:Any,N}, i::Vararg{Int,N}) where {N}
    _scalarindexingallowed()
    return getindex_tuple(d, i)
end
Base.getindex(d::DArray) = d[1]
Base.getindex(d::SubDArray, I::Int...) = invoke(getindex, Tuple{SubArray{<:Any,N},Vararg{Int,N}} where N, d, I...)
Base.getindex(d::SubOrDArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...)

function Base.isassigned(D::DArray, i::Integer...)
    try
        getindex_tuple(D, i)
        true
    catch e
        if isa(e, BoundsError) || isa(e, UndefRefError)
            return false
        else
            rethrow(e)
        end
    end
end

Base.copy(d::SubDArray) = copyto!(similar(d), d)
Base.copy(d::SubDArray{<:Any,2}) = copyto!(similar(d), d)

function Base.copyto!(dest::SubOrDArray, src::AbstractArray)
    @sync for p in procs(dest)
        @async remotecall_wait(p) do
            ldest = localpart(dest)
            copyto!(ldest, view(src, localindices(dest)...))
        end
    end
    return dest
end

function Base.deepcopy(src::DArray)
    dest = similar(src)
    @sync for p in procs(src)
        @async remotecall_wait(p) do
            dest[:L] = deepcopy(src[:L])
        end
    end
    return dest
end
# We also want to optimize setindex! with a SubDArray source, but this is hard
# and only works on 0.5.

# Similar to Base.indexin, but just create a logical mask. Note that this
# must return a logical mask in order to support merging multiple masks
# together into one linear index since we need to know how many elements to
# skip at the end. In many cases range intersection would be much faster
# than generating a logical mask, but that loses the endpoint information.
indexin_mask(a, b::Number) = a .== b
indexin_mask(a, r::AbstractRange{Int}) = [i in r for i in a]
indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
indexin_mask(a, b) = [i in b for i in a]

import Base: tail
# Given a tuple of indices and a tuple of masks, restrict the indices to the
# valid regions. This is, effectively, reversing Base.setindex_shape_check.
# We can't just use indexing into MergedIndices here because getindex is much
# pickier about singleton dimensions than setindex! is.
restrict_indices(::Tuple{}, ::Tuple{}) = ()
function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
    elseif length(a[1]) == 1
        (a[1], restrict_indices(tail(a), b))
    elseif length(b[1]) == 1 && b[1][1]
        restrict_indices(a, tail(b))
    else
        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
    end
end
# The final indices are funky - they're allowed to accumulate together.
# An easy (albeit very inefficient) fix for too many masks is to use the
# outer product to merge them. But we can do that lazily with a custom type:
function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
    (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
end
# But too many indices is much harder; this requires merging the indices
# in `a` before applying the final mask in `b`.
function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
    if length(a[1]) == 1
        (a[1], restrict_indices(tail(a), b))
    else
        # When one mask spans multiple indices, we need to merge the indices
        # together. At this point, we can just use indexing to merge them since
        # there's no longer special handling of singleton dimensions
        (view(MergedIndices(a, map(length, a)), b[1]),)
    end
end

struct ProductIndices{I,N} <: AbstractArray{Bool, N}
    indices::I
    sz::NTuple{N,Int}
end
Base.size(P::ProductIndices) = P.sz
# This gets passed to map to avoid breaking propagation of inbounds
Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
Base.@propagate_inbounds Base.getindex(P::ProductIndices{J,N}, I::Vararg{Int, N}) where {J,N} =
    Bool((&)(map(propagate_getindex, P.indices, I)...))

struct MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
    indices::I
    sz::NTuple{N,Int}
end
Base.size(M::MergedIndices) = M.sz
Base.@propagate_inbounds Base.getindex(M::MergedIndices{J,N}, I::Vararg{Int, N}) where {J,N} =
    CartesianIndex(map(propagate_getindex, M.indices, I))
# Additionally, we optimize bounds checking when using MergedIndices as an
# array index since checking, e.g., A[1:500, 1:500] is *way* faster than
# checking an array of 500^2 elements of CartesianIndex{2}. This optimization
# also applies to reshapes of MergedIndices since the outer shape of the
# container doesn't affect the index elements themselves. We can go even
# farther and say that even restricted views of MergedIndices must be valid
# over the entire array. This is overly strict in general, but in this
# use-case all the merged indices must be valid at some point, so it's ok.
const ReshapedMergedIndices{T,N,M<:MergedIndices} = Base.ReshapedArray{T,N,M}
const SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} = SubArray{T,N,M}
const MergedIndicesOrSub = Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
    Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))

# The tricky thing here is that we want to optimize the accesses into the
# distributed array, but in doing so, we lose track of which indices in I we
# should be using.
#
# I’ve come to the conclusion that the function is utterly insane.
# There are *6* flavors of indices with four different reference points:
# 1. Find the indices of each portion of the DArray.
# 2. Find the valid subset of indices for the SubArray into that portion.
# 3. Find the portion of the `I` indices that should be used when you access the
#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
#    from all other arrays, wherein we simply iterate over the source array’s
#    elements.  You need to *both* know which elements in `J` were skipped
#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
# 4. If `K` doesn't correspond to an entire chunk, reinterpret `K` in terms of
#    the local portion of the source array
function Base.setindex!(a::Array, s::SubDArray,
        I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
    Inew = Base.to_indices(a, I)
    Base.setindex_shape_check(s, Base.index_lengths(Inew...)...)
    d = parent(s)
    J = Base.to_indices(d, s.indices)
    @sync for (pid, K_c) in zip(d.pids, d.indices)
        K = map(intersect, J, K_c)
        if !any(isempty, K)
            K_mask = map(indexin_mask, J, K_c)
            idxs = restrict_indices(Inew, K_mask)
            if isequal(K, K_c)
                # whole chunk
                @async a[idxs...] = chunk(d, pid)
            else
                # partial chunk
                localidxs = map((Kj, K_cj) -> Kj .- (first(K_cj) - 1), K, K_c)
                @async a[idxs...] = remotecall_fetch((d, idxs) -> localpart(d)[idxs...], pid, d, localidxs)
            end
        end
    end
    return a
end

function Base.fill!(A::DArray, x)
    @sync for p in procs(A)
        @async remotecall_wait((A,x)->fill!(localpart(A), x), p, A, x)
    end
    return A
end

function Random.rand!(A::DArray, ::Type{T}) where T
    @sync for p in procs(A)
        @async remotecall_wait((A, T)->rand!(localpart(A), T), p, A, T)
    end
    return A
end


================================================
FILE: src/linalg.jl
================================================
function Base.copy(Dadj::Adjoint{T,<:DArray{T,2}}) where T
    D = parent(Dadj)
    DArray(reverse(size(D)), procs(D)) do I
        lp = Array{T}(undef, map(length, I))
        rp = convert(Array, D[reverse(I)...])
        adjoint!(lp, rp)
    end
end

function Base.copy(Dtr::Transpose{T,<:DArray{T,2}}) where T
    D = parent(Dtr)
    DArray(reverse(size(D)), procs(D)) do I
        lp = Array{T}(undef, map(length, I))
        rp = convert(Array, D[reverse(I)...])
        transpose!(lp, rp)
    end
end

const DVector{T,A} = DArray{T,1,A}
const DMatrix{T,A} = DArray{T,2,A}

# Level 1

function LinearAlgebra.axpy!(α, x::DArray, y::DArray)
    if length(x) != length(y)
        throw(DimensionMismatch("vectors must have same length"))
    end
    @sync for p in procs(y)
        @async remotecall_wait(p) do
            axpy!(α, localpart(x), localpart(y))
        end
    end
    return y
end

function LinearAlgebra.dot(x::DVector, y::DVector)
    if length(x) != length(y)
        throw(DimensionMismatch(""))
    end

    results = asyncmap(procs(x)) do p
        remotecall_fetch((x, y) -> dot(localpart(x), makelocal(y, localindices(x)...)), p, x, y)
    end
    return reduce(+, results)
end

function LinearAlgebra.norm(x::DArray, p::Real = 2)
    results = asyncmap(procs(x)) do pp
        remotecall_fetch(() -> norm(localpart(x), p), pp)
    end
    return norm(results, p)
end

function LinearAlgebra.rmul!(A::DArray, x::Number)
    @sync for p in procs(A)
        @async remotecall_wait((A,x)->rmul!(localpart(A), x), p, A, x)
    end
    return A
end

# Level 2
function add!(dest, src, scale = one(dest[1]))
    if length(dest) != length(src)
        throw(DimensionMismatch("source and destination arrays must have same number of elements"))
    end
    if scale == one(scale)
        @simd for i = eachindex(dest)
            @inbounds dest[i] += src[i]
        end
    else
        @simd for i = eachindex(dest)
            @inbounds dest[i] += scale*src[i]
        end
    end
    return dest
end

function LinearAlgebra.mul!(y::DVector, A::DMatrix, x::AbstractVector, α::Number = 1, β::Number = 0)

    # error checks
    if size(A, 2) != length(x)
        throw(DimensionMismatch(""))
    end
    if y.cuts[1] != A.cuts[1]
        throw(ArgumentError("cuts of output vector must match cuts of first dimension of matrix"))
    end

    # Multiply on each tile of A
    R = Array{Future}(undef, size(A.pids))
    for j = 1:size(A.pids, 2)
        xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1]
        for i = 1:size(A.pids, 1)
            R[i,j] = remotecall(procs(A)[i,j]) do
                localpart(A)*convert(localtype(x), xj)
            end
        end
    end

    # Scale y if necessary
    if β != one(β)
        asyncmap(procs(y)) do p
            remotecall_wait(p) do
                if !iszero(β)
                    rmul!(localpart(y), β)
                else
                    fill!(localpart(y), 0)
                end
            end
        end
    end

    # Update y
    @sync for i = 1:size(R, 1)
        p = y.pids[i]
        for j = 1:size(R, 2)
            rij = R[i,j]
            @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)
        end
    end

    return y
end

function LinearAlgebra.mul!(y::DVector, adjA::Adjoint{<:Number,<:DMatrix}, x::AbstractVector, α::Number = 1, β::Number = 0)

    A = parent(adjA)

    # error checks
    if size(A, 1) != length(x)
        throw(DimensionMismatch(""))
    end
    if y.cuts[1] != A.cuts[2]
        throw(ArgumentError("cuts of output vector must match cuts of second dimension of matrix"))
    end

    # Multiply on each tile of A
    R = Array{Future}(undef, reverse(size(A.pids)))
    for j = 1:size(A.pids, 1)
        xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1]
        for i = 1:size(A.pids, 2)
            R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i])
        end
    end

    # Scale y if necessary
    if β != one(β)
        @sync for p in procs(y)
            @async remotecall_wait(p) do
                if !iszero(β)
                    rmul!(localpart(y), β)
                else
                    fill!(localpart(y), 0)
                end
            end
        end
    end

    # Update y
    @sync for i = 1:size(R, 1)
        p = y.pids[i]
        for j = 1:size(R, 2)
            rij = R[i,j]
            @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)
        end
    end
    return y
end

function LinearAlgebra.lmul!(D::Diagonal, DA::DMatrix)
    d = D.diag
    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
        d[DA.indices[pididx][1]]
    end
    map_localparts!(DA) do lDA
        lmul!(Diagonal(localpart(s)), lDA)
    end
end

function LinearAlgebra.rmul!(DA::DMatrix, D::Diagonal)
    d = D.diag
    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
        d[DA.indices[pididx][2]]
    end
    map_localparts!(DA) do lDA
        rmul!(lDA, Diagonal(localpart(s)))
    end
end

# Level 3
function _matmatmul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number, β::Number, tA)
    # error checks
    Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1)
    mA, nA = (size(A, Ad1), size(A, Ad2))
    mB, nB = size(B)
    if mB != nA
        throw(DimensionMismatch("matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)"))
    end
    if size(C,1) != mA || size(C,2) != nB
        throw(DimensionMismatch("result C has dimensions $(size(C)), needs ($mA, $nB)"))
    end
    if C.cuts[1] != A.cuts[Ad1]
        throw(ArgumentError("cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix"))
    end

    # Multiply on each tile of A
    if tA == 'N'
        R = Array{Future}(undef, size(procs(A))..., size(procs(C), 2))
    else
        R = Array{Future}(undef, reverse(size(procs(A)))..., size(procs(C), 2))
    end
    for j = 1:size(A.pids, Ad2)
        for k = 1:size(C.pids, 2)
            Acuts = A.cuts[Ad2]
            Ccuts = C.cuts[2]
            Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1]
            for i = 1:size(A.pids, Ad1)
                p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i]
                R[i,j,k] = remotecall(p) do
                    if tA == 'T'
                        return transpose(localpart(A))*convert(localtype(B), Bjk)
                    elseif tA == 'C'
                        return adjoint(localpart(A))*convert(localtype(B), Bjk)
                    else
                        return localpart(A)*convert(localtype(B), Bjk)
                    end
                end
            end
        end
    end

    # Scale C if necessary
    if β != one(β)
        @sync for p in C.pids
            if iszero(β)
                @async remotecall_wait(() -> fill!(localpart(C), 0), p)
            else
                @async remotecall_wait(() -> rmul!(localpart(C), β), p)
            end
        end
    end

    # Update C
    @sync for i = 1:size(R, 1)
        for k = 1:size(C.pids, 2)
            p = C.pids[i,k]
            for j = 1:size(R, 2)
                rijk = R[i,j,k]
                @async remotecall_wait(d -> add!(localpart(d), fetch(rijk), α), p, C)
            end
        end
    end
    return C
end

LinearAlgebra.mul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, A, B, α, β, 'N')
LinearAlgebra.mul!(C::DMatrix, A::Adjoint{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'C')
LinearAlgebra.mul!(C::DMatrix, A::Transpose{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'T')

_matmul_op = (t,s) -> t*s + t*s

function Base.:*(A::DMatrix, x::AbstractVector)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
    y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),))
    return mul!(y, A, x)
end
function Base.:*(A::DMatrix, B::AbstractMatrix)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
    C = DArray(I -> Array{T}(undef, map(length, I)),
            (size(A, 1), size(B, 2)),
            procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))],
            (size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2))))
    return mul!(C, A, B)
end

function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, x::AbstractVector)
    A = parent(adjA)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
    y = DArray(I -> Array{T}(undef, map(length, I)),
            (size(A, 2),),
            procs(A)[1,:],
            (size(procs(A), 2),))
    return mul!(y, adjA, x)
end
function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, B::AbstractMatrix)
    A = parent(adjA)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
    C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),
        size(B, 2)),
        procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],
        (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))
    return mul!(C, adjA, B)
end

function Base.:*(trA::Transpose{<:Any,<:DMatrix}, x::AbstractVector)
    A = parent(trA)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
    y = DArray(I -> Array{T}(undef, map(length, I)),
            (size(A, 2),),
            procs(A)[1,:],
            (size(procs(A), 2),))
    return mul!(y, trA, x)
end
function Base.:*(trA::Transpose{<:Any,<:DMatrix}, B::AbstractMatrix)
    A = parent(trA)
    T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
    C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),
        size(B, 2)),
        procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],
        (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))
    return mul!(C, trA, B)
end


================================================
FILE: src/mapreduce.jl
================================================
## higher-order functions ##

Base.map(f, d0::DArray, ds::AbstractArray...) = broadcast(f, d0, ds...)

function Base.map!(f::F, dest::DArray, src::DArray{<:Any,<:Any,A}) where {F,A}
    @sync for p in procs(dest)
        @async remotecall_wait(p) do
            map!(f, localpart(dest), makelocal(src, localindices(dest)...))
        end
    end
    return dest
end

# Only defining `reduce(f, ::DArray)` causes method ambiguity issues with
# - `reduce(hcat, ::AbstractVector{<:AbstractVecOrMat})`
# - `reduce(vcat, ::AbstractVector{<:AbstractVecOrMat})`
Base.reduce(f, d::DArray) = _reduce(f, d)
Base.reduce(::typeof(hcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(hcat, d)
Base.reduce(::typeof(vcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(vcat, d)
function _reduce(f, d::DArray)
    results = asyncmap(procs(d)) do p
        remotecall_fetch(p) do
            return reduce(f, localpart(d))
        end
    end
    reduce(f, results)
end

function Base._mapreduce(f, op, ::IndexCartesian, d::DArray)
    results = asyncmap(procs(d)) do p
        remotecall_fetch((_f,_op,_d)->mapreduce(_f, _op, localpart(_d)), p, f, op, d)
    end

    reduce(op, results)
end
Base._mapreduce(f, op, ::IndexCartesian, d::SubDArray) = Base._mapreduce(f, op, IndexCartesian(), DArray(d))
# Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d)
# Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d)
# Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d)

# mapreducedim
function Base.reducedim_initarray(A::DArray, region, v0, ::Type{R}) where {R}
    # Store reduction on lowest pids
    pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...]
    chunks = similar(pids, Future)
    asyncmap!(chunks, pids) do p
        remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), p)
    end
    return DArray(chunks)
end
Base.reducedim_initarray(A::DArray, region, v0::T) where {T} = Base.reducedim_initarray(A, region, v0, T)

# Compute mapreducedim of each localpart and store the result in a new DArray
function mapreducedim_within(f, op, A::DArray, region)
    arraysize = [size(A)...]
    gridsize = [size(A.indices)...]
    arraysize[[region...]] = gridsize[[region...]]
    indx = similar(A.indices)

    for i in CartesianIndices(indx)
        indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indices[i][j], ndims(A))
    end
    cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)]
    return DArray(next_did(), I -> mapreduce(f, op, localpart(A), dims=region),
        tuple(arraysize...), procs(A), indx, cuts)
end

# Compute mapreducedim across the processes. This should be done after mapreducedim
# has been run on each localpart with mapreducedim_within. Eventually, we might
# want to write mapreducedim_between! as a binary reduction.
function mapreducedim_between!(f, op, R::DArray, A::DArray, region)
    @sync for p in procs(R)
        @async remotecall_wait(p, f, op, R, A, region) do f, op, R, A, region
            localind = [r for r = localindices(A)]
            localind[[region...]] = [1:n for n = size(A)[[region...]]]
            B = convert(Array, A[localind...])
            Base.mapreducedim!(f, op, localpart(R), B)
        end
    end
    return R
end

function Base.mapreducedim!(f, op, R::DArray, A::DArray)
    lsize = Base.check_reducedims(R,A)
    if isempty(A)
        return copy(R)
    end
    region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...)
    if isempty(region)
        return copyto!(R, A)
    end
    B = mapreducedim_within(f, op, A, region)
    return mapreducedim_between!(identity, op, R, B, region)
end

## Some special cases
function Base._all(f, A::DArray, ::Colon)
    B = asyncmap(procs(A)) do p
        remotecall_fetch(p) do
            all(f, localpart(A))
        end
    end
    return all(B)
end

function Base._any(f, A::DArray, ::Colon)
    B = asyncmap(procs(A)) do p
        remotecall_fetch(p) do
            any(f, localpart(A))
        end
    end
    return any(B)
end

function Base.count(f, A::DArray)
    B = asyncmap(procs(A)) do p
        remotecall_fetch(p) do
            count(f, localpart(A))
        end
    end
    return sum(B)
end

function Base.extrema(d::DArray)
    r = asyncmap(procs(d)) do p
        remotecall_fetch(p) do
            extrema(localpart(d))
        end
    end
    return reduce((t,s) -> (min(t[1], s[1]), max(t[2], s[2])), r)
end

# Unary vector functions
Base.:(-)(D::DArray) = map(-, D)


map_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d)
map_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I
    f(localpart(d1), localpart(d2))
end

function map_localparts(f::Callable, DA::DArray, A::Array)
    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
        A[DA.indices[pididx]...]
    end
    DArray(DA) do I
        f(localpart(DA), localpart(s))
    end
end

function map_localparts(f::Callable, A::Array, DA::DArray)
    s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
        A[DA.indices[pididx]...]
    end
    DArray(DA) do I
        f(localpart(s), localpart(DA))
    end
end

function map_localparts!(f::Callable, d::DArray)
    @sync for p in procs(d)
        @async remotecall_wait((f,d)->f(localpart(d)), p, f, d)
    end
    return d
end

# Here we assume all the DArrays have
# the same size and distribution
map_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1])


function samedist(A::DArray, B::DArray)
    (size(A) == size(B)) || throw(DimensionMismatch())
    if (procs(A) != procs(B)) || (A.cuts != B.cuts)
        B = DArray(x->B[x...], A)
    end
    B
end

for f in (:+, :-, :div, :mod, :rem, :&, :|, :xor)
    @eval begin
        function Base.$f(A::DArray{T}, B::DArray{T}) where T
            B = samedist(A, B)
            map_localparts($f, A, B)
        end
        Base.$f(A::DArray{T}, B::Array{T}) where {T} = map_localparts($f, A, B)
        Base.$f(A::Array{T}, B::DArray{T}) where {T} = map_localparts($f, A, B)
    end
end

function Base.mapslices(f, D::DArray{T,N,A}; dims) where {T,N,A}
    if !(dims isa AbstractVector)
        dims = [dims...]
    end
    if !all(t -> t == 1, size(D.indices)[dims])
        p = ones(Int, ndims(D))
        nondims = filter(t -> !(t in dims), 1:ndims(D))
        p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D))
        DD = DArray(size(D), procs(D), p) do I
            return convert(A, D[I...])
        end
        return mapslices(f, DD, dims=dims)
    end

    refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),dims=z), p, f, D, dims) for p in procs(D)]

    DArray(reshape(refs, size(procs(D))))
end

function _ppeval(f, A...; dim = map(ndims, A))
    if length(dim) != length(A)
        throw(ArgumentError("dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))"))
    end
    narg = length(A)
    dimlength = size(A[1], dim[1])
    for i = 2:narg
        if dim[i] > 0 && dimlength != size(A[i], dim[i])
            throw(ArgumentError("lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))"))
        end
    end
    dims = []
    idx  = []
    args = []
    for i = 1:narg
        push!(dims, ndims(A[i]))
        push!(idx, Any[Colon() for d in 1:dims[i]])
        if dim[i] > 0
            idx[i][dim[i]] = 1
            push!(args, view(A[i], idx[i]...))
        else
            push!(args, A[i])
        end
    end
    R1 = f(args...)
    ridx = Any[1:size(R1, d) for d in 1:ndims(R1)]
    push!(ridx, 1)
    Rsize = map(last, ridx)
    Rsize[end] = dimlength
    R = Array{eltype(R1)}(undef, Rsize...)

    for i = 1:dimlength
        for j = 1:narg
            if dim[j] > 0
                idx[j][dim[j]] = i
                args[j] = view(A[j], idx[j]...)
            else
                args[j] = A[j]
            end
        end
        ridx[end] = i
        R[ridx...] = f(args...)
    end

    return R
end

"""
     ppeval(f, D...; dim::NTuple)

Evaluates the callable argument `f` on slices of the elements of the `D` tuple.

#### Arguments
`f` can be any callable object that accepts sliced or broadcasted elements of `D`.
The result returned from `f` must be either an array or a scalar.

`D` has any number of elements and the elements can have any type. If an element
of `D` is a distributed array along the dimension specified by `dim`. If an
element of `D` is not distributed, the element is by default broadcasted and
applied on all evaluations of `f`.

`dim` is a tuple of integers specifying the dimension over which the elements
of `D` is slices. The length of the tuple must therefore be the same as the
number of arguments `D`. By default distributed arrays are slides along the
last dimension. If the value is less than or equal to zero the element are
broadcasted to all evaluations of `f`.

#### Result
`ppeval` returns a distributed array of dimension `p+1` where the first `p`
sizes correspond to the sizes of return values of `f`. The last dimension of
the return array from `ppeval` has the same length as the dimension over which
the input arrays are sliced.

#### Examples
```jl
addprocs(Sys.CPU_THREADS)

using DistributedArrays

A = drandn((10, 10, Sys.CPU_THREADS), workers(), [1, 1, Sys.CPU_THREADS])

ppeval(eigvals, A)

ppeval(eigvals, A, randn(10,10)) # broadcasting second argument

B = drandn((10, Sys.CPU_THREADS), workers(), [1, Sys.CPU_THREADS])

ppeval(*, A, B)
```
"""
function ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D))
    #Ensure that the complete DArray is available on the specified dims on all processors
    for i = 1:length(D)
        if isa(D[i], DArray)
            for idxs in D[i].indices
                for d in setdiff(1:ndims(D[i]), dim[i])
                    if length(idxs[d]) != size(D[i], d)
                        throw(DimensionMismatch(string("dimension $d is distributed. ",
                            "ppeval requires dimension $d to be completely available on all processors.")))
                    end
                end
            end
        end
    end

    refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])]

    # The array of Futures has to be reshaped for the DArray constructor to work correctly.
    # This requires a fetch and the DArray is also fetching so it might be better to modify
    # the DArray constructor.
    sd = [size(D[1].pids)...]
    nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1])
    DArray(reshape(refs, tuple([sd[1:nd - 1]; sd[end]]...)))
end


================================================
FILE: src/serialize.jl
================================================
function Serialization.serialize(S::AbstractSerializer, d::DArray{T,N,A}) where {T,N,A}
    # Only send the ident for participating workers - we expect the DArray to exist in the
    # remote registry. DO NOT send the localpart.
    destpid = worker_id_from_socket(S.io)
    Serialization.serialize_type(S, typeof(d))
    if (destpid in d.pids) || (destpid == d.id[1])
        serialize(S, (true, d.id))    # (id_only, id)
    else
        serialize(S, (false, d.id))
        for n in [:dims, :pids, :indices, :cuts]
            serialize(S, getfield(d, n))
        end
        serialize(S, A)
    end
end

function Serialization.deserialize(S::AbstractSerializer, t::Type{DT}) where DT<:DArray
    what = deserialize(S)
    id_only = what[1]
    id = what[2]

    if id_only
        d = d_from_weakref_or_d(id)
        if d === nothing
            # access to fields will throw an error, at least the deserialization process will not
            # result in worker death
            d = DT()
            d.id = id
        end
        return d
    else
        # We are not a participating worker, deser fields and instantiate locally.
        dims = deserialize(S)
        pids = deserialize(S)
        indices = deserialize(S)
        cuts = deserialize(S)
        A = deserialize(S)
        T=eltype(DT)
        N=length(dims)
        return DT(id, dims, pids, indices, cuts, empty_localpart(T,N,A))
    end
end

# Serialize only those parts of the object as required by the destination worker.
mutable struct DestinationSerializer
    generate::Union{Function,Nothing}  # Function to generate the part to be serialized
    pids::Union{Array,Nothing}         # MUST have the same shape as the distribution
    deser_obj::Any                     # Deserialized part

    DestinationSerializer(f,p,d) = new(f,p,d)
end

DestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, nothing)

# constructs a DestinationSerializer after verifying that the shape of pids.
function verified_destination_serializer(f::Function, pids::Array, verify_size)
    @assert size(pids) == verify_size
    return DestinationSerializer(f, pids)
end

DestinationSerializer(deser_obj::Any) = DestinationSerializer(nothing, nothing, deser_obj)

function Serialization.serialize(S::AbstractSerializer, s::DestinationSerializer)
    pid = worker_id_from_socket(S.io)
    pididx = findfirst(isequal(pid), s.pids)
    @assert pididx !== nothing
    Serialization.serialize_type(S, typeof(s))
    serialize(S, s.generate(pididx))
end

function Serialization.deserialize(S::AbstractSerializer, t::Type{T}) where T<:DestinationSerializer
    lpart = deserialize(S)
    return DestinationSerializer(lpart)
end


function localpart(s::DestinationSerializer)
    if s.deser_obj !== nothing
        return s.deser_obj
    elseif s.generate !== nothing && (myid() in s.pids)
        # Handle the special case where myid() is part of s.pids.
        # In this case serialize/deserialize is not called as the remotecall is executed locally
        return s.generate(findfirst(isequal(myid()), s.pids))
    else
        throw(ErrorException(string("Invalid state in DestinationSerializer.")))
    end
end


================================================
FILE: src/sort.jl
================================================
# Sorting a DVector using samplesort

function sample_n_setup_ref(d::DVector, sample_size; kwargs...)
    lp = localpart(d)
    llp = length(lp)
    np = length(procs(d))
    sample_size = llp > sample_size ? sample_size : llp
    sorted = sort(lp; kwargs...)
    sample = sorted[collect(1:div(llp,sample_size):llp)]
    ref = RemoteChannel(()->Channel(np+1))             # To collect parts to be sorted locally later.
                                                       # First element is the locally sorted vector
    put!(ref, sorted)
    return (sample, ref)
end


function scatter_n_sort_localparts(d, myidx, refs, boundaries::Array{T}; by = identity, kwargs...) where T
    if d==nothing
        sorted = take!(refs[myidx])  # First entry in the remote channel is sorted localpart
    else
        sorted = sort(localpart(d); by = by, kwargs...)
    end

    # send respective parts to correct workers, iterate over sorted array
    p_sorted = 1
    for (i,r) in enumerate(refs)
        p_till = length(sorted)+1

        # calculate range to send to refs[i]
        ctr=1
        for x in sorted[p_sorted:end]
            if by(x) > by(boundaries[i+1])
                p_till = p_sorted+ctr-1
                break
            else
                ctr += 1
            end
        end

        if p_till == p_sorted
            @async put!(r, Array{T}(undef,0))
        else
            v = sorted[p_sorted:p_till-1]
            @async put!(r, v)
        end

        p_sorted = p_till
    end

    # wait to receive all of my parts from all other workers
    lp_sorting=T[]
    for _ in refs
        v = take!(refs[myidx])
        append!(lp_sorting, v)
    end

    sorted_ref=RemoteChannel()
    put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...))
    return (sorted_ref, length(lp_sorting))
end

function compute_boundaries(d::DVector{T}; kwargs...) where T
    pids = procs(d)
    np = length(pids)
    sample_sz_on_wrkr = 512

    results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)

    samples = Array{T}(undef,0)
    for x in results
        append!(samples, x[1])
    end
    sort!(samples; kwargs...)
    samples[1] = typemin(T)

    refs=[x[2] for x in results]

    boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
    push!(boundaries, typemax(T))

    return (boundaries, refs)
end

"""
    sort(d::DVector; sample=true, kwargs...) -> DVector

Sorts and returns a new distributed vector.

The sorted vector may not have the same distribution as the original.

Keyword argument `sample` can take values:

- `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`.

- `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d)

- 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values

- Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution.

Keyword argument `alg` takes the same options `Base.sort`
"""
function Base.sort(d::DVector{T}; sample=true, kwargs...) where T
    pids = procs(d)
    np = length(pids)

    # Only `alg` and `sample` are supported as keyword arguments
    if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0
        throw(ArgumentError("Only `alg`, `by` and `sample` are supported as keyword arguments"))
    end

    if sample==true
        boundaries, refs = compute_boundaries(d; kwargs...)
        presorted=true

    elseif sample==false
        # Assume an uniform distribution between min and max values
        minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)
        min_d = minimum(T[x[1] for x in minmax])
        max_d = maximum(T[x[2] for x in minmax])

        return sort(d; sample=(min_d,max_d), kwargs...)

    elseif isa(sample, Tuple)
        # Assume an uniform distribution between min and max values in the tuple
        lb=sample[1]
        ub=sample[2]

        @assert lb<=ub

        s = Array{T}(undef,np)
        part = abs(ub - lb)/np
        (isnan(part) || isinf(part)) && throw(ArgumentError("lower and upper bounds must not be infinities"))

        for n in 1:np
            v = lb + (n-1)*part
            if T <: Integer
                s[n] = round(v)
            else
                s[n] = v
            end
        end
        return sort(d; sample=s, kwargs...)

    elseif isa(sample, Array)
        # Provided array is used as a sample
        samples = sort(copy(sample))
        samples[1] = typemin(T)
        boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
        push!(boundaries, typemax(T))
        presorted=false

        refs=[RemoteChannel(p) for p in procs(d)]
    else
        throw(ArgumentError("keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : " * string(sample)))
    end

    local_sort_results = Array{Tuple}(undef,np)

    Base.asyncmap!((i,p) -> remotecall_fetch(
            scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...),
                                    local_sort_results, 1:np, pids)

    # Construct a new DArray from the sorted refs. Remove parts with 0-length since
    # the DArray constructor_from_refs does not yet support it. This implies that
    # the participating workers for the sorted darray may be different from the original
    # for highly non-uniform distributions.
    local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)]
    return DArray(local_sorted_refs)
end


================================================
FILE: src/spmd.jl
================================================
module SPMD

using Distributed: RemoteChannel, myid, procs, remote_do, remotecall_fetch, remotecall_wait
using ..DistributedArrays: DistributedArrays, gather, next_did

export sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather
export context_local_storage, context, spmd


mutable struct WorkerDataChannel
    pid::Int
    rc::Union{RemoteChannel,Nothing}
    lock::ReentrantLock

    WorkerDataChannel(pid) = new(pid, nothing, ReentrantLock())
end

mutable struct SPMDContext
    id::Tuple{Int,Int}
    chnl::Channel
    store::Dict{Any,Any}
    pids::Array{Int}

    function SPMDContext(id::Tuple{Int,Int}, pids::Vector{Int})
        ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), pids)
        if first(id) == myid()
            finalizer(ctxt) do ctxt
                for p in ctxt.pids
                    @async remote_do(delete_ctxt_id, p, ctxt.id)
                end
            end
        end
        return ctxt
    end
end


# Every worker is associated with its own RemoteChannel
struct WorkerChannelDict
    data::Dict{Int, WorkerDataChannel}
    lock::ReentrantLock
    WorkerChannelDict() = new(Dict{Int, WorkerDataChannel}(), ReentrantLock())
end
const WORKERCHANNELS = WorkerChannelDict()

Base.get!(f::Function, x::WorkerChannelDict, id::Int) = @lock x.lock get!(f, x.data, id)

# mapping between a context id and context object
struct SPMDContextDict
    data::Dict{Tuple{Int,Int}, SPMDContext}
    lock::ReentrantLock
    SPMDContextDict() = new(Dict{Tuple{Int,Int}, SPMDContext}(), ReentrantLock())
end
const CONTEXTS = SPMDContextDict()

Base.delete!(x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock delete!(x.data, id)
Base.get!(f::Function, x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock get!(f, x.data, id)

function context_local_storage()
    ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT))
    ctxt.store
end

context(pids::Vector{Int}=procs()) = SPMDContext(next_did(), pids)

# Multiple SPMD blocks can be executed concurrently,
# each in its own context. Messages are still sent as part of the
# same remote channels associated with each worker. They are
# read from the remote channel into local channels each associated
# with a different run of `spmd`.

function get_dc(wc::WorkerDataChannel)
    lock(wc.lock)
    try
        if wc.rc === nothing
            if wc.pid == myid()
                myrc = RemoteChannel(()->Channel(typemax(Int)))
                wc.rc = myrc

                # start a task to transfer incoming messages into local
                # channels based on the execution context
                @async begin
                    while true
                        msg = take!(myrc)
                        ctxt_id = msg[1] # First element of the message tuple is the context id.
                        ctxt = get_ctxt_from_id(ctxt_id)
                        put!(ctxt.chnl, msg[2:end]) # stripping the context_id
                    end
                end
            else
                wc.rc = remotecall_fetch(()->get_remote_dc(myid()), wc.pid)
            end
        end
    finally
        unlock(wc.lock)
    end
    return wc.rc
end

function get_ctxt_from_id(ctxt_id::Tuple{Int,Int})
    ctxt = get!(CONTEXTS, ctxt_id) do
        return SPMDContext(ctxt_id, Int[])
    end
    return ctxt
end

# Since modules may be loaded in any order on the workers,
# and workers may be dynamically added, pull in the remote channel
# handles when accessed for the first time.
function get_remote_dc(pid::Int)
    wc = get!(WORKERCHANNELS, pid) do
        return WorkerDataChannel(pid)
    end
    return get_dc(wc)
end

function send_msg(to, typ, data, tag)
    ctxt_id = task_local_storage(:SPMD_CTXT)
    @async begin
        dc = get_remote_dc(to)
        put!(dc, (ctxt_id, typ, myid(), data, tag))
#        println("Sent to ", dc)
    end
end

function get_msg(typ_check, from_check=false, tag_check=nothing)
    ctxt_id = task_local_storage(:SPMD_CTXT)
    chnl = get_ctxt_from_id(ctxt_id).chnl

    unexpected_msgs=[]
    while true
        typ, from, data, tag = take!(chnl)

        if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check)
            push!(unexpected_msgs, (typ, from, data, tag))
#            println("Unexpected in get_msg ", unexpected_msgs, " looking for ", typ_check, " ", from_check, " ", tag_check)
        else
            # put all the messages we read (but not expected) back to the local channel
            foreach(x->put!(chnl, x), unexpected_msgs)
            return (from, data)
        end
    end
end

function sendto(pid::Int, data::Any; tag=nothing)
    send_msg(pid, :sendto, data, tag)
end

function recvfrom(pid::Int; tag=nothing)
    _, data = get_msg(:sendto, pid, tag)
    return data
end

function recvfrom_any(; tag=nothing)
    from, data = get_msg(:sendto, false, tag)
    return (from,data)
end

function barrier(;pids=procs(), tag=nothing)
    # send a message to everyone
    for p in sort(pids)
        send_msg(p, :barrier, nothing, tag)
    end
    # make sure we recv a message from everyone
    pending=deepcopy(pids)
    unexpected_msgs=[]

    while length(pending) > 0
        from, _ = get_msg(:barrier, false, tag)
        if from in pending
            filter!(x->x!=from, pending)
        else
            # handle case of 2 (or more) consecutive barrier calls.
            push!(unexpected_msgs, (:barrier, from, nothing, tag))
#            println("Unexpected ", from)
        end
#        length(pending) == 1 && println("Waiting for ", pending)
    end

    ctxt_id = task_local_storage(:SPMD_CTXT)
    chnl = get_ctxt_from_id(ctxt_id).chnl
    foreach(x->put!(chnl, x), unexpected_msgs)
    return nothing
end

function bcast(data::Any, pid::Int; tag=nothing, pids=procs())
    if myid() == pid
        for p in filter(x->x!=pid, sort(pids))
            send_msg(p, :bcast, data, tag)
        end
        return data
    else
        from, data = get_msg(:bcast, pid, tag)
        return data
    end
end

function scatter(x, pid::Int; tag=nothing, pids=procs())
    if myid() == pid
        @assert rem(length(x), length(pids)) == 0
        cnt = div(length(x), length(pids))
        for (i,p) in enumerate(sort(pids))
            p == pid && continue
            send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag)
        end
        myidx = findfirst(isequal(pid), sort(pids))
        return x[cnt*(myidx-1)+1:cnt*myidx]
    else
        _, data = get_msg(:scatter, pid, tag)
        return data
    end
end

function DistributedArrays.gather(x, pid::Int; tag=nothing, pids=procs())
    if myid() == pid
        gathered_data = Array{Any}(undef, length(pids))
        myidx = findfirst(isequal(pid), sort(pids))
        gathered_data[myidx] = x
        n = length(pids) - 1
        while n > 0
            from, data_x = get_msg(:gather, false, tag)
            fromidx = findfirst(isequal(from), sort(pids))
            gathered_data[fromidx] = data_x
            n=n-1
        end
        return gathered_data
    else
        send_msg(pid, :gather, x, tag)
        return x
    end
end

function spmd_local(f, ctxt_id, clear_ctxt)
    task_local_storage(:SPMD_CTXT, ctxt_id)
    f()
    clear_ctxt && delete_ctxt_id(ctxt_id)
    return nothing
end

function spmd(f, args...; pids=procs(), context=nothing)
    f_noarg = ()->f(args...)
    clear_ctxt = false
    if context == nothing
        ctxt_id = next_did()
        clear_ctxt = true    # temporary unique context created for this run.
                             # should be cleared at the end of the run.
    else
        ctxt_id = context.id
    end
    @sync for p in pids
        @async remotecall_wait(spmd_local, p, f_noarg, ctxt_id, clear_ctxt)
    end
    nothing
end

delete_ctxt_id(ctxt_id::Tuple{Int,Int}) = delete!(CONTEXTS, ctxt_id)

Base.close(ctxt::SPMDContext) = finalize(ctxt)

end


================================================
FILE: test/aqua.jl
================================================
using DistributedArrays, Test
import Aqua

@testset "Aqua" begin
    Aqua.test_all(DistributedArrays; ambiguities = (; broken = true))
end


================================================
FILE: test/darray.jl
================================================
using Test, LinearAlgebra, SpecialFunctions
using Statistics: mean
using SparseArrays: nnz
using Random
@everywhere using SparseArrays: sprandn

@testset "test distribute and other constructors" begin
    A = rand(1:100, (100,100))

    @testset "test default distribute" begin
        DA = distribute(A)
        @test length(procs(DA)) == nworkers()
        @test sum(DA) == sum(A)
        close(DA)
    end

    @testset "test distribute with procs arguments" begin
        DA = distribute(A, procs = procs())
        @test length(procs(DA)) == nprocs()
        @test sum(DA) == sum(A)
        close(DA)
    end

    @testset "test distribute with procs and dist arguments" begin
        DA = distribute(A, procs = [1, 2], dist = [1,2])
        @test size(procs(DA)) == (1,2)
        @test sum(DA) == sum(A)
        close(DA)
    end

    @testset "Create darray with unconventional distribution and distribute like it" begin
        block = 10
        Y = nworkers() * block
        X = nworkers() * block
        remote_parts = map(workers()) do wid
            remotecall(rand, wid, block, Y)
        end
        DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1)))
        A = rand(X, Y)
        DA2 = distribute(A, DA1)

        @test size(DA1) == size(DA2)

        close(DA1)
        close(DA2)
    end

    @testset "Global DArray serialization issue #134" begin
        global A134 = drandn(1)
        D2 = DArray(I -> DistributedArrays.localpart(A134), A134)
        @test D2 == A134
        close(A134)
        close(D2)
    end

    @testset "empty_localpart should work when only constructor (not conversion is defined)" begin
        @test DistributedArrays.empty_localpart(Float64,2,LowerTriangular{Float64,Matrix{Float64}}) isa
                LowerTriangular
    end
    
    @testset "Consistent Uneven Distribution issue #166" begin
        DA = drand((2+length(OTHERIDS),), [MYID, OTHERIDS])
        @test fetch(@spawnat MYID length(localpart(DA)) == 2)
        @test fetch(@spawnat OTHERIDS length(localpart(DA)) == 1)
        close(DA)
        @test DistributedArrays.defaultdist(50,4) == [1,14,27,39,51]
    end
    
    @testset "Inhomogeneous typeof(localpart)" begin
        block = 10
        Y = nworkers() * block
        X = nworkers() * block

        @assert nworkers() > 1
        @test_throws ErrorException DArray((X, Y)) do I
            eltype = first(CartesianIndices(I)) == CartesianIndex(1, 1) ? Int64 : Float64
            zeros(eltype, map(length, I))
        end
    end
end

check_leaks()

@testset "test DArray equality/copy/deepcopy" begin
    D = drand((200,200), [MYID, OTHERIDS])

    @testset "test isequal(::DArray, ::DArray)" begin
        DC = copy(D)
        @test D == DC
        close(DC)
    end

    @testset "test [deep]copy(::DArray) does a copy of each localpart" begin
        DC = copy(D)
        @spawnat OTHERIDS localpart(DC)[1] = 0
        @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)
        DD = deepcopy(D)
        @spawnat OTHERIDS localpart(DD)[1] = 0
        @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)
        close(DC)
        close(DD)
    end

    @testset "test copy(::DArray) is shallow" begin
        DA = @DArray [rand(100) for i=1:10]
        DC = copy(DA)
        id = procs(DC)[1]
        @test DA == DC
        fetch(@spawnat id localpart(DC)[1] .= -1.0)
        @test DA == DC
        @test fetch(@spawnat id all(localpart(DA)[1] .== -1.0))
        close(DA)
        close(DC)
    end

    @testset "test deepcopy(::DArray) is not shallow" begin
        DA = @DArray [rand(100) for i=1:10]
        DC = deepcopy(DA)
        id = procs(DC)[1]
        @test DA == DC
        fetch(@spawnat id localpart(DC)[1] .= -1.0)
        @test DA != DC
        @test fetch(@spawnat id all(localpart(DA)[1] .>= 0.0))
        close(DA)
        close(DC)
    end

    close(D)
end

check_leaks()

@testset "test DArray similar" begin
    D = drand((200,200), [MYID, OTHERIDS])
    DS = similar(D,Float16)

    @testset "test eltype of a similar" begin
        @test eltype(DS) == Float16
    end

    @testset "test dims of a similar" begin
        @test size(D) == size(DS)
    end
    close(D)
    close(DS)
end

check_leaks()

@testset "test DArray reshape" begin
    D = drand((200,200), [MYID, OTHERIDS])

    @testset "Test error-throwing in reshape" begin
        @test_throws DimensionMismatch reshape(D,(100,100))
    end

    DR = reshape(D,(100,400))
    @testset "Test reshape" begin
        @test size(DR) == (100,400)
    end
    close(D)
end

check_leaks()

@testset "test @DArray comprehension constructor" begin

    @testset "test valid use of @DArray" begin
        D = @DArray [i+j for i=1:10, j=1:10]
        @test D == [i+j for i=1:10, j=1:10]
        close(D)
    end

    @testset "test invalid use of @DArray" begin
        #@test_throws ArgumentError eval(:((@DArray [1,2,3,4])))
        @test_throws LoadError eval(:((@DArray [1,2,3,4])))
    end
end

check_leaks()

@testset "test DArray / Array conversion" begin
    D = drand((200,200), [MYID, OTHERIDS])

    @testset "test construct Array from (Sub)DArray" begin
        S = Matrix{Float64}(D[1:150, 1:150])
        A = Matrix{Float64}(D)

        @test A[1:150,1:150] == S
        D2 = DArray{Float64,2,Matrix{Float64}}(A)
        @test D2 == D
        DistributedArrays.allowscalar(true)
        @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1]
        @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101]
        DistributedArrays.allowscalar(false)
        close(D2)

        S2 = Vector{Float64}(D[4, 23:176])
        @test A[4, 23:176] == S2

        S3 = Vector{Float64}(D[23:176, 197])
        @test A[23:176, 197] == S3

        S4 = zeros(4)
        setindex!(S4, D[3:4, 99:100], :)
        # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}
        DistributedArrays.allowscalar(true)
        @test S4 == vec(D[3:4, 99:100])
        @test S4 == vec(A[3:4, 99:100])
        DistributedArrays.allowscalar(false)

        S5 = zeros(2,2)
        setindex!(S5, D[1,1:4], :, 1:2)
        # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}
        DistributedArrays.allowscalar(true)
        @test vec(S5) == D[1, 1:4]
        @test vec(S5) == A[1, 1:4]
        DistributedArrays.allowscalar(false)
    end
    close(D)
end

check_leaks()

@testset "test copy!" begin
    D1 = dzeros((10,10))
    r1 = remotecall_wait(() -> randn(3,10), workers()[1])
    r2 = remotecall_wait(() -> randn(7,10), workers()[2])
    D2 = DArray(reshape([r1; r2], 2, 1))
    copyto!(D2, D1)
    @test D1 == D2
    close(D1)
    close(D2)
end

check_leaks()

@testset "test DArray reduce" begin
    D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS])

    @testset "test reduce" begin
        @test reduce(+, D) == ((50*MYID) + (50*OTHERIDS))
    end

    @testset "test map / reduce" begin
        D2 = map(x->1, D)
        @test D2 isa DArray
        @test reduce(+, D2) == 100
        close(D2)
    end

    @testset "test map! / reduce" begin
        map!(x->1, D, D)
        @test reduce(+, D) == 100
    end
    close(D)
end

check_leaks()

@testset "test rmul" begin
    A = randn(100,100)
    DA = distribute(A)
    @test rmul!(DA, 2) == rmul!(A, 2)
    close(DA)
end

check_leaks()

@testset "test rmul!(Diagonal, A)" begin
    A = randn(100, 100)
    b = randn(100)
    D = Diagonal(b)
    DA = distribute(A)
    @test lmul!(D, A) == lmul!(D, DA)
    close(DA)
    A = randn(100, 100)
    b = randn(100)
    DA = distribute(A)
    @test rmul!(A, D) == rmul!(DA, D)
    close(DA)
end

check_leaks()

@testset "test mapreduce on DArrays" begin
    for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *]
        A = rand(1:5, rand(2:30))
        DA = distribute(A)
        @test DA isa DArray
        @test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0
        close(DA)
    end
end

check_leaks()

@testset "test mapreducedim on DArrays" begin
    D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS])
    D2 = map(x->1, D)
    @test D2 isa DArray
    @test mapreduce(t -> t*t, +, D2, dims=1) == mapreduce(t -> t*t, +, convert(Array, D2), dims=1)
    @test mapreduce(t -> t*t, +, D2, dims=2) == mapreduce(t -> t*t, +, convert(Array, D2), dims=2)
    @test mapreduce(t -> t*t, +, D2, dims=(1,2)) == mapreduce(t -> t*t, +, convert(Array, D2), dims=(1,2))

    # Test non-regularly chunked DArrays
    r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1])
    r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2])
    D = DArray(reshape([r1; r2], (2,1)))
    @test Array(sum(D, dims=2)) == sum(Array(D), dims=2)

    # close(D)
    # close(D2)
    d_closeall()   # temp created by the mapreduce above
end

check_leaks()

@testset "test mapreducdim, reducedim on DArrays" begin
    dims = (20,20,20)
    DA = drandn(dims)
    A = convert(Array, DA)

    @testset "dimension $dms" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
        @test mapreduce(t -> t*t, +, A, dims=dms) ≈ mapreduce(t -> t*t, +, DA, dims=dms)
        @test mapreduce(t -> t*t, +, A, dims=dms, init=1.0) ≈ mapreduce(t -> t*t, +, DA, dims=dms, init=1.0)
        @test reduce(*, A, dims=dms) ≈ reduce(*, DA, dims=dms)
        @test reduce(*, A, dims=dms, init=2.0) ≈ reduce(*, DA, dims=dms, init=2.0)
    end
    close(DA)
    d_closeall()   # temp created by the mapreduce above
end

check_leaks()

@testset "test statistical functions on DArrays" begin
    dims = (20,20,20)
    DA = drandn(dims)
    A = Array(DA)

    @testset "test $f for dimension $dms" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
        # std is pending implementation
        @test f(DA, dims=dms) ≈ f(A, dims=dms)
    end

    close(DA)
    d_closeall()   # temporaries created above
end

check_leaks()

unpack(ex::Base.CapturedException) = unpack(ex.ex)
unpack(ex::Distributed.RemoteException) = unpack(ex.captured)
unpack(ex::Base.TaskFailedException) = unpack(ex.task.exception)
unpack(ex) = ex

@testset "test sum on DArrays" begin
    A = randn(100,100)
    DA = distribute(A)

    # sum either throws an ArgumentError, a CompositeException of ArgumentErrors,
    # or a RemoteException wrapping an ArgumentError
    try
        sum(DA, dims=-1)
    catch err
        if isa(err, CompositeException)
            @test !isempty(err.exceptions)
            for excep in err.exceptions
                # Unpack the remote exception
                orig_err = unpack(excep)
                @test isa(orig_err, ArgumentError)
            end
        elseif isa(err, RemoteException)
            @test err.captured isa CapturedException
            @test err.captured.ex isa ArgumentError
        else
            @test isa(err, ArgumentError)
        end
    end
    try
        sum(DA, dims=0)
    catch err
        if isa(err, CompositeException)
            @test !isempty(err.exceptions)
            for excep in err.exceptions
                # Unpack the remote exception
                orig_err = unpack(excep)
                @test isa(orig_err, ArgumentError)
            end
        elseif isa(err, RemoteException)
            @test err.captured isa CapturedException
            @test err.captured.ex isa ArgumentError
        else
            @test isa(err, ArgumentError)
        end
    end

    @test sum(DA) ≈ sum(A)
    @test sum(DA, dims=1) ≈ sum(A, dims=1)
    @test sum(DA, dims=2) ≈ sum(A, dims=2)
    @test sum(DA, dims=3) ≈ sum(A, dims=3)
    close(DA)
    d_closeall()   # temporaries created above
end

check_leaks()

@testset "test size on DArrays" begin

    A = randn(100,100)
    DA = distribute(A)

    @test_throws BoundsError size(DA, 0)
    @test size(DA,1) == size(A,1)
    @test size(DA,2) == size(A,2)
    @test size(DA,3) == size(A,3)
    close(DA)
end

check_leaks()

# test length / lastindex
@testset "test collections API" begin
    A = randn(23,23)
    DA = distribute(A)

    @testset "test length" begin
        @test length(DA) == length(A)
    end

    @testset "test lastindex" begin
        @test lastindex(DA) == lastindex(A)
    end
    close(DA)
end

check_leaks()

@testset "test max / min / sum" begin
    a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))
    d = distribute(a)

    @test sum(d)          == sum(a)
    @test maximum(d)      == maximum(a)
    @test minimum(d)      == minimum(a)
    @test maximum(abs, d) == maximum(abs, a)
    @test minimum(abs, d) == minimum(abs, a)
    @test sum(abs, d)     == sum(abs, a)
    @test sum(abs2, d)    == sum(abs2, a)
    @test extrema(d)      == extrema(a)
    close(d)
end

check_leaks()

@testset "test all / any" begin
    a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))
    a = [true for i in 1:100]
    d = distribute(a)

    @test all(d)
    @test any(d)

    close(d)

    a[50] = false
    d = distribute(a)
    @test !all(d)
    @test any(d)

    close(d)

    a = [false for i in 1:100]
    d = distribute(a)
    @test !all(d)
    @test !any(d)

    close(d)

    d = dones(10,10)
    @test !all(x-> x>1.0, d)
    @test all(x-> x>0.0, d)

    close(d)

    a = ones(10,10)
    a[10] = 2.0
    d = distribute(a)
    @test any(x-> x == 1.0, d)
    @test any(x-> x == 2.0, d)
    @test !any(x-> x == 3.0, d)

    close(d)
end

check_leaks()

@testset "test count"  begin
    a = ones(10,10)
    a[10] = 2.0
    d = distribute(a)

    @test count(x-> x == 2.0, d) == 1
    @test count(x-> x == 1.0, d) == 99
    @test count(x-> x == 0.0, d) == 0

    close(d)
end

check_leaks()

@testset "test prod" begin
    a = fill(2, 10);
    d = distribute(a);
    @test prod(d) == 2^10

    close(d)
end

check_leaks()

@testset "test zeros" begin
    @testset "1D dzeros default element type" begin
        A = dzeros(10)
        @test A == zeros(10)
        @test eltype(A) == Float64
        @test size(A) == (10,)
        close(A)
    end

    @testset "1D dzeros with specified element type" begin
        A = dzeros(Int, 10)
        @test A == zeros(10)
        @test eltype(A) == Int
        @test size(A) == (10,)
        close(A)
    end

    @testset "2D dzeros default element type, Dims constructor" begin
        A = dzeros((10,10))
        @test A == zeros((10,10))
        @test eltype(A) == Float64
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dzeros specified element type, Dims constructor" begin
        A = dzeros(Int, (10,10))
        @test A == zeros(Int, (10,10))
        @test eltype(A) == Int
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dzeros, default element type" begin
        A = dzeros(10,10)
        @test A == zeros(10,10)
        @test eltype(A) == Float64
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dzeros, specified element type" begin
        A = dzeros(Int, 10, 10)
        @test A == zeros(Int, 10, 10)
        @test eltype(A) == Int
        @test size(A) == (10,10)
        close(A)
    end
end

check_leaks()

@testset "test dones" begin
    @testset "1D dones default element type" begin
        A = dones(10)
        @test A == ones(10)
        @test eltype(A) == Float64
        @test size(A) == (10,)
        close(A)
    end

    @testset "1D dones with specified element type" begin
        A = dones(Int, 10)
        @test eltype(A) == Int
        @test size(A) == (10,)
        close(A)
    end

    @testset "2D dones default element type, Dims constructor" begin
        A = dones((10,10))
        @test A == ones((10,10))
        @test eltype(A) == Float64
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dones specified element type, Dims constructor" begin
        A = dones(Int, (10,10))
        @test A == ones(Int, (10,10))
        @test eltype(A) == Int
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dones, default element type" begin
        A = dones(10,10)
        @test A == ones(10,10)
        @test eltype(A) == Float64
        @test size(A) == (10,10)
        close(A)
    end

    @testset "2D dones, specified element type" begin
        A = dones(Int, 10, 10)
        @test A == ones(Int, 10, 10)
        @test eltype(A) == Int
        @test size(A) == (10,10)
        close(A)
    end
end

check_leaks()

@testset "test drand" begin
    @testset "1D drand" begin
        A = drand(100)
        @test eltype(A) == Float64
        @test size(A) == (100,)
        @test all(x-> x >= 0.0 && x <= 1.0, A)
        close(A)
    end

    @testset "1D drand, specified element type" begin
        A = drand(Int, 100)
        @test eltype(A) == Int
        @test size(A) == (100,)
        close(A)
    end

    @testset "1D drand, UnitRange" begin
        A = drand(1:10, 100)
        @test eltype(A) == Int
        @test size(A) == (100,)
        close(A)
    end

    @testset "1D drand, Array" begin
        A = drand([-1,0,1], 100)
        @test eltype(A) == Int
        @test size(A) == (100,)
        close(A)
    end

    @testset "2D drand, Dims constructor" begin
        A = drand((50,50))
        @test eltype(A) == Float64
        @test size(A) == (50,50)
        @test all(x-> x >= 0.0 && x <= 1.0, A)
        close(A)
    end

    @testset "2D drand" begin
        A = drand(100,100)
        @test eltype(A) == Float64
        @test size(A) == (100,100)
        @test all(x-> x >= 0.0 && x <= 1.0, A)
        close(A)
    end

    @testset "2D drand, Dims constructor, specified element type" begin
        A = drand(Int, (100,100))
        @test eltype(A) == Int
        @test size(A) == (100,100)
        close(A)
    end

    @testset "2D drand, specified element type" begin
        A = drand(Int, 100, 100)
        @test eltype(A) == Int
        @test size(A) == (100,100)
        close(A)
    end
end

check_leaks()

@testset "test randn" begin
    @testset "1D drandn" begin
        A = drandn(100)
        @test eltype(A) == Float64
        @test size(A) == (100,)
        close(A)
    end

    @testset "2D drandn, Dims constructor" begin
        A = drandn((50,50))
        @test eltype(A) == Float64
        @test size(A) == (50,50)
        close(A)
    end

    @testset "2D drandn" begin
        A = drandn(100,100)
        @test eltype(A) == Float64
        @test size(A) == (100,100)
        close(A)
    end
end

check_leaks()

@testset "test transpose/adjoint" begin
    @testset "test transpose real" begin
        A = drand(Float64, 100, 200)
        @test copy(transpose(A)) == transpose(Array(A))
        close(A)
    end
    @testset "test transpose complex" begin
        A = drand(ComplexF64, 200, 100)
        @test copy(transpose(A)) == transpose(Array(A))
        close(A)
    end
    @testset "test adjoint real" begin
        A = drand(Float64, 200, 100)
        @test copy(adjoint(A)) == adjoint(Array(A))
        close(A)
    end
    @testset "test adjoint complex" begin
        A = drand(ComplexF64, 100, 200)
        @test copy(adjoint(A)) == adjoint(Array(A))
        close(A)
    end

    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "makelocal" begin
    A = randn(5*nprocs(), 5*nprocs())
    dA = distribute(A, procs=procs())
    for i in 1:size(dA, 2)
        a = DistributedArrays.makelocal(dA, :, i)
        @test all(Array(view(dA, :, i)) .== a)
        @test all(      view( A, :, i) .== a)
    end
    for i in 1:size(dA, 1)
        a = DistributedArrays.makelocal(dA, i, :)
        @test all(Array(view(dA, i:i, :)) .== a)
        @test all(      view( A, i:i, :) .== a)
    end
    a = DistributedArrays.makelocal(dA, 1:5, 1:5)
    @test all(Array(view(dA, 1:5, 1:5)) .== a)
    @test all(      view( A, 1:5, 1:5) .== a)
    close(dA)
end

@testset "test convert from subdarray" begin
    a = drand(20, 20);

    s = view(a, 1:5, 5:8)
    @test isa(s, SubDArray)
    @test s == DArray(s)

    s = view(a, 6:5, 5:8)
    @test isa(s, SubDArray)
    @test s == DArray(s)
    close(a)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test scalar math" begin
    a = drand(20, 20);
    b = convert(Array, a)
    @testset "$f" for f in (-, abs, abs2, acos, acosd, acot,
              acotd, acsch, angle, asech, asin,
              asind, asinh, atan, atand, atanh,
              big, cbrt, ceil, cis, complex, conj,
              cos, cosc, cosd, cosh, cospi, cot,
              cotd, coth, csc, cscd, csch, dawson,
              deg2rad, digamma, erf, erfc, erfcinv,
              erfcx, erfi, erfinv, exp, exp10, exp2,
              expm1, exponent, float, floor, gamma, imag,
              invdigamma, isfinite, isinf, isnan,
              loggamma, log, log10, log1p, log2, rad2deg, real,
              sec, secd, sech, sign, sin, sinc, sind,
              sinh, sinpi, sqrt, tan, tand, tanh, trigamma)
        @test f.(a) == f.(b)
    end
    a = a .+ 1
    b = b .+ 1
    @testset "$f" for f in (asec, asecd, acosh, acsc, acscd, acoth)
        @test f.(a) == f.(b)
    end
    close(a)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test mapslices" begin
    A = randn(5,5,5)
    D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)])
    @test mapslices(svdvals, D, dims=(1,2)) ≈ mapslices(svdvals, A, dims=(1,2))
    @test mapslices(svdvals, D, dims=(1,3)) ≈ mapslices(svdvals, A, dims=(1,3))
    @test mapslices(svdvals, D, dims=(2,3)) ≈ mapslices(svdvals, A, dims=(2,3))
    @test mapslices(sort, D, dims=(1,)) ≈ mapslices(sort, A, dims=(1,))
    @test mapslices(sort, D, dims=(2,)) ≈ mapslices(sort, A, dims=(2,))
    @test mapslices(sort, D, dims=(3,)) ≈ mapslices(sort, A, dims=(3,))

    # issue #3613
    B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), dims=[1,2])
    @test size(B) == (1,1,4)
    @test all(B.==6)

    # issue #5141
    C1 = mapslices(x-> maximum(-x), D, dims=[])
    @test C1 == -D

    # issue #5177
    c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)])
    m1 = mapslices(x-> ones(2,3), c, dims=[1,2])
    m2 = mapslices(x-> ones(2,4), c, dims=[1,3])
    m3 = mapslices(x-> ones(3,4), c, dims=[2,3])
    @test size(m1) == size(m2) == size(m3) == size(c)

    n1 = mapslices(x-> ones(6), c, dims=[1,2])
    n2 = mapslices(x-> ones(6), c, dims=[1,3])
    n3 = mapslices(x-> ones(6), c, dims=[2,3])
    n1a = mapslices(x-> ones(1,6), c, dims=[1,2])
    n2a = mapslices(x-> ones(1,6), c, dims=[1,3])
    n3a = mapslices(x-> ones(1,6), c, dims=[2,3])
    @test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5))
    @test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5))
    close(D)
    close(c)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test scalar ops" begin
    a = drand(20,20)
    b = convert(Array, a)
    c = drand(20,20)
    d = convert(Array, c)

    @testset "$f" for f in (:+, :-, :*, :/, :%)
        x = rand()
        @test @eval ($f).($a, $x) == ($f).($b, $x)
        @test @eval ($f).($x, $a) == ($f).($x, $b)
        @test @eval ($f).($a, $c) == ($f).($b, $d)
    end

    close(a)
    close(c)

    a = dones(Int, 20, 20)
    b = convert(Array, a)
    @testset "$f" for f in (:<<, :>>)
        @test @eval ($f).($a, 2)  == ($f).($b, 2)
        @test @eval ($f).(2, $a)  == ($f).(2, $b)
        @test @eval ($f).($a, $a) == ($f).($b, $b)
    end

    @testset "$f" for f in (:rem,)
        x = rand()
        @test @eval ($f).($a, $x) == ($f).($b, $x)
    end
    close(a)
    close(c)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test broadcast ops" begin
    wrkrs = workers()
    nwrkrs = length(wrkrs)
    nrows = 20 * nwrkrs
    ncols = 10 * nwrkrs
    a = drand((nrows,ncols), wrkrs, (1, nwrkrs))
    m = mean(a, dims=1)
    c = a .- m
    d = convert(Array, a) .- convert(Array, m)
    @test c == d
    e = @DArray [ones(10) for i=1:4]
    f = 2 .* e
    @test Array(f) == 2 .* Array(e)
    @test Array(map(x -> sum(x) .+ 2, e)) == map(x -> sum(x) .+ 2, e)

    @testset "test nested broadcast" begin
       g = a .- m .* sin.(c)
       @test Array(g) == Array(a) .- Array(m) .* sin.(Array(c))
    end

    @testset "Broadcasting into DArray" begin
        a .= ones(nrows, ncols)
        @test all(isone, a)
        a .= 3 .+ abs2.(@view(zeros(nrows, ncols + 5)[:, 6:end]))
        @test all(x -> x == 3, a)
    end

    # @testset "lazy wrapped broadcast" begin
    #    l = similar(a)
    #    l[1:10, :] .= view(a, 1:10, : )
    # end
    d_closeall()
end

check_leaks()

@testset "test matrix multiplication" begin
    A = drandn(20,20)
    b = drandn(20)
    B = drandn(20,20)

    @test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps())
    @test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps())
    @test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps())
    @test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps())
    close(A)
    close(b)
    close(B)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "dot product" begin
    A = drandn(20,20)
    b = drandn(20)
    c = A * b

    @test dot(c, b) ≈ dot(convert(Array, c), convert(Array, b))
    close(A)
    close(b)
    close(c)
end

check_leaks()

@testset "test norm" begin
    x = drandn(20)

    @test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps())
    @test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps())
    @test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps())
    @test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps())
    close(x)
end

check_leaks()

@testset "test axpy!" begin
    for (x, y) in ((drandn(20), drandn(20)),
                   (drandn(20, 2), drandn(20, 2)))

        @test Array(axpy!(2.0, x, copy(y))) ≈ axpy!(2.0, Array(x), Array(y))
        @test_throws DimensionMismatch axpy!(2.0, x, zeros(length(x) + 1))
        close(x)
        close(y)
    end

    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test ppeval" begin
    A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()])
    B = drandn((10, nworkers()), workers(), [1, nworkers()])

    R = zeros(10, nworkers())
    for i = 1:nworkers()
        R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i]
    end
    @test convert(Array, ppeval(*, A, B)) ≈ R
    @test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, Matrix{Float64}(I,10,10)))
    close(A)
    close(B)
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "test nnz" begin
    A = sprandn(10, 10, 0.5)
    @test nnz(distribute(A)) == nnz(A)
end

@testset "test matmatmul" begin
    A = drandn(30, 30)
    B = drandn(30, 20)
    a = convert(Array, A)
    b = convert(Array, B)

    AB = A * B
    AtB = transpose(A) * B
    AcB = A' * B

    ab = a * b
    atb = transpose(a) * b
    acb = a' * b

    @test AB ≈ ab
    @test AtB ≈ atb
    @test AcB ≈ acb
    d_closeall()  # close the temporaries created above
end

@testset "sort, T = $T, 10^$i elements" for i in 0:6, T in [Int, Float64]
    d = DistributedArrays.drand(T, 10^i)
    @testset "sample = $sample" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)]
        d2 = DistributedArrays.sort(d; sample=sample)
        a  = convert(Array, d)
        a2 = convert(Array, d2)
        @test length(d) == length(d2)
        @test sort(a) == a2
    end
    d_closeall()  # close the temporaries created above
end

check_leaks()

@testset "ddata" begin
    d = ddata(;T=Int, init=I->myid())
    for p in workers()
        @test p == remotecall_fetch(d->d[:L], p, d)
    end
    @test Int[workers()...] == gather(d)

    close(d)

    d = ddata(;T=Int, data=workers())
    for p in workers()
        @test p == remotecall_fetch(d->d[:L], p, d)
    end
    @test Int[workers()...] == gather(d)

    close(d)

    d = ddata(;T=Any, init=I->"Hello World!")
    for p in workers()
        @test "Hello World!" == remotecall_fetch(d->d[:L], p, d)
    end
    Any["Hello World!" for p in workers()] == gather(d)


    close(d)
end

check_leaks()

@testset "rand!" begin
    d = dzeros(30, 30)
    rand!(d)

    close(d)
end

check_leaks()

@testset "fill!" begin
    d = dzeros(30, 30)
    fill!(d, 3.14)
    @test all(x-> x == 3.14, d)

    close(d)
end

check_leaks()

d_closeall()

@testset "test for any leaks" begin
    sleep(1.0)     # allow time for any cleanup to complete
    allrefszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REFS.lock, isempty(DistributedArrays.REFS.data)), p) for p in procs()]
    @test all(allrefszero)

    allregistrieszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REGISTRY.lock, isempty(DistributedArrays.REGISTRY.data)), p) for p in procs()]
    @test all(allregistrieszero)
end


================================================
FILE: test/explicit_imports.jl
================================================
using DistributedArrays, Test
import ExplicitImports

@testset "ExplicitImports" begin
    # No implicit imports in DistributedArrays (ie. no `using MyPkg`)
    @test ExplicitImports.check_no_implicit_imports(DistributedArrays) === nothing

    # No non-owning imports in DistributedArrays (ie. no `using LinearAlgebra: map`)
    @test ExplicitImports.check_all_explicit_imports_via_owners(DistributedArrays) === nothing

    # Limit non-public imports in DistributedArrays (ie. `using MyPkg: _non_public_internal_func`)
    # to a few selected types and functions
    @test ExplicitImports.check_all_explicit_imports_are_public(
        DistributedArrays;
        ignore = (
            # Base
            :Broadcasted,
            :Callable,
            (VERSION < v"1.11" ? (:tail,) : ())...,
        ),
    ) === nothing

    # No stale imports in DistributedArrays (ie. no `using MyPkg: func` where `func` is not used in DistributedArrays)
    @test ExplicitImports.check_no_stale_explicit_imports(DistributedArrays) === nothing

    # No non-owning accesses in DistributedArrays (ie. no `... LinearAlgebra.map(...)`)
    @test ExplicitImports.check_all_qualified_accesses_via_owners(DistributedArrays) === nothing

    # Limit non-public accesses in DistributedArrays (ie. no `... MyPkg._non_public_internal_func(...)`)
    # to a few selected types and methods from Base
    @test ExplicitImports.check_all_qualified_accesses_are_public(
        DistributedArrays;
        ignore = (
            # Base.Broadcast
            :AbstractArrayStyle,
            :DefaultArrayStyle,
            :broadcasted,
            :throwdm,
            # Base
            (VERSION < v"1.11" ? (Symbol("@propagate_inbounds"),) : ())...,
            :ReshapedArray,
            :Slice,
            :_all,
            :_any,
            :_mapreduce,
            :check_reducedims,
            :checkbounds_indices,
            :index_lengths,
            :mapreducedim!,
            :promote_op,
            :reducedim_initarray,
            :reindex,
            :setindex_shape_check,
            :unalias,
            # Serialization
            :serialize_type,
            # Statistics        
            :_mean,
        ),
    ) === nothing

    # No self-qualified accesses in DistributedArrays (ie. no `... DistributedArrays.func(...)`)
    @test ExplicitImports.check_no_self_qualified_accesses(DistributedArrays) === nothing
end


================================================
FILE: test/runtests.jl
================================================
using Test
using Distributed
using DistributedArrays

# Disable scalar indexing to avoid falling back on generic methods
# for AbstractArray
DistributedArrays.allowscalar(false)

# add at least 3 worker processes
if nworkers() < 3
    n = max(3, min(8, Sys.CPU_THREADS))
    addprocs(n; exeflags=`--check-bounds=yes`)
end
@assert nprocs() > 3
@assert nworkers() >= 3

@everywhere using Distributed
@everywhere using DistributedArrays
@everywhere using DistributedArrays.SPMD
@everywhere using Random
@everywhere using LinearAlgebra

@everywhere Random.seed!(1234 + myid())

const MYID = myid()
const OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))]

function check_leaks()
    nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)
    if !iszero(nrefs)
        sleep(0.1)  # allow time for any cleanup to complete and test again
        nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)
        if !iszero(nrefs)
            @warn("Probable leak of ", nrefs, " darrays")
        end
    end
end

include("aqua.jl")
include("explicit_imports.jl")
include("darray.jl")
include("spmd.jl")


================================================
FILE: test/spmd.jl
================================================
@everywhere function spmd_test1()
    barrier(;tag=:b1)

    if myid() == 1
        @assert SPMD.recvfrom(2) == "Hello from 2"
        println("SPMD: Passed send/recv")
    elseif myid() == 2
        data = "Hello from 2"
        sendto(1, data)
    end

    stime = rand(1:5)
#    println("Sleeping for $stime seconds")
    sleep(stime)
    barrier(;tag=:b2)

    bcast_val = nothing
    if myid() == 1
        bcast_val = rand(2)
    end

    bcast_val = bcast(bcast_val, 1)

    if myid() == 1
        @assert bcast_val == SPMD.recvfrom(2)
        println("SPMD: Passed broadcast")
    elseif myid() == 2
        sendto(1, bcast_val)
    end

    barrier()

    scatter_data = nothing
    if myid() == 1
        scatter_data = rand(Int8, nprocs())
    end
    lp = scatter(scatter_data, 1, tag=1)

    if myid() == 1
        @assert scatter_data[2:2] == SPMD.recvfrom(2)
        println("SPMD: Passed scatter 1")
    elseif myid() == 2
        sendto(1, lp)
    end

    scatter_data = nothing
    if myid() == 1
        scatter_data = rand(Int8, nprocs()*2)
    end
    lp = scatter(scatter_data, 1, tag=2)

    if myid() == 1
        @assert scatter_data[3:4] == SPMD.recvfrom(2)
        println("SPMD: Passed scatter 2")
    elseif myid() == 2
        sendto(1, lp)
    end

    gathered_data = gather(myid(), 1, tag=3)
    if myid() == 1
        @assert gathered_data == procs()
        println("SPMD: Passed gather 1")
    end

    gathered_data = gather([myid(), myid()], 1, tag=4)
    if myid() == 1
        @assert gathered_data == [[p,p] for p in procs()]
        println("SPMD: Passed gather 2")
    end
end

spmd(spmd_test1)

# Test running only on the workers using the spmd function.

# define the function everywhere
@everywhere function foo_spmd(d_in, d_out, n)
    pids=sort(vec(procs(d_in)))
    pididx = findfirst(isequal(myid()), pids)
    mylp = localpart(d_in)
    localsum = 0

    # Have each node exchange data with its neighbors
    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1

#    println(p_pididx, " p", pids[p_pididx], " ", n_pididx, " p", pids[n_pididx])
#    println(mylp)

    for i in 1:n
        sendto(pids[n_pididx], mylp[2])
        sendto(pids[p_pididx], mylp[1])

        mylp[2] = SPMD.recvfrom(pids[p_pididx])
        mylp[1] = SPMD.recvfrom(pids[n_pididx])

#        println(mylp)

        barrier(;pids=pids)
        localsum = localsum + mylp[1] + mylp[2]
    end

    # finally store the sum in d_out
    d_out[:L] = localsum
end

# run foo_spmd on all workers, many of them, all concurrently using implicitly different contexts.
in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
out_arrays = map(x->ddata(), 1:8)

@sync for i in 1:8
    @async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers())
end
for i in 1:8
    @test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
end

println("SPMD: Passed testing of spmd function run concurrently")

# run concurrently with explicitly different contexts

# define the function everywhere
@everywhere function foo_spmd2(d_in, d_out, n)
    pids=sort(vec(procs(d_in)))
    pididx = findfirst(isequal(myid()), pids)
    mylp = localpart(d_in)

    # see if we have a value in the local store.
    store = context_local_storage()

    localsum = get!(store, :LOCALSUM, 0)

    # Have each node exchange data with its neighbors
    n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
    p_pididx = pididx-1 < 1 ? length(pids) : pididx-1

    for i in 1:n
        sendto(pids[n_pididx], mylp[2])
        sendto(pids[p_pididx], mylp[1])

        mylp[2] = SPMD.recvfrom(pids[p_pididx])
        mylp[1] = SPMD.recvfrom(pids[n_pididx])

        barrier(;pids=pids)
        localsum = localsum + mylp[1] + mylp[2]
    end

    # finally store the sum in d_out
    d_out[:L] = localsum
    store[:LOCALSUM] = localsum
end


in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
out_arrays = map(x->ddata(), 1:8)
contexts = map(x->context(workers()), 1:8)

@sync for i in 1:8
    @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
end
# Second run will add the value stored in the previous run.
@sync for i in 1:8
    @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
end

for i in 1:8
    @test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
end

# verify localstores with appropriate context store values exist.
@everywhere begin
    if myid() != 1
        local n = 0
        @lock DistributedArrays.SPMD.CONTEXTS.lock begin
            for (k,v) in DistributedArrays.SPMD.CONTEXTS.data
                store = v.store
                localsum = store[:LOCALSUM]
                if localsum != 2*sum(workers())*2
                    println("localsum ", localsum, " != $(2*sum(workers())*2)")
                    error("localsum mismatch")
                end
                n += 1
            end
        end
        @assert n == 8
    end
end

# close the contexts
foreach(close, contexts)

# verify that the localstores have been deleted.
@everywhere begin
    @assert @lock DistributedArrays.SPMD.CONTEXTS.lock isempty(DistributedArrays.SPMD.CONTEXTS.data)
end

println("SPMD: Passed spmd function with explicit context run concurrently")