Repository: JuliaParallel/DistributedArrays.jl
Branch: master
Commit: db355b31aefd
Files: 30
Total size: 138.8 KB
Directory structure:
gitextract_34pth6or/
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── CI.yml
│ ├── CompatHelper.yml
│ └── TagBot.yml
├── .gitignore
├── LICENSE.md
├── Project.toml
├── README.md
├── codecov.yml
├── docs/
│ ├── .gitignore
│ ├── Project.toml
│ ├── make.jl
│ └── src/
│ ├── api.md
│ └── index.md
├── ext/
│ ├── SparseArraysExt.jl
│ └── StatisticsExt.jl
├── src/
│ ├── DistributedArrays.jl
│ ├── broadcast.jl
│ ├── core.jl
│ ├── darray.jl
│ ├── linalg.jl
│ ├── mapreduce.jl
│ ├── serialize.jl
│ ├── sort.jl
│ └── spmd.jl
└── test/
├── aqua.jl
├── darray.jl
├── explicit_imports.jl
├── runtests.jl
└── spmd.jl
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/dependabot.yml
================================================
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
================================================
FILE: .github/workflows/CI.yml
================================================
name: CI
on:
pull_request:
branches:
- master
push:
branches:
- master
tags: '*'
workflow_dispatch:
concurrency:
# Skip intermediate builds: all builds except for builds on the `master` branch
# Cancel intermediate builds: only pull request builds
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- 'min'
- 'lts'
- '1'
- 'pre'
os:
- ubuntu-latest
- windows-latest
- macOS-latest
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: ${{ matrix.version }}
- uses: julia-actions/cache@v3
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v5
with:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: true
docs:
name: Documentation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: '1'
- uses: julia-actions/cache@v3
- run: julia --project=docs -e 'import Pkg; Pkg.instantiate()'
- run: |
julia --project=docs -e '
using Documenter: doctest
using DistributedArrays
doctest(DistributedArrays)'
- run: julia --project=docs docs/make.jl
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
================================================
FILE: .github/workflows/CompatHelper.yml
================================================
name: CompatHelper
on:
schedule:
- cron: 0 0 * * *
workflow_dispatch:
jobs:
CompatHelper:
runs-on: ubuntu-latest
steps:
- name: "Add the General registry via Git"
run: |
import Pkg
ENV["JULIA_PKG_SERVER"] = ""
Pkg.Registry.add("General")
shell: julia --color=yes {0}
- name: "Install CompatHelper"
run: |
import Pkg
name = "CompatHelper"
uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
version = "3"
Pkg.add(; name, uuid, version)
shell: julia --color=yes {0}
- name: "Run CompatHelper"
run: |
import CompatHelper
CompatHelper.main(; subdirs = ["", "docs"])
shell: julia --color=yes {0}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
================================================
FILE: .github/workflows/TagBot.yml
================================================
name: TagBot
on:
issue_comment:
types:
- created
workflow_dispatch:
jobs:
TagBot:
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
================================================
FILE: .gitignore
================================================
Manifest.toml
*.jl.cov
*.jl.mem
.DS_Store
.vscode/
================================================
FILE: LICENSE.md
================================================
The DistributedArrays.jl package is licensed under the MIT "Expat" License:
> Copyright (c) 2015: Julia Parallel Contributors
>
> Permission is hereby granted, free of charge, to any person obtaining
> a copy of this software and associated documentation files (the
> "Software"), to deal in the Software without restriction, including
> without limitation the rights to use, copy, modify, merge, publish,
> distribute, sublicense, and/or sell copies of the Software, and to
> permit persons to whom the Software is furnished to do so, subject to
> the following conditions:
>
> The above copyright notice and this permission notice shall be
> included in all copies or substantial portions of the Software.
>
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: Project.toml
================================================
name = "DistributedArrays"
uuid = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
version = "0.6.9"
[deps]
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[weakdeps]
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[extensions]
SparseArraysExt = "SparseArrays"
StatisticsExt = "Statistics"
[compat]
Aqua = "0.8.12"
Distributed = "<0.0.1, 1"
ExplicitImports = "1.13.2"
LinearAlgebra = "<0.0.1, 1"
Primes = "0.4, 0.5"
Random = "<0.0.1, 1"
Serialization = "<0.0.1, 1"
SparseArrays = "<0.0.1, 1"
SpecialFunctions = "0.8, 1, 2"
Statistics = "<0.0.1, 1"
Test = "<0.0.1, 1"
julia = "1.10"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
test = ["Aqua", "ExplicitImports", "SparseArrays", "SpecialFunctions", "Statistics", "Test"]
================================================
FILE: README.md
================================================
# DistributedArrays
*Distributed arrays for Julia.*
| **Documentation** | **Build Status** |
|:-------------------------------------------------------------------------:|:-------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][travis-img]][travis-url] [![][codecov-img]][codecov-url] |
## Introduction
`DistributedArrays.jl` uses the stdlib [`Distributed`][distributed-docs] to implement a *Global Array* interface.
A `DArray` is distributed across a set of workers. Each worker can read and write from its local portion of the array and each worker has read-only access to the portions of the array held by other workers.
## Installation
The package can be installed with the Julia package manager.
From the Julia REPL, type `]` to enter the Pkg REPL mode and run:
```
pkg> add DistributedArrays
```
Or, equivalently, via the `Pkg` API:
```julia
julia> import Pkg; Pkg.add("DistributedArrays")
```
## Documentation
- [**STABLE**][docs-stable-url] — **documentation of the most recently tagged version.**
- [**DEVEL**][docs-dev-url] — *documentation of the in-development version.*
## Project Status
The package is tested against
Julia 1.10.0 (oldest supported Julia version),
the Julia LTS version,
the latest stable release of Julia,
and the pre-release version of Julia.
## Questions and Contributions
Usage questions can be posted on the [Julia Discourse forum][discourse-tag-url] under the `Parallel/Distributed` category, in the #parallel channel of the [Julia Slack](https://julialang.org/community/).
Contributions are very welcome, as are feature requests and suggestions. Please open an [issue][issues-url] if you encounter any problems. In particular additions to documentation are encouraged!
[contrib-url]: https://juliadocs.github.io/Documenter.jl/latest/man/contributing/
[discourse-tag-url]: https://discourse.julialang.org/c/domain/parallel
[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg
[docs-dev-url]: https://juliaparallel.github.io/DistributedArrays.jl/dev
[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
[docs-stable-url]: https://juliaparallel.github.io/DistributedArrays.jl/stable
[travis-img]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master
[travis-url]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl
[codecov-img]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg
[codecov-url]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl
[issues-url]: https://github.com/JuliaParallel/DistributedArrays.jl/issues
[distributed-docs]: https://docs.julialang.org/en/v1/manual/parallel-computing/#Multi-Core-or-Distributed-Processing-1
================================================
FILE: codecov.yml
================================================
comment: off
================================================
FILE: docs/.gitignore
================================================
build/
================================================
FILE: docs/Project.toml
================================================
[deps]
DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
[compat]
DistributedArrays = "0.6"
Documenter = "1"
[sources.DistributedArrays]
path = ".."
================================================
FILE: docs/make.jl
================================================
using Documenter, DistributedArrays
makedocs(
modules = [DistributedArrays],
format = Documenter.HTML(),
sitename = "DistributedArrays.jl",
pages = [
"Introduction" => "index.md"
"API" => "api.md"
],
doctest = true
)
deploydocs(
repo = "github.com/JuliaParallel/DistributedArrays.jl.git",
)
================================================
FILE: docs/src/api.md
================================================
# API
```@autodocs
Modules = [DistributedArrays]
```
================================================
FILE: docs/src/index.md
================================================
# DistributedArrays.jl
```@contents
```
Distributed Arrays
------------------
Large computations are often organized around large arrays of data. In these
cases, a particularly natural way to obtain parallelism is to distribute arrays
among several processes. This combines the memory resources of multiple
machines, allowing use of arrays too large to fit on one machine. Each process
can read and write to the part of the array it owns and has read-only access to
the parts it doesn't own. This provides a ready answer to the question of how a
program should be divided among machines.
Julia distributed arrays are implemented by the `DArray` type. A
`DArray` has an element type and dimensions just like an `Array`.
A `DArray` can also use arbitrary array-like types to represent the local
chunks that store actual data. The data in a `DArray` is distributed by
dividing the index space into some number of blocks in each dimension.
Common kinds of arrays can be constructed with functions beginning with
`d`:
```julia
dzeros(100,100,10)
dones(100,100,10)
drand(100,100,10)
drandn(100,100,10)
dfill(x,100,100,10)
```
In the last case, each element will be initialized to the specified
value `x`. These functions automatically pick a distribution for you.
For more control, you can specify which processes to use, and how the
data should be distributed:
```julia
dzeros((100,100), workers()[1:4], [1,4])
```
The second argument specifies that the array should be created on the first
four workers. When dividing data among a large number of processes,
one often sees diminishing returns in performance. Placing `DArray`s
on a subset of processes allows multiple `DArray` computations to
happen at once, with a higher ratio of work to communication on each
process.
The third argument specifies a distribution; the nth element of
this array specifies how many pieces dimension n should be divided into.
In this example the first dimension will not be divided, and the second
dimension will be divided into 4 pieces. Therefore each local chunk will be
of size `(100,25)`. Note that the product of the distribution array must
equal the number of processes.
* `distribute(a::Array)` converts a local array to a distributed array.
* `localpart(d::DArray)` obtains the locally-stored portion
of a `DArray`.
* Localparts can be retrieved and set via the indexing syntax too.
Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which
are all equivalent. For example, `d[:L]` returns the localpart of `d`
while `d[:L]=v` sets `v` as the localpart of `d`.
* `localindices(a::DArray)` gives a tuple of the index ranges owned by the
local process.
* `convert(Array, a::DArray)` brings all the data to the local process.
Indexing a `DArray` (square brackets) with ranges of indices always
creates a `SubArray`, not copying any data.
Constructing Distributed Arrays
-------------------------------
The primitive `DArray` constructor has the following somewhat elaborate signature:
```julia
DArray(init, dims[, procs, dist])
```
`init` is a function that accepts a tuple of index ranges. This function should
allocate a local chunk of the distributed array and initialize it for the specified
indices. `dims` is the overall size of the distributed array.
`procs` optionally specifies a vector of process IDs to use.
`dist` is an integer vector specifying how many chunks the
distributed array should be divided into in each dimension.
The last two arguments are optional, and defaults will be used if they
are omitted.
As an example, here is how to turn the local array constructor `fill`
into a distributed array constructor:
```julia
dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
```
In this case the `init` function only needs to call `fill` with the
dimensions of the local piece it is creating.
`DArray`s can also be constructed from multidimensional `Array` comprehensions with
the `@DArray` macro syntax. This syntax is just sugar for the primitive `DArray` constructor:
```julia
julia> [i+j for i = 1:5, j = 1:5]
5x5 Array{Int64,2}:
2 3 4 5 6
3 4 5 6 7
4 5 6 7 8
5 6 7 8 9
6 7 8 9 10
julia> @DArray [i+j for i = 1:5, j = 1:5]
5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}:
2 3 4 5 6
3 4 5 6 7
4 5 6 7 8
5 6 7 8 9
6 7 8 9 10
```
### Construction from arrays generated on separate processes
`DArray`s can also be constructed from arrays that have been constructed on separate processes, as demonstrated below:
```julia
ras = [@spawnat p rand(30,30) for p in workers()[1:4]]
ras = reshape(ras,(2,2))
D = DArray(ras)
```
An alternative syntax is:
```julia
r1 = DistributedArrays.remotecall(() -> rand(10,10), workers()[1])
r2 = DistributedArrays.remotecall(() -> rand(10,10), workers()[2])
r3 = DistributedArrays.remotecall(() -> rand(10,10), workers()[3])
r4 = DistributedArrays.remotecall(() -> rand(10,10), workers()[4])
D = DArray(reshape([r1 r2 r3 r4], (2,2)))
```
The distribution of indices across workers can be checked with
```julia
[@fetchfrom p localindices(D) for p in workers()]
```
Distributed Array Operations
----------------------------
At this time, distributed arrays do not have much functionality. Their
major utility is allowing communication to be done via array indexing, which
is convenient for many problems. As an example, consider implementing the
"life" cellular automaton, where each cell in a grid is updated according
to its neighboring cells. To compute a chunk of the result of one iteration,
each process needs the immediate neighbor cells of its local chunk. The
following code accomplishes this:
```julia
function life_step(d::DArray)
DArray(size(d),procs(d)) do I
top = mod(first(I[1])-2,size(d,1))+1
bot = mod( last(I[1]) ,size(d,1))+1
left = mod(first(I[2])-2,size(d,2))+1
right = mod( last(I[2]) ,size(d,2))+1
old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2)
old[1 , 1 ] = d[top , left] # left side
old[2:end-1, 1 ] = d[I[1], left]
old[end , 1 ] = d[bot , left]
old[1 , 2:end-1] = d[top , I[2]]
old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle
old[end , 2:end-1] = d[bot , I[2]]
old[1 , end ] = d[top , right] # right side
old[2:end-1, end ] = d[I[1], right]
old[end , end ] = d[bot , right]
life_rule(old)
end
end
```
As you can see, we use a series of indexing expressions to fetch
data into a local array `old`. Note that the `do` block syntax is
convenient for passing `init` functions to the `DArray` constructor.
Next, the serial function `life_rule` is called to apply the update rules
to the data, yielding the needed `DArray` chunk. Nothing about `life_rule`
is `DArray`-specific, but we list it here for completeness:
```julia
function life_rule(old)
m, n = size(old)
new = similar(old, m-2, n-2)
for j = 2:n-1
for i = 2:m-1
nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1],
old[i ,j-1], old[i ,j+1],
old[i+1,j-1], old[i+1,j], old[i+1,j+1])
new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j])
end
end
new
end
```
Numerical Results of Distributed Computations
---------------------------------------------
Floating point arithmetic is not associative and this comes up
when performing distributed computations over `DArray`s. All `DArray`
operations are performed over the `localpart` chunks and then aggregated.
The change in ordering of the operations will change the numeric result as
seen in this simple example:
```julia
julia> addprocs(8);
julia> using DistributedArrays
julia> A = fill(1.1, (100,100));
julia> sum(A)
11000.000000000013
julia> DA = distribute(A);
julia> sum(DA)
11000.000000000127
julia> sum(A) == sum(DA)
false
```
The ultimate ordering of operations will be dependent on how the `Array` is distributed.
Garbage Collection and `DArray`s
------------------------------
When a `DArray` is constructed (typically on the master process), the returned `DArray` objects stores information on how the
array is distributed, which processor holds which indices and so on. When the `DArray` object
on the master process is garbage collected, all participating workers are notified and
localparts of the `DArray` freed on each worker.
Since the size of the `DArray` object itself is small, a problem arises as `gc` on the master faces no memory pressure to
collect the `DArray` immediately. This results in a delay of the memory being released on the participating workers.
Therefore it is highly recommended to explicitly call `close(d::DArray)` as soon as user code
has finished working with the distributed array.
It is also important to note that the localparts of the `DArray` is collected from all participating workers
when the `DArray` object on the process creating the `DArray` is collected. It is therefore important to maintain
a reference to a `DArray` object on the creating process for as long as it is being computed upon.
`d_closeall()` is another useful function to manage distributed memory. It releases all `DArrays` created from
the calling process, including any temporaries created during computation.
Working with distributed non-array data (requires Julia 0.6)
------------------------------------------------------------
The function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used
to created a distributed vector whose localparts need not be Arrays.
It returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same.
`ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value,
initially initialized to `nothing`.
Argument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`.
If the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1,
returns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`.
`gather{T}(d::DArray{T,1,T})` returns an `Array{T,1}` consisting of all distributed elements of `d`.
Given a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart`
on the ith worker that `d` is distributed over.
SPMD Mode (An MPI Style SPMD mode with MPI like primitives, requires Julia 0.6)
-------------------------------------------------------------------------------
SPMD, i.e., a Single Program Multiple Data mode, is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background.
The same block of code is executed concurrently on all workers using the `spmd` function.
```julia
# define foo() on all workers
@everywhere function foo(arg1, arg2)
....
end
# call foo() everywhere using the `spmd` function
d_in=DArray(.....)
d_out=ddata()
spmd(foo,d_in,d_out; pids=workers()) # executes on all workers
```
`spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)`
`args` is one or more arguments to be passed to `f`. `pids` identifies the workers
that `f` needs to be run on. `context` identifies a run context, which is explained
later.
The following primitives can be used in SPMD mode.
- `sendto(pid, data; tag=nothing)` - sends `data` to `pid`
- `recvfrom(pid; tag=nothing)` - receives data from `pid`
- `recvfrom_any(; tag=nothing)` - receives data from any `pid`
- `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceed
- `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid`
- `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid`
- `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid`
Tag `tag` should be used to differentiate between consecutive calls of the same type, for example,
consecutive `bcast` calls.
`spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to
import it explicitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example,
`SPMD.sendto`.
Example
-------
This toy example exchanges data with each of its neighbors `n` times.
```julia
using Distributed
using DistributedArrays
addprocs(8)
@everywhere using DistributedArrays
@everywhere using DistributedArrays.SPMD
d_in=d=DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1])
d_out=ddata();
# define the function everywhere
@everywhere function foo_spmd(d_in, d_out, n)
pids = sort(vec(procs(d_in)))
pididx = findfirst(isequal(myid()), pids)
mylp = d_in[:L]
localsum = 0
# Have each worker exchange data with its neighbors
n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
for i in 1:n
sendto(pids[n_pididx], mylp[2])
sendto(pids[p_pididx], mylp[1])
mylp[2] = recvfrom(pids[p_pididx])
mylp[1] = recvfrom(pids[n_pididx])
barrier(;pids=pids)
localsum = localsum + mylp[1] + mylp[2]
end
# finally store the sum in d_out
d_out[:L] = localsum
end
# run foo_spmd on all workers
spmd(foo_spmd, d_in, d_out, 10, pids=workers())
# print values of d_in and d_out after the run
println(d_in)
println(d_out)
```
SPMD Context
------------
Each SPMD run is implicitly executed in a different context. This allows for multiple `spmd` calls to
be active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`.
`context(pids=procs())` returns a new SPMD context.
A SPMD context also provides a context local storage, a dict, which can be used to store
key-value pairs between spmd runs under the same context.
`context_local_storage()` returns the dictionary associated with the context.
NOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context
which live only for the duration of the call. Explicitly created context objects can be released
early by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries
on all participating `pids`. Else they will be released when the context object is gc'ed
on the node that created it.
Nested `spmd` calls
-------------------
As `spmd` executes the specified function on all participating nodes, we need to be careful with nesting `spmd` calls.
An example of an unsafe(wrong) way:
```julia
function foo(.....)
......
spmd(bar, ......)
......
end
function bar(....)
......
spmd(baz, ......)
......
end
spmd(foo,....)
```
In the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currently part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process.
The correct way (only have the driver process initiate `spmd` calls):
```julia
function foo()
......
myid()==1 && spmd(bar, ......)
......
end
function bar()
......
myid()==1 && spmd(baz, ......)
......
end
spmd(foo,....)
```
This is also true of functions which automatically distribute computation on DArrays.
```julia
function foo(d::DArray)
......
myid()==1 && map!(bar, d)
......
end
spmd(foo,....)
```
Without the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is probably not what we want.
Similarly `@everywhere` from within a SPMD run should also be driven from the master node only.
================================================
FILE: ext/SparseArraysExt.jl
================================================
module SparseArraysExt
using DistributedArrays: DArray, SubDArray, SubOrDArray, localpart
using DistributedArrays.Distributed: remotecall_fetch
using SparseArrays: SparseArrays, nnz
function SparseArrays.nnz(A::DArray)
B = asyncmap(A.pids) do p
remotecall_fetch(nnz∘localpart, p, A)
end
return reduce(+, B)
end
# Fix method ambiguities
# TODO: Improve efficiency?
Base.copyto!(dest::SubOrDArray{<:Any,2}, src::SparseArrays.AbstractSparseMatrixCSC) = copyto!(dest, Matrix(src))
@static if isdefined(SparseArrays, :CHOLMOD)
Base.copyto!(dest::SubOrDArray, src::SparseArrays.CHOLMOD.Dense) = copyto!(dest, Array(src))
Base.copyto!(dest::SubOrDArray{T}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))
Base.copyto!(dest::SubOrDArray{T,2}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src))
end
# Fix method ambiguities
for T in (:DArray, :SubDArray)
@eval begin
Base.:(==)(d1::$T{<:Any,1}, d2::SparseArrays.ReadOnly) = d1 == parent(d2)
Base.:(==)(d1::SparseArrays.ReadOnly, d2::$T{<:Any,1}) = parent(d1) == d2
end
end
end
================================================
FILE: ext/StatisticsExt.jl
================================================
module StatisticsExt
using DistributedArrays: DArray
using Statistics: Statistics
Statistics._mean(f, A::DArray, region) = sum(f, A, dims = region) ./ prod((size(A, i) for i in region))
end
================================================
FILE: src/DistributedArrays.jl
================================================
module DistributedArrays
using Base: Callable
using Base.Broadcast: BroadcastStyle, Broadcasted
using Distributed: Distributed, RemoteChannel, Future, myid, nworkers, procs, remotecall, remotecall_fetch, remotecall_wait, worker_id_from_socket, workers
using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, I, Transpose, adjoint, adjoint!, axpy!, dot, lmul!, mul!, norm, rmul!, transpose, transpose!
using Random: Random, rand!
using Serialization: Serialization, AbstractSerializer, deserialize, serialize
using Primes: factor
# DArray exports
export DArray, SubDArray, SubOrDArray, @DArray
export dzeros, dones, dfill, drand, drandn, distribute, localpart, localindices, ppeval
# non-array distributed data
export ddata, gather
# immediate release of localparts
export d_closeall
include("darray.jl")
include("core.jl")
include("serialize.jl")
include("broadcast.jl")
include("mapreduce.jl")
include("linalg.jl")
include("sort.jl")
include("spmd.jl")
export SPMD
end # module
================================================
FILE: src/broadcast.jl
================================================
###
# Distributed broadcast implementation
##
# We define a custom ArrayStyle here since we need to keep track of
# the fact that it is Distributed and what kind of underlying broadcast behaviour
# we will encounter.
struct DArrayStyle{Style <: Union{Nothing,BroadcastStyle}} <: Broadcast.AbstractArrayStyle{Any} end
DArrayStyle(::S) where {S} = DArrayStyle{S}()
DArrayStyle(::S, ::Val{N}) where {S,N} = DArrayStyle(S(Val(N)))
DArrayStyle(::Val{N}) where N = DArrayStyle{Broadcast.DefaultArrayStyle{N}}()
Broadcast.BroadcastStyle(::Type{<:DArray{<:Any, N, A}}) where {N, A} = DArrayStyle(BroadcastStyle(A), Val(N))
# promotion rules
# TODO: test this
function Broadcast.BroadcastStyle(::DArrayStyle{AStyle}, ::DArrayStyle{BStyle}) where {AStyle, BStyle}
DArrayStyle(BroadcastStyle(AStyle, BStyle))
end
function Broadcast.broadcasted(::DArrayStyle{Style}, f, args...) where Style
inner = Broadcast.broadcasted(Style(), f, args...)
if inner isa Broadcasted
return Broadcasted{DArrayStyle{Style}}(inner.f, inner.args, inner.axes)
else # eagerly evaluated
return inner
end
end
# # deal with one layer deep lazy arrays
# BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where T <: DArray = BroadcastStyle(T)
# BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where T <: DArray = BroadcastStyle(T)
# BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,<:T}}) where T <: DArray = BroadcastStyle(T)
# # This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray
# # and we could define our methods in terms of Union{DArray, WrappedArray{<:Any, <:DArray}}
# const DDestArray = Union{DArray,
# LinearAlgebra.Transpose{<:Any,<:DArray},
# LinearAlgebra.Adjoint{<:Any,<:DArray},
# SubArray{<:Any, <:Any, <:DArray}}
const DDestArray = DArray
# This method is responsible for selection the output type of broadcast
function Base.similar(bc::Broadcasted{<:DArrayStyle{Style}}, ::Type{ElType}) where {Style, ElType}
DArray(map(length, axes(bc))) do I
# create fake Broadcasted for underlying ArrayStyle
bc′ = Broadcasted{Style}(identity, (), map(length, I))
similar(bc′, ElType)
end
end
##
# Ref https://docs.julialang.org/en/v1/manual/interfaces/#extending-in-place-broadcast-2
#
# We purposefully only specialise `copyto!`,
# Broadcast implementation that defers to the underlying BroadcastStyle. We can't
# assume that `getindex` is fast, furthermore we can't assume that the distribution of
# DArray across workers is equal or that the underlying array type is consistent.
#
# Implementation:
# - first distribute all arguments
# - Q: How do decide on the cuts
# - then localise arguments on each node
##
@inline function Base.copyto!(dest::DDestArray, bc::Broadcasted{Nothing})
axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
# Distribute Broadcasted
# This will turn local AbstractArrays into DArrays
dbc = bcdistribute(bc)
@sync for p in procs(dest)
@async remotecall_wait(p) do
# get the indices for the localpart
lpidx = localpartindex(dest)
@assert lpidx != 0
# create a local version of the broadcast, by constructing views
# Note: creates copies of the argument
lbc = bclocal(dbc, dest.indices[lpidx])
copyto!(localpart(dest), lbc)
end
end
return dest
end
# Test
# a = Array
# a .= DArray(x,y)
@inline function Base.copy(bc::Broadcasted{<:DArrayStyle})
dbc = bcdistribute(bc)
# TODO: teach DArray about axes since this is wrong for OffsetArrays
DArray(map(length, axes(bc))) do I
lbc = bclocal(dbc, I)
copy(lbc)
end
end
# _bcview creates takes the shapes of a view and the shape of a broadcasted argument,
# and produces the view over that argument that constitutes part of the broadcast
# it is in a sense the inverse of _bcs in Base.Broadcast
_bcview(::Tuple{}, ::Tuple{}) = ()
_bcview(::Tuple{}, view::Tuple) = ()
_bcview(shape::Tuple, ::Tuple{}) = (shape[1], _bcview(tail(shape), ())...)
function _bcview(shape::Tuple, view::Tuple)
return (_bcview1(shape[1], view[1]), _bcview(tail(shape), tail(view))...)
end
# _bcview1 handles the logic for a single dimension
function _bcview1(a, b)
if a == 1 || a == 1:1
return 1:1
elseif first(a) <= first(b) <= last(a) &&
first(a) <= last(b) <= last(b)
return b
else
throw(DimensionMismatch("broadcast view could not be constructed"))
end
end
# Distribute broadcast
# TODO: How to decide on cuts
@inline bcdistribute(bc::Broadcasted{Style}) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{DArrayStyle{Style}}(bc.f, bcdistribute_args(bc.args), bc.axes)
@inline bcdistribute(bc::Broadcasted{Style}) where Style<:DArrayStyle = Broadcasted{Style}(bc.f, bcdistribute_args(bc.args), bc.axes)
# ask BroadcastStyle to decide if argument is in need of being distributed
bcdistribute(x::T) where T = _bcdistribute(BroadcastStyle(T), x)
_bcdistribute(::DArrayStyle, x) = x
# Don't bother distributing singletons
_bcdistribute(::Broadcast.AbstractArrayStyle{0}, x) = x
_bcdistribute(::Broadcast.AbstractArrayStyle, x) = distribute(x)
_bcdistribute(::Any, x) = x
@inline bcdistribute_args(args::Tuple) = (bcdistribute(args[1]), bcdistribute_args(tail(args))...)
bcdistribute_args(args::Tuple{Any}) = (bcdistribute(args[1]),)
bcdistribute_args(args::Tuple{}) = ()
# dropping axes here since recomputing is easier
@inline bclocal(bc::Broadcasted{DArrayStyle{Style}}, idxs) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{Style}(bc.f, bclocal_args(_bcview(axes(bc), idxs), bc.args))
# bclocal will do a view of the data and the copy it over
# except when the data already is local
function bclocal(x::DArray{T, N, AT}, idxs) where {T, N, AT}
bcidxs = _bcview(axes(x), idxs)
makelocal(x, bcidxs...)
end
bclocal(x, idxs) = x
@inline bclocal_args(idxs, args::Tuple) = (bclocal(args[1], idxs), bclocal_args(idxs, tail(args))...)
bclocal_args(idxs, args::Tuple{Any}) = (bclocal(args[1], idxs),)
bclocal_args(idxs, args::Tuple{}) = ()
================================================
FILE: src/core.jl
================================================
# Thread-safe registry of DArray references
struct DArrayRegistry
data::Dict{Tuple{Int,Int}, Any}
lock::ReentrantLock
DArrayRegistry() = new(Dict{Tuple{Int,Int}, Any}(), ReentrantLock())
end
const REGISTRY = DArrayRegistry()
function Base.get(r::DArrayRegistry, id::Tuple{Int,Int}, default)
@lock r.lock begin
return get(r.data, id, default)
end
end
function Base.getindex(r::DArrayRegistry, id::Tuple{Int,Int})
@lock r.lock begin
return r.data[id]
end
end
function Base.setindex!(r::DArrayRegistry, val, id::Tuple{Int,Int})
@lock r.lock begin
r.data[id] = val
end
return r
end
function Base.delete!(r::DArrayRegistry, id::Tuple{Int,Int})
@lock r.lock delete!(r.data, id)
return r
end
# Thread-safe set of IDs of DArrays created on this node
struct DArrayRefs
data::Set{Tuple{Int,Int}}
lock::ReentrantLock
DArrayRefs() = new(Set{Tuple{Int,Int}}(), ReentrantLock())
end
const REFS = DArrayRefs()
function Base.push!(r::DArrayRefs, id::Tuple{Int,Int})
# Ensure id refers to a DArray created on this node
if first(id) != myid()
throw(
ArgumentError(
lazy"`DArray` is not created on the current worker: Only `DArray`s created on worker $(myid()) can be stored in this set but the `DArray` was created on worker $(first(id))."))
end
@lock r.lock begin
return push!(r.data, id)
end
end
function Base.delete!(r::DArrayRefs, id::Tuple{Int,Int})
@lock r.lock delete!(r.data, id)
return r
end
# Global counter to generate a unique ID for each DArray
const DID = Threads.Atomic{Int}(1)
"""
next_did()
Increment a global counter and return a tuple of the current worker ID and the incremented
value of the counter.
This tuple is used as a unique ID for a new `DArray`.
"""
next_did() = (myid(), Threads.atomic_add!(DID, 1))
release_localpart(id::Tuple{Int,Int}) = (delete!(REGISTRY, id); nothing)
function release_allparts(id::Tuple{Int,Int}, pids::Array{Int})
@sync begin
released_myid = false
for p in pids
if p == myid()
@async release_localpart(id)
released_myid = true
else
@async remotecall_fetch(release_localpart, p, id)
end
end
if !released_myid
@async release_localpart(id)
end
end
return nothing
end
function close_by_id(id::Tuple{Int,Int}, pids::Array{Int})
release_allparts(id, pids)
delete!(REFS, id)
nothing
end
function d_closeall()
@lock REFS.lock begin
while !isempty(REFS.data)
id = pop!(REFS.data)
d = d_from_weakref_or_d(id)
if d isa DArray
finalize(d)
end
end
end
return nothing
end
Base.close(d::DArray) = finalize(d)
"""
procs(d::DArray)
Get the vector of processes storing pieces of DArray `d`.
"""
Distributed.procs(d::DArray) = d.pids
Distributed.procs(d::SubDArray) = procs(parent(d))
"""
localpart(A)
The identity when input is not distributed
"""
localpart(A) = A
================================================
FILE: src/darray.jl
================================================
"""
DArray(init, dims, [procs, dist])
Construct a distributed array.
The parameter `init` is a function that accepts a tuple of index ranges.
This function should allocate a local chunk of the distributed array and initialize it for the specified indices.
`dims` is the overall size of the distributed array.
`procs` optionally specifies a vector of process IDs to use.
If unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode,
i.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the
interactive julia prompt.
`dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension.
For example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as:
### Example
```jl
dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
```
"""
mutable struct DArray{T,N,A} <: AbstractArray{T,N}
id::Tuple{Int,Int}
dims::NTuple{N,Int}
pids::Array{Int,N} # pids[i]==p ⇒ processor p has piece i
indices::Array{NTuple{N,UnitRange{Int}},N} # indices held by piece i
cuts::Vector{Vector{Int}} # cuts[d][i] = first index of chunk i in dimension d
localpart::Union{A,Nothing}
function DArray{T,N,A}(id::Tuple{Int,Int}, dims::NTuple{N,Int}, pids, indices, cuts, lp) where {T,N,A}
# check invariants
if dims != map(last, last(indices))
throw(ArgumentError("dimension of DArray (dim) and indices do not match"))
end
d = d_from_weakref_or_d(id)
if d === nothing
d = new(id, dims, pids, indices, cuts, lp)
end
if first(id) == myid()
push!(REFS, id)
REGISTRY[id] = WeakRef(d)
finalizer(d) do d
@async close_by_id(d.id, d.pids)
end
end
d
end
DArray{T,N,A}() where {T,N,A} = new()
end
unpack_weakref(x) = x
unpack_weakref(x::WeakRef) = x.value
d_from_weakref_or_d(id::Tuple{Int,Int}) = unpack_weakref(get(REGISTRY, id, nothing))
Base.eltype(::Type{DArray{T}}) where {T} = T
empty_localpart(T,N,A) = A(Array{T}(undef, ntuple(zero, N)))
const SubDArray{T,N,D<:DArray} = SubArray{T,N,D}
const SubOrDArray{T,N} = Union{DArray{T,N}, SubDArray{T,N}}
localtype(::Type{DArray{T,N,S}}) where {T,N,S} = S
localtype(::Type{SubDArray{T,N,D}}) where {T,N,D} = localtype(D)
localtype(A::SubOrDArray) = localtype(typeof(A))
localtype(A::AbstractArray) = typeof(A)
Base.hash(d::DArray, h::UInt) = Base.hash(d.id, h)
## core constructors ##
function DArray(id::Tuple{Int,Int}, init::I, dims, pids, idxs, cuts) where {I}
localtypes = Vector{DataType}(undef,length(pids))
if init isa Function
asyncmap!(localtypes, pids) do pid
return remotecall_fetch(construct_localparts, pid, init, id, dims, pids, idxs, cuts)
end
else
asyncmap!(localtypes, pids, init) do pid, pid_init
# constructing from an array of remote refs.
return remotecall_fetch(construct_localparts, pid, pid_init, id, dims, pids, idxs, cuts)
end
end
if !allequal(localtypes)
@sync for p in pids
@async remotecall_wait(release_localpart, p, id)
end
throw(ErrorException(lazy"Constructed localparts have different `eltype`: $(localtypes)"))
end
A = first(localtypes)
if myid() in pids
return unpack_weakref(REGISTRY[id])
else
T = eltype(A)
N = length(dims)
return DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A))
end
end
function construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing)
localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init)
if A == nothing
A = typeof(localpart)
end
if T == nothing
T = eltype(A)
end
N = length(dims)
d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart)
REGISTRY[id] = d
A
end
function ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])
pids=sort(vec(pids))
id = next_did()
npids = length(pids)
ldata = length(data)
idxs, cuts = chunk_idxs([npids], [npids])
if ldata > 0
@assert rem(ldata,npids) == 0
if ldata == npids
T = eltype(data)
s = DestinationSerializer(pididx->data[pididx], pids)
init = I->localpart(s)
else
# call the standard distribute function
return distribute(data)
end
end
@sync for p in pids
@async remotecall_wait(construct_localparts, p, init, id, (npids,), pids, idxs, cuts; T=T, A=T)
end
if myid() in pids
return unpack_weakref(REGISTRY[id])
else
return DArray{T,1,T}(id, (npids,), pids, idxs, cuts, nothing)
end
end
function gather(d::DArray{T,1,T}) where T
pids = procs(d)
a = Vector{T}(undef, length(pids))
asyncmap!(a, pids) do p
remotecall_fetch(localpart, p, d)
end
a
end
function DArray(init, dims, procs, dist)
np = prod(dist)
procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist)))
idxs, cuts = chunk_idxs([dims...], dist)
id = next_did()
return DArray(id, init, dims, procs, idxs, cuts)
end
function DArray(init, dims, procs)
if isempty(procs)
throw(ArgumentError("no processors given"))
end
return DArray(init, dims, procs, defaultdist(dims, procs))
end
DArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))])
# Create a DArray from a collection of references
# The refs must have the same layout as the parts distributed.
# i.e.
# size(refs) must specify the distribution of dimensions across processors
# prod(size(refs)) must equal number of parts
# FIXME : Empty parts are currently not supported.
function DArray(refs)
dimdist = size(refs)
id = next_did()
nsizes = Array{Tuple}(undef, dimdist)
asyncmap!(nsizes, refs) do r
remotecall_fetch(sz_localpart_ref, r.where, r, id)
end
nindices = Array{NTuple{length(dimdist),UnitRange{Int}}}(undef, dimdist...)
for i in 1:length(nindices)
subidx = CartesianIndices(dimdist)[i]
nindices[i] = ntuple(length(subidx)) do x
idx_in_dim = subidx[x]
startidx = 1
for j in 1:(idx_in_dim-1)
prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx))
prevsize = nsizes[prevsubidx...]
startidx += prevsize[x]
end
startidx:startidx+(nsizes[i][x])-1
end
end
lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindices]...)
ncuts = Array{Int,1}[pushfirst!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)]
ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...)
DArray(id, refs, ndims, map(r -> r.where, refs), nindices, ncuts)
end
macro DArray(ex0::Expr)
if ex0.head !== :comprehension
throw(ArgumentError("invalid @DArray syntax"))
end
ex = ex0.args[1]
if ex.head !== :generator
throw(ArgumentError("invalid @DArray syntax"))
end
ex.args[1] = esc(ex.args[1])
ndim = length(ex.args) - 1
ranges = map(r->esc(r.args[2]), ex.args[2:end])
for d = 1:ndim
var = ex.args[d+1].args[1]
ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
end
return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),
tuple($(map(r->:(length($r)), ranges)...))) )
end
# new DArray similar to an existing one
DArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indices, d.cuts)
sz_localpart_ref(ref, id) = size(fetch(ref))
Base.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(undef, map(length,I)), dims, procs(d))
Base.similar(d::DArray, T::Type) = similar(d, T, size(d))
Base.similar(d::DArray{T}, dims::Dims) where {T} = similar(d, T, dims)
Base.similar(d::DArray{T}) where {T} = similar(d, T, size(d))
Base.size(d::DArray) = d.dims
chunktype(d::DArray{T,N,A}) where {T,N,A} = A
## chunk index utilities ##
# decide how to divide each dimension
# returns size of chunks array
function defaultdist(dims, pids)
dims = [dims...]
chunks = ones(Int, length(dims))
np = length(pids)
f = sort!(collect(keys(factor(np))), rev=true)
k = 1
while np > 1
# repeatedly allocate largest factor to largest dim
if np % f[k] != 0
k += 1
if k > length(f)
break
end
end
fac = f[k]
(d, dno) = findmax(dims)
# resolve ties to highest dim
dno = findlast(isequal(d), dims)
if dims[dno] >= fac
dims[dno] = div(dims[dno], fac)
chunks[dno] *= fac
end
np = div(np, fac)
end
return chunks
end
# get array of start indices for dividing sz into nc chunks
function defaultdist(sz::Int, nc::Int)
if sz >= nc
chunk_size = div(sz,nc)
remainder = rem(sz,nc)
grid = zeros(Int64, nc+1)
for i = 1:(nc+1)
grid[i] += (i-1)*chunk_size + 1
if i<= remainder
grid[i] += i-1
else
grid[i] += remainder
end
end
return grid
else
return [[1:(sz+1);]; zeros(Int, nc-sz)]
end
end
# compute indices array for dividing dims into chunks
function chunk_idxs(dims, chunks)
cuts = map(defaultdist, dims, chunks)
n = length(dims)
idxs = Array{NTuple{n,UnitRange{Int}}}(undef, chunks...)
for cidx in CartesianIndices(tuple(chunks...))
idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n)
end
return (idxs, cuts)
end
function localpartindex(pids::Array{Int})
mi = myid()
for i = 1:length(pids)
if pids[i] == mi
return i
end
end
return 0
end
localpartindex(d::DArray) = localpartindex(procs(d))
"""
localpart(d::DArray)
Get the local piece of a distributed array.
Returns an empty array if no local part exists on the calling process.
d[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts.
This syntaxt can also be used for assignment. For example,
`d[:L]=v` will assign `v` to the localpart of `d`.
"""
function localpart(d::DArray{T,N,A}) where {T,N,A}
lpidx = localpartindex(d)
if lpidx == 0
return empty_localpart(T,N,A)::A
end
return d.localpart::A
end
localpart(d::DArray, localidx...) = localpart(d)[localidx...]
_localindex(i::Integer, offset) = i - offset
_localindex(i::AbstractRange, offset) = (first(i)-offset):step(i):(last(i)-offset)
_localindex(i::AbstractUnitRange, offset) = (first(i)-offset):(last(i)-offset)
"""
makelocal(A::DArray, I...)
Equivalent to `Array(view(A, I...))` but optimised for the case that the data is local.
Can return a view into `localpart(A)`
"""
@inline function makelocal(A::DArray{<:Any, <:Any, AT}, I::Vararg{Any, N}) where {N, AT}
J = map(i->Base.unalias(A, i), to_indices(A, I))
J = map(j-> isa(j, Base.Slice) ? j.indices : j, J)
@boundscheck checkbounds(A, J...)
lidcs = localindices(A)
if Base.checkbounds_indices(Bool, lidcs, J)
# data we want is local
viewidcs = ntuple(i -> _localindex(J[i], first(lidcs[i]) - 1), ndims(A))
view(localpart(A), viewidcs...)
else
# Make more efficient (?maybe) by allocating new memory
# only for the remote part
viewidcs = ntuple(i -> _localindex(J[i], 0), ndims(A))
arr = similar(AT, map(length, viewidcs)...)
copyto!(arr, view(A, viewidcs...))
end
end
# shortcut to set/get localparts of a distributed object
Base.getindex(d::DArray, s::Symbol) = _getindex(d, s)
Base.getindex(d::DArray{<:Any, 1}, s::Symbol) = _getindex(d, s)
function _getindex(d::DArray, s::Symbol)
@assert s in [:L, :l, :LP, :lp]
return localpart(d)
end
function Base.setindex!(d::DArray{T,N,A}, new_lp::A, s::Symbol) where {T,N,A}
@assert s in [:L, :l, :LP, :lp]
d.localpart = new_lp
new_lp
end
# fetch localpart of d at pids[i]
Base.fetch(d::DArray{T,N,A}, i) where {T,N,A} = remotecall_fetch(localpart, d.pids[i], d)
"""
localindices(d)
A tuple describing the indices owned by the local process.
Returns a tuple with empty ranges if no local part exists on the calling process.
"""
function localindices(d::DArray)
lpidx = localpartindex(d)
if lpidx == 0
return ntuple(i -> 1:0, ndims(d))
end
return d.indices[lpidx]
end
# Equality
function Base.:(==)(d::DArray{<:Any,<:Any,A}, a::AbstractArray) where A
if size(d) != size(a)
return false
else
b = asyncmap(procs(d)) do p
remotecall_fetch(p) do
localpart(d) == A(a[localindices(d)...])
end
end
return all(b)
end
end
function Base.:(==)(d::SubDArray, a::AbstractArray)
cd = copy(d)
t = cd == a
finalize(cd)
return t
end
Base.:(==)(a::AbstractArray, d::DArray) = d == a
Base.:(==)(a::AbstractArray, d::SubDArray) = d == a
Base.:(==)(d1::DArray, d2::DArray) = invoke(==, Tuple{DArray, AbstractArray}, d1, d2)
function Base.:(==)(d1::SubDArray, d2::DArray)
cd1 = copy(d1)
t = cd1 == d2
finalize(cd1)
return t
end
function Base.:(==)(d1::DArray, d2::SubDArray)
cd2 = copy(d2)
t = d1 == cd2
finalize(cd2)
return t
end
function Base.:(==)(d1::SubDArray, d2::SubDArray)
cd1 = copy(d1)
t = cd1 == d2
finalize(cd1)
return t
end
"""
locate(d::DArray, I::Int...)
Determine the index of `procs(d)` that hold element `I`.
"""
function locate(d::DArray, I::Int...)
ntuple(ndims(d)) do i
fi = searchsortedlast(d.cuts[i], I[i])
if fi >= length(d.cuts[i])
throw(ArgumentError("element not contained in array"))
end
return fi
end
end
chunk(d::DArray{T,N,A}, pid::Int) where {T,N,A} = remotecall_fetch(localpart, pid, d)::A
## convenience constructors ##
"""
dzeros(dims, ...)
Construct a distributed array of zeros.
Trailing arguments are the same as those accepted by `DArray`.
"""
dzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...)
dzeros(::Type{T}, dims::Dims, args...) where {T} = DArray(I->zeros(T,map(length,I)), dims, args...)
dzeros(::Type{T}, d1::Integer, drest::Integer...) where {T} = dzeros(T, convert(Dims, tuple(d1, drest...)))
dzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...)))
dzeros(d::Dims) = dzeros(Float64, d)
"""
dones(dims, ...)
Construct a distributed array of ones.
Trailing arguments are the same as those accepted by `DArray`.
"""
dones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...)
dones(::Type{T}, dims::Dims, args...) where {T} = DArray(I->ones(T,map(length,I)), dims, args...)
dones(::Type{T}, d1::Integer, drest::Integer...) where {T} = dones(T, convert(Dims, tuple(d1, drest...)))
dones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...)))
dones(d::Dims) = dones(Float64, d)
"""
dfill(x, dims, ...)
Construct a distributed array filled with value `x`.
Trailing arguments are the same as those accepted by `DArray`.
"""
dfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...)
dfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...)))
"""
drand(dims, ...)
Construct a distributed uniform random array.
Trailing arguments are the same as those accepted by `DArray`.
"""
drand(::Type{T}, dims::Dims) where {T} = DArray(I -> rand(T, map(length, I)), dims)
drand(X, dims::Dims) = DArray(I -> rand(X, map(length, I)), dims)
drand(dims::Dims) = drand(Float64, dims)
drand(::Type{T}, d1::Integer, drest::Integer...) where {T} = drand(T, Dims((d1, drest...)))
drand(X, d1::Integer, drest::Integer...) = drand(X, Dims((d1, drest...)))
drand(d1::Integer, drest::Integer...) = drand(Float64, Dims((d1, drest...)))
# With optional process IDs and number of chunks
for N in (1, 2)
@eval begin
drand(::Type{T}, dims::Dims, args::Vararg{Any,$N}) where {T} = DArray(I -> rand(T, map(length, I)), dims, args...)
drand(X, dims::Dims, args::Vararg{Any,$N}) = DArray(I -> rand(X, map(length, I)), dims, args...)
drand(dims::Dims, args::Vararg{Any,$N}) = drand(Float64, dims, args...)
end
end
# Fix method ambiguities
drand(dims::Dims, procs::Tuple{Vararg{Int}}) = drand(Float64, dims, procs)
drand(dims::Dims, procs::Tuple{Vararg{Int}}, dist) = drand(Float64, dims, procs, dist)
drand(X::Tuple{Vararg{Int}}, dim::Integer) = drand(X, Dims((dim,)))
drand(X::Tuple{Vararg{Int}}, d1::Integer, d2::Integer) = drand(X, Dims((d1, d2)))
"""
drandn(dims, ...)
Construct a distributed normal random array.
Trailing arguments are the same as those accepted by `DArray`.
"""
drandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...)
drandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...)))
## conversions ##
"""
distribute(A[; procs, dist])
Convert a local array to distributed.
`procs` optionally specifies an array of process IDs to use. (defaults to all workers)
`dist` optionally specifies a vector or tuple of the number of partitions in each dimension
"""
function distribute(A::AbstractArray;
procs = workers()[1:min(nworkers(), maximum(size(A)))],
dist = defaultdist(size(A), procs))
np = prod(dist)
procs_used = procs[1:np]
idxs, _ = chunk_idxs([size(A)...], dist)
s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx
A[idxs[pididx]...]
end
return DArray(I->localpart(s), size(A), procs_used, dist)
end
"""
distribute(A, DA)
Distribute a local array `A` like the distributed array `DA`.
"""
function distribute(A::AbstractArray, DA::DArray)
size(DA) == size(A) || throw(DimensionMismatch("Distributed array has size $(size(DA)) but array has $(size(A))"))
s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
A[DA.indices[pididx]...]
end
return DArray(I->localpart(s), DA)
end
DArray{T,N,S}(A::S) where {T,N,S<:AbstractArray} = distribute(convert(AbstractArray{T,N}, A))
function Array{S,N}(d::DArray{T,N}) where {S,T,N}
a = Array{S}(undef, size(d))
@sync for (pid, indices) in zip(d.pids, d.indices)
if !any(isempty, indices)
@async a[indices...] = chunk(d, pid)
end
end
return a
end
function Array{S,N}(s::SubDArray{T,N}) where {S,T,N}
I = s.indices
d = parent(s)
if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S && !isempty(s)
l = locate(d, map(first, I)...)
if isequal(d.indices[l...], I)
# SubDArray corresponds to a chunk
return chunk(d, d.pids[l...])
end
end
a = Array{S}(undef, size(s))
copyto!(a, s)
end
function Base.copyto!(a::Array, s::SubDArray)
N = ndims(a)
a[[1:size(a,i) for i=1:N]...] = s
return a
end
function DArray(SD::SubArray{T,N}) where {T,N}
D = SD.parent
DArray(size(SD), procs(D)) do I
lindices = Base.reindex(SD.indices, I)
convert(Array, D[lindices...])
end
end
function Base.reshape(A::DArray{T,1,S}, d::Dims) where {T,S<:Array}
if prod(d) != length(A)
throw(DimensionMismatch("dimensions must be consistent with array size"))
end
return DArray(d) do I
sz = map(length,I)
d1offs = first(I[1])
nd = length(I)
B = Array{T}(undef, sz)
nr = size(B,1)
sztail = size(B)[2:end]
for i=1:div(length(B),nr)
i2 = CartesianIndices(sztail)[i]
globalidx = [ I[j][i2[j-1]] for j=2:nd ]
a = LinearIndices(d)[d1offs, globalidx...]
B[:,i] = Array(A[a:(a+nr-1)])
end
B
end
end
## indexing ##
const _allowscalar = Ref(true)
allowscalar(flag = true) = (_allowscalar[] = flag)
_scalarindexingallowed() = _allowscalar[] || throw(ErrorException("scalar indexing disabled"))
getlocalindex(d::DArray, idx...) = localpart(d)[idx...]
function getindex_tuple(d::DArray{T,N}, I::NTuple{N,Int}) where {T,N}
chidx = locate(d, I...)
idxs = d.indices[chidx...]
localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d))
pid = d.pids[chidx...]
return remotecall_fetch(getlocalindex, pid, d, localidx...)::T
end
function Base.getindex(d::DArray, i::Int)
_scalarindexingallowed()
return getindex_tuple(d, Tuple(CartesianIndices(d)[i]))
end
function Base.getindex(d::DArray{<:Any,N}, i::Vararg{Int,N}) where {N}
_scalarindexingallowed()
return getindex_tuple(d, i)
end
Base.getindex(d::DArray) = d[1]
Base.getindex(d::SubDArray, I::Int...) = invoke(getindex, Tuple{SubArray{<:Any,N},Vararg{Int,N}} where N, d, I...)
Base.getindex(d::SubOrDArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...)
function Base.isassigned(D::DArray, i::Integer...)
try
getindex_tuple(D, i)
true
catch e
if isa(e, BoundsError) || isa(e, UndefRefError)
return false
else
rethrow(e)
end
end
end
Base.copy(d::SubDArray) = copyto!(similar(d), d)
Base.copy(d::SubDArray{<:Any,2}) = copyto!(similar(d), d)
function Base.copyto!(dest::SubOrDArray, src::AbstractArray)
@sync for p in procs(dest)
@async remotecall_wait(p) do
ldest = localpart(dest)
copyto!(ldest, view(src, localindices(dest)...))
end
end
return dest
end
function Base.deepcopy(src::DArray)
dest = similar(src)
@sync for p in procs(src)
@async remotecall_wait(p) do
dest[:L] = deepcopy(src[:L])
end
end
return dest
end
# We also want to optimize setindex! with a SubDArray source, but this is hard
# and only works on 0.5.
# Similar to Base.indexin, but just create a logical mask. Note that this
# must return a logical mask in order to support merging multiple masks
# together into one linear index since we need to know how many elements to
# skip at the end. In many cases range intersection would be much faster
# than generating a logical mask, but that loses the endpoint information.
indexin_mask(a, b::Number) = a .== b
indexin_mask(a, r::AbstractRange{Int}) = [i in r for i in a]
indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
indexin_mask(a, b) = [i in b for i in a]
import Base: tail
# Given a tuple of indices and a tuple of masks, restrict the indices to the
# valid regions. This is, effectively, reversing Base.setindex_shape_check.
# We can't just use indexing into MergedIndices here because getindex is much
# pickier about singleton dimensions than setindex! is.
restrict_indices(::Tuple{}, ::Tuple{}) = ()
function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
(vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
elseif length(a[1]) == 1
(a[1], restrict_indices(tail(a), b))
elseif length(b[1]) == 1 && b[1][1]
restrict_indices(a, tail(b))
else
throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
end
end
# The final indices are funky - they're allowed to accumulate together.
# An easy (albeit very inefficient) fix for too many masks is to use the
# outer product to merge them. But we can do that lazily with a custom type:
function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
(vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
end
# But too many indices is much harder; this requires merging the indices
# in `a` before applying the final mask in `b`.
function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
if length(a[1]) == 1
(a[1], restrict_indices(tail(a), b))
else
# When one mask spans multiple indices, we need to merge the indices
# together. At this point, we can just use indexing to merge them since
# there's no longer special handling of singleton dimensions
(view(MergedIndices(a, map(length, a)), b[1]),)
end
end
struct ProductIndices{I,N} <: AbstractArray{Bool, N}
indices::I
sz::NTuple{N,Int}
end
Base.size(P::ProductIndices) = P.sz
# This gets passed to map to avoid breaking propagation of inbounds
Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
Base.@propagate_inbounds Base.getindex(P::ProductIndices{J,N}, I::Vararg{Int, N}) where {J,N} =
Bool((&)(map(propagate_getindex, P.indices, I)...))
struct MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
indices::I
sz::NTuple{N,Int}
end
Base.size(M::MergedIndices) = M.sz
Base.@propagate_inbounds Base.getindex(M::MergedIndices{J,N}, I::Vararg{Int, N}) where {J,N} =
CartesianIndex(map(propagate_getindex, M.indices, I))
# Additionally, we optimize bounds checking when using MergedIndices as an
# array index since checking, e.g., A[1:500, 1:500] is *way* faster than
# checking an array of 500^2 elements of CartesianIndex{2}. This optimization
# also applies to reshapes of MergedIndices since the outer shape of the
# container doesn't affect the index elements themselves. We can go even
# farther and say that even restricted views of MergedIndices must be valid
# over the entire array. This is overly strict in general, but in this
# use-case all the merged indices must be valid at some point, so it's ok.
const ReshapedMergedIndices{T,N,M<:MergedIndices} = Base.ReshapedArray{T,N,M}
const SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} = SubArray{T,N,M}
const MergedIndicesOrSub = Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
@inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
# The tricky thing here is that we want to optimize the accesses into the
# distributed array, but in doing so, we lose track of which indices in I we
# should be using.
#
# I’ve come to the conclusion that the function is utterly insane.
# There are *6* flavors of indices with four different reference points:
# 1. Find the indices of each portion of the DArray.
# 2. Find the valid subset of indices for the SubArray into that portion.
# 3. Find the portion of the `I` indices that should be used when you access the
# `K` indices in the subarray. This guy is nasty. It’s totally backwards
# from all other arrays, wherein we simply iterate over the source array’s
# elements. You need to *both* know which elements in `J` were skipped
# (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
# 4. If `K` doesn't correspond to an entire chunk, reinterpret `K` in terms of
# the local portion of the source array
function Base.setindex!(a::Array, s::SubDArray,
I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
Inew = Base.to_indices(a, I)
Base.setindex_shape_check(s, Base.index_lengths(Inew...)...)
d = parent(s)
J = Base.to_indices(d, s.indices)
@sync for (pid, K_c) in zip(d.pids, d.indices)
K = map(intersect, J, K_c)
if !any(isempty, K)
K_mask = map(indexin_mask, J, K_c)
idxs = restrict_indices(Inew, K_mask)
if isequal(K, K_c)
# whole chunk
@async a[idxs...] = chunk(d, pid)
else
# partial chunk
localidxs = map((Kj, K_cj) -> Kj .- (first(K_cj) - 1), K, K_c)
@async a[idxs...] = remotecall_fetch((d, idxs) -> localpart(d)[idxs...], pid, d, localidxs)
end
end
end
return a
end
function Base.fill!(A::DArray, x)
@sync for p in procs(A)
@async remotecall_wait((A,x)->fill!(localpart(A), x), p, A, x)
end
return A
end
function Random.rand!(A::DArray, ::Type{T}) where T
@sync for p in procs(A)
@async remotecall_wait((A, T)->rand!(localpart(A), T), p, A, T)
end
return A
end
================================================
FILE: src/linalg.jl
================================================
function Base.copy(Dadj::Adjoint{T,<:DArray{T,2}}) where T
D = parent(Dadj)
DArray(reverse(size(D)), procs(D)) do I
lp = Array{T}(undef, map(length, I))
rp = convert(Array, D[reverse(I)...])
adjoint!(lp, rp)
end
end
function Base.copy(Dtr::Transpose{T,<:DArray{T,2}}) where T
D = parent(Dtr)
DArray(reverse(size(D)), procs(D)) do I
lp = Array{T}(undef, map(length, I))
rp = convert(Array, D[reverse(I)...])
transpose!(lp, rp)
end
end
const DVector{T,A} = DArray{T,1,A}
const DMatrix{T,A} = DArray{T,2,A}
# Level 1
function LinearAlgebra.axpy!(α, x::DArray, y::DArray)
if length(x) != length(y)
throw(DimensionMismatch("vectors must have same length"))
end
@sync for p in procs(y)
@async remotecall_wait(p) do
axpy!(α, localpart(x), localpart(y))
end
end
return y
end
function LinearAlgebra.dot(x::DVector, y::DVector)
if length(x) != length(y)
throw(DimensionMismatch(""))
end
results = asyncmap(procs(x)) do p
remotecall_fetch((x, y) -> dot(localpart(x), makelocal(y, localindices(x)...)), p, x, y)
end
return reduce(+, results)
end
function LinearAlgebra.norm(x::DArray, p::Real = 2)
results = asyncmap(procs(x)) do pp
remotecall_fetch(() -> norm(localpart(x), p), pp)
end
return norm(results, p)
end
function LinearAlgebra.rmul!(A::DArray, x::Number)
@sync for p in procs(A)
@async remotecall_wait((A,x)->rmul!(localpart(A), x), p, A, x)
end
return A
end
# Level 2
function add!(dest, src, scale = one(dest[1]))
if length(dest) != length(src)
throw(DimensionMismatch("source and destination arrays must have same number of elements"))
end
if scale == one(scale)
@simd for i = eachindex(dest)
@inbounds dest[i] += src[i]
end
else
@simd for i = eachindex(dest)
@inbounds dest[i] += scale*src[i]
end
end
return dest
end
function LinearAlgebra.mul!(y::DVector, A::DMatrix, x::AbstractVector, α::Number = 1, β::Number = 0)
# error checks
if size(A, 2) != length(x)
throw(DimensionMismatch(""))
end
if y.cuts[1] != A.cuts[1]
throw(ArgumentError("cuts of output vector must match cuts of first dimension of matrix"))
end
# Multiply on each tile of A
R = Array{Future}(undef, size(A.pids))
for j = 1:size(A.pids, 2)
xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1]
for i = 1:size(A.pids, 1)
R[i,j] = remotecall(procs(A)[i,j]) do
localpart(A)*convert(localtype(x), xj)
end
end
end
# Scale y if necessary
if β != one(β)
asyncmap(procs(y)) do p
remotecall_wait(p) do
if !iszero(β)
rmul!(localpart(y), β)
else
fill!(localpart(y), 0)
end
end
end
end
# Update y
@sync for i = 1:size(R, 1)
p = y.pids[i]
for j = 1:size(R, 2)
rij = R[i,j]
@async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)
end
end
return y
end
function LinearAlgebra.mul!(y::DVector, adjA::Adjoint{<:Number,<:DMatrix}, x::AbstractVector, α::Number = 1, β::Number = 0)
A = parent(adjA)
# error checks
if size(A, 1) != length(x)
throw(DimensionMismatch(""))
end
if y.cuts[1] != A.cuts[2]
throw(ArgumentError("cuts of output vector must match cuts of second dimension of matrix"))
end
# Multiply on each tile of A
R = Array{Future}(undef, reverse(size(A.pids)))
for j = 1:size(A.pids, 1)
xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1]
for i = 1:size(A.pids, 2)
R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i])
end
end
# Scale y if necessary
if β != one(β)
@sync for p in procs(y)
@async remotecall_wait(p) do
if !iszero(β)
rmul!(localpart(y), β)
else
fill!(localpart(y), 0)
end
end
end
end
# Update y
@sync for i = 1:size(R, 1)
p = y.pids[i]
for j = 1:size(R, 2)
rij = R[i,j]
@async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p)
end
end
return y
end
function LinearAlgebra.lmul!(D::Diagonal, DA::DMatrix)
d = D.diag
s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
d[DA.indices[pididx][1]]
end
map_localparts!(DA) do lDA
lmul!(Diagonal(localpart(s)), lDA)
end
end
function LinearAlgebra.rmul!(DA::DMatrix, D::Diagonal)
d = D.diag
s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
d[DA.indices[pididx][2]]
end
map_localparts!(DA) do lDA
rmul!(lDA, Diagonal(localpart(s)))
end
end
# Level 3
function _matmatmul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number, β::Number, tA)
# error checks
Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1)
mA, nA = (size(A, Ad1), size(A, Ad2))
mB, nB = size(B)
if mB != nA
throw(DimensionMismatch("matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)"))
end
if size(C,1) != mA || size(C,2) != nB
throw(DimensionMismatch("result C has dimensions $(size(C)), needs ($mA, $nB)"))
end
if C.cuts[1] != A.cuts[Ad1]
throw(ArgumentError("cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix"))
end
# Multiply on each tile of A
if tA == 'N'
R = Array{Future}(undef, size(procs(A))..., size(procs(C), 2))
else
R = Array{Future}(undef, reverse(size(procs(A)))..., size(procs(C), 2))
end
for j = 1:size(A.pids, Ad2)
for k = 1:size(C.pids, 2)
Acuts = A.cuts[Ad2]
Ccuts = C.cuts[2]
Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1]
for i = 1:size(A.pids, Ad1)
p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i]
R[i,j,k] = remotecall(p) do
if tA == 'T'
return transpose(localpart(A))*convert(localtype(B), Bjk)
elseif tA == 'C'
return adjoint(localpart(A))*convert(localtype(B), Bjk)
else
return localpart(A)*convert(localtype(B), Bjk)
end
end
end
end
end
# Scale C if necessary
if β != one(β)
@sync for p in C.pids
if iszero(β)
@async remotecall_wait(() -> fill!(localpart(C), 0), p)
else
@async remotecall_wait(() -> rmul!(localpart(C), β), p)
end
end
end
# Update C
@sync for i = 1:size(R, 1)
for k = 1:size(C.pids, 2)
p = C.pids[i,k]
for j = 1:size(R, 2)
rijk = R[i,j,k]
@async remotecall_wait(d -> add!(localpart(d), fetch(rijk), α), p, C)
end
end
end
return C
end
LinearAlgebra.mul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, A, B, α, β, 'N')
LinearAlgebra.mul!(C::DMatrix, A::Adjoint{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'C')
LinearAlgebra.mul!(C::DMatrix, A::Transpose{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'T')
_matmul_op = (t,s) -> t*s + t*s
function Base.:*(A::DMatrix, x::AbstractVector)
T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),))
return mul!(y, A, x)
end
function Base.:*(A::DMatrix, B::AbstractMatrix)
T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
C = DArray(I -> Array{T}(undef, map(length, I)),
(size(A, 1), size(B, 2)),
procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))],
(size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2))))
return mul!(C, A, B)
end
function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, x::AbstractVector)
A = parent(adjA)
T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
y = DArray(I -> Array{T}(undef, map(length, I)),
(size(A, 2),),
procs(A)[1,:],
(size(procs(A), 2),))
return mul!(y, adjA, x)
end
function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, B::AbstractMatrix)
A = parent(adjA)
T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),
size(B, 2)),
procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],
(size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))
return mul!(C, adjA, B)
end
function Base.:*(trA::Transpose{<:Any,<:DMatrix}, x::AbstractVector)
A = parent(trA)
T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
y = DArray(I -> Array{T}(undef, map(length, I)),
(size(A, 2),),
procs(A)[1,:],
(size(procs(A), 2),))
return mul!(y, trA, x)
end
function Base.:*(trA::Transpose{<:Any,<:DMatrix}, B::AbstractMatrix)
A = parent(trA)
T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),
size(B, 2)),
procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],
(size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))
return mul!(C, trA, B)
end
================================================
FILE: src/mapreduce.jl
================================================
## higher-order functions ##
Base.map(f, d0::DArray, ds::AbstractArray...) = broadcast(f, d0, ds...)
function Base.map!(f::F, dest::DArray, src::DArray{<:Any,<:Any,A}) where {F,A}
@sync for p in procs(dest)
@async remotecall_wait(p) do
map!(f, localpart(dest), makelocal(src, localindices(dest)...))
end
end
return dest
end
# Only defining `reduce(f, ::DArray)` causes method ambiguity issues with
# - `reduce(hcat, ::AbstractVector{<:AbstractVecOrMat})`
# - `reduce(vcat, ::AbstractVector{<:AbstractVecOrMat})`
Base.reduce(f, d::DArray) = _reduce(f, d)
Base.reduce(::typeof(hcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(hcat, d)
Base.reduce(::typeof(vcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(vcat, d)
function _reduce(f, d::DArray)
results = asyncmap(procs(d)) do p
remotecall_fetch(p) do
return reduce(f, localpart(d))
end
end
reduce(f, results)
end
function Base._mapreduce(f, op, ::IndexCartesian, d::DArray)
results = asyncmap(procs(d)) do p
remotecall_fetch((_f,_op,_d)->mapreduce(_f, _op, localpart(_d)), p, f, op, d)
end
reduce(op, results)
end
Base._mapreduce(f, op, ::IndexCartesian, d::SubDArray) = Base._mapreduce(f, op, IndexCartesian(), DArray(d))
# Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d)
# Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d)
# Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d)
# mapreducedim
function Base.reducedim_initarray(A::DArray, region, v0, ::Type{R}) where {R}
# Store reduction on lowest pids
pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...]
chunks = similar(pids, Future)
asyncmap!(chunks, pids) do p
remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), p)
end
return DArray(chunks)
end
Base.reducedim_initarray(A::DArray, region, v0::T) where {T} = Base.reducedim_initarray(A, region, v0, T)
# Compute mapreducedim of each localpart and store the result in a new DArray
function mapreducedim_within(f, op, A::DArray, region)
arraysize = [size(A)...]
gridsize = [size(A.indices)...]
arraysize[[region...]] = gridsize[[region...]]
indx = similar(A.indices)
for i in CartesianIndices(indx)
indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indices[i][j], ndims(A))
end
cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)]
return DArray(next_did(), I -> mapreduce(f, op, localpart(A), dims=region),
tuple(arraysize...), procs(A), indx, cuts)
end
# Compute mapreducedim across the processes. This should be done after mapreducedim
# has been run on each localpart with mapreducedim_within. Eventually, we might
# want to write mapreducedim_between! as a binary reduction.
function mapreducedim_between!(f, op, R::DArray, A::DArray, region)
@sync for p in procs(R)
@async remotecall_wait(p, f, op, R, A, region) do f, op, R, A, region
localind = [r for r = localindices(A)]
localind[[region...]] = [1:n for n = size(A)[[region...]]]
B = convert(Array, A[localind...])
Base.mapreducedim!(f, op, localpart(R), B)
end
end
return R
end
function Base.mapreducedim!(f, op, R::DArray, A::DArray)
lsize = Base.check_reducedims(R,A)
if isempty(A)
return copy(R)
end
region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...)
if isempty(region)
return copyto!(R, A)
end
B = mapreducedim_within(f, op, A, region)
return mapreducedim_between!(identity, op, R, B, region)
end
## Some special cases
function Base._all(f, A::DArray, ::Colon)
B = asyncmap(procs(A)) do p
remotecall_fetch(p) do
all(f, localpart(A))
end
end
return all(B)
end
function Base._any(f, A::DArray, ::Colon)
B = asyncmap(procs(A)) do p
remotecall_fetch(p) do
any(f, localpart(A))
end
end
return any(B)
end
function Base.count(f, A::DArray)
B = asyncmap(procs(A)) do p
remotecall_fetch(p) do
count(f, localpart(A))
end
end
return sum(B)
end
function Base.extrema(d::DArray)
r = asyncmap(procs(d)) do p
remotecall_fetch(p) do
extrema(localpart(d))
end
end
return reduce((t,s) -> (min(t[1], s[1]), max(t[2], s[2])), r)
end
# Unary vector functions
Base.:(-)(D::DArray) = map(-, D)
map_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d)
map_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I
f(localpart(d1), localpart(d2))
end
function map_localparts(f::Callable, DA::DArray, A::Array)
s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
A[DA.indices[pididx]...]
end
DArray(DA) do I
f(localpart(DA), localpart(s))
end
end
function map_localparts(f::Callable, A::Array, DA::DArray)
s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx
A[DA.indices[pididx]...]
end
DArray(DA) do I
f(localpart(s), localpart(DA))
end
end
function map_localparts!(f::Callable, d::DArray)
@sync for p in procs(d)
@async remotecall_wait((f,d)->f(localpart(d)), p, f, d)
end
return d
end
# Here we assume all the DArrays have
# the same size and distribution
map_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1])
function samedist(A::DArray, B::DArray)
(size(A) == size(B)) || throw(DimensionMismatch())
if (procs(A) != procs(B)) || (A.cuts != B.cuts)
B = DArray(x->B[x...], A)
end
B
end
for f in (:+, :-, :div, :mod, :rem, :&, :|, :xor)
@eval begin
function Base.$f(A::DArray{T}, B::DArray{T}) where T
B = samedist(A, B)
map_localparts($f, A, B)
end
Base.$f(A::DArray{T}, B::Array{T}) where {T} = map_localparts($f, A, B)
Base.$f(A::Array{T}, B::DArray{T}) where {T} = map_localparts($f, A, B)
end
end
function Base.mapslices(f, D::DArray{T,N,A}; dims) where {T,N,A}
if !(dims isa AbstractVector)
dims = [dims...]
end
if !all(t -> t == 1, size(D.indices)[dims])
p = ones(Int, ndims(D))
nondims = filter(t -> !(t in dims), 1:ndims(D))
p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D))
DD = DArray(size(D), procs(D), p) do I
return convert(A, D[I...])
end
return mapslices(f, DD, dims=dims)
end
refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),dims=z), p, f, D, dims) for p in procs(D)]
DArray(reshape(refs, size(procs(D))))
end
function _ppeval(f, A...; dim = map(ndims, A))
if length(dim) != length(A)
throw(ArgumentError("dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))"))
end
narg = length(A)
dimlength = size(A[1], dim[1])
for i = 2:narg
if dim[i] > 0 && dimlength != size(A[i], dim[i])
throw(ArgumentError("lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))"))
end
end
dims = []
idx = []
args = []
for i = 1:narg
push!(dims, ndims(A[i]))
push!(idx, Any[Colon() for d in 1:dims[i]])
if dim[i] > 0
idx[i][dim[i]] = 1
push!(args, view(A[i], idx[i]...))
else
push!(args, A[i])
end
end
R1 = f(args...)
ridx = Any[1:size(R1, d) for d in 1:ndims(R1)]
push!(ridx, 1)
Rsize = map(last, ridx)
Rsize[end] = dimlength
R = Array{eltype(R1)}(undef, Rsize...)
for i = 1:dimlength
for j = 1:narg
if dim[j] > 0
idx[j][dim[j]] = i
args[j] = view(A[j], idx[j]...)
else
args[j] = A[j]
end
end
ridx[end] = i
R[ridx...] = f(args...)
end
return R
end
"""
ppeval(f, D...; dim::NTuple)
Evaluates the callable argument `f` on slices of the elements of the `D` tuple.
#### Arguments
`f` can be any callable object that accepts sliced or broadcasted elements of `D`.
The result returned from `f` must be either an array or a scalar.
`D` has any number of elements and the elements can have any type. If an element
of `D` is a distributed array along the dimension specified by `dim`. If an
element of `D` is not distributed, the element is by default broadcasted and
applied on all evaluations of `f`.
`dim` is a tuple of integers specifying the dimension over which the elements
of `D` is slices. The length of the tuple must therefore be the same as the
number of arguments `D`. By default distributed arrays are slides along the
last dimension. If the value is less than or equal to zero the element are
broadcasted to all evaluations of `f`.
#### Result
`ppeval` returns a distributed array of dimension `p+1` where the first `p`
sizes correspond to the sizes of return values of `f`. The last dimension of
the return array from `ppeval` has the same length as the dimension over which
the input arrays are sliced.
#### Examples
```jl
addprocs(Sys.CPU_THREADS)
using DistributedArrays
A = drandn((10, 10, Sys.CPU_THREADS), workers(), [1, 1, Sys.CPU_THREADS])
ppeval(eigvals, A)
ppeval(eigvals, A, randn(10,10)) # broadcasting second argument
B = drandn((10, Sys.CPU_THREADS), workers(), [1, Sys.CPU_THREADS])
ppeval(*, A, B)
```
"""
function ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D))
#Ensure that the complete DArray is available on the specified dims on all processors
for i = 1:length(D)
if isa(D[i], DArray)
for idxs in D[i].indices
for d in setdiff(1:ndims(D[i]), dim[i])
if length(idxs[d]) != size(D[i], d)
throw(DimensionMismatch(string("dimension $d is distributed. ",
"ppeval requires dimension $d to be completely available on all processors.")))
end
end
end
end
end
refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])]
# The array of Futures has to be reshaped for the DArray constructor to work correctly.
# This requires a fetch and the DArray is also fetching so it might be better to modify
# the DArray constructor.
sd = [size(D[1].pids)...]
nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1])
DArray(reshape(refs, tuple([sd[1:nd - 1]; sd[end]]...)))
end
================================================
FILE: src/serialize.jl
================================================
function Serialization.serialize(S::AbstractSerializer, d::DArray{T,N,A}) where {T,N,A}
# Only send the ident for participating workers - we expect the DArray to exist in the
# remote registry. DO NOT send the localpart.
destpid = worker_id_from_socket(S.io)
Serialization.serialize_type(S, typeof(d))
if (destpid in d.pids) || (destpid == d.id[1])
serialize(S, (true, d.id)) # (id_only, id)
else
serialize(S, (false, d.id))
for n in [:dims, :pids, :indices, :cuts]
serialize(S, getfield(d, n))
end
serialize(S, A)
end
end
function Serialization.deserialize(S::AbstractSerializer, t::Type{DT}) where DT<:DArray
what = deserialize(S)
id_only = what[1]
id = what[2]
if id_only
d = d_from_weakref_or_d(id)
if d === nothing
# access to fields will throw an error, at least the deserialization process will not
# result in worker death
d = DT()
d.id = id
end
return d
else
# We are not a participating worker, deser fields and instantiate locally.
dims = deserialize(S)
pids = deserialize(S)
indices = deserialize(S)
cuts = deserialize(S)
A = deserialize(S)
T=eltype(DT)
N=length(dims)
return DT(id, dims, pids, indices, cuts, empty_localpart(T,N,A))
end
end
# Serialize only those parts of the object as required by the destination worker.
mutable struct DestinationSerializer
generate::Union{Function,Nothing} # Function to generate the part to be serialized
pids::Union{Array,Nothing} # MUST have the same shape as the distribution
deser_obj::Any # Deserialized part
DestinationSerializer(f,p,d) = new(f,p,d)
end
DestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, nothing)
# constructs a DestinationSerializer after verifying that the shape of pids.
function verified_destination_serializer(f::Function, pids::Array, verify_size)
@assert size(pids) == verify_size
return DestinationSerializer(f, pids)
end
DestinationSerializer(deser_obj::Any) = DestinationSerializer(nothing, nothing, deser_obj)
function Serialization.serialize(S::AbstractSerializer, s::DestinationSerializer)
pid = worker_id_from_socket(S.io)
pididx = findfirst(isequal(pid), s.pids)
@assert pididx !== nothing
Serialization.serialize_type(S, typeof(s))
serialize(S, s.generate(pididx))
end
function Serialization.deserialize(S::AbstractSerializer, t::Type{T}) where T<:DestinationSerializer
lpart = deserialize(S)
return DestinationSerializer(lpart)
end
function localpart(s::DestinationSerializer)
if s.deser_obj !== nothing
return s.deser_obj
elseif s.generate !== nothing && (myid() in s.pids)
# Handle the special case where myid() is part of s.pids.
# In this case serialize/deserialize is not called as the remotecall is executed locally
return s.generate(findfirst(isequal(myid()), s.pids))
else
throw(ErrorException(string("Invalid state in DestinationSerializer.")))
end
end
================================================
FILE: src/sort.jl
================================================
# Sorting a DVector using samplesort
function sample_n_setup_ref(d::DVector, sample_size; kwargs...)
lp = localpart(d)
llp = length(lp)
np = length(procs(d))
sample_size = llp > sample_size ? sample_size : llp
sorted = sort(lp; kwargs...)
sample = sorted[collect(1:div(llp,sample_size):llp)]
ref = RemoteChannel(()->Channel(np+1)) # To collect parts to be sorted locally later.
# First element is the locally sorted vector
put!(ref, sorted)
return (sample, ref)
end
function scatter_n_sort_localparts(d, myidx, refs, boundaries::Array{T}; by = identity, kwargs...) where T
if d==nothing
sorted = take!(refs[myidx]) # First entry in the remote channel is sorted localpart
else
sorted = sort(localpart(d); by = by, kwargs...)
end
# send respective parts to correct workers, iterate over sorted array
p_sorted = 1
for (i,r) in enumerate(refs)
p_till = length(sorted)+1
# calculate range to send to refs[i]
ctr=1
for x in sorted[p_sorted:end]
if by(x) > by(boundaries[i+1])
p_till = p_sorted+ctr-1
break
else
ctr += 1
end
end
if p_till == p_sorted
@async put!(r, Array{T}(undef,0))
else
v = sorted[p_sorted:p_till-1]
@async put!(r, v)
end
p_sorted = p_till
end
# wait to receive all of my parts from all other workers
lp_sorting=T[]
for _ in refs
v = take!(refs[myidx])
append!(lp_sorting, v)
end
sorted_ref=RemoteChannel()
put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...))
return (sorted_ref, length(lp_sorting))
end
function compute_boundaries(d::DVector{T}; kwargs...) where T
pids = procs(d)
np = length(pids)
sample_sz_on_wrkr = 512
results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)
samples = Array{T}(undef,0)
for x in results
append!(samples, x[1])
end
sort!(samples; kwargs...)
samples[1] = typemin(T)
refs=[x[2] for x in results]
boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
push!(boundaries, typemax(T))
return (boundaries, refs)
end
"""
sort(d::DVector; sample=true, kwargs...) -> DVector
Sorts and returns a new distributed vector.
The sorted vector may not have the same distribution as the original.
Keyword argument `sample` can take values:
- `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`.
- `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d)
- 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values
- Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution.
Keyword argument `alg` takes the same options `Base.sort`
"""
function Base.sort(d::DVector{T}; sample=true, kwargs...) where T
pids = procs(d)
np = length(pids)
# Only `alg` and `sample` are supported as keyword arguments
if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0
throw(ArgumentError("Only `alg`, `by` and `sample` are supported as keyword arguments"))
end
if sample==true
boundaries, refs = compute_boundaries(d; kwargs...)
presorted=true
elseif sample==false
# Assume an uniform distribution between min and max values
minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)
min_d = minimum(T[x[1] for x in minmax])
max_d = maximum(T[x[2] for x in minmax])
return sort(d; sample=(min_d,max_d), kwargs...)
elseif isa(sample, Tuple)
# Assume an uniform distribution between min and max values in the tuple
lb=sample[1]
ub=sample[2]
@assert lb<=ub
s = Array{T}(undef,np)
part = abs(ub - lb)/np
(isnan(part) || isinf(part)) && throw(ArgumentError("lower and upper bounds must not be infinities"))
for n in 1:np
v = lb + (n-1)*part
if T <: Integer
s[n] = round(v)
else
s[n] = v
end
end
return sort(d; sample=s, kwargs...)
elseif isa(sample, Array)
# Provided array is used as a sample
samples = sort(copy(sample))
samples[1] = typemin(T)
boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
push!(boundaries, typemax(T))
presorted=false
refs=[RemoteChannel(p) for p in procs(d)]
else
throw(ArgumentError("keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : " * string(sample)))
end
local_sort_results = Array{Tuple}(undef,np)
Base.asyncmap!((i,p) -> remotecall_fetch(
scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...),
local_sort_results, 1:np, pids)
# Construct a new DArray from the sorted refs. Remove parts with 0-length since
# the DArray constructor_from_refs does not yet support it. This implies that
# the participating workers for the sorted darray may be different from the original
# for highly non-uniform distributions.
local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)]
return DArray(local_sorted_refs)
end
================================================
FILE: src/spmd.jl
================================================
module SPMD
using Distributed: RemoteChannel, myid, procs, remote_do, remotecall_fetch, remotecall_wait
using ..DistributedArrays: DistributedArrays, gather, next_did
export sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather
export context_local_storage, context, spmd
mutable struct WorkerDataChannel
pid::Int
rc::Union{RemoteChannel,Nothing}
lock::ReentrantLock
WorkerDataChannel(pid) = new(pid, nothing, ReentrantLock())
end
mutable struct SPMDContext
id::Tuple{Int,Int}
chnl::Channel
store::Dict{Any,Any}
pids::Array{Int}
function SPMDContext(id::Tuple{Int,Int}, pids::Vector{Int})
ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), pids)
if first(id) == myid()
finalizer(ctxt) do ctxt
for p in ctxt.pids
@async remote_do(delete_ctxt_id, p, ctxt.id)
end
end
end
return ctxt
end
end
# Every worker is associated with its own RemoteChannel
struct WorkerChannelDict
data::Dict{Int, WorkerDataChannel}
lock::ReentrantLock
WorkerChannelDict() = new(Dict{Int, WorkerDataChannel}(), ReentrantLock())
end
const WORKERCHANNELS = WorkerChannelDict()
Base.get!(f::Function, x::WorkerChannelDict, id::Int) = @lock x.lock get!(f, x.data, id)
# mapping between a context id and context object
struct SPMDContextDict
data::Dict{Tuple{Int,Int}, SPMDContext}
lock::ReentrantLock
SPMDContextDict() = new(Dict{Tuple{Int,Int}, SPMDContext}(), ReentrantLock())
end
const CONTEXTS = SPMDContextDict()
Base.delete!(x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock delete!(x.data, id)
Base.get!(f::Function, x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock get!(f, x.data, id)
function context_local_storage()
ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT))
ctxt.store
end
context(pids::Vector{Int}=procs()) = SPMDContext(next_did(), pids)
# Multiple SPMD blocks can be executed concurrently,
# each in its own context. Messages are still sent as part of the
# same remote channels associated with each worker. They are
# read from the remote channel into local channels each associated
# with a different run of `spmd`.
function get_dc(wc::WorkerDataChannel)
lock(wc.lock)
try
if wc.rc === nothing
if wc.pid == myid()
myrc = RemoteChannel(()->Channel(typemax(Int)))
wc.rc = myrc
# start a task to transfer incoming messages into local
# channels based on the execution context
@async begin
while true
msg = take!(myrc)
ctxt_id = msg[1] # First element of the message tuple is the context id.
ctxt = get_ctxt_from_id(ctxt_id)
put!(ctxt.chnl, msg[2:end]) # stripping the context_id
end
end
else
wc.rc = remotecall_fetch(()->get_remote_dc(myid()), wc.pid)
end
end
finally
unlock(wc.lock)
end
return wc.rc
end
function get_ctxt_from_id(ctxt_id::Tuple{Int,Int})
ctxt = get!(CONTEXTS, ctxt_id) do
return SPMDContext(ctxt_id, Int[])
end
return ctxt
end
# Since modules may be loaded in any order on the workers,
# and workers may be dynamically added, pull in the remote channel
# handles when accessed for the first time.
function get_remote_dc(pid::Int)
wc = get!(WORKERCHANNELS, pid) do
return WorkerDataChannel(pid)
end
return get_dc(wc)
end
function send_msg(to, typ, data, tag)
ctxt_id = task_local_storage(:SPMD_CTXT)
@async begin
dc = get_remote_dc(to)
put!(dc, (ctxt_id, typ, myid(), data, tag))
# println("Sent to ", dc)
end
end
function get_msg(typ_check, from_check=false, tag_check=nothing)
ctxt_id = task_local_storage(:SPMD_CTXT)
chnl = get_ctxt_from_id(ctxt_id).chnl
unexpected_msgs=[]
while true
typ, from, data, tag = take!(chnl)
if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check)
push!(unexpected_msgs, (typ, from, data, tag))
# println("Unexpected in get_msg ", unexpected_msgs, " looking for ", typ_check, " ", from_check, " ", tag_check)
else
# put all the messages we read (but not expected) back to the local channel
foreach(x->put!(chnl, x), unexpected_msgs)
return (from, data)
end
end
end
function sendto(pid::Int, data::Any; tag=nothing)
send_msg(pid, :sendto, data, tag)
end
function recvfrom(pid::Int; tag=nothing)
_, data = get_msg(:sendto, pid, tag)
return data
end
function recvfrom_any(; tag=nothing)
from, data = get_msg(:sendto, false, tag)
return (from,data)
end
function barrier(;pids=procs(), tag=nothing)
# send a message to everyone
for p in sort(pids)
send_msg(p, :barrier, nothing, tag)
end
# make sure we recv a message from everyone
pending=deepcopy(pids)
unexpected_msgs=[]
while length(pending) > 0
from, _ = get_msg(:barrier, false, tag)
if from in pending
filter!(x->x!=from, pending)
else
# handle case of 2 (or more) consecutive barrier calls.
push!(unexpected_msgs, (:barrier, from, nothing, tag))
# println("Unexpected ", from)
end
# length(pending) == 1 && println("Waiting for ", pending)
end
ctxt_id = task_local_storage(:SPMD_CTXT)
chnl = get_ctxt_from_id(ctxt_id).chnl
foreach(x->put!(chnl, x), unexpected_msgs)
return nothing
end
function bcast(data::Any, pid::Int; tag=nothing, pids=procs())
if myid() == pid
for p in filter(x->x!=pid, sort(pids))
send_msg(p, :bcast, data, tag)
end
return data
else
from, data = get_msg(:bcast, pid, tag)
return data
end
end
function scatter(x, pid::Int; tag=nothing, pids=procs())
if myid() == pid
@assert rem(length(x), length(pids)) == 0
cnt = div(length(x), length(pids))
for (i,p) in enumerate(sort(pids))
p == pid && continue
send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag)
end
myidx = findfirst(isequal(pid), sort(pids))
return x[cnt*(myidx-1)+1:cnt*myidx]
else
_, data = get_msg(:scatter, pid, tag)
return data
end
end
function DistributedArrays.gather(x, pid::Int; tag=nothing, pids=procs())
if myid() == pid
gathered_data = Array{Any}(undef, length(pids))
myidx = findfirst(isequal(pid), sort(pids))
gathered_data[myidx] = x
n = length(pids) - 1
while n > 0
from, data_x = get_msg(:gather, false, tag)
fromidx = findfirst(isequal(from), sort(pids))
gathered_data[fromidx] = data_x
n=n-1
end
return gathered_data
else
send_msg(pid, :gather, x, tag)
return x
end
end
function spmd_local(f, ctxt_id, clear_ctxt)
task_local_storage(:SPMD_CTXT, ctxt_id)
f()
clear_ctxt && delete_ctxt_id(ctxt_id)
return nothing
end
function spmd(f, args...; pids=procs(), context=nothing)
f_noarg = ()->f(args...)
clear_ctxt = false
if context == nothing
ctxt_id = next_did()
clear_ctxt = true # temporary unique context created for this run.
# should be cleared at the end of the run.
else
ctxt_id = context.id
end
@sync for p in pids
@async remotecall_wait(spmd_local, p, f_noarg, ctxt_id, clear_ctxt)
end
nothing
end
delete_ctxt_id(ctxt_id::Tuple{Int,Int}) = delete!(CONTEXTS, ctxt_id)
Base.close(ctxt::SPMDContext) = finalize(ctxt)
end
================================================
FILE: test/aqua.jl
================================================
using DistributedArrays, Test
import Aqua
@testset "Aqua" begin
Aqua.test_all(DistributedArrays; ambiguities = (; broken = true))
end
================================================
FILE: test/darray.jl
================================================
using Test, LinearAlgebra, SpecialFunctions
using Statistics: mean
using SparseArrays: nnz
using Random
@everywhere using SparseArrays: sprandn
@testset "test distribute and other constructors" begin
A = rand(1:100, (100,100))
@testset "test default distribute" begin
DA = distribute(A)
@test length(procs(DA)) == nworkers()
@test sum(DA) == sum(A)
close(DA)
end
@testset "test distribute with procs arguments" begin
DA = distribute(A, procs = procs())
@test length(procs(DA)) == nprocs()
@test sum(DA) == sum(A)
close(DA)
end
@testset "test distribute with procs and dist arguments" begin
DA = distribute(A, procs = [1, 2], dist = [1,2])
@test size(procs(DA)) == (1,2)
@test sum(DA) == sum(A)
close(DA)
end
@testset "Create darray with unconventional distribution and distribute like it" begin
block = 10
Y = nworkers() * block
X = nworkers() * block
remote_parts = map(workers()) do wid
remotecall(rand, wid, block, Y)
end
DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1)))
A = rand(X, Y)
DA2 = distribute(A, DA1)
@test size(DA1) == size(DA2)
close(DA1)
close(DA2)
end
@testset "Global DArray serialization issue #134" begin
global A134 = drandn(1)
D2 = DArray(I -> DistributedArrays.localpart(A134), A134)
@test D2 == A134
close(A134)
close(D2)
end
@testset "empty_localpart should work when only constructor (not conversion is defined)" begin
@test DistributedArrays.empty_localpart(Float64,2,LowerTriangular{Float64,Matrix{Float64}}) isa
LowerTriangular
end
@testset "Consistent Uneven Distribution issue #166" begin
DA = drand((2+length(OTHERIDS),), [MYID, OTHERIDS])
@test fetch(@spawnat MYID length(localpart(DA)) == 2)
@test fetch(@spawnat OTHERIDS length(localpart(DA)) == 1)
close(DA)
@test DistributedArrays.defaultdist(50,4) == [1,14,27,39,51]
end
@testset "Inhomogeneous typeof(localpart)" begin
block = 10
Y = nworkers() * block
X = nworkers() * block
@assert nworkers() > 1
@test_throws ErrorException DArray((X, Y)) do I
eltype = first(CartesianIndices(I)) == CartesianIndex(1, 1) ? Int64 : Float64
zeros(eltype, map(length, I))
end
end
end
check_leaks()
@testset "test DArray equality/copy/deepcopy" begin
D = drand((200,200), [MYID, OTHERIDS])
@testset "test isequal(::DArray, ::DArray)" begin
DC = copy(D)
@test D == DC
close(DC)
end
@testset "test [deep]copy(::DArray) does a copy of each localpart" begin
DC = copy(D)
@spawnat OTHERIDS localpart(DC)[1] = 0
@test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)
DD = deepcopy(D)
@spawnat OTHERIDS localpart(DD)[1] = 0
@test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)
close(DC)
close(DD)
end
@testset "test copy(::DArray) is shallow" begin
DA = @DArray [rand(100) for i=1:10]
DC = copy(DA)
id = procs(DC)[1]
@test DA == DC
fetch(@spawnat id localpart(DC)[1] .= -1.0)
@test DA == DC
@test fetch(@spawnat id all(localpart(DA)[1] .== -1.0))
close(DA)
close(DC)
end
@testset "test deepcopy(::DArray) is not shallow" begin
DA = @DArray [rand(100) for i=1:10]
DC = deepcopy(DA)
id = procs(DC)[1]
@test DA == DC
fetch(@spawnat id localpart(DC)[1] .= -1.0)
@test DA != DC
@test fetch(@spawnat id all(localpart(DA)[1] .>= 0.0))
close(DA)
close(DC)
end
close(D)
end
check_leaks()
@testset "test DArray similar" begin
D = drand((200,200), [MYID, OTHERIDS])
DS = similar(D,Float16)
@testset "test eltype of a similar" begin
@test eltype(DS) == Float16
end
@testset "test dims of a similar" begin
@test size(D) == size(DS)
end
close(D)
close(DS)
end
check_leaks()
@testset "test DArray reshape" begin
D = drand((200,200), [MYID, OTHERIDS])
@testset "Test error-throwing in reshape" begin
@test_throws DimensionMismatch reshape(D,(100,100))
end
DR = reshape(D,(100,400))
@testset "Test reshape" begin
@test size(DR) == (100,400)
end
close(D)
end
check_leaks()
@testset "test @DArray comprehension constructor" begin
@testset "test valid use of @DArray" begin
D = @DArray [i+j for i=1:10, j=1:10]
@test D == [i+j for i=1:10, j=1:10]
close(D)
end
@testset "test invalid use of @DArray" begin
#@test_throws ArgumentError eval(:((@DArray [1,2,3,4])))
@test_throws LoadError eval(:((@DArray [1,2,3,4])))
end
end
check_leaks()
@testset "test DArray / Array conversion" begin
D = drand((200,200), [MYID, OTHERIDS])
@testset "test construct Array from (Sub)DArray" begin
S = Matrix{Float64}(D[1:150, 1:150])
A = Matrix{Float64}(D)
@test A[1:150,1:150] == S
D2 = DArray{Float64,2,Matrix{Float64}}(A)
@test D2 == D
DistributedArrays.allowscalar(true)
@test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1]
@test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101]
DistributedArrays.allowscalar(false)
close(D2)
S2 = Vector{Float64}(D[4, 23:176])
@test A[4, 23:176] == S2
S3 = Vector{Float64}(D[23:176, 197])
@test A[23:176, 197] == S3
S4 = zeros(4)
setindex!(S4, D[3:4, 99:100], :)
# FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}
DistributedArrays.allowscalar(true)
@test S4 == vec(D[3:4, 99:100])
@test S4 == vec(A[3:4, 99:100])
DistributedArrays.allowscalar(false)
S5 = zeros(2,2)
setindex!(S5, D[1,1:4], :, 1:2)
# FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray}
DistributedArrays.allowscalar(true)
@test vec(S5) == D[1, 1:4]
@test vec(S5) == A[1, 1:4]
DistributedArrays.allowscalar(false)
end
close(D)
end
check_leaks()
@testset "test copy!" begin
D1 = dzeros((10,10))
r1 = remotecall_wait(() -> randn(3,10), workers()[1])
r2 = remotecall_wait(() -> randn(7,10), workers()[2])
D2 = DArray(reshape([r1; r2], 2, 1))
copyto!(D2, D1)
@test D1 == D2
close(D1)
close(D2)
end
check_leaks()
@testset "test DArray reduce" begin
D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS])
@testset "test reduce" begin
@test reduce(+, D) == ((50*MYID) + (50*OTHERIDS))
end
@testset "test map / reduce" begin
D2 = map(x->1, D)
@test D2 isa DArray
@test reduce(+, D2) == 100
close(D2)
end
@testset "test map! / reduce" begin
map!(x->1, D, D)
@test reduce(+, D) == 100
end
close(D)
end
check_leaks()
@testset "test rmul" begin
A = randn(100,100)
DA = distribute(A)
@test rmul!(DA, 2) == rmul!(A, 2)
close(DA)
end
check_leaks()
@testset "test rmul!(Diagonal, A)" begin
A = randn(100, 100)
b = randn(100)
D = Diagonal(b)
DA = distribute(A)
@test lmul!(D, A) == lmul!(D, DA)
close(DA)
A = randn(100, 100)
b = randn(100)
DA = distribute(A)
@test rmul!(A, D) == rmul!(DA, D)
close(DA)
end
check_leaks()
@testset "test mapreduce on DArrays" begin
for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *]
A = rand(1:5, rand(2:30))
DA = distribute(A)
@test DA isa DArray
@test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0
close(DA)
end
end
check_leaks()
@testset "test mapreducedim on DArrays" begin
D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS])
D2 = map(x->1, D)
@test D2 isa DArray
@test mapreduce(t -> t*t, +, D2, dims=1) == mapreduce(t -> t*t, +, convert(Array, D2), dims=1)
@test mapreduce(t -> t*t, +, D2, dims=2) == mapreduce(t -> t*t, +, convert(Array, D2), dims=2)
@test mapreduce(t -> t*t, +, D2, dims=(1,2)) == mapreduce(t -> t*t, +, convert(Array, D2), dims=(1,2))
# Test non-regularly chunked DArrays
r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1])
r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2])
D = DArray(reshape([r1; r2], (2,1)))
@test Array(sum(D, dims=2)) == sum(Array(D), dims=2)
# close(D)
# close(D2)
d_closeall() # temp created by the mapreduce above
end
check_leaks()
@testset "test mapreducdim, reducedim on DArrays" begin
dims = (20,20,20)
DA = drandn(dims)
A = convert(Array, DA)
@testset "dimension $dms" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
@test mapreduce(t -> t*t, +, A, dims=dms) ≈ mapreduce(t -> t*t, +, DA, dims=dms)
@test mapreduce(t -> t*t, +, A, dims=dms, init=1.0) ≈ mapreduce(t -> t*t, +, DA, dims=dms, init=1.0)
@test reduce(*, A, dims=dms) ≈ reduce(*, DA, dims=dms)
@test reduce(*, A, dims=dms, init=2.0) ≈ reduce(*, DA, dims=dms, init=2.0)
end
close(DA)
d_closeall() # temp created by the mapreduce above
end
check_leaks()
@testset "test statistical functions on DArrays" begin
dims = (20,20,20)
DA = drandn(dims)
A = Array(DA)
@testset "test $f for dimension $dms" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
# std is pending implementation
@test f(DA, dims=dms) ≈ f(A, dims=dms)
end
close(DA)
d_closeall() # temporaries created above
end
check_leaks()
unpack(ex::Base.CapturedException) = unpack(ex.ex)
unpack(ex::Distributed.RemoteException) = unpack(ex.captured)
unpack(ex::Base.TaskFailedException) = unpack(ex.task.exception)
unpack(ex) = ex
@testset "test sum on DArrays" begin
A = randn(100,100)
DA = distribute(A)
# sum either throws an ArgumentError, a CompositeException of ArgumentErrors,
# or a RemoteException wrapping an ArgumentError
try
sum(DA, dims=-1)
catch err
if isa(err, CompositeException)
@test !isempty(err.exceptions)
for excep in err.exceptions
# Unpack the remote exception
orig_err = unpack(excep)
@test isa(orig_err, ArgumentError)
end
elseif isa(err, RemoteException)
@test err.captured isa CapturedException
@test err.captured.ex isa ArgumentError
else
@test isa(err, ArgumentError)
end
end
try
sum(DA, dims=0)
catch err
if isa(err, CompositeException)
@test !isempty(err.exceptions)
for excep in err.exceptions
# Unpack the remote exception
orig_err = unpack(excep)
@test isa(orig_err, ArgumentError)
end
elseif isa(err, RemoteException)
@test err.captured isa CapturedException
@test err.captured.ex isa ArgumentError
else
@test isa(err, ArgumentError)
end
end
@test sum(DA) ≈ sum(A)
@test sum(DA, dims=1) ≈ sum(A, dims=1)
@test sum(DA, dims=2) ≈ sum(A, dims=2)
@test sum(DA, dims=3) ≈ sum(A, dims=3)
close(DA)
d_closeall() # temporaries created above
end
check_leaks()
@testset "test size on DArrays" begin
A = randn(100,100)
DA = distribute(A)
@test_throws BoundsError size(DA, 0)
@test size(DA,1) == size(A,1)
@test size(DA,2) == size(A,2)
@test size(DA,3) == size(A,3)
close(DA)
end
check_leaks()
# test length / lastindex
@testset "test collections API" begin
A = randn(23,23)
DA = distribute(A)
@testset "test length" begin
@test length(DA) == length(A)
end
@testset "test lastindex" begin
@test lastindex(DA) == lastindex(A)
end
close(DA)
end
check_leaks()
@testset "test max / min / sum" begin
a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))
d = distribute(a)
@test sum(d) == sum(a)
@test maximum(d) == maximum(a)
@test minimum(d) == minimum(a)
@test maximum(abs, d) == maximum(abs, a)
@test minimum(abs, d) == minimum(abs, a)
@test sum(abs, d) == sum(abs, a)
@test sum(abs2, d) == sum(abs2, a)
@test extrema(d) == extrema(a)
close(d)
end
check_leaks()
@testset "test all / any" begin
a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000))
a = [true for i in 1:100]
d = distribute(a)
@test all(d)
@test any(d)
close(d)
a[50] = false
d = distribute(a)
@test !all(d)
@test any(d)
close(d)
a = [false for i in 1:100]
d = distribute(a)
@test !all(d)
@test !any(d)
close(d)
d = dones(10,10)
@test !all(x-> x>1.0, d)
@test all(x-> x>0.0, d)
close(d)
a = ones(10,10)
a[10] = 2.0
d = distribute(a)
@test any(x-> x == 1.0, d)
@test any(x-> x == 2.0, d)
@test !any(x-> x == 3.0, d)
close(d)
end
check_leaks()
@testset "test count" begin
a = ones(10,10)
a[10] = 2.0
d = distribute(a)
@test count(x-> x == 2.0, d) == 1
@test count(x-> x == 1.0, d) == 99
@test count(x-> x == 0.0, d) == 0
close(d)
end
check_leaks()
@testset "test prod" begin
a = fill(2, 10);
d = distribute(a);
@test prod(d) == 2^10
close(d)
end
check_leaks()
@testset "test zeros" begin
@testset "1D dzeros default element type" begin
A = dzeros(10)
@test A == zeros(10)
@test eltype(A) == Float64
@test size(A) == (10,)
close(A)
end
@testset "1D dzeros with specified element type" begin
A = dzeros(Int, 10)
@test A == zeros(10)
@test eltype(A) == Int
@test size(A) == (10,)
close(A)
end
@testset "2D dzeros default element type, Dims constructor" begin
A = dzeros((10,10))
@test A == zeros((10,10))
@test eltype(A) == Float64
@test size(A) == (10,10)
close(A)
end
@testset "2D dzeros specified element type, Dims constructor" begin
A = dzeros(Int, (10,10))
@test A == zeros(Int, (10,10))
@test eltype(A) == Int
@test size(A) == (10,10)
close(A)
end
@testset "2D dzeros, default element type" begin
A = dzeros(10,10)
@test A == zeros(10,10)
@test eltype(A) == Float64
@test size(A) == (10,10)
close(A)
end
@testset "2D dzeros, specified element type" begin
A = dzeros(Int, 10, 10)
@test A == zeros(Int, 10, 10)
@test eltype(A) == Int
@test size(A) == (10,10)
close(A)
end
end
check_leaks()
@testset "test dones" begin
@testset "1D dones default element type" begin
A = dones(10)
@test A == ones(10)
@test eltype(A) == Float64
@test size(A) == (10,)
close(A)
end
@testset "1D dones with specified element type" begin
A = dones(Int, 10)
@test eltype(A) == Int
@test size(A) == (10,)
close(A)
end
@testset "2D dones default element type, Dims constructor" begin
A = dones((10,10))
@test A == ones((10,10))
@test eltype(A) == Float64
@test size(A) == (10,10)
close(A)
end
@testset "2D dones specified element type, Dims constructor" begin
A = dones(Int, (10,10))
@test A == ones(Int, (10,10))
@test eltype(A) == Int
@test size(A) == (10,10)
close(A)
end
@testset "2D dones, default element type" begin
A = dones(10,10)
@test A == ones(10,10)
@test eltype(A) == Float64
@test size(A) == (10,10)
close(A)
end
@testset "2D dones, specified element type" begin
A = dones(Int, 10, 10)
@test A == ones(Int, 10, 10)
@test eltype(A) == Int
@test size(A) == (10,10)
close(A)
end
end
check_leaks()
@testset "test drand" begin
@testset "1D drand" begin
A = drand(100)
@test eltype(A) == Float64
@test size(A) == (100,)
@test all(x-> x >= 0.0 && x <= 1.0, A)
close(A)
end
@testset "1D drand, specified element type" begin
A = drand(Int, 100)
@test eltype(A) == Int
@test size(A) == (100,)
close(A)
end
@testset "1D drand, UnitRange" begin
A = drand(1:10, 100)
@test eltype(A) == Int
@test size(A) == (100,)
close(A)
end
@testset "1D drand, Array" begin
A = drand([-1,0,1], 100)
@test eltype(A) == Int
@test size(A) == (100,)
close(A)
end
@testset "2D drand, Dims constructor" begin
A = drand((50,50))
@test eltype(A) == Float64
@test size(A) == (50,50)
@test all(x-> x >= 0.0 && x <= 1.0, A)
close(A)
end
@testset "2D drand" begin
A = drand(100,100)
@test eltype(A) == Float64
@test size(A) == (100,100)
@test all(x-> x >= 0.0 && x <= 1.0, A)
close(A)
end
@testset "2D drand, Dims constructor, specified element type" begin
A = drand(Int, (100,100))
@test eltype(A) == Int
@test size(A) == (100,100)
close(A)
end
@testset "2D drand, specified element type" begin
A = drand(Int, 100, 100)
@test eltype(A) == Int
@test size(A) == (100,100)
close(A)
end
end
check_leaks()
@testset "test randn" begin
@testset "1D drandn" begin
A = drandn(100)
@test eltype(A) == Float64
@test size(A) == (100,)
close(A)
end
@testset "2D drandn, Dims constructor" begin
A = drandn((50,50))
@test eltype(A) == Float64
@test size(A) == (50,50)
close(A)
end
@testset "2D drandn" begin
A = drandn(100,100)
@test eltype(A) == Float64
@test size(A) == (100,100)
close(A)
end
end
check_leaks()
@testset "test transpose/adjoint" begin
@testset "test transpose real" begin
A = drand(Float64, 100, 200)
@test copy(transpose(A)) == transpose(Array(A))
close(A)
end
@testset "test transpose complex" begin
A = drand(ComplexF64, 200, 100)
@test copy(transpose(A)) == transpose(Array(A))
close(A)
end
@testset "test adjoint real" begin
A = drand(Float64, 200, 100)
@test copy(adjoint(A)) == adjoint(Array(A))
close(A)
end
@testset "test adjoint complex" begin
A = drand(ComplexF64, 100, 200)
@test copy(adjoint(A)) == adjoint(Array(A))
close(A)
end
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "makelocal" begin
A = randn(5*nprocs(), 5*nprocs())
dA = distribute(A, procs=procs())
for i in 1:size(dA, 2)
a = DistributedArrays.makelocal(dA, :, i)
@test all(Array(view(dA, :, i)) .== a)
@test all( view( A, :, i) .== a)
end
for i in 1:size(dA, 1)
a = DistributedArrays.makelocal(dA, i, :)
@test all(Array(view(dA, i:i, :)) .== a)
@test all( view( A, i:i, :) .== a)
end
a = DistributedArrays.makelocal(dA, 1:5, 1:5)
@test all(Array(view(dA, 1:5, 1:5)) .== a)
@test all( view( A, 1:5, 1:5) .== a)
close(dA)
end
@testset "test convert from subdarray" begin
a = drand(20, 20);
s = view(a, 1:5, 5:8)
@test isa(s, SubDArray)
@test s == DArray(s)
s = view(a, 6:5, 5:8)
@test isa(s, SubDArray)
@test s == DArray(s)
close(a)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test scalar math" begin
a = drand(20, 20);
b = convert(Array, a)
@testset "$f" for f in (-, abs, abs2, acos, acosd, acot,
acotd, acsch, angle, asech, asin,
asind, asinh, atan, atand, atanh,
big, cbrt, ceil, cis, complex, conj,
cos, cosc, cosd, cosh, cospi, cot,
cotd, coth, csc, cscd, csch, dawson,
deg2rad, digamma, erf, erfc, erfcinv,
erfcx, erfi, erfinv, exp, exp10, exp2,
expm1, exponent, float, floor, gamma, imag,
invdigamma, isfinite, isinf, isnan,
loggamma, log, log10, log1p, log2, rad2deg, real,
sec, secd, sech, sign, sin, sinc, sind,
sinh, sinpi, sqrt, tan, tand, tanh, trigamma)
@test f.(a) == f.(b)
end
a = a .+ 1
b = b .+ 1
@testset "$f" for f in (asec, asecd, acosh, acsc, acscd, acoth)
@test f.(a) == f.(b)
end
close(a)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test mapslices" begin
A = randn(5,5,5)
D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)])
@test mapslices(svdvals, D, dims=(1,2)) ≈ mapslices(svdvals, A, dims=(1,2))
@test mapslices(svdvals, D, dims=(1,3)) ≈ mapslices(svdvals, A, dims=(1,3))
@test mapslices(svdvals, D, dims=(2,3)) ≈ mapslices(svdvals, A, dims=(2,3))
@test mapslices(sort, D, dims=(1,)) ≈ mapslices(sort, A, dims=(1,))
@test mapslices(sort, D, dims=(2,)) ≈ mapslices(sort, A, dims=(2,))
@test mapslices(sort, D, dims=(3,)) ≈ mapslices(sort, A, dims=(3,))
# issue #3613
B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), dims=[1,2])
@test size(B) == (1,1,4)
@test all(B.==6)
# issue #5141
C1 = mapslices(x-> maximum(-x), D, dims=[])
@test C1 == -D
# issue #5177
c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)])
m1 = mapslices(x-> ones(2,3), c, dims=[1,2])
m2 = mapslices(x-> ones(2,4), c, dims=[1,3])
m3 = mapslices(x-> ones(3,4), c, dims=[2,3])
@test size(m1) == size(m2) == size(m3) == size(c)
n1 = mapslices(x-> ones(6), c, dims=[1,2])
n2 = mapslices(x-> ones(6), c, dims=[1,3])
n3 = mapslices(x-> ones(6), c, dims=[2,3])
n1a = mapslices(x-> ones(1,6), c, dims=[1,2])
n2a = mapslices(x-> ones(1,6), c, dims=[1,3])
n3a = mapslices(x-> ones(1,6), c, dims=[2,3])
@test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5))
@test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5))
close(D)
close(c)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test scalar ops" begin
a = drand(20,20)
b = convert(Array, a)
c = drand(20,20)
d = convert(Array, c)
@testset "$f" for f in (:+, :-, :*, :/, :%)
x = rand()
@test @eval ($f).($a, $x) == ($f).($b, $x)
@test @eval ($f).($x, $a) == ($f).($x, $b)
@test @eval ($f).($a, $c) == ($f).($b, $d)
end
close(a)
close(c)
a = dones(Int, 20, 20)
b = convert(Array, a)
@testset "$f" for f in (:<<, :>>)
@test @eval ($f).($a, 2) == ($f).($b, 2)
@test @eval ($f).(2, $a) == ($f).(2, $b)
@test @eval ($f).($a, $a) == ($f).($b, $b)
end
@testset "$f" for f in (:rem,)
x = rand()
@test @eval ($f).($a, $x) == ($f).($b, $x)
end
close(a)
close(c)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test broadcast ops" begin
wrkrs = workers()
nwrkrs = length(wrkrs)
nrows = 20 * nwrkrs
ncols = 10 * nwrkrs
a = drand((nrows,ncols), wrkrs, (1, nwrkrs))
m = mean(a, dims=1)
c = a .- m
d = convert(Array, a) .- convert(Array, m)
@test c == d
e = @DArray [ones(10) for i=1:4]
f = 2 .* e
@test Array(f) == 2 .* Array(e)
@test Array(map(x -> sum(x) .+ 2, e)) == map(x -> sum(x) .+ 2, e)
@testset "test nested broadcast" begin
g = a .- m .* sin.(c)
@test Array(g) == Array(a) .- Array(m) .* sin.(Array(c))
end
@testset "Broadcasting into DArray" begin
a .= ones(nrows, ncols)
@test all(isone, a)
a .= 3 .+ abs2.(@view(zeros(nrows, ncols + 5)[:, 6:end]))
@test all(x -> x == 3, a)
end
# @testset "lazy wrapped broadcast" begin
# l = similar(a)
# l[1:10, :] .= view(a, 1:10, : )
# end
d_closeall()
end
check_leaks()
@testset "test matrix multiplication" begin
A = drandn(20,20)
b = drandn(20)
B = drandn(20,20)
@test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps())
@test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps())
@test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps())
@test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps())
close(A)
close(b)
close(B)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "dot product" begin
A = drandn(20,20)
b = drandn(20)
c = A * b
@test dot(c, b) ≈ dot(convert(Array, c), convert(Array, b))
close(A)
close(b)
close(c)
end
check_leaks()
@testset "test norm" begin
x = drandn(20)
@test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps())
@test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps())
@test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps())
@test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps())
close(x)
end
check_leaks()
@testset "test axpy!" begin
for (x, y) in ((drandn(20), drandn(20)),
(drandn(20, 2), drandn(20, 2)))
@test Array(axpy!(2.0, x, copy(y))) ≈ axpy!(2.0, Array(x), Array(y))
@test_throws DimensionMismatch axpy!(2.0, x, zeros(length(x) + 1))
close(x)
close(y)
end
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test ppeval" begin
A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()])
B = drandn((10, nworkers()), workers(), [1, nworkers()])
R = zeros(10, nworkers())
for i = 1:nworkers()
R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i]
end
@test convert(Array, ppeval(*, A, B)) ≈ R
@test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, Matrix{Float64}(I,10,10)))
close(A)
close(B)
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "test nnz" begin
A = sprandn(10, 10, 0.5)
@test nnz(distribute(A)) == nnz(A)
end
@testset "test matmatmul" begin
A = drandn(30, 30)
B = drandn(30, 20)
a = convert(Array, A)
b = convert(Array, B)
AB = A * B
AtB = transpose(A) * B
AcB = A' * B
ab = a * b
atb = transpose(a) * b
acb = a' * b
@test AB ≈ ab
@test AtB ≈ atb
@test AcB ≈ acb
d_closeall() # close the temporaries created above
end
@testset "sort, T = $T, 10^$i elements" for i in 0:6, T in [Int, Float64]
d = DistributedArrays.drand(T, 10^i)
@testset "sample = $sample" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)]
d2 = DistributedArrays.sort(d; sample=sample)
a = convert(Array, d)
a2 = convert(Array, d2)
@test length(d) == length(d2)
@test sort(a) == a2
end
d_closeall() # close the temporaries created above
end
check_leaks()
@testset "ddata" begin
d = ddata(;T=Int, init=I->myid())
for p in workers()
@test p == remotecall_fetch(d->d[:L], p, d)
end
@test Int[workers()...] == gather(d)
close(d)
d = ddata(;T=Int, data=workers())
for p in workers()
@test p == remotecall_fetch(d->d[:L], p, d)
end
@test Int[workers()...] == gather(d)
close(d)
d = ddata(;T=Any, init=I->"Hello World!")
for p in workers()
@test "Hello World!" == remotecall_fetch(d->d[:L], p, d)
end
Any["Hello World!" for p in workers()] == gather(d)
close(d)
end
check_leaks()
@testset "rand!" begin
d = dzeros(30, 30)
rand!(d)
close(d)
end
check_leaks()
@testset "fill!" begin
d = dzeros(30, 30)
fill!(d, 3.14)
@test all(x-> x == 3.14, d)
close(d)
end
check_leaks()
d_closeall()
@testset "test for any leaks" begin
sleep(1.0) # allow time for any cleanup to complete
allrefszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REFS.lock, isempty(DistributedArrays.REFS.data)), p) for p in procs()]
@test all(allrefszero)
allregistrieszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REGISTRY.lock, isempty(DistributedArrays.REGISTRY.data)), p) for p in procs()]
@test all(allregistrieszero)
end
================================================
FILE: test/explicit_imports.jl
================================================
using DistributedArrays, Test
import ExplicitImports
@testset "ExplicitImports" begin
# No implicit imports in DistributedArrays (ie. no `using MyPkg`)
@test ExplicitImports.check_no_implicit_imports(DistributedArrays) === nothing
# No non-owning imports in DistributedArrays (ie. no `using LinearAlgebra: map`)
@test ExplicitImports.check_all_explicit_imports_via_owners(DistributedArrays) === nothing
# Limit non-public imports in DistributedArrays (ie. `using MyPkg: _non_public_internal_func`)
# to a few selected types and functions
@test ExplicitImports.check_all_explicit_imports_are_public(
DistributedArrays;
ignore = (
# Base
:Broadcasted,
:Callable,
(VERSION < v"1.11" ? (:tail,) : ())...,
),
) === nothing
# No stale imports in DistributedArrays (ie. no `using MyPkg: func` where `func` is not used in DistributedArrays)
@test ExplicitImports.check_no_stale_explicit_imports(DistributedArrays) === nothing
# No non-owning accesses in DistributedArrays (ie. no `... LinearAlgebra.map(...)`)
@test ExplicitImports.check_all_qualified_accesses_via_owners(DistributedArrays) === nothing
# Limit non-public accesses in DistributedArrays (ie. no `... MyPkg._non_public_internal_func(...)`)
# to a few selected types and methods from Base
@test ExplicitImports.check_all_qualified_accesses_are_public(
DistributedArrays;
ignore = (
# Base.Broadcast
:AbstractArrayStyle,
:DefaultArrayStyle,
:broadcasted,
:throwdm,
# Base
(VERSION < v"1.11" ? (Symbol("@propagate_inbounds"),) : ())...,
:ReshapedArray,
:Slice,
:_all,
:_any,
:_mapreduce,
:check_reducedims,
:checkbounds_indices,
:index_lengths,
:mapreducedim!,
:promote_op,
:reducedim_initarray,
:reindex,
:setindex_shape_check,
:unalias,
# Serialization
:serialize_type,
# Statistics
:_mean,
),
) === nothing
# No self-qualified accesses in DistributedArrays (ie. no `... DistributedArrays.func(...)`)
@test ExplicitImports.check_no_self_qualified_accesses(DistributedArrays) === nothing
end
================================================
FILE: test/runtests.jl
================================================
using Test
using Distributed
using DistributedArrays
# Disable scalar indexing to avoid falling back on generic methods
# for AbstractArray
DistributedArrays.allowscalar(false)
# add at least 3 worker processes
if nworkers() < 3
n = max(3, min(8, Sys.CPU_THREADS))
addprocs(n; exeflags=`--check-bounds=yes`)
end
@assert nprocs() > 3
@assert nworkers() >= 3
@everywhere using Distributed
@everywhere using DistributedArrays
@everywhere using DistributedArrays.SPMD
@everywhere using Random
@everywhere using LinearAlgebra
@everywhere Random.seed!(1234 + myid())
const MYID = myid()
const OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))]
function check_leaks()
nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)
if !iszero(nrefs)
sleep(0.1) # allow time for any cleanup to complete and test again
nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data)
if !iszero(nrefs)
@warn("Probable leak of ", nrefs, " darrays")
end
end
end
include("aqua.jl")
include("explicit_imports.jl")
include("darray.jl")
include("spmd.jl")
================================================
FILE: test/spmd.jl
================================================
@everywhere function spmd_test1()
barrier(;tag=:b1)
if myid() == 1
@assert SPMD.recvfrom(2) == "Hello from 2"
println("SPMD: Passed send/recv")
elseif myid() == 2
data = "Hello from 2"
sendto(1, data)
end
stime = rand(1:5)
# println("Sleeping for $stime seconds")
sleep(stime)
barrier(;tag=:b2)
bcast_val = nothing
if myid() == 1
bcast_val = rand(2)
end
bcast_val = bcast(bcast_val, 1)
if myid() == 1
@assert bcast_val == SPMD.recvfrom(2)
println("SPMD: Passed broadcast")
elseif myid() == 2
sendto(1, bcast_val)
end
barrier()
scatter_data = nothing
if myid() == 1
scatter_data = rand(Int8, nprocs())
end
lp = scatter(scatter_data, 1, tag=1)
if myid() == 1
@assert scatter_data[2:2] == SPMD.recvfrom(2)
println("SPMD: Passed scatter 1")
elseif myid() == 2
sendto(1, lp)
end
scatter_data = nothing
if myid() == 1
scatter_data = rand(Int8, nprocs()*2)
end
lp = scatter(scatter_data, 1, tag=2)
if myid() == 1
@assert scatter_data[3:4] == SPMD.recvfrom(2)
println("SPMD: Passed scatter 2")
elseif myid() == 2
sendto(1, lp)
end
gathered_data = gather(myid(), 1, tag=3)
if myid() == 1
@assert gathered_data == procs()
println("SPMD: Passed gather 1")
end
gathered_data = gather([myid(), myid()], 1, tag=4)
if myid() == 1
@assert gathered_data == [[p,p] for p in procs()]
println("SPMD: Passed gather 2")
end
end
spmd(spmd_test1)
# Test running only on the workers using the spmd function.
# define the function everywhere
@everywhere function foo_spmd(d_in, d_out, n)
pids=sort(vec(procs(d_in)))
pididx = findfirst(isequal(myid()), pids)
mylp = localpart(d_in)
localsum = 0
# Have each node exchange data with its neighbors
n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
# println(p_pididx, " p", pids[p_pididx], " ", n_pididx, " p", pids[n_pididx])
# println(mylp)
for i in 1:n
sendto(pids[n_pididx], mylp[2])
sendto(pids[p_pididx], mylp[1])
mylp[2] = SPMD.recvfrom(pids[p_pididx])
mylp[1] = SPMD.recvfrom(pids[n_pididx])
# println(mylp)
barrier(;pids=pids)
localsum = localsum + mylp[1] + mylp[2]
end
# finally store the sum in d_out
d_out[:L] = localsum
end
# run foo_spmd on all workers, many of them, all concurrently using implicitly different contexts.
in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
out_arrays = map(x->ddata(), 1:8)
@sync for i in 1:8
@async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers())
end
for i in 1:8
@test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
end
println("SPMD: Passed testing of spmd function run concurrently")
# run concurrently with explicitly different contexts
# define the function everywhere
@everywhere function foo_spmd2(d_in, d_out, n)
pids=sort(vec(procs(d_in)))
pididx = findfirst(isequal(myid()), pids)
mylp = localpart(d_in)
# see if we have a value in the local store.
store = context_local_storage()
localsum = get!(store, :LOCALSUM, 0)
# Have each node exchange data with its neighbors
n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
for i in 1:n
sendto(pids[n_pididx], mylp[2])
sendto(pids[p_pididx], mylp[1])
mylp[2] = SPMD.recvfrom(pids[p_pididx])
mylp[1] = SPMD.recvfrom(pids[n_pididx])
barrier(;pids=pids)
localsum = localsum + mylp[1] + mylp[2]
end
# finally store the sum in d_out
d_out[:L] = localsum
store[:LOCALSUM] = localsum
end
in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
out_arrays = map(x->ddata(), 1:8)
contexts = map(x->context(workers()), 1:8)
@sync for i in 1:8
@async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
end
# Second run will add the value stored in the previous run.
@sync for i in 1:8
@async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
end
for i in 1:8
@test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
end
# verify localstores with appropriate context store values exist.
@everywhere begin
if myid() != 1
local n = 0
@lock DistributedArrays.SPMD.CONTEXTS.lock begin
for (k,v) in DistributedArrays.SPMD.CONTEXTS.data
store = v.store
localsum = store[:LOCALSUM]
if localsum != 2*sum(workers())*2
println("localsum ", localsum, " != $(2*sum(workers())*2)")
error("localsum mismatch")
end
n += 1
end
end
@assert n == 8
end
end
# close the contexts
foreach(close, contexts)
# verify that the localstores have been deleted.
@everywhere begin
@assert @lock DistributedArrays.SPMD.CONTEXTS.lock isempty(DistributedArrays.SPMD.CONTEXTS.data)
end
println("SPMD: Passed spmd function with explicit context run concurrently")
gitextract_34pth6or/
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── CI.yml
│ ├── CompatHelper.yml
│ └── TagBot.yml
├── .gitignore
├── LICENSE.md
├── Project.toml
├── README.md
├── codecov.yml
├── docs/
│ ├── .gitignore
│ ├── Project.toml
│ ├── make.jl
│ └── src/
│ ├── api.md
│ └── index.md
├── ext/
│ ├── SparseArraysExt.jl
│ └── StatisticsExt.jl
├── src/
│ ├── DistributedArrays.jl
│ ├── broadcast.jl
│ ├── core.jl
│ ├── darray.jl
│ ├── linalg.jl
│ ├── mapreduce.jl
│ ├── serialize.jl
│ ├── sort.jl
│ └── spmd.jl
└── test/
├── aqua.jl
├── darray.jl
├── explicit_imports.jl
├── runtests.jl
└── spmd.jl
Condensed preview — 30 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (149K chars).
[
{
"path": ".github/dependabot.yml",
"chars": 255,
"preview": "# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\nversion: 2\nupda"
},
{
"path": ".github/workflows/CI.yml",
"chars": 1925,
"preview": "name: CI\non:\n pull_request:\n branches:\n - master\n push:\n branches:\n - master\n tags: '*'\n workflow_"
},
{
"path": ".github/workflows/CompatHelper.yml",
"chars": 893,
"preview": "name: CompatHelper\non:\n schedule:\n - cron: 0 0 * * *\n workflow_dispatch:\njobs:\n CompatHelper:\n runs-on: ubuntu-"
},
{
"path": ".github/workflows/TagBot.yml",
"chars": 362,
"preview": "name: TagBot\non:\n issue_comment:\n types:\n - created\n workflow_dispatch:\njobs:\n TagBot:\n if: github.event_n"
},
{
"path": ".gitignore",
"chars": 50,
"preview": "Manifest.toml\n*.jl.cov\n*.jl.mem\n.DS_Store\n.vscode/"
},
{
"path": "LICENSE.md",
"chars": 1186,
"preview": "The DistributedArrays.jl package is licensed under the MIT \"Expat\" License:\n\n> Copyright (c) 2015: Julia Parallel Contri"
},
{
"path": "Project.toml",
"chars": 1272,
"preview": "name = \"DistributedArrays\"\nuuid = \"aaf54ef3-cdf8-58ed-94cc-d582ad619b94\"\nversion = \"0.6.9\"\n\n[deps]\nDistributed = \"8ba89e"
},
{
"path": "README.md",
"chars": 2896,
"preview": "# DistributedArrays\n\n*Distributed arrays for Julia.*\n\n| **Documentation** "
},
{
"path": "codecov.yml",
"chars": 14,
"preview": " comment: off\n"
},
{
"path": "docs/.gitignore",
"chars": 7,
"preview": "build/\n"
},
{
"path": "docs/Project.toml",
"chars": 212,
"preview": "[deps]\nDistributedArrays = \"aaf54ef3-cdf8-58ed-94cc-d582ad619b94\"\nDocumenter = \"e30172f5-a6a5-5a46-863b-614d45cd2de4\"\n\n["
},
{
"path": "docs/make.jl",
"chars": 337,
"preview": "using Documenter, DistributedArrays\n\nmakedocs(\n modules = [DistributedArrays],\n format = Documenter.HTML(),\n si"
},
{
"path": "docs/src/api.md",
"chars": 54,
"preview": "# API\n\n```@autodocs\nModules = [DistributedArrays]\n```\n"
},
{
"path": "docs/src/index.md",
"chars": 16033,
"preview": "# DistributedArrays.jl\n\n```@contents\n```\n\nDistributed Arrays\n------------------\n\nLarge computations are often organized "
},
{
"path": "ext/SparseArraysExt.jl",
"chars": 1220,
"preview": "module SparseArraysExt\n\nusing DistributedArrays: DArray, SubDArray, SubOrDArray, localpart\nusing DistributedArrays.Distr"
},
{
"path": "ext/StatisticsExt.jl",
"chars": 193,
"preview": "module StatisticsExt\n\nusing DistributedArrays: DArray\nusing Statistics: Statistics\n\nStatistics._mean(f, A::DArray, regio"
},
{
"path": "src/DistributedArrays.jl",
"chars": 987,
"preview": "module DistributedArrays\n\nusing Base: Callable\nusing Base.Broadcast: BroadcastStyle, Broadcasted\n\nusing Distributed: Dis"
},
{
"path": "src/broadcast.jl",
"chars": 6229,
"preview": "###\n# Distributed broadcast implementation\n##\n\n# We define a custom ArrayStyle here since we need to keep track of\n# the"
},
{
"path": "src/core.jl",
"chars": 3120,
"preview": "# Thread-safe registry of DArray references\nstruct DArrayRegistry\n data::Dict{Tuple{Int,Int}, Any}\n lock::Reentran"
},
{
"path": "src/darray.jl",
"chars": 28876,
"preview": "\"\"\"\n DArray(init, dims, [procs, dist])\n\nConstruct a distributed array.\n\nThe parameter `init` is a function that accep"
},
{
"path": "src/linalg.jl",
"chars": 9829,
"preview": "function Base.copy(Dadj::Adjoint{T,<:DArray{T,2}}) where T\n D = parent(Dadj)\n DArray(reverse(size(D)), procs(D)) d"
},
{
"path": "src/mapreduce.jl",
"chars": 10760,
"preview": "## higher-order functions ##\n\nBase.map(f, d0::DArray, ds::AbstractArray...) = broadcast(f, d0, ds...)\n\nfunction Base.map"
},
{
"path": "src/serialize.jl",
"chars": 3190,
"preview": "function Serialization.serialize(S::AbstractSerializer, d::DArray{T,N,A}) where {T,N,A}\n # Only send the ident for pa"
},
{
"path": "src/sort.jl",
"chars": 5767,
"preview": "# Sorting a DVector using samplesort\n\nfunction sample_n_setup_ref(d::DVector, sample_size; kwargs...)\n lp = localpart"
},
{
"path": "src/spmd.jl",
"chars": 7875,
"preview": "module SPMD\n\nusing Distributed: RemoteChannel, myid, procs, remote_do, remotecall_fetch, remotecall_wait\nusing ..Distrib"
},
{
"path": "test/aqua.jl",
"chars": 139,
"preview": "using DistributedArrays, Test\nimport Aqua\n\n@testset \"Aqua\" begin\n Aqua.test_all(DistributedArrays; ambiguities = (; b"
},
{
"path": "test/darray.jl",
"chars": 29395,
"preview": "using Test, LinearAlgebra, SpecialFunctions\nusing Statistics: mean\nusing SparseArrays: nnz\nusing Random\n@everywhere usin"
},
{
"path": "test/explicit_imports.jl",
"chars": 2429,
"preview": "using DistributedArrays, Test\nimport ExplicitImports\n\n@testset \"ExplicitImports\" begin\n # No implicit imports in Dist"
},
{
"path": "test/runtests.jl",
"chars": 1156,
"preview": "using Test\nusing Distributed\nusing DistributedArrays\n\n# Disable scalar indexing to avoid falling back on generic methods"
},
{
"path": "test/spmd.jl",
"chars": 5477,
"preview": "@everywhere function spmd_test1()\n barrier(;tag=:b1)\n\n if myid() == 1\n @assert SPMD.recvfrom(2) == \"Hello f"
}
]
About this extraction
This page contains the full source code of the JuliaParallel/DistributedArrays.jl GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 30 files (138.8 KB), approximately 43.1k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.