Repository: JuliaParallel/DistributedArrays.jl Branch: master Commit: db355b31aefd Files: 30 Total size: 138.8 KB Directory structure: gitextract_34pth6or/ ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── CI.yml │ ├── CompatHelper.yml │ └── TagBot.yml ├── .gitignore ├── LICENSE.md ├── Project.toml ├── README.md ├── codecov.yml ├── docs/ │ ├── .gitignore │ ├── Project.toml │ ├── make.jl │ └── src/ │ ├── api.md │ └── index.md ├── ext/ │ ├── SparseArraysExt.jl │ └── StatisticsExt.jl ├── src/ │ ├── DistributedArrays.jl │ ├── broadcast.jl │ ├── core.jl │ ├── darray.jl │ ├── linalg.jl │ ├── mapreduce.jl │ ├── serialize.jl │ ├── sort.jl │ └── spmd.jl └── test/ ├── aqua.jl ├── darray.jl ├── explicit_imports.jl ├── runtests.jl └── spmd.jl ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/dependabot.yml ================================================ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "github-actions" directory: "/" # Location of package manifests schedule: interval: "weekly" ================================================ FILE: .github/workflows/CI.yml ================================================ name: CI on: pull_request: branches: - master push: branches: - master tags: '*' workflow_dispatch: concurrency: # Skip intermediate builds: all builds except for builds on the `master` branch # Cancel intermediate builds: only pull request builds group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: version: - 'min' - 'lts' - '1' - 'pre' os: - ubuntu-latest - windows-latest - macOS-latest steps: - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: ${{ matrix.version }} - uses: julia-actions/cache@v3 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true docs: name: Documentation runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: '1' - uses: julia-actions/cache@v3 - run: julia --project=docs -e 'import Pkg; Pkg.instantiate()' - run: | julia --project=docs -e ' using Documenter: doctest using DistributedArrays doctest(DistributedArrays)' - run: julia --project=docs docs/make.jl env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} ================================================ FILE: .github/workflows/CompatHelper.yml ================================================ name: CompatHelper on: schedule: - cron: 0 0 * * * workflow_dispatch: jobs: CompatHelper: runs-on: ubuntu-latest steps: - name: "Add the General registry via Git" run: | import Pkg ENV["JULIA_PKG_SERVER"] = "" Pkg.Registry.add("General") shell: julia --color=yes {0} - name: "Install CompatHelper" run: | import Pkg name = "CompatHelper" uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" version = "3" Pkg.add(; name, uuid, version) shell: julia --color=yes {0} - name: "Run CompatHelper" run: | import CompatHelper CompatHelper.main(; subdirs = ["", "docs"]) shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} ================================================ FILE: .github/workflows/TagBot.yml ================================================ name: TagBot on: issue_comment: types: - created workflow_dispatch: jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' runs-on: ubuntu-latest steps: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} ssh: ${{ secrets.DOCUMENTER_KEY }} ================================================ FILE: .gitignore ================================================ Manifest.toml *.jl.cov *.jl.mem .DS_Store .vscode/ ================================================ FILE: LICENSE.md ================================================ The DistributedArrays.jl package is licensed under the MIT "Expat" License: > Copyright (c) 2015: Julia Parallel Contributors > > Permission is hereby granted, free of charge, to any person obtaining > a copy of this software and associated documentation files (the > "Software"), to deal in the Software without restriction, including > without limitation the rights to use, copy, modify, merge, publish, > distribute, sublicense, and/or sell copies of the Software, and to > permit persons to whom the Software is furnished to do so, subject to > the following conditions: > > The above copyright notice and this permission notice shall be > included in all copies or substantial portions of the Software. > > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Project.toml ================================================ name = "DistributedArrays" uuid = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94" version = "0.6.9" [deps] Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [weakdeps] SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [extensions] SparseArraysExt = "SparseArrays" StatisticsExt = "Statistics" [compat] Aqua = "0.8.12" Distributed = "<0.0.1, 1" ExplicitImports = "1.13.2" LinearAlgebra = "<0.0.1, 1" Primes = "0.4, 0.5" Random = "<0.0.1, 1" Serialization = "<0.0.1, 1" SparseArrays = "<0.0.1, 1" SpecialFunctions = "0.8, 1, 2" Statistics = "<0.0.1, 1" Test = "<0.0.1, 1" julia = "1.10" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Aqua", "ExplicitImports", "SparseArrays", "SpecialFunctions", "Statistics", "Test"] ================================================ FILE: README.md ================================================ # DistributedArrays *Distributed arrays for Julia.* | **Documentation** | **Build Status** | |:-------------------------------------------------------------------------:|:-------------------------------------------------------------:| | [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![][travis-img]][travis-url] [![][codecov-img]][codecov-url] | ## Introduction `DistributedArrays.jl` uses the stdlib [`Distributed`][distributed-docs] to implement a *Global Array* interface. A `DArray` is distributed across a set of workers. Each worker can read and write from its local portion of the array and each worker has read-only access to the portions of the array held by other workers. ## Installation The package can be installed with the Julia package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and run: ``` pkg> add DistributedArrays ``` Or, equivalently, via the `Pkg` API: ```julia julia> import Pkg; Pkg.add("DistributedArrays") ``` ## Documentation - [**STABLE**][docs-stable-url] — **documentation of the most recently tagged version.** - [**DEVEL**][docs-dev-url] — *documentation of the in-development version.* ## Project Status The package is tested against Julia 1.10.0 (oldest supported Julia version), the Julia LTS version, the latest stable release of Julia, and the pre-release version of Julia. ## Questions and Contributions Usage questions can be posted on the [Julia Discourse forum][discourse-tag-url] under the `Parallel/Distributed` category, in the #parallel channel of the [Julia Slack](https://julialang.org/community/). Contributions are very welcome, as are feature requests and suggestions. Please open an [issue][issues-url] if you encounter any problems. In particular additions to documentation are encouraged! [contrib-url]: https://juliadocs.github.io/Documenter.jl/latest/man/contributing/ [discourse-tag-url]: https://discourse.julialang.org/c/domain/parallel [docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg [docs-dev-url]: https://juliaparallel.github.io/DistributedArrays.jl/dev [docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg [docs-stable-url]: https://juliaparallel.github.io/DistributedArrays.jl/stable [travis-img]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master [travis-url]: https://travis-ci.org/JuliaParallel/DistributedArrays.jl [codecov-img]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg [codecov-url]: https://codecov.io/gh/JuliaParallel/DistributedArrays.jl [issues-url]: https://github.com/JuliaParallel/DistributedArrays.jl/issues [distributed-docs]: https://docs.julialang.org/en/v1/manual/parallel-computing/#Multi-Core-or-Distributed-Processing-1 ================================================ FILE: codecov.yml ================================================ comment: off ================================================ FILE: docs/.gitignore ================================================ build/ ================================================ FILE: docs/Project.toml ================================================ [deps] DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] DistributedArrays = "0.6" Documenter = "1" [sources.DistributedArrays] path = ".." ================================================ FILE: docs/make.jl ================================================ using Documenter, DistributedArrays makedocs( modules = [DistributedArrays], format = Documenter.HTML(), sitename = "DistributedArrays.jl", pages = [ "Introduction" => "index.md" "API" => "api.md" ], doctest = true ) deploydocs( repo = "github.com/JuliaParallel/DistributedArrays.jl.git", ) ================================================ FILE: docs/src/api.md ================================================ # API ```@autodocs Modules = [DistributedArrays] ``` ================================================ FILE: docs/src/index.md ================================================ # DistributedArrays.jl ```@contents ``` Distributed Arrays ------------------ Large computations are often organized around large arrays of data. In these cases, a particularly natural way to obtain parallelism is to distribute arrays among several processes. This combines the memory resources of multiple machines, allowing use of arrays too large to fit on one machine. Each process can read and write to the part of the array it owns and has read-only access to the parts it doesn't own. This provides a ready answer to the question of how a program should be divided among machines. Julia distributed arrays are implemented by the `DArray` type. A `DArray` has an element type and dimensions just like an `Array`. A `DArray` can also use arbitrary array-like types to represent the local chunks that store actual data. The data in a `DArray` is distributed by dividing the index space into some number of blocks in each dimension. Common kinds of arrays can be constructed with functions beginning with `d`: ```julia dzeros(100,100,10) dones(100,100,10) drand(100,100,10) drandn(100,100,10) dfill(x,100,100,10) ``` In the last case, each element will be initialized to the specified value `x`. These functions automatically pick a distribution for you. For more control, you can specify which processes to use, and how the data should be distributed: ```julia dzeros((100,100), workers()[1:4], [1,4]) ``` The second argument specifies that the array should be created on the first four workers. When dividing data among a large number of processes, one often sees diminishing returns in performance. Placing `DArray`s on a subset of processes allows multiple `DArray` computations to happen at once, with a higher ratio of work to communication on each process. The third argument specifies a distribution; the nth element of this array specifies how many pieces dimension n should be divided into. In this example the first dimension will not be divided, and the second dimension will be divided into 4 pieces. Therefore each local chunk will be of size `(100,25)`. Note that the product of the distribution array must equal the number of processes. * `distribute(a::Array)` converts a local array to a distributed array. * `localpart(d::DArray)` obtains the locally-stored portion of a `DArray`. * Localparts can be retrieved and set via the indexing syntax too. Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which are all equivalent. For example, `d[:L]` returns the localpart of `d` while `d[:L]=v` sets `v` as the localpart of `d`. * `localindices(a::DArray)` gives a tuple of the index ranges owned by the local process. * `convert(Array, a::DArray)` brings all the data to the local process. Indexing a `DArray` (square brackets) with ranges of indices always creates a `SubArray`, not copying any data. Constructing Distributed Arrays ------------------------------- The primitive `DArray` constructor has the following somewhat elaborate signature: ```julia DArray(init, dims[, procs, dist]) ``` `init` is a function that accepts a tuple of index ranges. This function should allocate a local chunk of the distributed array and initialize it for the specified indices. `dims` is the overall size of the distributed array. `procs` optionally specifies a vector of process IDs to use. `dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension. The last two arguments are optional, and defaults will be used if they are omitted. As an example, here is how to turn the local array constructor `fill` into a distributed array constructor: ```julia dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...) ``` In this case the `init` function only needs to call `fill` with the dimensions of the local piece it is creating. `DArray`s can also be constructed from multidimensional `Array` comprehensions with the `@DArray` macro syntax. This syntax is just sugar for the primitive `DArray` constructor: ```julia julia> [i+j for i = 1:5, j = 1:5] 5x5 Array{Int64,2}: 2 3 4 5 6 3 4 5 6 7 4 5 6 7 8 5 6 7 8 9 6 7 8 9 10 julia> @DArray [i+j for i = 1:5, j = 1:5] 5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}: 2 3 4 5 6 3 4 5 6 7 4 5 6 7 8 5 6 7 8 9 6 7 8 9 10 ``` ### Construction from arrays generated on separate processes `DArray`s can also be constructed from arrays that have been constructed on separate processes, as demonstrated below: ```julia ras = [@spawnat p rand(30,30) for p in workers()[1:4]] ras = reshape(ras,(2,2)) D = DArray(ras) ``` An alternative syntax is: ```julia r1 = DistributedArrays.remotecall(() -> rand(10,10), workers()[1]) r2 = DistributedArrays.remotecall(() -> rand(10,10), workers()[2]) r3 = DistributedArrays.remotecall(() -> rand(10,10), workers()[3]) r4 = DistributedArrays.remotecall(() -> rand(10,10), workers()[4]) D = DArray(reshape([r1 r2 r3 r4], (2,2))) ``` The distribution of indices across workers can be checked with ```julia [@fetchfrom p localindices(D) for p in workers()] ``` Distributed Array Operations ---------------------------- At this time, distributed arrays do not have much functionality. Their major utility is allowing communication to be done via array indexing, which is convenient for many problems. As an example, consider implementing the "life" cellular automaton, where each cell in a grid is updated according to its neighboring cells. To compute a chunk of the result of one iteration, each process needs the immediate neighbor cells of its local chunk. The following code accomplishes this: ```julia function life_step(d::DArray) DArray(size(d),procs(d)) do I top = mod(first(I[1])-2,size(d,1))+1 bot = mod( last(I[1]) ,size(d,1))+1 left = mod(first(I[2])-2,size(d,2))+1 right = mod( last(I[2]) ,size(d,2))+1 old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2) old[1 , 1 ] = d[top , left] # left side old[2:end-1, 1 ] = d[I[1], left] old[end , 1 ] = d[bot , left] old[1 , 2:end-1] = d[top , I[2]] old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle old[end , 2:end-1] = d[bot , I[2]] old[1 , end ] = d[top , right] # right side old[2:end-1, end ] = d[I[1], right] old[end , end ] = d[bot , right] life_rule(old) end end ``` As you can see, we use a series of indexing expressions to fetch data into a local array `old`. Note that the `do` block syntax is convenient for passing `init` functions to the `DArray` constructor. Next, the serial function `life_rule` is called to apply the update rules to the data, yielding the needed `DArray` chunk. Nothing about `life_rule` is `DArray`-specific, but we list it here for completeness: ```julia function life_rule(old) m, n = size(old) new = similar(old, m-2, n-2) for j = 2:n-1 for i = 2:m-1 nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1], old[i ,j-1], old[i ,j+1], old[i+1,j-1], old[i+1,j], old[i+1,j+1]) new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j]) end end new end ``` Numerical Results of Distributed Computations --------------------------------------------- Floating point arithmetic is not associative and this comes up when performing distributed computations over `DArray`s. All `DArray` operations are performed over the `localpart` chunks and then aggregated. The change in ordering of the operations will change the numeric result as seen in this simple example: ```julia julia> addprocs(8); julia> using DistributedArrays julia> A = fill(1.1, (100,100)); julia> sum(A) 11000.000000000013 julia> DA = distribute(A); julia> sum(DA) 11000.000000000127 julia> sum(A) == sum(DA) false ``` The ultimate ordering of operations will be dependent on how the `Array` is distributed. Garbage Collection and `DArray`s ------------------------------ When a `DArray` is constructed (typically on the master process), the returned `DArray` objects stores information on how the array is distributed, which processor holds which indices and so on. When the `DArray` object on the master process is garbage collected, all participating workers are notified and localparts of the `DArray` freed on each worker. Since the size of the `DArray` object itself is small, a problem arises as `gc` on the master faces no memory pressure to collect the `DArray` immediately. This results in a delay of the memory being released on the participating workers. Therefore it is highly recommended to explicitly call `close(d::DArray)` as soon as user code has finished working with the distributed array. It is also important to note that the localparts of the `DArray` is collected from all participating workers when the `DArray` object on the process creating the `DArray` is collected. It is therefore important to maintain a reference to a `DArray` object on the creating process for as long as it is being computed upon. `d_closeall()` is another useful function to manage distributed memory. It releases all `DArrays` created from the calling process, including any temporaries created during computation. Working with distributed non-array data (requires Julia 0.6) ------------------------------------------------------------ The function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used to created a distributed vector whose localparts need not be Arrays. It returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same. `ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value, initially initialized to `nothing`. Argument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`. If the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1, returns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`. `gather{T}(d::DArray{T,1,T})` returns an `Array{T,1}` consisting of all distributed elements of `d`. Given a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart` on the ith worker that `d` is distributed over. SPMD Mode (An MPI Style SPMD mode with MPI like primitives, requires Julia 0.6) ------------------------------------------------------------------------------- SPMD, i.e., a Single Program Multiple Data mode, is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background. The same block of code is executed concurrently on all workers using the `spmd` function. ```julia # define foo() on all workers @everywhere function foo(arg1, arg2) .... end # call foo() everywhere using the `spmd` function d_in=DArray(.....) d_out=ddata() spmd(foo,d_in,d_out; pids=workers()) # executes on all workers ``` `spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)` `args` is one or more arguments to be passed to `f`. `pids` identifies the workers that `f` needs to be run on. `context` identifies a run context, which is explained later. The following primitives can be used in SPMD mode. - `sendto(pid, data; tag=nothing)` - sends `data` to `pid` - `recvfrom(pid; tag=nothing)` - receives data from `pid` - `recvfrom_any(; tag=nothing)` - receives data from any `pid` - `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceed - `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid` - `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid` - `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid` Tag `tag` should be used to differentiate between consecutive calls of the same type, for example, consecutive `bcast` calls. `spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to import it explicitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example, `SPMD.sendto`. Example ------- This toy example exchanges data with each of its neighbors `n` times. ```julia using Distributed using DistributedArrays addprocs(8) @everywhere using DistributedArrays @everywhere using DistributedArrays.SPMD d_in=d=DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]) d_out=ddata(); # define the function everywhere @everywhere function foo_spmd(d_in, d_out, n) pids = sort(vec(procs(d_in))) pididx = findfirst(isequal(myid()), pids) mylp = d_in[:L] localsum = 0 # Have each worker exchange data with its neighbors n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 for i in 1:n sendto(pids[n_pididx], mylp[2]) sendto(pids[p_pididx], mylp[1]) mylp[2] = recvfrom(pids[p_pididx]) mylp[1] = recvfrom(pids[n_pididx]) barrier(;pids=pids) localsum = localsum + mylp[1] + mylp[2] end # finally store the sum in d_out d_out[:L] = localsum end # run foo_spmd on all workers spmd(foo_spmd, d_in, d_out, 10, pids=workers()) # print values of d_in and d_out after the run println(d_in) println(d_out) ``` SPMD Context ------------ Each SPMD run is implicitly executed in a different context. This allows for multiple `spmd` calls to be active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`. `context(pids=procs())` returns a new SPMD context. A SPMD context also provides a context local storage, a dict, which can be used to store key-value pairs between spmd runs under the same context. `context_local_storage()` returns the dictionary associated with the context. NOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context which live only for the duration of the call. Explicitly created context objects can be released early by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries on all participating `pids`. Else they will be released when the context object is gc'ed on the node that created it. Nested `spmd` calls ------------------- As `spmd` executes the specified function on all participating nodes, we need to be careful with nesting `spmd` calls. An example of an unsafe(wrong) way: ```julia function foo(.....) ...... spmd(bar, ......) ...... end function bar(....) ...... spmd(baz, ......) ...... end spmd(foo,....) ``` In the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currently part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process. The correct way (only have the driver process initiate `spmd` calls): ```julia function foo() ...... myid()==1 && spmd(bar, ......) ...... end function bar() ...... myid()==1 && spmd(baz, ......) ...... end spmd(foo,....) ``` This is also true of functions which automatically distribute computation on DArrays. ```julia function foo(d::DArray) ...... myid()==1 && map!(bar, d) ...... end spmd(foo,....) ``` Without the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is probably not what we want. Similarly `@everywhere` from within a SPMD run should also be driven from the master node only. ================================================ FILE: ext/SparseArraysExt.jl ================================================ module SparseArraysExt using DistributedArrays: DArray, SubDArray, SubOrDArray, localpart using DistributedArrays.Distributed: remotecall_fetch using SparseArrays: SparseArrays, nnz function SparseArrays.nnz(A::DArray) B = asyncmap(A.pids) do p remotecall_fetch(nnz∘localpart, p, A) end return reduce(+, B) end # Fix method ambiguities # TODO: Improve efficiency? Base.copyto!(dest::SubOrDArray{<:Any,2}, src::SparseArrays.AbstractSparseMatrixCSC) = copyto!(dest, Matrix(src)) @static if isdefined(SparseArrays, :CHOLMOD) Base.copyto!(dest::SubOrDArray, src::SparseArrays.CHOLMOD.Dense) = copyto!(dest, Array(src)) Base.copyto!(dest::SubOrDArray{T}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src)) Base.copyto!(dest::SubOrDArray{T,2}, src::SparseArrays.CHOLMOD.Dense{T}) where {T<:Union{Float32,Float64,ComplexF32,ComplexF64}} = copyto!(dest, Array(src)) end # Fix method ambiguities for T in (:DArray, :SubDArray) @eval begin Base.:(==)(d1::$T{<:Any,1}, d2::SparseArrays.ReadOnly) = d1 == parent(d2) Base.:(==)(d1::SparseArrays.ReadOnly, d2::$T{<:Any,1}) = parent(d1) == d2 end end end ================================================ FILE: ext/StatisticsExt.jl ================================================ module StatisticsExt using DistributedArrays: DArray using Statistics: Statistics Statistics._mean(f, A::DArray, region) = sum(f, A, dims = region) ./ prod((size(A, i) for i in region)) end ================================================ FILE: src/DistributedArrays.jl ================================================ module DistributedArrays using Base: Callable using Base.Broadcast: BroadcastStyle, Broadcasted using Distributed: Distributed, RemoteChannel, Future, myid, nworkers, procs, remotecall, remotecall_fetch, remotecall_wait, worker_id_from_socket, workers using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, I, Transpose, adjoint, adjoint!, axpy!, dot, lmul!, mul!, norm, rmul!, transpose, transpose! using Random: Random, rand! using Serialization: Serialization, AbstractSerializer, deserialize, serialize using Primes: factor # DArray exports export DArray, SubDArray, SubOrDArray, @DArray export dzeros, dones, dfill, drand, drandn, distribute, localpart, localindices, ppeval # non-array distributed data export ddata, gather # immediate release of localparts export d_closeall include("darray.jl") include("core.jl") include("serialize.jl") include("broadcast.jl") include("mapreduce.jl") include("linalg.jl") include("sort.jl") include("spmd.jl") export SPMD end # module ================================================ FILE: src/broadcast.jl ================================================ ### # Distributed broadcast implementation ## # We define a custom ArrayStyle here since we need to keep track of # the fact that it is Distributed and what kind of underlying broadcast behaviour # we will encounter. struct DArrayStyle{Style <: Union{Nothing,BroadcastStyle}} <: Broadcast.AbstractArrayStyle{Any} end DArrayStyle(::S) where {S} = DArrayStyle{S}() DArrayStyle(::S, ::Val{N}) where {S,N} = DArrayStyle(S(Val(N))) DArrayStyle(::Val{N}) where N = DArrayStyle{Broadcast.DefaultArrayStyle{N}}() Broadcast.BroadcastStyle(::Type{<:DArray{<:Any, N, A}}) where {N, A} = DArrayStyle(BroadcastStyle(A), Val(N)) # promotion rules # TODO: test this function Broadcast.BroadcastStyle(::DArrayStyle{AStyle}, ::DArrayStyle{BStyle}) where {AStyle, BStyle} DArrayStyle(BroadcastStyle(AStyle, BStyle)) end function Broadcast.broadcasted(::DArrayStyle{Style}, f, args...) where Style inner = Broadcast.broadcasted(Style(), f, args...) if inner isa Broadcasted return Broadcasted{DArrayStyle{Style}}(inner.f, inner.args, inner.axes) else # eagerly evaluated return inner end end # # deal with one layer deep lazy arrays # BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where T <: DArray = BroadcastStyle(T) # BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where T <: DArray = BroadcastStyle(T) # BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,<:T}}) where T <: DArray = BroadcastStyle(T) # # This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray # # and we could define our methods in terms of Union{DArray, WrappedArray{<:Any, <:DArray}} # const DDestArray = Union{DArray, # LinearAlgebra.Transpose{<:Any,<:DArray}, # LinearAlgebra.Adjoint{<:Any,<:DArray}, # SubArray{<:Any, <:Any, <:DArray}} const DDestArray = DArray # This method is responsible for selection the output type of broadcast function Base.similar(bc::Broadcasted{<:DArrayStyle{Style}}, ::Type{ElType}) where {Style, ElType} DArray(map(length, axes(bc))) do I # create fake Broadcasted for underlying ArrayStyle bc′ = Broadcasted{Style}(identity, (), map(length, I)) similar(bc′, ElType) end end ## # Ref https://docs.julialang.org/en/v1/manual/interfaces/#extending-in-place-broadcast-2 # # We purposefully only specialise `copyto!`, # Broadcast implementation that defers to the underlying BroadcastStyle. We can't # assume that `getindex` is fast, furthermore we can't assume that the distribution of # DArray across workers is equal or that the underlying array type is consistent. # # Implementation: # - first distribute all arguments # - Q: How do decide on the cuts # - then localise arguments on each node ## @inline function Base.copyto!(dest::DDestArray, bc::Broadcasted{Nothing}) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) # Distribute Broadcasted # This will turn local AbstractArrays into DArrays dbc = bcdistribute(bc) @sync for p in procs(dest) @async remotecall_wait(p) do # get the indices for the localpart lpidx = localpartindex(dest) @assert lpidx != 0 # create a local version of the broadcast, by constructing views # Note: creates copies of the argument lbc = bclocal(dbc, dest.indices[lpidx]) copyto!(localpart(dest), lbc) end end return dest end # Test # a = Array # a .= DArray(x,y) @inline function Base.copy(bc::Broadcasted{<:DArrayStyle}) dbc = bcdistribute(bc) # TODO: teach DArray about axes since this is wrong for OffsetArrays DArray(map(length, axes(bc))) do I lbc = bclocal(dbc, I) copy(lbc) end end # _bcview creates takes the shapes of a view and the shape of a broadcasted argument, # and produces the view over that argument that constitutes part of the broadcast # it is in a sense the inverse of _bcs in Base.Broadcast _bcview(::Tuple{}, ::Tuple{}) = () _bcview(::Tuple{}, view::Tuple) = () _bcview(shape::Tuple, ::Tuple{}) = (shape[1], _bcview(tail(shape), ())...) function _bcview(shape::Tuple, view::Tuple) return (_bcview1(shape[1], view[1]), _bcview(tail(shape), tail(view))...) end # _bcview1 handles the logic for a single dimension function _bcview1(a, b) if a == 1 || a == 1:1 return 1:1 elseif first(a) <= first(b) <= last(a) && first(a) <= last(b) <= last(b) return b else throw(DimensionMismatch("broadcast view could not be constructed")) end end # Distribute broadcast # TODO: How to decide on cuts @inline bcdistribute(bc::Broadcasted{Style}) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{DArrayStyle{Style}}(bc.f, bcdistribute_args(bc.args), bc.axes) @inline bcdistribute(bc::Broadcasted{Style}) where Style<:DArrayStyle = Broadcasted{Style}(bc.f, bcdistribute_args(bc.args), bc.axes) # ask BroadcastStyle to decide if argument is in need of being distributed bcdistribute(x::T) where T = _bcdistribute(BroadcastStyle(T), x) _bcdistribute(::DArrayStyle, x) = x # Don't bother distributing singletons _bcdistribute(::Broadcast.AbstractArrayStyle{0}, x) = x _bcdistribute(::Broadcast.AbstractArrayStyle, x) = distribute(x) _bcdistribute(::Any, x) = x @inline bcdistribute_args(args::Tuple) = (bcdistribute(args[1]), bcdistribute_args(tail(args))...) bcdistribute_args(args::Tuple{Any}) = (bcdistribute(args[1]),) bcdistribute_args(args::Tuple{}) = () # dropping axes here since recomputing is easier @inline bclocal(bc::Broadcasted{DArrayStyle{Style}}, idxs) where Style<:Union{Nothing,BroadcastStyle} = Broadcasted{Style}(bc.f, bclocal_args(_bcview(axes(bc), idxs), bc.args)) # bclocal will do a view of the data and the copy it over # except when the data already is local function bclocal(x::DArray{T, N, AT}, idxs) where {T, N, AT} bcidxs = _bcview(axes(x), idxs) makelocal(x, bcidxs...) end bclocal(x, idxs) = x @inline bclocal_args(idxs, args::Tuple) = (bclocal(args[1], idxs), bclocal_args(idxs, tail(args))...) bclocal_args(idxs, args::Tuple{Any}) = (bclocal(args[1], idxs),) bclocal_args(idxs, args::Tuple{}) = () ================================================ FILE: src/core.jl ================================================ # Thread-safe registry of DArray references struct DArrayRegistry data::Dict{Tuple{Int,Int}, Any} lock::ReentrantLock DArrayRegistry() = new(Dict{Tuple{Int,Int}, Any}(), ReentrantLock()) end const REGISTRY = DArrayRegistry() function Base.get(r::DArrayRegistry, id::Tuple{Int,Int}, default) @lock r.lock begin return get(r.data, id, default) end end function Base.getindex(r::DArrayRegistry, id::Tuple{Int,Int}) @lock r.lock begin return r.data[id] end end function Base.setindex!(r::DArrayRegistry, val, id::Tuple{Int,Int}) @lock r.lock begin r.data[id] = val end return r end function Base.delete!(r::DArrayRegistry, id::Tuple{Int,Int}) @lock r.lock delete!(r.data, id) return r end # Thread-safe set of IDs of DArrays created on this node struct DArrayRefs data::Set{Tuple{Int,Int}} lock::ReentrantLock DArrayRefs() = new(Set{Tuple{Int,Int}}(), ReentrantLock()) end const REFS = DArrayRefs() function Base.push!(r::DArrayRefs, id::Tuple{Int,Int}) # Ensure id refers to a DArray created on this node if first(id) != myid() throw( ArgumentError( lazy"`DArray` is not created on the current worker: Only `DArray`s created on worker $(myid()) can be stored in this set but the `DArray` was created on worker $(first(id)).")) end @lock r.lock begin return push!(r.data, id) end end function Base.delete!(r::DArrayRefs, id::Tuple{Int,Int}) @lock r.lock delete!(r.data, id) return r end # Global counter to generate a unique ID for each DArray const DID = Threads.Atomic{Int}(1) """ next_did() Increment a global counter and return a tuple of the current worker ID and the incremented value of the counter. This tuple is used as a unique ID for a new `DArray`. """ next_did() = (myid(), Threads.atomic_add!(DID, 1)) release_localpart(id::Tuple{Int,Int}) = (delete!(REGISTRY, id); nothing) function release_allparts(id::Tuple{Int,Int}, pids::Array{Int}) @sync begin released_myid = false for p in pids if p == myid() @async release_localpart(id) released_myid = true else @async remotecall_fetch(release_localpart, p, id) end end if !released_myid @async release_localpart(id) end end return nothing end function close_by_id(id::Tuple{Int,Int}, pids::Array{Int}) release_allparts(id, pids) delete!(REFS, id) nothing end function d_closeall() @lock REFS.lock begin while !isempty(REFS.data) id = pop!(REFS.data) d = d_from_weakref_or_d(id) if d isa DArray finalize(d) end end end return nothing end Base.close(d::DArray) = finalize(d) """ procs(d::DArray) Get the vector of processes storing pieces of DArray `d`. """ Distributed.procs(d::DArray) = d.pids Distributed.procs(d::SubDArray) = procs(parent(d)) """ localpart(A) The identity when input is not distributed """ localpart(A) = A ================================================ FILE: src/darray.jl ================================================ """ DArray(init, dims, [procs, dist]) Construct a distributed array. The parameter `init` is a function that accepts a tuple of index ranges. This function should allocate a local chunk of the distributed array and initialize it for the specified indices. `dims` is the overall size of the distributed array. `procs` optionally specifies a vector of process IDs to use. If unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode, i.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the interactive julia prompt. `dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension. For example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as: ### Example ```jl dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...) ``` """ mutable struct DArray{T,N,A} <: AbstractArray{T,N} id::Tuple{Int,Int} dims::NTuple{N,Int} pids::Array{Int,N} # pids[i]==p ⇒ processor p has piece i indices::Array{NTuple{N,UnitRange{Int}},N} # indices held by piece i cuts::Vector{Vector{Int}} # cuts[d][i] = first index of chunk i in dimension d localpart::Union{A,Nothing} function DArray{T,N,A}(id::Tuple{Int,Int}, dims::NTuple{N,Int}, pids, indices, cuts, lp) where {T,N,A} # check invariants if dims != map(last, last(indices)) throw(ArgumentError("dimension of DArray (dim) and indices do not match")) end d = d_from_weakref_or_d(id) if d === nothing d = new(id, dims, pids, indices, cuts, lp) end if first(id) == myid() push!(REFS, id) REGISTRY[id] = WeakRef(d) finalizer(d) do d @async close_by_id(d.id, d.pids) end end d end DArray{T,N,A}() where {T,N,A} = new() end unpack_weakref(x) = x unpack_weakref(x::WeakRef) = x.value d_from_weakref_or_d(id::Tuple{Int,Int}) = unpack_weakref(get(REGISTRY, id, nothing)) Base.eltype(::Type{DArray{T}}) where {T} = T empty_localpart(T,N,A) = A(Array{T}(undef, ntuple(zero, N))) const SubDArray{T,N,D<:DArray} = SubArray{T,N,D} const SubOrDArray{T,N} = Union{DArray{T,N}, SubDArray{T,N}} localtype(::Type{DArray{T,N,S}}) where {T,N,S} = S localtype(::Type{SubDArray{T,N,D}}) where {T,N,D} = localtype(D) localtype(A::SubOrDArray) = localtype(typeof(A)) localtype(A::AbstractArray) = typeof(A) Base.hash(d::DArray, h::UInt) = Base.hash(d.id, h) ## core constructors ## function DArray(id::Tuple{Int,Int}, init::I, dims, pids, idxs, cuts) where {I} localtypes = Vector{DataType}(undef,length(pids)) if init isa Function asyncmap!(localtypes, pids) do pid return remotecall_fetch(construct_localparts, pid, init, id, dims, pids, idxs, cuts) end else asyncmap!(localtypes, pids, init) do pid, pid_init # constructing from an array of remote refs. return remotecall_fetch(construct_localparts, pid, pid_init, id, dims, pids, idxs, cuts) end end if !allequal(localtypes) @sync for p in pids @async remotecall_wait(release_localpart, p, id) end throw(ErrorException(lazy"Constructed localparts have different `eltype`: $(localtypes)")) end A = first(localtypes) if myid() in pids return unpack_weakref(REGISTRY[id]) else T = eltype(A) N = length(dims) return DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A)) end end function construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing) localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init) if A == nothing A = typeof(localpart) end if T == nothing T = eltype(A) end N = length(dims) d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart) REGISTRY[id] = d A end function ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[]) pids=sort(vec(pids)) id = next_did() npids = length(pids) ldata = length(data) idxs, cuts = chunk_idxs([npids], [npids]) if ldata > 0 @assert rem(ldata,npids) == 0 if ldata == npids T = eltype(data) s = DestinationSerializer(pididx->data[pididx], pids) init = I->localpart(s) else # call the standard distribute function return distribute(data) end end @sync for p in pids @async remotecall_wait(construct_localparts, p, init, id, (npids,), pids, idxs, cuts; T=T, A=T) end if myid() in pids return unpack_weakref(REGISTRY[id]) else return DArray{T,1,T}(id, (npids,), pids, idxs, cuts, nothing) end end function gather(d::DArray{T,1,T}) where T pids = procs(d) a = Vector{T}(undef, length(pids)) asyncmap!(a, pids) do p remotecall_fetch(localpart, p, d) end a end function DArray(init, dims, procs, dist) np = prod(dist) procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist))) idxs, cuts = chunk_idxs([dims...], dist) id = next_did() return DArray(id, init, dims, procs, idxs, cuts) end function DArray(init, dims, procs) if isempty(procs) throw(ArgumentError("no processors given")) end return DArray(init, dims, procs, defaultdist(dims, procs)) end DArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))]) # Create a DArray from a collection of references # The refs must have the same layout as the parts distributed. # i.e. # size(refs) must specify the distribution of dimensions across processors # prod(size(refs)) must equal number of parts # FIXME : Empty parts are currently not supported. function DArray(refs) dimdist = size(refs) id = next_did() nsizes = Array{Tuple}(undef, dimdist) asyncmap!(nsizes, refs) do r remotecall_fetch(sz_localpart_ref, r.where, r, id) end nindices = Array{NTuple{length(dimdist),UnitRange{Int}}}(undef, dimdist...) for i in 1:length(nindices) subidx = CartesianIndices(dimdist)[i] nindices[i] = ntuple(length(subidx)) do x idx_in_dim = subidx[x] startidx = 1 for j in 1:(idx_in_dim-1) prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx)) prevsize = nsizes[prevsubidx...] startidx += prevsize[x] end startidx:startidx+(nsizes[i][x])-1 end end lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindices]...) ncuts = Array{Int,1}[pushfirst!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)] ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...) DArray(id, refs, ndims, map(r -> r.where, refs), nindices, ncuts) end macro DArray(ex0::Expr) if ex0.head !== :comprehension throw(ArgumentError("invalid @DArray syntax")) end ex = ex0.args[1] if ex.head !== :generator throw(ArgumentError("invalid @DArray syntax")) end ex.args[1] = esc(ex.args[1]) ndim = length(ex.args) - 1 ranges = map(r->esc(r.args[2]), ex.args[2:end]) for d = 1:ndim var = ex.args[d+1].args[1] ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] ) end return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0), tuple($(map(r->:(length($r)), ranges)...))) ) end # new DArray similar to an existing one DArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indices, d.cuts) sz_localpart_ref(ref, id) = size(fetch(ref)) Base.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(undef, map(length,I)), dims, procs(d)) Base.similar(d::DArray, T::Type) = similar(d, T, size(d)) Base.similar(d::DArray{T}, dims::Dims) where {T} = similar(d, T, dims) Base.similar(d::DArray{T}) where {T} = similar(d, T, size(d)) Base.size(d::DArray) = d.dims chunktype(d::DArray{T,N,A}) where {T,N,A} = A ## chunk index utilities ## # decide how to divide each dimension # returns size of chunks array function defaultdist(dims, pids) dims = [dims...] chunks = ones(Int, length(dims)) np = length(pids) f = sort!(collect(keys(factor(np))), rev=true) k = 1 while np > 1 # repeatedly allocate largest factor to largest dim if np % f[k] != 0 k += 1 if k > length(f) break end end fac = f[k] (d, dno) = findmax(dims) # resolve ties to highest dim dno = findlast(isequal(d), dims) if dims[dno] >= fac dims[dno] = div(dims[dno], fac) chunks[dno] *= fac end np = div(np, fac) end return chunks end # get array of start indices for dividing sz into nc chunks function defaultdist(sz::Int, nc::Int) if sz >= nc chunk_size = div(sz,nc) remainder = rem(sz,nc) grid = zeros(Int64, nc+1) for i = 1:(nc+1) grid[i] += (i-1)*chunk_size + 1 if i<= remainder grid[i] += i-1 else grid[i] += remainder end end return grid else return [[1:(sz+1);]; zeros(Int, nc-sz)] end end # compute indices array for dividing dims into chunks function chunk_idxs(dims, chunks) cuts = map(defaultdist, dims, chunks) n = length(dims) idxs = Array{NTuple{n,UnitRange{Int}}}(undef, chunks...) for cidx in CartesianIndices(tuple(chunks...)) idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n) end return (idxs, cuts) end function localpartindex(pids::Array{Int}) mi = myid() for i = 1:length(pids) if pids[i] == mi return i end end return 0 end localpartindex(d::DArray) = localpartindex(procs(d)) """ localpart(d::DArray) Get the local piece of a distributed array. Returns an empty array if no local part exists on the calling process. d[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts. This syntaxt can also be used for assignment. For example, `d[:L]=v` will assign `v` to the localpart of `d`. """ function localpart(d::DArray{T,N,A}) where {T,N,A} lpidx = localpartindex(d) if lpidx == 0 return empty_localpart(T,N,A)::A end return d.localpart::A end localpart(d::DArray, localidx...) = localpart(d)[localidx...] _localindex(i::Integer, offset) = i - offset _localindex(i::AbstractRange, offset) = (first(i)-offset):step(i):(last(i)-offset) _localindex(i::AbstractUnitRange, offset) = (first(i)-offset):(last(i)-offset) """ makelocal(A::DArray, I...) Equivalent to `Array(view(A, I...))` but optimised for the case that the data is local. Can return a view into `localpart(A)` """ @inline function makelocal(A::DArray{<:Any, <:Any, AT}, I::Vararg{Any, N}) where {N, AT} J = map(i->Base.unalias(A, i), to_indices(A, I)) J = map(j-> isa(j, Base.Slice) ? j.indices : j, J) @boundscheck checkbounds(A, J...) lidcs = localindices(A) if Base.checkbounds_indices(Bool, lidcs, J) # data we want is local viewidcs = ntuple(i -> _localindex(J[i], first(lidcs[i]) - 1), ndims(A)) view(localpart(A), viewidcs...) else # Make more efficient (?maybe) by allocating new memory # only for the remote part viewidcs = ntuple(i -> _localindex(J[i], 0), ndims(A)) arr = similar(AT, map(length, viewidcs)...) copyto!(arr, view(A, viewidcs...)) end end # shortcut to set/get localparts of a distributed object Base.getindex(d::DArray, s::Symbol) = _getindex(d, s) Base.getindex(d::DArray{<:Any, 1}, s::Symbol) = _getindex(d, s) function _getindex(d::DArray, s::Symbol) @assert s in [:L, :l, :LP, :lp] return localpart(d) end function Base.setindex!(d::DArray{T,N,A}, new_lp::A, s::Symbol) where {T,N,A} @assert s in [:L, :l, :LP, :lp] d.localpart = new_lp new_lp end # fetch localpart of d at pids[i] Base.fetch(d::DArray{T,N,A}, i) where {T,N,A} = remotecall_fetch(localpart, d.pids[i], d) """ localindices(d) A tuple describing the indices owned by the local process. Returns a tuple with empty ranges if no local part exists on the calling process. """ function localindices(d::DArray) lpidx = localpartindex(d) if lpidx == 0 return ntuple(i -> 1:0, ndims(d)) end return d.indices[lpidx] end # Equality function Base.:(==)(d::DArray{<:Any,<:Any,A}, a::AbstractArray) where A if size(d) != size(a) return false else b = asyncmap(procs(d)) do p remotecall_fetch(p) do localpart(d) == A(a[localindices(d)...]) end end return all(b) end end function Base.:(==)(d::SubDArray, a::AbstractArray) cd = copy(d) t = cd == a finalize(cd) return t end Base.:(==)(a::AbstractArray, d::DArray) = d == a Base.:(==)(a::AbstractArray, d::SubDArray) = d == a Base.:(==)(d1::DArray, d2::DArray) = invoke(==, Tuple{DArray, AbstractArray}, d1, d2) function Base.:(==)(d1::SubDArray, d2::DArray) cd1 = copy(d1) t = cd1 == d2 finalize(cd1) return t end function Base.:(==)(d1::DArray, d2::SubDArray) cd2 = copy(d2) t = d1 == cd2 finalize(cd2) return t end function Base.:(==)(d1::SubDArray, d2::SubDArray) cd1 = copy(d1) t = cd1 == d2 finalize(cd1) return t end """ locate(d::DArray, I::Int...) Determine the index of `procs(d)` that hold element `I`. """ function locate(d::DArray, I::Int...) ntuple(ndims(d)) do i fi = searchsortedlast(d.cuts[i], I[i]) if fi >= length(d.cuts[i]) throw(ArgumentError("element not contained in array")) end return fi end end chunk(d::DArray{T,N,A}, pid::Int) where {T,N,A} = remotecall_fetch(localpart, pid, d)::A ## convenience constructors ## """ dzeros(dims, ...) Construct a distributed array of zeros. Trailing arguments are the same as those accepted by `DArray`. """ dzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...) dzeros(::Type{T}, dims::Dims, args...) where {T} = DArray(I->zeros(T,map(length,I)), dims, args...) dzeros(::Type{T}, d1::Integer, drest::Integer...) where {T} = dzeros(T, convert(Dims, tuple(d1, drest...))) dzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...))) dzeros(d::Dims) = dzeros(Float64, d) """ dones(dims, ...) Construct a distributed array of ones. Trailing arguments are the same as those accepted by `DArray`. """ dones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...) dones(::Type{T}, dims::Dims, args...) where {T} = DArray(I->ones(T,map(length,I)), dims, args...) dones(::Type{T}, d1::Integer, drest::Integer...) where {T} = dones(T, convert(Dims, tuple(d1, drest...))) dones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...))) dones(d::Dims) = dones(Float64, d) """ dfill(x, dims, ...) Construct a distributed array filled with value `x`. Trailing arguments are the same as those accepted by `DArray`. """ dfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...) dfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...))) """ drand(dims, ...) Construct a distributed uniform random array. Trailing arguments are the same as those accepted by `DArray`. """ drand(::Type{T}, dims::Dims) where {T} = DArray(I -> rand(T, map(length, I)), dims) drand(X, dims::Dims) = DArray(I -> rand(X, map(length, I)), dims) drand(dims::Dims) = drand(Float64, dims) drand(::Type{T}, d1::Integer, drest::Integer...) where {T} = drand(T, Dims((d1, drest...))) drand(X, d1::Integer, drest::Integer...) = drand(X, Dims((d1, drest...))) drand(d1::Integer, drest::Integer...) = drand(Float64, Dims((d1, drest...))) # With optional process IDs and number of chunks for N in (1, 2) @eval begin drand(::Type{T}, dims::Dims, args::Vararg{Any,$N}) where {T} = DArray(I -> rand(T, map(length, I)), dims, args...) drand(X, dims::Dims, args::Vararg{Any,$N}) = DArray(I -> rand(X, map(length, I)), dims, args...) drand(dims::Dims, args::Vararg{Any,$N}) = drand(Float64, dims, args...) end end # Fix method ambiguities drand(dims::Dims, procs::Tuple{Vararg{Int}}) = drand(Float64, dims, procs) drand(dims::Dims, procs::Tuple{Vararg{Int}}, dist) = drand(Float64, dims, procs, dist) drand(X::Tuple{Vararg{Int}}, dim::Integer) = drand(X, Dims((dim,))) drand(X::Tuple{Vararg{Int}}, d1::Integer, d2::Integer) = drand(X, Dims((d1, d2))) """ drandn(dims, ...) Construct a distributed normal random array. Trailing arguments are the same as those accepted by `DArray`. """ drandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...) drandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...))) ## conversions ## """ distribute(A[; procs, dist]) Convert a local array to distributed. `procs` optionally specifies an array of process IDs to use. (defaults to all workers) `dist` optionally specifies a vector or tuple of the number of partitions in each dimension """ function distribute(A::AbstractArray; procs = workers()[1:min(nworkers(), maximum(size(A)))], dist = defaultdist(size(A), procs)) np = prod(dist) procs_used = procs[1:np] idxs, _ = chunk_idxs([size(A)...], dist) s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx A[idxs[pididx]...] end return DArray(I->localpart(s), size(A), procs_used, dist) end """ distribute(A, DA) Distribute a local array `A` like the distributed array `DA`. """ function distribute(A::AbstractArray, DA::DArray) size(DA) == size(A) || throw(DimensionMismatch("Distributed array has size $(size(DA)) but array has $(size(A))")) s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx A[DA.indices[pididx]...] end return DArray(I->localpart(s), DA) end DArray{T,N,S}(A::S) where {T,N,S<:AbstractArray} = distribute(convert(AbstractArray{T,N}, A)) function Array{S,N}(d::DArray{T,N}) where {S,T,N} a = Array{S}(undef, size(d)) @sync for (pid, indices) in zip(d.pids, d.indices) if !any(isempty, indices) @async a[indices...] = chunk(d, pid) end end return a end function Array{S,N}(s::SubDArray{T,N}) where {S,T,N} I = s.indices d = parent(s) if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S && !isempty(s) l = locate(d, map(first, I)...) if isequal(d.indices[l...], I) # SubDArray corresponds to a chunk return chunk(d, d.pids[l...]) end end a = Array{S}(undef, size(s)) copyto!(a, s) end function Base.copyto!(a::Array, s::SubDArray) N = ndims(a) a[[1:size(a,i) for i=1:N]...] = s return a end function DArray(SD::SubArray{T,N}) where {T,N} D = SD.parent DArray(size(SD), procs(D)) do I lindices = Base.reindex(SD.indices, I) convert(Array, D[lindices...]) end end function Base.reshape(A::DArray{T,1,S}, d::Dims) where {T,S<:Array} if prod(d) != length(A) throw(DimensionMismatch("dimensions must be consistent with array size")) end return DArray(d) do I sz = map(length,I) d1offs = first(I[1]) nd = length(I) B = Array{T}(undef, sz) nr = size(B,1) sztail = size(B)[2:end] for i=1:div(length(B),nr) i2 = CartesianIndices(sztail)[i] globalidx = [ I[j][i2[j-1]] for j=2:nd ] a = LinearIndices(d)[d1offs, globalidx...] B[:,i] = Array(A[a:(a+nr-1)]) end B end end ## indexing ## const _allowscalar = Ref(true) allowscalar(flag = true) = (_allowscalar[] = flag) _scalarindexingallowed() = _allowscalar[] || throw(ErrorException("scalar indexing disabled")) getlocalindex(d::DArray, idx...) = localpart(d)[idx...] function getindex_tuple(d::DArray{T,N}, I::NTuple{N,Int}) where {T,N} chidx = locate(d, I...) idxs = d.indices[chidx...] localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d)) pid = d.pids[chidx...] return remotecall_fetch(getlocalindex, pid, d, localidx...)::T end function Base.getindex(d::DArray, i::Int) _scalarindexingallowed() return getindex_tuple(d, Tuple(CartesianIndices(d)[i])) end function Base.getindex(d::DArray{<:Any,N}, i::Vararg{Int,N}) where {N} _scalarindexingallowed() return getindex_tuple(d, i) end Base.getindex(d::DArray) = d[1] Base.getindex(d::SubDArray, I::Int...) = invoke(getindex, Tuple{SubArray{<:Any,N},Vararg{Int,N}} where N, d, I...) Base.getindex(d::SubOrDArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...) function Base.isassigned(D::DArray, i::Integer...) try getindex_tuple(D, i) true catch e if isa(e, BoundsError) || isa(e, UndefRefError) return false else rethrow(e) end end end Base.copy(d::SubDArray) = copyto!(similar(d), d) Base.copy(d::SubDArray{<:Any,2}) = copyto!(similar(d), d) function Base.copyto!(dest::SubOrDArray, src::AbstractArray) @sync for p in procs(dest) @async remotecall_wait(p) do ldest = localpart(dest) copyto!(ldest, view(src, localindices(dest)...)) end end return dest end function Base.deepcopy(src::DArray) dest = similar(src) @sync for p in procs(src) @async remotecall_wait(p) do dest[:L] = deepcopy(src[:L]) end end return dest end # We also want to optimize setindex! with a SubDArray source, but this is hard # and only works on 0.5. # Similar to Base.indexin, but just create a logical mask. Note that this # must return a logical mask in order to support merging multiple masks # together into one linear index since we need to know how many elements to # skip at the end. In many cases range intersection would be much faster # than generating a logical mask, but that loses the endpoint information. indexin_mask(a, b::Number) = a .== b indexin_mask(a, r::AbstractRange{Int}) = [i in r for i in a] indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b)) indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b)) indexin_mask(a, b) = [i in b for i in a] import Base: tail # Given a tuple of indices and a tuple of masks, restrict the indices to the # valid regions. This is, effectively, reversing Base.setindex_shape_check. # We can't just use indexing into MergedIndices here because getindex is much # pickier about singleton dimensions than setindex! is. restrict_indices(::Tuple{}, ::Tuple{}) = () function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}}) if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1) (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...) elseif length(a[1]) == 1 (a[1], restrict_indices(tail(a), b)) elseif length(b[1]) == 1 && b[1][1] restrict_indices(a, tail(b)) else throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue")) end end # The final indices are funky - they're allowed to accumulate together. # An easy (albeit very inefficient) fix for too many masks is to use the # outer product to merge them. But we can do that lazily with a custom type: function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}}) (vec(a[1])[vec(ProductIndices(b, map(length, b)))],) end # But too many indices is much harder; this requires merging the indices # in `a` before applying the final mask in `b`. function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any}) if length(a[1]) == 1 (a[1], restrict_indices(tail(a), b)) else # When one mask spans multiple indices, we need to merge the indices # together. At this point, we can just use indexing to merge them since # there's no longer special handling of singleton dimensions (view(MergedIndices(a, map(length, a)), b[1]),) end end struct ProductIndices{I,N} <: AbstractArray{Bool, N} indices::I sz::NTuple{N,Int} end Base.size(P::ProductIndices) = P.sz # This gets passed to map to avoid breaking propagation of inbounds Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...] Base.@propagate_inbounds Base.getindex(P::ProductIndices{J,N}, I::Vararg{Int, N}) where {J,N} = Bool((&)(map(propagate_getindex, P.indices, I)...)) struct MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N} indices::I sz::NTuple{N,Int} end Base.size(M::MergedIndices) = M.sz Base.@propagate_inbounds Base.getindex(M::MergedIndices{J,N}, I::Vararg{Int, N}) where {J,N} = CartesianIndex(map(propagate_getindex, M.indices, I)) # Additionally, we optimize bounds checking when using MergedIndices as an # array index since checking, e.g., A[1:500, 1:500] is *way* faster than # checking an array of 500^2 elements of CartesianIndex{2}. This optimization # also applies to reshapes of MergedIndices since the outer shape of the # container doesn't affect the index elements themselves. We can go even # farther and say that even restricted views of MergedIndices must be valid # over the entire array. This is overly strict in general, but in this # use-case all the merged indices must be valid at some point, so it's ok. const ReshapedMergedIndices{T,N,M<:MergedIndices} = Base.ReshapedArray{T,N,M} const SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} = SubArray{T,N,M} const MergedIndicesOrSub = Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices} @inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) @inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) @inline Base.checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = Base.checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) # The tricky thing here is that we want to optimize the accesses into the # distributed array, but in doing so, we lose track of which indices in I we # should be using. # # I’ve come to the conclusion that the function is utterly insane. # There are *6* flavors of indices with four different reference points: # 1. Find the indices of each portion of the DArray. # 2. Find the valid subset of indices for the SubArray into that portion. # 3. Find the portion of the `I` indices that should be used when you access the # `K` indices in the subarray. This guy is nasty. It’s totally backwards # from all other arrays, wherein we simply iterate over the source array’s # elements. You need to *both* know which elements in `J` were skipped # (`indexin_mask`) and which dimensions should match up (`restrict_indices`) # 4. If `K` doesn't correspond to an entire chunk, reinterpret `K` in terms of # the local portion of the source array function Base.setindex!(a::Array, s::SubDArray, I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) Inew = Base.to_indices(a, I) Base.setindex_shape_check(s, Base.index_lengths(Inew...)...) d = parent(s) J = Base.to_indices(d, s.indices) @sync for (pid, K_c) in zip(d.pids, d.indices) K = map(intersect, J, K_c) if !any(isempty, K) K_mask = map(indexin_mask, J, K_c) idxs = restrict_indices(Inew, K_mask) if isequal(K, K_c) # whole chunk @async a[idxs...] = chunk(d, pid) else # partial chunk localidxs = map((Kj, K_cj) -> Kj .- (first(K_cj) - 1), K, K_c) @async a[idxs...] = remotecall_fetch((d, idxs) -> localpart(d)[idxs...], pid, d, localidxs) end end end return a end function Base.fill!(A::DArray, x) @sync for p in procs(A) @async remotecall_wait((A,x)->fill!(localpart(A), x), p, A, x) end return A end function Random.rand!(A::DArray, ::Type{T}) where T @sync for p in procs(A) @async remotecall_wait((A, T)->rand!(localpart(A), T), p, A, T) end return A end ================================================ FILE: src/linalg.jl ================================================ function Base.copy(Dadj::Adjoint{T,<:DArray{T,2}}) where T D = parent(Dadj) DArray(reverse(size(D)), procs(D)) do I lp = Array{T}(undef, map(length, I)) rp = convert(Array, D[reverse(I)...]) adjoint!(lp, rp) end end function Base.copy(Dtr::Transpose{T,<:DArray{T,2}}) where T D = parent(Dtr) DArray(reverse(size(D)), procs(D)) do I lp = Array{T}(undef, map(length, I)) rp = convert(Array, D[reverse(I)...]) transpose!(lp, rp) end end const DVector{T,A} = DArray{T,1,A} const DMatrix{T,A} = DArray{T,2,A} # Level 1 function LinearAlgebra.axpy!(α, x::DArray, y::DArray) if length(x) != length(y) throw(DimensionMismatch("vectors must have same length")) end @sync for p in procs(y) @async remotecall_wait(p) do axpy!(α, localpart(x), localpart(y)) end end return y end function LinearAlgebra.dot(x::DVector, y::DVector) if length(x) != length(y) throw(DimensionMismatch("")) end results = asyncmap(procs(x)) do p remotecall_fetch((x, y) -> dot(localpart(x), makelocal(y, localindices(x)...)), p, x, y) end return reduce(+, results) end function LinearAlgebra.norm(x::DArray, p::Real = 2) results = asyncmap(procs(x)) do pp remotecall_fetch(() -> norm(localpart(x), p), pp) end return norm(results, p) end function LinearAlgebra.rmul!(A::DArray, x::Number) @sync for p in procs(A) @async remotecall_wait((A,x)->rmul!(localpart(A), x), p, A, x) end return A end # Level 2 function add!(dest, src, scale = one(dest[1])) if length(dest) != length(src) throw(DimensionMismatch("source and destination arrays must have same number of elements")) end if scale == one(scale) @simd for i = eachindex(dest) @inbounds dest[i] += src[i] end else @simd for i = eachindex(dest) @inbounds dest[i] += scale*src[i] end end return dest end function LinearAlgebra.mul!(y::DVector, A::DMatrix, x::AbstractVector, α::Number = 1, β::Number = 0) # error checks if size(A, 2) != length(x) throw(DimensionMismatch("")) end if y.cuts[1] != A.cuts[1] throw(ArgumentError("cuts of output vector must match cuts of first dimension of matrix")) end # Multiply on each tile of A R = Array{Future}(undef, size(A.pids)) for j = 1:size(A.pids, 2) xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1] for i = 1:size(A.pids, 1) R[i,j] = remotecall(procs(A)[i,j]) do localpart(A)*convert(localtype(x), xj) end end end # Scale y if necessary if β != one(β) asyncmap(procs(y)) do p remotecall_wait(p) do if !iszero(β) rmul!(localpart(y), β) else fill!(localpart(y), 0) end end end end # Update y @sync for i = 1:size(R, 1) p = y.pids[i] for j = 1:size(R, 2) rij = R[i,j] @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p) end end return y end function LinearAlgebra.mul!(y::DVector, adjA::Adjoint{<:Number,<:DMatrix}, x::AbstractVector, α::Number = 1, β::Number = 0) A = parent(adjA) # error checks if size(A, 1) != length(x) throw(DimensionMismatch("")) end if y.cuts[1] != A.cuts[2] throw(ArgumentError("cuts of output vector must match cuts of second dimension of matrix")) end # Multiply on each tile of A R = Array{Future}(undef, reverse(size(A.pids))) for j = 1:size(A.pids, 1) xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1] for i = 1:size(A.pids, 2) R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i]) end end # Scale y if necessary if β != one(β) @sync for p in procs(y) @async remotecall_wait(p) do if !iszero(β) rmul!(localpart(y), β) else fill!(localpart(y), 0) end end end end # Update y @sync for i = 1:size(R, 1) p = y.pids[i] for j = 1:size(R, 2) rij = R[i,j] @async remotecall_wait(() -> add!(localpart(y), fetch(rij), α), p) end end return y end function LinearAlgebra.lmul!(D::Diagonal, DA::DMatrix) d = D.diag s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx d[DA.indices[pididx][1]] end map_localparts!(DA) do lDA lmul!(Diagonal(localpart(s)), lDA) end end function LinearAlgebra.rmul!(DA::DMatrix, D::Diagonal) d = D.diag s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx d[DA.indices[pididx][2]] end map_localparts!(DA) do lDA rmul!(lDA, Diagonal(localpart(s))) end end # Level 3 function _matmatmul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number, β::Number, tA) # error checks Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1) mA, nA = (size(A, Ad1), size(A, Ad2)) mB, nB = size(B) if mB != nA throw(DimensionMismatch("matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)")) end if size(C,1) != mA || size(C,2) != nB throw(DimensionMismatch("result C has dimensions $(size(C)), needs ($mA, $nB)")) end if C.cuts[1] != A.cuts[Ad1] throw(ArgumentError("cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix")) end # Multiply on each tile of A if tA == 'N' R = Array{Future}(undef, size(procs(A))..., size(procs(C), 2)) else R = Array{Future}(undef, reverse(size(procs(A)))..., size(procs(C), 2)) end for j = 1:size(A.pids, Ad2) for k = 1:size(C.pids, 2) Acuts = A.cuts[Ad2] Ccuts = C.cuts[2] Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1] for i = 1:size(A.pids, Ad1) p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i] R[i,j,k] = remotecall(p) do if tA == 'T' return transpose(localpart(A))*convert(localtype(B), Bjk) elseif tA == 'C' return adjoint(localpart(A))*convert(localtype(B), Bjk) else return localpart(A)*convert(localtype(B), Bjk) end end end end end # Scale C if necessary if β != one(β) @sync for p in C.pids if iszero(β) @async remotecall_wait(() -> fill!(localpart(C), 0), p) else @async remotecall_wait(() -> rmul!(localpart(C), β), p) end end end # Update C @sync for i = 1:size(R, 1) for k = 1:size(C.pids, 2) p = C.pids[i,k] for j = 1:size(R, 2) rijk = R[i,j,k] @async remotecall_wait(d -> add!(localpart(d), fetch(rijk), α), p, C) end end end return C end LinearAlgebra.mul!(C::DMatrix, A::DMatrix, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, A, B, α, β, 'N') LinearAlgebra.mul!(C::DMatrix, A::Adjoint{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'C') LinearAlgebra.mul!(C::DMatrix, A::Transpose{<:Number,<:DMatrix}, B::AbstractMatrix, α::Number = 1, β::Number = 0) = _matmatmul!(C, parent(A), B, α, β, 'T') _matmul_op = (t,s) -> t*s + t*s function Base.:*(A::DMatrix, x::AbstractVector) T = Base.promote_op(_matmul_op, eltype(A), eltype(x)) y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),)) return mul!(y, A, x) end function Base.:*(A::DMatrix, B::AbstractMatrix) T = Base.promote_op(_matmul_op, eltype(A), eltype(B)) C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 1), size(B, 2)), procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))], (size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2)))) return mul!(C, A, B) end function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, x::AbstractVector) A = parent(adjA) T = Base.promote_op(_matmul_op, eltype(A), eltype(x)) y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),), procs(A)[1,:], (size(procs(A), 2),)) return mul!(y, adjA, x) end function Base.:*(adjA::Adjoint{<:Any,<:DMatrix}, B::AbstractMatrix) A = parent(adjA) T = Base.promote_op(_matmul_op, eltype(A), eltype(B)) C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2), size(B, 2)), procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:], (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2)))) return mul!(C, adjA, B) end function Base.:*(trA::Transpose{<:Any,<:DMatrix}, x::AbstractVector) A = parent(trA) T = Base.promote_op(_matmul_op, eltype(A), eltype(x)) y = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2),), procs(A)[1,:], (size(procs(A), 2),)) return mul!(y, trA, x) end function Base.:*(trA::Transpose{<:Any,<:DMatrix}, B::AbstractMatrix) A = parent(trA) T = Base.promote_op(_matmul_op, eltype(A), eltype(B)) C = DArray(I -> Array{T}(undef, map(length, I)), (size(A, 2), size(B, 2)), procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:], (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2)))) return mul!(C, trA, B) end ================================================ FILE: src/mapreduce.jl ================================================ ## higher-order functions ## Base.map(f, d0::DArray, ds::AbstractArray...) = broadcast(f, d0, ds...) function Base.map!(f::F, dest::DArray, src::DArray{<:Any,<:Any,A}) where {F,A} @sync for p in procs(dest) @async remotecall_wait(p) do map!(f, localpart(dest), makelocal(src, localindices(dest)...)) end end return dest end # Only defining `reduce(f, ::DArray)` causes method ambiguity issues with # - `reduce(hcat, ::AbstractVector{<:AbstractVecOrMat})` # - `reduce(vcat, ::AbstractVector{<:AbstractVecOrMat})` Base.reduce(f, d::DArray) = _reduce(f, d) Base.reduce(::typeof(hcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(hcat, d) Base.reduce(::typeof(vcat), d::DArray{<:AbstractVecOrMat, 1}) = _reduce(vcat, d) function _reduce(f, d::DArray) results = asyncmap(procs(d)) do p remotecall_fetch(p) do return reduce(f, localpart(d)) end end reduce(f, results) end function Base._mapreduce(f, op, ::IndexCartesian, d::DArray) results = asyncmap(procs(d)) do p remotecall_fetch((_f,_op,_d)->mapreduce(_f, _op, localpart(_d)), p, f, op, d) end reduce(op, results) end Base._mapreduce(f, op, ::IndexCartesian, d::SubDArray) = Base._mapreduce(f, op, IndexCartesian(), DArray(d)) # Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d) # Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d) # Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d) # mapreducedim function Base.reducedim_initarray(A::DArray, region, v0, ::Type{R}) where {R} # Store reduction on lowest pids pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...] chunks = similar(pids, Future) asyncmap!(chunks, pids) do p remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), p) end return DArray(chunks) end Base.reducedim_initarray(A::DArray, region, v0::T) where {T} = Base.reducedim_initarray(A, region, v0, T) # Compute mapreducedim of each localpart and store the result in a new DArray function mapreducedim_within(f, op, A::DArray, region) arraysize = [size(A)...] gridsize = [size(A.indices)...] arraysize[[region...]] = gridsize[[region...]] indx = similar(A.indices) for i in CartesianIndices(indx) indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indices[i][j], ndims(A)) end cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)] return DArray(next_did(), I -> mapreduce(f, op, localpart(A), dims=region), tuple(arraysize...), procs(A), indx, cuts) end # Compute mapreducedim across the processes. This should be done after mapreducedim # has been run on each localpart with mapreducedim_within. Eventually, we might # want to write mapreducedim_between! as a binary reduction. function mapreducedim_between!(f, op, R::DArray, A::DArray, region) @sync for p in procs(R) @async remotecall_wait(p, f, op, R, A, region) do f, op, R, A, region localind = [r for r = localindices(A)] localind[[region...]] = [1:n for n = size(A)[[region...]]] B = convert(Array, A[localind...]) Base.mapreducedim!(f, op, localpart(R), B) end end return R end function Base.mapreducedim!(f, op, R::DArray, A::DArray) lsize = Base.check_reducedims(R,A) if isempty(A) return copy(R) end region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...) if isempty(region) return copyto!(R, A) end B = mapreducedim_within(f, op, A, region) return mapreducedim_between!(identity, op, R, B, region) end ## Some special cases function Base._all(f, A::DArray, ::Colon) B = asyncmap(procs(A)) do p remotecall_fetch(p) do all(f, localpart(A)) end end return all(B) end function Base._any(f, A::DArray, ::Colon) B = asyncmap(procs(A)) do p remotecall_fetch(p) do any(f, localpart(A)) end end return any(B) end function Base.count(f, A::DArray) B = asyncmap(procs(A)) do p remotecall_fetch(p) do count(f, localpart(A)) end end return sum(B) end function Base.extrema(d::DArray) r = asyncmap(procs(d)) do p remotecall_fetch(p) do extrema(localpart(d)) end end return reduce((t,s) -> (min(t[1], s[1]), max(t[2], s[2])), r) end # Unary vector functions Base.:(-)(D::DArray) = map(-, D) map_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d) map_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I f(localpart(d1), localpart(d2)) end function map_localparts(f::Callable, DA::DArray, A::Array) s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx A[DA.indices[pididx]...] end DArray(DA) do I f(localpart(DA), localpart(s)) end end function map_localparts(f::Callable, A::Array, DA::DArray) s = verified_destination_serializer(procs(DA), size(DA.indices)) do pididx A[DA.indices[pididx]...] end DArray(DA) do I f(localpart(s), localpart(DA)) end end function map_localparts!(f::Callable, d::DArray) @sync for p in procs(d) @async remotecall_wait((f,d)->f(localpart(d)), p, f, d) end return d end # Here we assume all the DArrays have # the same size and distribution map_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1]) function samedist(A::DArray, B::DArray) (size(A) == size(B)) || throw(DimensionMismatch()) if (procs(A) != procs(B)) || (A.cuts != B.cuts) B = DArray(x->B[x...], A) end B end for f in (:+, :-, :div, :mod, :rem, :&, :|, :xor) @eval begin function Base.$f(A::DArray{T}, B::DArray{T}) where T B = samedist(A, B) map_localparts($f, A, B) end Base.$f(A::DArray{T}, B::Array{T}) where {T} = map_localparts($f, A, B) Base.$f(A::Array{T}, B::DArray{T}) where {T} = map_localparts($f, A, B) end end function Base.mapslices(f, D::DArray{T,N,A}; dims) where {T,N,A} if !(dims isa AbstractVector) dims = [dims...] end if !all(t -> t == 1, size(D.indices)[dims]) p = ones(Int, ndims(D)) nondims = filter(t -> !(t in dims), 1:ndims(D)) p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D)) DD = DArray(size(D), procs(D), p) do I return convert(A, D[I...]) end return mapslices(f, DD, dims=dims) end refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),dims=z), p, f, D, dims) for p in procs(D)] DArray(reshape(refs, size(procs(D)))) end function _ppeval(f, A...; dim = map(ndims, A)) if length(dim) != length(A) throw(ArgumentError("dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))")) end narg = length(A) dimlength = size(A[1], dim[1]) for i = 2:narg if dim[i] > 0 && dimlength != size(A[i], dim[i]) throw(ArgumentError("lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))")) end end dims = [] idx = [] args = [] for i = 1:narg push!(dims, ndims(A[i])) push!(idx, Any[Colon() for d in 1:dims[i]]) if dim[i] > 0 idx[i][dim[i]] = 1 push!(args, view(A[i], idx[i]...)) else push!(args, A[i]) end end R1 = f(args...) ridx = Any[1:size(R1, d) for d in 1:ndims(R1)] push!(ridx, 1) Rsize = map(last, ridx) Rsize[end] = dimlength R = Array{eltype(R1)}(undef, Rsize...) for i = 1:dimlength for j = 1:narg if dim[j] > 0 idx[j][dim[j]] = i args[j] = view(A[j], idx[j]...) else args[j] = A[j] end end ridx[end] = i R[ridx...] = f(args...) end return R end """ ppeval(f, D...; dim::NTuple) Evaluates the callable argument `f` on slices of the elements of the `D` tuple. #### Arguments `f` can be any callable object that accepts sliced or broadcasted elements of `D`. The result returned from `f` must be either an array or a scalar. `D` has any number of elements and the elements can have any type. If an element of `D` is a distributed array along the dimension specified by `dim`. If an element of `D` is not distributed, the element is by default broadcasted and applied on all evaluations of `f`. `dim` is a tuple of integers specifying the dimension over which the elements of `D` is slices. The length of the tuple must therefore be the same as the number of arguments `D`. By default distributed arrays are slides along the last dimension. If the value is less than or equal to zero the element are broadcasted to all evaluations of `f`. #### Result `ppeval` returns a distributed array of dimension `p+1` where the first `p` sizes correspond to the sizes of return values of `f`. The last dimension of the return array from `ppeval` has the same length as the dimension over which the input arrays are sliced. #### Examples ```jl addprocs(Sys.CPU_THREADS) using DistributedArrays A = drandn((10, 10, Sys.CPU_THREADS), workers(), [1, 1, Sys.CPU_THREADS]) ppeval(eigvals, A) ppeval(eigvals, A, randn(10,10)) # broadcasting second argument B = drandn((10, Sys.CPU_THREADS), workers(), [1, Sys.CPU_THREADS]) ppeval(*, A, B) ``` """ function ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D)) #Ensure that the complete DArray is available on the specified dims on all processors for i = 1:length(D) if isa(D[i], DArray) for idxs in D[i].indices for d in setdiff(1:ndims(D[i]), dim[i]) if length(idxs[d]) != size(D[i], d) throw(DimensionMismatch(string("dimension $d is distributed. ", "ppeval requires dimension $d to be completely available on all processors."))) end end end end end refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])] # The array of Futures has to be reshaped for the DArray constructor to work correctly. # This requires a fetch and the DArray is also fetching so it might be better to modify # the DArray constructor. sd = [size(D[1].pids)...] nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1]) DArray(reshape(refs, tuple([sd[1:nd - 1]; sd[end]]...))) end ================================================ FILE: src/serialize.jl ================================================ function Serialization.serialize(S::AbstractSerializer, d::DArray{T,N,A}) where {T,N,A} # Only send the ident for participating workers - we expect the DArray to exist in the # remote registry. DO NOT send the localpart. destpid = worker_id_from_socket(S.io) Serialization.serialize_type(S, typeof(d)) if (destpid in d.pids) || (destpid == d.id[1]) serialize(S, (true, d.id)) # (id_only, id) else serialize(S, (false, d.id)) for n in [:dims, :pids, :indices, :cuts] serialize(S, getfield(d, n)) end serialize(S, A) end end function Serialization.deserialize(S::AbstractSerializer, t::Type{DT}) where DT<:DArray what = deserialize(S) id_only = what[1] id = what[2] if id_only d = d_from_weakref_or_d(id) if d === nothing # access to fields will throw an error, at least the deserialization process will not # result in worker death d = DT() d.id = id end return d else # We are not a participating worker, deser fields and instantiate locally. dims = deserialize(S) pids = deserialize(S) indices = deserialize(S) cuts = deserialize(S) A = deserialize(S) T=eltype(DT) N=length(dims) return DT(id, dims, pids, indices, cuts, empty_localpart(T,N,A)) end end # Serialize only those parts of the object as required by the destination worker. mutable struct DestinationSerializer generate::Union{Function,Nothing} # Function to generate the part to be serialized pids::Union{Array,Nothing} # MUST have the same shape as the distribution deser_obj::Any # Deserialized part DestinationSerializer(f,p,d) = new(f,p,d) end DestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, nothing) # constructs a DestinationSerializer after verifying that the shape of pids. function verified_destination_serializer(f::Function, pids::Array, verify_size) @assert size(pids) == verify_size return DestinationSerializer(f, pids) end DestinationSerializer(deser_obj::Any) = DestinationSerializer(nothing, nothing, deser_obj) function Serialization.serialize(S::AbstractSerializer, s::DestinationSerializer) pid = worker_id_from_socket(S.io) pididx = findfirst(isequal(pid), s.pids) @assert pididx !== nothing Serialization.serialize_type(S, typeof(s)) serialize(S, s.generate(pididx)) end function Serialization.deserialize(S::AbstractSerializer, t::Type{T}) where T<:DestinationSerializer lpart = deserialize(S) return DestinationSerializer(lpart) end function localpart(s::DestinationSerializer) if s.deser_obj !== nothing return s.deser_obj elseif s.generate !== nothing && (myid() in s.pids) # Handle the special case where myid() is part of s.pids. # In this case serialize/deserialize is not called as the remotecall is executed locally return s.generate(findfirst(isequal(myid()), s.pids)) else throw(ErrorException(string("Invalid state in DestinationSerializer."))) end end ================================================ FILE: src/sort.jl ================================================ # Sorting a DVector using samplesort function sample_n_setup_ref(d::DVector, sample_size; kwargs...) lp = localpart(d) llp = length(lp) np = length(procs(d)) sample_size = llp > sample_size ? sample_size : llp sorted = sort(lp; kwargs...) sample = sorted[collect(1:div(llp,sample_size):llp)] ref = RemoteChannel(()->Channel(np+1)) # To collect parts to be sorted locally later. # First element is the locally sorted vector put!(ref, sorted) return (sample, ref) end function scatter_n_sort_localparts(d, myidx, refs, boundaries::Array{T}; by = identity, kwargs...) where T if d==nothing sorted = take!(refs[myidx]) # First entry in the remote channel is sorted localpart else sorted = sort(localpart(d); by = by, kwargs...) end # send respective parts to correct workers, iterate over sorted array p_sorted = 1 for (i,r) in enumerate(refs) p_till = length(sorted)+1 # calculate range to send to refs[i] ctr=1 for x in sorted[p_sorted:end] if by(x) > by(boundaries[i+1]) p_till = p_sorted+ctr-1 break else ctr += 1 end end if p_till == p_sorted @async put!(r, Array{T}(undef,0)) else v = sorted[p_sorted:p_till-1] @async put!(r, v) end p_sorted = p_till end # wait to receive all of my parts from all other workers lp_sorting=T[] for _ in refs v = take!(refs[myidx]) append!(lp_sorting, v) end sorted_ref=RemoteChannel() put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...)) return (sorted_ref, length(lp_sorting)) end function compute_boundaries(d::DVector{T}; kwargs...) where T pids = procs(d) np = length(pids) sample_sz_on_wrkr = 512 results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids) samples = Array{T}(undef,0) for x in results append!(samples, x[1]) end sort!(samples; kwargs...) samples[1] = typemin(T) refs=[x[2] for x in results] boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]] push!(boundaries, typemax(T)) return (boundaries, refs) end """ sort(d::DVector; sample=true, kwargs...) -> DVector Sorts and returns a new distributed vector. The sorted vector may not have the same distribution as the original. Keyword argument `sample` can take values: - `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`. - `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d) - 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values - Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution. Keyword argument `alg` takes the same options `Base.sort` """ function Base.sort(d::DVector{T}; sample=true, kwargs...) where T pids = procs(d) np = length(pids) # Only `alg` and `sample` are supported as keyword arguments if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0 throw(ArgumentError("Only `alg`, `by` and `sample` are supported as keyword arguments")) end if sample==true boundaries, refs = compute_boundaries(d; kwargs...) presorted=true elseif sample==false # Assume an uniform distribution between min and max values minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids) min_d = minimum(T[x[1] for x in minmax]) max_d = maximum(T[x[2] for x in minmax]) return sort(d; sample=(min_d,max_d), kwargs...) elseif isa(sample, Tuple) # Assume an uniform distribution between min and max values in the tuple lb=sample[1] ub=sample[2] @assert lb<=ub s = Array{T}(undef,np) part = abs(ub - lb)/np (isnan(part) || isinf(part)) && throw(ArgumentError("lower and upper bounds must not be infinities")) for n in 1:np v = lb + (n-1)*part if T <: Integer s[n] = round(v) else s[n] = v end end return sort(d; sample=s, kwargs...) elseif isa(sample, Array) # Provided array is used as a sample samples = sort(copy(sample)) samples[1] = typemin(T) boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]] push!(boundaries, typemax(T)) presorted=false refs=[RemoteChannel(p) for p in procs(d)] else throw(ArgumentError("keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : " * string(sample))) end local_sort_results = Array{Tuple}(undef,np) Base.asyncmap!((i,p) -> remotecall_fetch( scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...), local_sort_results, 1:np, pids) # Construct a new DArray from the sorted refs. Remove parts with 0-length since # the DArray constructor_from_refs does not yet support it. This implies that # the participating workers for the sorted darray may be different from the original # for highly non-uniform distributions. local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)] return DArray(local_sorted_refs) end ================================================ FILE: src/spmd.jl ================================================ module SPMD using Distributed: RemoteChannel, myid, procs, remote_do, remotecall_fetch, remotecall_wait using ..DistributedArrays: DistributedArrays, gather, next_did export sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather export context_local_storage, context, spmd mutable struct WorkerDataChannel pid::Int rc::Union{RemoteChannel,Nothing} lock::ReentrantLock WorkerDataChannel(pid) = new(pid, nothing, ReentrantLock()) end mutable struct SPMDContext id::Tuple{Int,Int} chnl::Channel store::Dict{Any,Any} pids::Array{Int} function SPMDContext(id::Tuple{Int,Int}, pids::Vector{Int}) ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), pids) if first(id) == myid() finalizer(ctxt) do ctxt for p in ctxt.pids @async remote_do(delete_ctxt_id, p, ctxt.id) end end end return ctxt end end # Every worker is associated with its own RemoteChannel struct WorkerChannelDict data::Dict{Int, WorkerDataChannel} lock::ReentrantLock WorkerChannelDict() = new(Dict{Int, WorkerDataChannel}(), ReentrantLock()) end const WORKERCHANNELS = WorkerChannelDict() Base.get!(f::Function, x::WorkerChannelDict, id::Int) = @lock x.lock get!(f, x.data, id) # mapping between a context id and context object struct SPMDContextDict data::Dict{Tuple{Int,Int}, SPMDContext} lock::ReentrantLock SPMDContextDict() = new(Dict{Tuple{Int,Int}, SPMDContext}(), ReentrantLock()) end const CONTEXTS = SPMDContextDict() Base.delete!(x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock delete!(x.data, id) Base.get!(f::Function, x::SPMDContextDict, id::Tuple{Int,Int}) = @lock x.lock get!(f, x.data, id) function context_local_storage() ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT)) ctxt.store end context(pids::Vector{Int}=procs()) = SPMDContext(next_did(), pids) # Multiple SPMD blocks can be executed concurrently, # each in its own context. Messages are still sent as part of the # same remote channels associated with each worker. They are # read from the remote channel into local channels each associated # with a different run of `spmd`. function get_dc(wc::WorkerDataChannel) lock(wc.lock) try if wc.rc === nothing if wc.pid == myid() myrc = RemoteChannel(()->Channel(typemax(Int))) wc.rc = myrc # start a task to transfer incoming messages into local # channels based on the execution context @async begin while true msg = take!(myrc) ctxt_id = msg[1] # First element of the message tuple is the context id. ctxt = get_ctxt_from_id(ctxt_id) put!(ctxt.chnl, msg[2:end]) # stripping the context_id end end else wc.rc = remotecall_fetch(()->get_remote_dc(myid()), wc.pid) end end finally unlock(wc.lock) end return wc.rc end function get_ctxt_from_id(ctxt_id::Tuple{Int,Int}) ctxt = get!(CONTEXTS, ctxt_id) do return SPMDContext(ctxt_id, Int[]) end return ctxt end # Since modules may be loaded in any order on the workers, # and workers may be dynamically added, pull in the remote channel # handles when accessed for the first time. function get_remote_dc(pid::Int) wc = get!(WORKERCHANNELS, pid) do return WorkerDataChannel(pid) end return get_dc(wc) end function send_msg(to, typ, data, tag) ctxt_id = task_local_storage(:SPMD_CTXT) @async begin dc = get_remote_dc(to) put!(dc, (ctxt_id, typ, myid(), data, tag)) # println("Sent to ", dc) end end function get_msg(typ_check, from_check=false, tag_check=nothing) ctxt_id = task_local_storage(:SPMD_CTXT) chnl = get_ctxt_from_id(ctxt_id).chnl unexpected_msgs=[] while true typ, from, data, tag = take!(chnl) if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check) push!(unexpected_msgs, (typ, from, data, tag)) # println("Unexpected in get_msg ", unexpected_msgs, " looking for ", typ_check, " ", from_check, " ", tag_check) else # put all the messages we read (but not expected) back to the local channel foreach(x->put!(chnl, x), unexpected_msgs) return (from, data) end end end function sendto(pid::Int, data::Any; tag=nothing) send_msg(pid, :sendto, data, tag) end function recvfrom(pid::Int; tag=nothing) _, data = get_msg(:sendto, pid, tag) return data end function recvfrom_any(; tag=nothing) from, data = get_msg(:sendto, false, tag) return (from,data) end function barrier(;pids=procs(), tag=nothing) # send a message to everyone for p in sort(pids) send_msg(p, :barrier, nothing, tag) end # make sure we recv a message from everyone pending=deepcopy(pids) unexpected_msgs=[] while length(pending) > 0 from, _ = get_msg(:barrier, false, tag) if from in pending filter!(x->x!=from, pending) else # handle case of 2 (or more) consecutive barrier calls. push!(unexpected_msgs, (:barrier, from, nothing, tag)) # println("Unexpected ", from) end # length(pending) == 1 && println("Waiting for ", pending) end ctxt_id = task_local_storage(:SPMD_CTXT) chnl = get_ctxt_from_id(ctxt_id).chnl foreach(x->put!(chnl, x), unexpected_msgs) return nothing end function bcast(data::Any, pid::Int; tag=nothing, pids=procs()) if myid() == pid for p in filter(x->x!=pid, sort(pids)) send_msg(p, :bcast, data, tag) end return data else from, data = get_msg(:bcast, pid, tag) return data end end function scatter(x, pid::Int; tag=nothing, pids=procs()) if myid() == pid @assert rem(length(x), length(pids)) == 0 cnt = div(length(x), length(pids)) for (i,p) in enumerate(sort(pids)) p == pid && continue send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag) end myidx = findfirst(isequal(pid), sort(pids)) return x[cnt*(myidx-1)+1:cnt*myidx] else _, data = get_msg(:scatter, pid, tag) return data end end function DistributedArrays.gather(x, pid::Int; tag=nothing, pids=procs()) if myid() == pid gathered_data = Array{Any}(undef, length(pids)) myidx = findfirst(isequal(pid), sort(pids)) gathered_data[myidx] = x n = length(pids) - 1 while n > 0 from, data_x = get_msg(:gather, false, tag) fromidx = findfirst(isequal(from), sort(pids)) gathered_data[fromidx] = data_x n=n-1 end return gathered_data else send_msg(pid, :gather, x, tag) return x end end function spmd_local(f, ctxt_id, clear_ctxt) task_local_storage(:SPMD_CTXT, ctxt_id) f() clear_ctxt && delete_ctxt_id(ctxt_id) return nothing end function spmd(f, args...; pids=procs(), context=nothing) f_noarg = ()->f(args...) clear_ctxt = false if context == nothing ctxt_id = next_did() clear_ctxt = true # temporary unique context created for this run. # should be cleared at the end of the run. else ctxt_id = context.id end @sync for p in pids @async remotecall_wait(spmd_local, p, f_noarg, ctxt_id, clear_ctxt) end nothing end delete_ctxt_id(ctxt_id::Tuple{Int,Int}) = delete!(CONTEXTS, ctxt_id) Base.close(ctxt::SPMDContext) = finalize(ctxt) end ================================================ FILE: test/aqua.jl ================================================ using DistributedArrays, Test import Aqua @testset "Aqua" begin Aqua.test_all(DistributedArrays; ambiguities = (; broken = true)) end ================================================ FILE: test/darray.jl ================================================ using Test, LinearAlgebra, SpecialFunctions using Statistics: mean using SparseArrays: nnz using Random @everywhere using SparseArrays: sprandn @testset "test distribute and other constructors" begin A = rand(1:100, (100,100)) @testset "test default distribute" begin DA = distribute(A) @test length(procs(DA)) == nworkers() @test sum(DA) == sum(A) close(DA) end @testset "test distribute with procs arguments" begin DA = distribute(A, procs = procs()) @test length(procs(DA)) == nprocs() @test sum(DA) == sum(A) close(DA) end @testset "test distribute with procs and dist arguments" begin DA = distribute(A, procs = [1, 2], dist = [1,2]) @test size(procs(DA)) == (1,2) @test sum(DA) == sum(A) close(DA) end @testset "Create darray with unconventional distribution and distribute like it" begin block = 10 Y = nworkers() * block X = nworkers() * block remote_parts = map(workers()) do wid remotecall(rand, wid, block, Y) end DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1))) A = rand(X, Y) DA2 = distribute(A, DA1) @test size(DA1) == size(DA2) close(DA1) close(DA2) end @testset "Global DArray serialization issue #134" begin global A134 = drandn(1) D2 = DArray(I -> DistributedArrays.localpart(A134), A134) @test D2 == A134 close(A134) close(D2) end @testset "empty_localpart should work when only constructor (not conversion is defined)" begin @test DistributedArrays.empty_localpart(Float64,2,LowerTriangular{Float64,Matrix{Float64}}) isa LowerTriangular end @testset "Consistent Uneven Distribution issue #166" begin DA = drand((2+length(OTHERIDS),), [MYID, OTHERIDS]) @test fetch(@spawnat MYID length(localpart(DA)) == 2) @test fetch(@spawnat OTHERIDS length(localpart(DA)) == 1) close(DA) @test DistributedArrays.defaultdist(50,4) == [1,14,27,39,51] end @testset "Inhomogeneous typeof(localpart)" begin block = 10 Y = nworkers() * block X = nworkers() * block @assert nworkers() > 1 @test_throws ErrorException DArray((X, Y)) do I eltype = first(CartesianIndices(I)) == CartesianIndex(1, 1) ? Int64 : Float64 zeros(eltype, map(length, I)) end end end check_leaks() @testset "test DArray equality/copy/deepcopy" begin D = drand((200,200), [MYID, OTHERIDS]) @testset "test isequal(::DArray, ::DArray)" begin DC = copy(D) @test D == DC close(DC) end @testset "test [deep]copy(::DArray) does a copy of each localpart" begin DC = copy(D) @spawnat OTHERIDS localpart(DC)[1] = 0 @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0) DD = deepcopy(D) @spawnat OTHERIDS localpart(DD)[1] = 0 @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0) close(DC) close(DD) end @testset "test copy(::DArray) is shallow" begin DA = @DArray [rand(100) for i=1:10] DC = copy(DA) id = procs(DC)[1] @test DA == DC fetch(@spawnat id localpart(DC)[1] .= -1.0) @test DA == DC @test fetch(@spawnat id all(localpart(DA)[1] .== -1.0)) close(DA) close(DC) end @testset "test deepcopy(::DArray) is not shallow" begin DA = @DArray [rand(100) for i=1:10] DC = deepcopy(DA) id = procs(DC)[1] @test DA == DC fetch(@spawnat id localpart(DC)[1] .= -1.0) @test DA != DC @test fetch(@spawnat id all(localpart(DA)[1] .>= 0.0)) close(DA) close(DC) end close(D) end check_leaks() @testset "test DArray similar" begin D = drand((200,200), [MYID, OTHERIDS]) DS = similar(D,Float16) @testset "test eltype of a similar" begin @test eltype(DS) == Float16 end @testset "test dims of a similar" begin @test size(D) == size(DS) end close(D) close(DS) end check_leaks() @testset "test DArray reshape" begin D = drand((200,200), [MYID, OTHERIDS]) @testset "Test error-throwing in reshape" begin @test_throws DimensionMismatch reshape(D,(100,100)) end DR = reshape(D,(100,400)) @testset "Test reshape" begin @test size(DR) == (100,400) end close(D) end check_leaks() @testset "test @DArray comprehension constructor" begin @testset "test valid use of @DArray" begin D = @DArray [i+j for i=1:10, j=1:10] @test D == [i+j for i=1:10, j=1:10] close(D) end @testset "test invalid use of @DArray" begin #@test_throws ArgumentError eval(:((@DArray [1,2,3,4]))) @test_throws LoadError eval(:((@DArray [1,2,3,4]))) end end check_leaks() @testset "test DArray / Array conversion" begin D = drand((200,200), [MYID, OTHERIDS]) @testset "test construct Array from (Sub)DArray" begin S = Matrix{Float64}(D[1:150, 1:150]) A = Matrix{Float64}(D) @test A[1:150,1:150] == S D2 = DArray{Float64,2,Matrix{Float64}}(A) @test D2 == D DistributedArrays.allowscalar(true) @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1] @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101] DistributedArrays.allowscalar(false) close(D2) S2 = Vector{Float64}(D[4, 23:176]) @test A[4, 23:176] == S2 S3 = Vector{Float64}(D[23:176, 197]) @test A[23:176, 197] == S3 S4 = zeros(4) setindex!(S4, D[3:4, 99:100], :) # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray} DistributedArrays.allowscalar(true) @test S4 == vec(D[3:4, 99:100]) @test S4 == vec(A[3:4, 99:100]) DistributedArrays.allowscalar(false) S5 = zeros(2,2) setindex!(S5, D[1,1:4], :, 1:2) # FixMe! Hitting the AbstractArray fallback here is extremely unfortunate but vec() becomes a ReshapedArray which makes it diffuclt to hit DArray methods. Unless this can be fixed in Base, we might have to add special methods for ReshapedArray{DArray} DistributedArrays.allowscalar(true) @test vec(S5) == D[1, 1:4] @test vec(S5) == A[1, 1:4] DistributedArrays.allowscalar(false) end close(D) end check_leaks() @testset "test copy!" begin D1 = dzeros((10,10)) r1 = remotecall_wait(() -> randn(3,10), workers()[1]) r2 = remotecall_wait(() -> randn(7,10), workers()[2]) D2 = DArray(reshape([r1; r2], 2, 1)) copyto!(D2, D1) @test D1 == D2 close(D1) close(D2) end check_leaks() @testset "test DArray reduce" begin D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS]) @testset "test reduce" begin @test reduce(+, D) == ((50*MYID) + (50*OTHERIDS)) end @testset "test map / reduce" begin D2 = map(x->1, D) @test D2 isa DArray @test reduce(+, D2) == 100 close(D2) end @testset "test map! / reduce" begin map!(x->1, D, D) @test reduce(+, D) == 100 end close(D) end check_leaks() @testset "test rmul" begin A = randn(100,100) DA = distribute(A) @test rmul!(DA, 2) == rmul!(A, 2) close(DA) end check_leaks() @testset "test rmul!(Diagonal, A)" begin A = randn(100, 100) b = randn(100) D = Diagonal(b) DA = distribute(A) @test lmul!(D, A) == lmul!(D, DA) close(DA) A = randn(100, 100) b = randn(100) DA = distribute(A) @test rmul!(A, D) == rmul!(DA, D) close(DA) end check_leaks() @testset "test mapreduce on DArrays" begin for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *] A = rand(1:5, rand(2:30)) DA = distribute(A) @test DA isa DArray @test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0 close(DA) end end check_leaks() @testset "test mapreducedim on DArrays" begin D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS]) D2 = map(x->1, D) @test D2 isa DArray @test mapreduce(t -> t*t, +, D2, dims=1) == mapreduce(t -> t*t, +, convert(Array, D2), dims=1) @test mapreduce(t -> t*t, +, D2, dims=2) == mapreduce(t -> t*t, +, convert(Array, D2), dims=2) @test mapreduce(t -> t*t, +, D2, dims=(1,2)) == mapreduce(t -> t*t, +, convert(Array, D2), dims=(1,2)) # Test non-regularly chunked DArrays r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1]) r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2]) D = DArray(reshape([r1; r2], (2,1))) @test Array(sum(D, dims=2)) == sum(Array(D), dims=2) # close(D) # close(D2) d_closeall() # temp created by the mapreduce above end check_leaks() @testset "test mapreducdim, reducedim on DArrays" begin dims = (20,20,20) DA = drandn(dims) A = convert(Array, DA) @testset "dimension $dms" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3)) @test mapreduce(t -> t*t, +, A, dims=dms) ≈ mapreduce(t -> t*t, +, DA, dims=dms) @test mapreduce(t -> t*t, +, A, dims=dms, init=1.0) ≈ mapreduce(t -> t*t, +, DA, dims=dms, init=1.0) @test reduce(*, A, dims=dms) ≈ reduce(*, DA, dims=dms) @test reduce(*, A, dims=dms, init=2.0) ≈ reduce(*, DA, dims=dms, init=2.0) end close(DA) d_closeall() # temp created by the mapreduce above end check_leaks() @testset "test statistical functions on DArrays" begin dims = (20,20,20) DA = drandn(dims) A = Array(DA) @testset "test $f for dimension $dms" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3)) # std is pending implementation @test f(DA, dims=dms) ≈ f(A, dims=dms) end close(DA) d_closeall() # temporaries created above end check_leaks() unpack(ex::Base.CapturedException) = unpack(ex.ex) unpack(ex::Distributed.RemoteException) = unpack(ex.captured) unpack(ex::Base.TaskFailedException) = unpack(ex.task.exception) unpack(ex) = ex @testset "test sum on DArrays" begin A = randn(100,100) DA = distribute(A) # sum either throws an ArgumentError, a CompositeException of ArgumentErrors, # or a RemoteException wrapping an ArgumentError try sum(DA, dims=-1) catch err if isa(err, CompositeException) @test !isempty(err.exceptions) for excep in err.exceptions # Unpack the remote exception orig_err = unpack(excep) @test isa(orig_err, ArgumentError) end elseif isa(err, RemoteException) @test err.captured isa CapturedException @test err.captured.ex isa ArgumentError else @test isa(err, ArgumentError) end end try sum(DA, dims=0) catch err if isa(err, CompositeException) @test !isempty(err.exceptions) for excep in err.exceptions # Unpack the remote exception orig_err = unpack(excep) @test isa(orig_err, ArgumentError) end elseif isa(err, RemoteException) @test err.captured isa CapturedException @test err.captured.ex isa ArgumentError else @test isa(err, ArgumentError) end end @test sum(DA) ≈ sum(A) @test sum(DA, dims=1) ≈ sum(A, dims=1) @test sum(DA, dims=2) ≈ sum(A, dims=2) @test sum(DA, dims=3) ≈ sum(A, dims=3) close(DA) d_closeall() # temporaries created above end check_leaks() @testset "test size on DArrays" begin A = randn(100,100) DA = distribute(A) @test_throws BoundsError size(DA, 0) @test size(DA,1) == size(A,1) @test size(DA,2) == size(A,2) @test size(DA,3) == size(A,3) close(DA) end check_leaks() # test length / lastindex @testset "test collections API" begin A = randn(23,23) DA = distribute(A) @testset "test length" begin @test length(DA) == length(A) end @testset "test lastindex" begin @test lastindex(DA) == lastindex(A) end close(DA) end check_leaks() @testset "test max / min / sum" begin a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000)) d = distribute(a) @test sum(d) == sum(a) @test maximum(d) == maximum(a) @test minimum(d) == minimum(a) @test maximum(abs, d) == maximum(abs, a) @test minimum(abs, d) == minimum(abs, a) @test sum(abs, d) == sum(abs, a) @test sum(abs2, d) == sum(abs2, a) @test extrema(d) == extrema(a) close(d) end check_leaks() @testset "test all / any" begin a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(undef,100,1000)) a = [true for i in 1:100] d = distribute(a) @test all(d) @test any(d) close(d) a[50] = false d = distribute(a) @test !all(d) @test any(d) close(d) a = [false for i in 1:100] d = distribute(a) @test !all(d) @test !any(d) close(d) d = dones(10,10) @test !all(x-> x>1.0, d) @test all(x-> x>0.0, d) close(d) a = ones(10,10) a[10] = 2.0 d = distribute(a) @test any(x-> x == 1.0, d) @test any(x-> x == 2.0, d) @test !any(x-> x == 3.0, d) close(d) end check_leaks() @testset "test count" begin a = ones(10,10) a[10] = 2.0 d = distribute(a) @test count(x-> x == 2.0, d) == 1 @test count(x-> x == 1.0, d) == 99 @test count(x-> x == 0.0, d) == 0 close(d) end check_leaks() @testset "test prod" begin a = fill(2, 10); d = distribute(a); @test prod(d) == 2^10 close(d) end check_leaks() @testset "test zeros" begin @testset "1D dzeros default element type" begin A = dzeros(10) @test A == zeros(10) @test eltype(A) == Float64 @test size(A) == (10,) close(A) end @testset "1D dzeros with specified element type" begin A = dzeros(Int, 10) @test A == zeros(10) @test eltype(A) == Int @test size(A) == (10,) close(A) end @testset "2D dzeros default element type, Dims constructor" begin A = dzeros((10,10)) @test A == zeros((10,10)) @test eltype(A) == Float64 @test size(A) == (10,10) close(A) end @testset "2D dzeros specified element type, Dims constructor" begin A = dzeros(Int, (10,10)) @test A == zeros(Int, (10,10)) @test eltype(A) == Int @test size(A) == (10,10) close(A) end @testset "2D dzeros, default element type" begin A = dzeros(10,10) @test A == zeros(10,10) @test eltype(A) == Float64 @test size(A) == (10,10) close(A) end @testset "2D dzeros, specified element type" begin A = dzeros(Int, 10, 10) @test A == zeros(Int, 10, 10) @test eltype(A) == Int @test size(A) == (10,10) close(A) end end check_leaks() @testset "test dones" begin @testset "1D dones default element type" begin A = dones(10) @test A == ones(10) @test eltype(A) == Float64 @test size(A) == (10,) close(A) end @testset "1D dones with specified element type" begin A = dones(Int, 10) @test eltype(A) == Int @test size(A) == (10,) close(A) end @testset "2D dones default element type, Dims constructor" begin A = dones((10,10)) @test A == ones((10,10)) @test eltype(A) == Float64 @test size(A) == (10,10) close(A) end @testset "2D dones specified element type, Dims constructor" begin A = dones(Int, (10,10)) @test A == ones(Int, (10,10)) @test eltype(A) == Int @test size(A) == (10,10) close(A) end @testset "2D dones, default element type" begin A = dones(10,10) @test A == ones(10,10) @test eltype(A) == Float64 @test size(A) == (10,10) close(A) end @testset "2D dones, specified element type" begin A = dones(Int, 10, 10) @test A == ones(Int, 10, 10) @test eltype(A) == Int @test size(A) == (10,10) close(A) end end check_leaks() @testset "test drand" begin @testset "1D drand" begin A = drand(100) @test eltype(A) == Float64 @test size(A) == (100,) @test all(x-> x >= 0.0 && x <= 1.0, A) close(A) end @testset "1D drand, specified element type" begin A = drand(Int, 100) @test eltype(A) == Int @test size(A) == (100,) close(A) end @testset "1D drand, UnitRange" begin A = drand(1:10, 100) @test eltype(A) == Int @test size(A) == (100,) close(A) end @testset "1D drand, Array" begin A = drand([-1,0,1], 100) @test eltype(A) == Int @test size(A) == (100,) close(A) end @testset "2D drand, Dims constructor" begin A = drand((50,50)) @test eltype(A) == Float64 @test size(A) == (50,50) @test all(x-> x >= 0.0 && x <= 1.0, A) close(A) end @testset "2D drand" begin A = drand(100,100) @test eltype(A) == Float64 @test size(A) == (100,100) @test all(x-> x >= 0.0 && x <= 1.0, A) close(A) end @testset "2D drand, Dims constructor, specified element type" begin A = drand(Int, (100,100)) @test eltype(A) == Int @test size(A) == (100,100) close(A) end @testset "2D drand, specified element type" begin A = drand(Int, 100, 100) @test eltype(A) == Int @test size(A) == (100,100) close(A) end end check_leaks() @testset "test randn" begin @testset "1D drandn" begin A = drandn(100) @test eltype(A) == Float64 @test size(A) == (100,) close(A) end @testset "2D drandn, Dims constructor" begin A = drandn((50,50)) @test eltype(A) == Float64 @test size(A) == (50,50) close(A) end @testset "2D drandn" begin A = drandn(100,100) @test eltype(A) == Float64 @test size(A) == (100,100) close(A) end end check_leaks() @testset "test transpose/adjoint" begin @testset "test transpose real" begin A = drand(Float64, 100, 200) @test copy(transpose(A)) == transpose(Array(A)) close(A) end @testset "test transpose complex" begin A = drand(ComplexF64, 200, 100) @test copy(transpose(A)) == transpose(Array(A)) close(A) end @testset "test adjoint real" begin A = drand(Float64, 200, 100) @test copy(adjoint(A)) == adjoint(Array(A)) close(A) end @testset "test adjoint complex" begin A = drand(ComplexF64, 100, 200) @test copy(adjoint(A)) == adjoint(Array(A)) close(A) end d_closeall() # close the temporaries created above end check_leaks() @testset "makelocal" begin A = randn(5*nprocs(), 5*nprocs()) dA = distribute(A, procs=procs()) for i in 1:size(dA, 2) a = DistributedArrays.makelocal(dA, :, i) @test all(Array(view(dA, :, i)) .== a) @test all( view( A, :, i) .== a) end for i in 1:size(dA, 1) a = DistributedArrays.makelocal(dA, i, :) @test all(Array(view(dA, i:i, :)) .== a) @test all( view( A, i:i, :) .== a) end a = DistributedArrays.makelocal(dA, 1:5, 1:5) @test all(Array(view(dA, 1:5, 1:5)) .== a) @test all( view( A, 1:5, 1:5) .== a) close(dA) end @testset "test convert from subdarray" begin a = drand(20, 20); s = view(a, 1:5, 5:8) @test isa(s, SubDArray) @test s == DArray(s) s = view(a, 6:5, 5:8) @test isa(s, SubDArray) @test s == DArray(s) close(a) d_closeall() # close the temporaries created above end check_leaks() @testset "test scalar math" begin a = drand(20, 20); b = convert(Array, a) @testset "$f" for f in (-, abs, abs2, acos, acosd, acot, acotd, acsch, angle, asech, asin, asind, asinh, atan, atand, atanh, big, cbrt, ceil, cis, complex, conj, cos, cosc, cosd, cosh, cospi, cot, cotd, coth, csc, cscd, csch, dawson, deg2rad, digamma, erf, erfc, erfcinv, erfcx, erfi, erfinv, exp, exp10, exp2, expm1, exponent, float, floor, gamma, imag, invdigamma, isfinite, isinf, isnan, loggamma, log, log10, log1p, log2, rad2deg, real, sec, secd, sech, sign, sin, sinc, sind, sinh, sinpi, sqrt, tan, tand, tanh, trigamma) @test f.(a) == f.(b) end a = a .+ 1 b = b .+ 1 @testset "$f" for f in (asec, asecd, acosh, acsc, acscd, acoth) @test f.(a) == f.(b) end close(a) d_closeall() # close the temporaries created above end check_leaks() @testset "test mapslices" begin A = randn(5,5,5) D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)]) @test mapslices(svdvals, D, dims=(1,2)) ≈ mapslices(svdvals, A, dims=(1,2)) @test mapslices(svdvals, D, dims=(1,3)) ≈ mapslices(svdvals, A, dims=(1,3)) @test mapslices(svdvals, D, dims=(2,3)) ≈ mapslices(svdvals, A, dims=(2,3)) @test mapslices(sort, D, dims=(1,)) ≈ mapslices(sort, A, dims=(1,)) @test mapslices(sort, D, dims=(2,)) ≈ mapslices(sort, A, dims=(2,)) @test mapslices(sort, D, dims=(3,)) ≈ mapslices(sort, A, dims=(3,)) # issue #3613 B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), dims=[1,2]) @test size(B) == (1,1,4) @test all(B.==6) # issue #5141 C1 = mapslices(x-> maximum(-x), D, dims=[]) @test C1 == -D # issue #5177 c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)]) m1 = mapslices(x-> ones(2,3), c, dims=[1,2]) m2 = mapslices(x-> ones(2,4), c, dims=[1,3]) m3 = mapslices(x-> ones(3,4), c, dims=[2,3]) @test size(m1) == size(m2) == size(m3) == size(c) n1 = mapslices(x-> ones(6), c, dims=[1,2]) n2 = mapslices(x-> ones(6), c, dims=[1,3]) n3 = mapslices(x-> ones(6), c, dims=[2,3]) n1a = mapslices(x-> ones(1,6), c, dims=[1,2]) n2a = mapslices(x-> ones(1,6), c, dims=[1,3]) n3a = mapslices(x-> ones(1,6), c, dims=[2,3]) @test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5)) @test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5)) close(D) close(c) d_closeall() # close the temporaries created above end check_leaks() @testset "test scalar ops" begin a = drand(20,20) b = convert(Array, a) c = drand(20,20) d = convert(Array, c) @testset "$f" for f in (:+, :-, :*, :/, :%) x = rand() @test @eval ($f).($a, $x) == ($f).($b, $x) @test @eval ($f).($x, $a) == ($f).($x, $b) @test @eval ($f).($a, $c) == ($f).($b, $d) end close(a) close(c) a = dones(Int, 20, 20) b = convert(Array, a) @testset "$f" for f in (:<<, :>>) @test @eval ($f).($a, 2) == ($f).($b, 2) @test @eval ($f).(2, $a) == ($f).(2, $b) @test @eval ($f).($a, $a) == ($f).($b, $b) end @testset "$f" for f in (:rem,) x = rand() @test @eval ($f).($a, $x) == ($f).($b, $x) end close(a) close(c) d_closeall() # close the temporaries created above end check_leaks() @testset "test broadcast ops" begin wrkrs = workers() nwrkrs = length(wrkrs) nrows = 20 * nwrkrs ncols = 10 * nwrkrs a = drand((nrows,ncols), wrkrs, (1, nwrkrs)) m = mean(a, dims=1) c = a .- m d = convert(Array, a) .- convert(Array, m) @test c == d e = @DArray [ones(10) for i=1:4] f = 2 .* e @test Array(f) == 2 .* Array(e) @test Array(map(x -> sum(x) .+ 2, e)) == map(x -> sum(x) .+ 2, e) @testset "test nested broadcast" begin g = a .- m .* sin.(c) @test Array(g) == Array(a) .- Array(m) .* sin.(Array(c)) end @testset "Broadcasting into DArray" begin a .= ones(nrows, ncols) @test all(isone, a) a .= 3 .+ abs2.(@view(zeros(nrows, ncols + 5)[:, 6:end])) @test all(x -> x == 3, a) end # @testset "lazy wrapped broadcast" begin # l = similar(a) # l[1:10, :] .= view(a, 1:10, : ) # end d_closeall() end check_leaks() @testset "test matrix multiplication" begin A = drandn(20,20) b = drandn(20) B = drandn(20,20) @test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps()) @test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps()) @test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps()) @test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps()) close(A) close(b) close(B) d_closeall() # close the temporaries created above end check_leaks() @testset "dot product" begin A = drandn(20,20) b = drandn(20) c = A * b @test dot(c, b) ≈ dot(convert(Array, c), convert(Array, b)) close(A) close(b) close(c) end check_leaks() @testset "test norm" begin x = drandn(20) @test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps()) @test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps()) @test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps()) @test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps()) close(x) end check_leaks() @testset "test axpy!" begin for (x, y) in ((drandn(20), drandn(20)), (drandn(20, 2), drandn(20, 2))) @test Array(axpy!(2.0, x, copy(y))) ≈ axpy!(2.0, Array(x), Array(y)) @test_throws DimensionMismatch axpy!(2.0, x, zeros(length(x) + 1)) close(x) close(y) end d_closeall() # close the temporaries created above end check_leaks() @testset "test ppeval" begin A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()]) B = drandn((10, nworkers()), workers(), [1, nworkers()]) R = zeros(10, nworkers()) for i = 1:nworkers() R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i] end @test convert(Array, ppeval(*, A, B)) ≈ R @test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, Matrix{Float64}(I,10,10))) close(A) close(B) d_closeall() # close the temporaries created above end check_leaks() @testset "test nnz" begin A = sprandn(10, 10, 0.5) @test nnz(distribute(A)) == nnz(A) end @testset "test matmatmul" begin A = drandn(30, 30) B = drandn(30, 20) a = convert(Array, A) b = convert(Array, B) AB = A * B AtB = transpose(A) * B AcB = A' * B ab = a * b atb = transpose(a) * b acb = a' * b @test AB ≈ ab @test AtB ≈ atb @test AcB ≈ acb d_closeall() # close the temporaries created above end @testset "sort, T = $T, 10^$i elements" for i in 0:6, T in [Int, Float64] d = DistributedArrays.drand(T, 10^i) @testset "sample = $sample" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)] d2 = DistributedArrays.sort(d; sample=sample) a = convert(Array, d) a2 = convert(Array, d2) @test length(d) == length(d2) @test sort(a) == a2 end d_closeall() # close the temporaries created above end check_leaks() @testset "ddata" begin d = ddata(;T=Int, init=I->myid()) for p in workers() @test p == remotecall_fetch(d->d[:L], p, d) end @test Int[workers()...] == gather(d) close(d) d = ddata(;T=Int, data=workers()) for p in workers() @test p == remotecall_fetch(d->d[:L], p, d) end @test Int[workers()...] == gather(d) close(d) d = ddata(;T=Any, init=I->"Hello World!") for p in workers() @test "Hello World!" == remotecall_fetch(d->d[:L], p, d) end Any["Hello World!" for p in workers()] == gather(d) close(d) end check_leaks() @testset "rand!" begin d = dzeros(30, 30) rand!(d) close(d) end check_leaks() @testset "fill!" begin d = dzeros(30, 30) fill!(d, 3.14) @test all(x-> x == 3.14, d) close(d) end check_leaks() d_closeall() @testset "test for any leaks" begin sleep(1.0) # allow time for any cleanup to complete allrefszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REFS.lock, isempty(DistributedArrays.REFS.data)), p) for p in procs()] @test all(allrefszero) allregistrieszero = Bool[remotecall_fetch(()-> @lock(DistributedArrays.REGISTRY.lock, isempty(DistributedArrays.REGISTRY.data)), p) for p in procs()] @test all(allregistrieszero) end ================================================ FILE: test/explicit_imports.jl ================================================ using DistributedArrays, Test import ExplicitImports @testset "ExplicitImports" begin # No implicit imports in DistributedArrays (ie. no `using MyPkg`) @test ExplicitImports.check_no_implicit_imports(DistributedArrays) === nothing # No non-owning imports in DistributedArrays (ie. no `using LinearAlgebra: map`) @test ExplicitImports.check_all_explicit_imports_via_owners(DistributedArrays) === nothing # Limit non-public imports in DistributedArrays (ie. `using MyPkg: _non_public_internal_func`) # to a few selected types and functions @test ExplicitImports.check_all_explicit_imports_are_public( DistributedArrays; ignore = ( # Base :Broadcasted, :Callable, (VERSION < v"1.11" ? (:tail,) : ())..., ), ) === nothing # No stale imports in DistributedArrays (ie. no `using MyPkg: func` where `func` is not used in DistributedArrays) @test ExplicitImports.check_no_stale_explicit_imports(DistributedArrays) === nothing # No non-owning accesses in DistributedArrays (ie. no `... LinearAlgebra.map(...)`) @test ExplicitImports.check_all_qualified_accesses_via_owners(DistributedArrays) === nothing # Limit non-public accesses in DistributedArrays (ie. no `... MyPkg._non_public_internal_func(...)`) # to a few selected types and methods from Base @test ExplicitImports.check_all_qualified_accesses_are_public( DistributedArrays; ignore = ( # Base.Broadcast :AbstractArrayStyle, :DefaultArrayStyle, :broadcasted, :throwdm, # Base (VERSION < v"1.11" ? (Symbol("@propagate_inbounds"),) : ())..., :ReshapedArray, :Slice, :_all, :_any, :_mapreduce, :check_reducedims, :checkbounds_indices, :index_lengths, :mapreducedim!, :promote_op, :reducedim_initarray, :reindex, :setindex_shape_check, :unalias, # Serialization :serialize_type, # Statistics :_mean, ), ) === nothing # No self-qualified accesses in DistributedArrays (ie. no `... DistributedArrays.func(...)`) @test ExplicitImports.check_no_self_qualified_accesses(DistributedArrays) === nothing end ================================================ FILE: test/runtests.jl ================================================ using Test using Distributed using DistributedArrays # Disable scalar indexing to avoid falling back on generic methods # for AbstractArray DistributedArrays.allowscalar(false) # add at least 3 worker processes if nworkers() < 3 n = max(3, min(8, Sys.CPU_THREADS)) addprocs(n; exeflags=`--check-bounds=yes`) end @assert nprocs() > 3 @assert nworkers() >= 3 @everywhere using Distributed @everywhere using DistributedArrays @everywhere using DistributedArrays.SPMD @everywhere using Random @everywhere using LinearAlgebra @everywhere Random.seed!(1234 + myid()) const MYID = myid() const OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))] function check_leaks() nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data) if !iszero(nrefs) sleep(0.1) # allow time for any cleanup to complete and test again nrefs = @lock DistributedArrays.REFS.lock length(DistributedArrays.REFS.data) if !iszero(nrefs) @warn("Probable leak of ", nrefs, " darrays") end end end include("aqua.jl") include("explicit_imports.jl") include("darray.jl") include("spmd.jl") ================================================ FILE: test/spmd.jl ================================================ @everywhere function spmd_test1() barrier(;tag=:b1) if myid() == 1 @assert SPMD.recvfrom(2) == "Hello from 2" println("SPMD: Passed send/recv") elseif myid() == 2 data = "Hello from 2" sendto(1, data) end stime = rand(1:5) # println("Sleeping for $stime seconds") sleep(stime) barrier(;tag=:b2) bcast_val = nothing if myid() == 1 bcast_val = rand(2) end bcast_val = bcast(bcast_val, 1) if myid() == 1 @assert bcast_val == SPMD.recvfrom(2) println("SPMD: Passed broadcast") elseif myid() == 2 sendto(1, bcast_val) end barrier() scatter_data = nothing if myid() == 1 scatter_data = rand(Int8, nprocs()) end lp = scatter(scatter_data, 1, tag=1) if myid() == 1 @assert scatter_data[2:2] == SPMD.recvfrom(2) println("SPMD: Passed scatter 1") elseif myid() == 2 sendto(1, lp) end scatter_data = nothing if myid() == 1 scatter_data = rand(Int8, nprocs()*2) end lp = scatter(scatter_data, 1, tag=2) if myid() == 1 @assert scatter_data[3:4] == SPMD.recvfrom(2) println("SPMD: Passed scatter 2") elseif myid() == 2 sendto(1, lp) end gathered_data = gather(myid(), 1, tag=3) if myid() == 1 @assert gathered_data == procs() println("SPMD: Passed gather 1") end gathered_data = gather([myid(), myid()], 1, tag=4) if myid() == 1 @assert gathered_data == [[p,p] for p in procs()] println("SPMD: Passed gather 2") end end spmd(spmd_test1) # Test running only on the workers using the spmd function. # define the function everywhere @everywhere function foo_spmd(d_in, d_out, n) pids=sort(vec(procs(d_in))) pididx = findfirst(isequal(myid()), pids) mylp = localpart(d_in) localsum = 0 # Have each node exchange data with its neighbors n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 # println(p_pididx, " p", pids[p_pididx], " ", n_pididx, " p", pids[n_pididx]) # println(mylp) for i in 1:n sendto(pids[n_pididx], mylp[2]) sendto(pids[p_pididx], mylp[1]) mylp[2] = SPMD.recvfrom(pids[p_pididx]) mylp[1] = SPMD.recvfrom(pids[n_pididx]) # println(mylp) barrier(;pids=pids) localsum = localsum + mylp[1] + mylp[2] end # finally store the sum in d_out d_out[:L] = localsum end # run foo_spmd on all workers, many of them, all concurrently using implicitly different contexts. in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8) out_arrays = map(x->ddata(), 1:8) @sync for i in 1:8 @async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers()) end for i in 1:8 @test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i]) end println("SPMD: Passed testing of spmd function run concurrently") # run concurrently with explicitly different contexts # define the function everywhere @everywhere function foo_spmd2(d_in, d_out, n) pids=sort(vec(procs(d_in))) pididx = findfirst(isequal(myid()), pids) mylp = localpart(d_in) # see if we have a value in the local store. store = context_local_storage() localsum = get!(store, :LOCALSUM, 0) # Have each node exchange data with its neighbors n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 for i in 1:n sendto(pids[n_pididx], mylp[2]) sendto(pids[p_pididx], mylp[1]) mylp[2] = SPMD.recvfrom(pids[p_pididx]) mylp[1] = SPMD.recvfrom(pids[n_pididx]) barrier(;pids=pids) localsum = localsum + mylp[1] + mylp[2] end # finally store the sum in d_out d_out[:L] = localsum store[:LOCALSUM] = localsum end in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...,)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8) out_arrays = map(x->ddata(), 1:8) contexts = map(x->context(workers()), 1:8) @sync for i in 1:8 @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i]) end # Second run will add the value stored in the previous run. @sync for i in 1:8 @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i]) end for i in 1:8 @test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i]) end # verify localstores with appropriate context store values exist. @everywhere begin if myid() != 1 local n = 0 @lock DistributedArrays.SPMD.CONTEXTS.lock begin for (k,v) in DistributedArrays.SPMD.CONTEXTS.data store = v.store localsum = store[:LOCALSUM] if localsum != 2*sum(workers())*2 println("localsum ", localsum, " != $(2*sum(workers())*2)") error("localsum mismatch") end n += 1 end end @assert n == 8 end end # close the contexts foreach(close, contexts) # verify that the localstores have been deleted. @everywhere begin @assert @lock DistributedArrays.SPMD.CONTEXTS.lock isempty(DistributedArrays.SPMD.CONTEXTS.data) end println("SPMD: Passed spmd function with explicit context run concurrently")