Repository: JuliaApproximation/FastTransforms.jl
Branch: master
Commit: 14a311816ffb
Files: 59
Total size: 331.9 KB

Directory structure:
gitextract_cc1mg2bt/

├── .github/
│   └── workflows/
│       ├── CIWindows.yml
│       ├── CompatHelper.yml
│       ├── TagBot.yml
│       ├── ci.yml
│       ├── docs.yml
│       └── downstream.yml
├── .gitignore
├── LICENSE.md
├── Project.toml
├── README.md
├── deps/
│   └── build.jl
├── docs/
│   ├── Project.toml
│   ├── make.jl
│   └── src/
│       ├── dev.md
│       └── index.md
├── examples/
│   ├── annulus.jl
│   ├── automaticdifferentiation.jl
│   ├── chebyshev.jl
│   ├── disk.jl
│   ├── halfrange.jl
│   ├── nonlocaldiffusion.jl
│   ├── padua.jl
│   ├── sphere.jl
│   ├── sphericalisometries.jl
│   ├── spinweighted.jl
│   ├── subspaceangles.jl
│   └── triangle.jl
├── src/
│   ├── FastTransforms.jl
│   ├── GramMatrix.jl
│   ├── PaduaTransform.jl
│   ├── ToeplitzPlusHankel.jl
│   ├── arrays.jl
│   ├── chebyshevtransform.jl
│   ├── clenshawcurtis.jl
│   ├── docstrings.jl
│   ├── elliptic.jl
│   ├── fejer.jl
│   ├── gaunt.jl
│   ├── hermite.jl
│   ├── inufft.jl
│   ├── libfasttransforms.jl
│   ├── nufft.jl
│   ├── specialfunctions.jl
│   ├── toeplitzhankel.jl
│   └── toeplitzplans.jl
└── test/
    ├── arraystests.jl
    ├── chebyshevtests.jl
    ├── gaunttests.jl
    ├── grammatrixtests.jl
    ├── hermitetests.jl
    ├── libfasttransformstests.jl
    ├── nuffttests.jl
    ├── paduatests.jl
    ├── quadraturetests.jl
    ├── runtests.jl
    ├── specialfunctionstests.jl
    ├── toeplitzhankeltests.jl
    ├── toeplitzplanstests.jl
    └── toeplitzplushankeltests.jl

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/CIWindows.yml
================================================
name: CI Windows
on:
  - push
  - pull_request
jobs:
  testwindows:
    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        version:
          - '1'
        os:
          - windows-latest
        arch:
          - x86
          - x64
    steps:
      - uses: actions/checkout@v3
      - uses: julia-actions/setup-julia@v1
        with:
          version: ${{ matrix.version }}
          arch: ${{ matrix.arch }}
          show-versioninfo: true
      - uses: actions/cache@v3
        env:
          cache-name: cache-artifacts
        with:
          path: ~/.julia/artifacts
          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
          restore-keys: |
            ${{ runner.os }}-test-${{ env.cache-name }}-
            ${{ runner.os }}-test-
            ${{ runner.os }}-
      - uses: julia-actions/julia-buildpkg@latest
      - uses: julia-actions/julia-runtest@latest
      - uses: julia-actions/julia-processcoverage@v1
      - uses: codecov/codecov-action@v3
        with:
          file: lcov.info


================================================
FILE: .github/workflows/CompatHelper.yml
================================================
name: CompatHelper
on:
  schedule:
    - cron: 0 0 * * *
  workflow_dispatch:
permissions:
  contents: write
  pull-requests: write
jobs:
  CompatHelper:
    runs-on: ubuntu-latest
    steps:
      - name: Check if Julia is already available in the PATH
        id: julia_in_path
        run: which julia
        continue-on-error: true
      - name: Install Julia, but only if it is not already available in the PATH
        uses: julia-actions/setup-julia@v1
        with:
          version: '1'
          arch: ${{ runner.arch }}
        if: steps.julia_in_path.outcome != 'success'
      - name: "Add the General registry via Git"
        run: |
          import Pkg
          ENV["JULIA_PKG_SERVER"] = ""
          Pkg.Registry.add("General")
        shell: julia --color=yes {0}
      - name: "Install CompatHelper"
        run: |
          import Pkg
          name = "CompatHelper"
          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
          version = "3"
          Pkg.add(; name, uuid, version)
        shell: julia --color=yes {0}
      - name: "Run CompatHelper"
        run: |
          import CompatHelper
          CompatHelper.main()
        shell: julia --color=yes {0}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}


================================================
FILE: .github/workflows/TagBot.yml
================================================
name: TagBot
on:
  issue_comment:
    types:
      - created
  workflow_dispatch:
jobs:
  TagBot:
    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
    runs-on: ubuntu-latest
    steps:
      - uses: JuliaRegistries/TagBot@v1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          ssh: ${{ secrets.DOCUMENTER_KEY }}


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
  - push
  - pull_request
jobs:
  test:
    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        version:
          - 'lts'
          - '1'
        os:
          - ubuntu-latest
#          - macOS-latest
        arch:
          - x86
          - x64
        exclude:
          - os: macOS-latest
            arch: x86
    steps:
      - uses: actions/checkout@v4
      - uses: julia-actions/setup-julia@v2
        with:
          version: ${{ matrix.version }}
          arch: ${{ matrix.arch }}
          show-versioninfo: true
      - uses: actions/cache@v3
        env:
          cache-name: cache-artifacts
        with:
          path: ~/.julia/artifacts
          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
          restore-keys: |
            ${{ runner.os }}-test-${{ env.cache-name }}-
            ${{ runner.os }}-test-
            ${{ runner.os }}-
      - uses: julia-actions/julia-buildpkg@latest
      - uses: julia-actions/julia-runtest@latest
      - uses: julia-actions/julia-processcoverage@v1
      - uses: codecov/codecov-action@v4
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          file: lcov.info


================================================
FILE: .github/workflows/docs.yml
================================================
name: Documentation
on:
  - push
  - pull_request
jobs:
  docs:
    name: Documentation
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: julia-actions/setup-julia@v1
        with:
          version: '1'
      - uses: julia-actions/julia-docdeploy@releases/v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}


================================================
FILE: .github/workflows/downstream.yml
================================================
name: IntegrationTest
on:
  push:
    branches: [master]
    tags: [v*]
    paths-ignore:
      - 'LICENSE'
      - 'README.md'
      - '.github/workflows/TagBot.yml'
  pull_request:
    paths-ignore:
      - 'LICENSE'
      - 'README.md'
      - '.github/workflows/TagBot.yml'

concurrency:
  group: build-${{ github.event.pull_request.number || github.ref }}-${{ github.workflow }}
  cancel-in-progress: true

jobs:
  pre_job:
    # continue-on-error: true # Uncomment once integration is finished
    runs-on: ubuntu-latest
    # Map a step output to a job output
    outputs:
      should_skip: ${{ steps.skip_check.outputs.should_skip }}
    steps:
      - id: skip_check
        uses: fkirc/skip-duplicate-actions@v5
  test:
    needs: pre_job
    if: needs.pre_job.outputs.should_skip != 'true'
    name: ${{ matrix.package.group }}/${{ matrix.package.repo }}/${{ matrix.julia-version }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        julia-version: ['1']
        os: [ubuntu-latest]
        package:
          - {repo: ClassicalOrthogonalPolynomials.jl, group: JuliaApproximation}
          - {repo: MultivariateOrthogonalPolynomials.jl, group: JuliaApproximation}
          - {repo: ApproxFun.jl, group: JuliaApproximation}

    steps:
      - uses: actions/checkout@v4
      - uses: julia-actions/setup-julia@v2
        with:
          version: ${{ matrix.julia-version }}
          arch: x64
      - uses: julia-actions/julia-buildpkg@latest
      - name: Clone Downstream
        uses: actions/checkout@v4
        with:
          repository: ${{ matrix.package.group }}/${{ matrix.package.repo }}
          path: downstream
      - name: Load this and run the downstream tests
        shell: julia --color=yes --project=downstream {0}
        run: |
          using Pkg
          try
            # force it to use this PR's version of the package
            Pkg.develop(PackageSpec(path="."))  # resolver may fail with main deps
            Pkg.update()
            Pkg.test(; coverage = true)  # resolver may fail with test time deps
          catch err
            err isa Pkg.Resolve.ResolverError || rethrow()
            # If we can't resolve that means this is incompatible by SemVer and this is fine
            # It means we marked this as a breaking change, so we don't need to worry about
            # Mistakenly introducing a breaking change, as we have intentionally made one
            @info "Not compatible with this release. No problem." exception=err
            exit(0)  # Exit immediately, as a success
          end
      - uses: julia-actions/julia-processcoverage@v1
      - uses: codecov/codecov-action@v4
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: lcov.info


================================================
FILE: .gitignore
================================================
docs/build/
docs/src/generated
deps/build.log
deps/libfasttransforms.*
.DS_Store
deps/FastTransforms/
Manifest.toml


================================================
FILE: LICENSE.md
================================================
The FastTransforms.jl package is licensed under the MIT "Expat" License:

> Copyright (c) 2016-2019: Richard Mikael Slevinsky and other contributors:
>
> https://github.com/JuliaApproximation/FastTransforms.jl/graphs/contributors
>
> Permission is hereby granted, free of charge, to any person obtaining
> a copy of this software and associated documentation files (the
> "Software"), to deal in the Software without restriction, including
> without limitation the rights to use, copy, modify, merge, publish,
> distribute, sublicense, and/or sell copies of the Software, and to
> permit persons to whom the Software is furnished to do so, subject to
> the following conditions:
>
> The above copyright notice and this permission notice shall be
> included in all copies or substantial portions of the Software.
>
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: Project.toml
================================================
name = "FastTransforms"
uuid = "057dd010-8810-581a-b7be-e3fc3b93f78c"
version = "0.17.1"


[deps]
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
ArrayLayouts = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
FastTransforms_jll = "34b6f7d7-08f9-5794-9e10-3819e4c7e49a"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
GenericFFT = "a8297547-1b15-4a5a-a998-a2ac5f1cef28"
LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
RecurrenceRelationships = "807425ed-42ea-44d6-a357-6771516d7b2c"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
ToeplitzMatrices = "c751599d-da0a-543b-9d20-d0a503d91d24"

[compat]
AbstractFFTs = "1.0"
ArrayLayouts = "1.10"
BandedMatrices = "1.5"
FFTW = "1.7"
FastGaussQuadrature = "0.4, 0.5, 1"
FastTransforms_jll = "0.6.2"
FillArrays = "0.9, 0.10, 0.11, 0.12, 0.13, 1"
GenericFFT = "0.1"
LazyArrays = "2.2"
RecurrenceRelationships = "0.2"
SpecialFunctions = "0.10, 1, 2"
ToeplitzMatrices = "0.7.1, 0.8"
julia = "1.7"

[extras]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "Random"]


================================================
FILE: README.md
================================================
# FastTransforms.jl

[![Build Status](https://github.com/JuliaApproximation/FastTransforms.jl/workflows/CI/badge.svg)](https://github.com/JuliaApproximation/FastTransforms.jl/actions?query=workflow%3ACI) [![codecov](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl/branch/master/graph/badge.svg?token=BxTvSNgmLL)](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/stable) [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/dev)
[![pkgeval](https://juliahub.com/docs/General/FastTransforms/stable/pkgeval.svg)](https://juliaci.github.io/NanosoldierReports/pkgeval_badges/report.html)

`FastTransforms.jl` allows the user to conveniently work with orthogonal polynomials with degrees well into the millions.

This package provides a Julia wrapper for the [C library](https://github.com/MikaelSlevinsky/FastTransforms) of the same name. Additionally, all three types of nonuniform fast Fourier transforms are available, as well as the Padua transform.

## Installation

Installation, which uses [BinaryBuilder](https://github.com/JuliaPackaging/BinaryBuilder.jl) for all of Julia's supported platforms (in particular Sandybridge Intel processors and beyond), may be as straightforward as:

```julia
pkg> add FastTransforms

julia> using FastTransforms, LinearAlgebra

```

## Fast orthogonal polynomial transforms

The orthogonal polynomial transforms are listed in `FastTransforms.Transforms` or `FastTransforms.kind2string.(instances(FastTransforms.Transforms))`. Univariate transforms may be planned with the standard normalization or with orthonormalization. For multivariate transforms, the standard normalization may be too severe for floating-point computations, so it is omitted. Here are two examples:

### The Chebyshev--Legendre transform

```julia
julia> c = rand(8192);

julia> leg2cheb(c);

julia> cheb2leg(c);

julia> norm(cheb2leg(leg2cheb(c; normcheb=true); normcheb=true)-c)/norm(c)
1.1866591414786334e-14

```

The implementation separates pre-computation into an `FTPlan`. This type is constructed with either `plan_leg2cheb` or `plan_cheb2leg`. Let's see how much faster it is if we pre-compute.

```julia
julia> p1 = plan_leg2cheb(c);

julia> p2 = plan_cheb2leg(c);

julia> @time leg2cheb(c);
  0.433938 seconds (9 allocations: 64.641 KiB)

julia> @time p1*c;
  0.005713 seconds (8 allocations: 64.594 KiB)

julia> @time cheb2leg(c);
  0.423865 seconds (9 allocations: 64.641 KiB)

julia> @time p2*c;
  0.005829 seconds (8 allocations: 64.594 KiB)

```

Furthermore, for orthogonal polynomial connection problems that are degree-preserving, we should expect to be able to apply the transforms in-place:

```julia
julia> lmul!(p1, c);

julia> lmul!(p2, c);

julia> ldiv!(p1, c);

julia> ldiv!(p2, c);

```

### The spherical harmonic transform

Let `F` be an array of spherical harmonic expansion coefficients with columns arranged by increasing order in absolute value, alternating between negative and positive orders. Then `sph2fourier` converts the representation into a bivariate Fourier series, and `fourier2sph` converts it back. Once in a bivariate Fourier series on the sphere, `plan_sph_synthesis` converts the coefficients to function samples on an equiangular grid that does not sample the poles, and `plan_sph_analysis` converts them back.

```julia
julia> F = sphrandn(Float64, 1024, 2047); # convenience method

julia> P = plan_sph2fourier(F);

julia> PS = plan_sph_synthesis(F);

julia> PA = plan_sph_analysis(F);

julia> @time G = PS*(P*F);
  0.090767 seconds (12 allocations: 31.985 MiB, 1.46% gc time)

julia> @time H = P\(PA*G);
  0.092726 seconds (12 allocations: 31.985 MiB, 1.63% gc time)

julia> norm(F-H)/norm(F)
2.1541073345177038e-15

```

Due to the structure of the spherical harmonic connection problem, these transforms may also be performed in-place with `lmul!` and `ldiv!`.

See also [FastSphericalHarmonics.jl](https://github.com/eschnett/FastSphericalHarmonics.jl) for a simpler interface to the spherical harmonic transforms defined in this package.

## Nonuniform fast Fourier transforms

The NUFFTs are implemented thanks to [Alex Townsend](https://github.com/ajt60gaibb):
 - `nufft1` assumes uniform samples and noninteger frequencies;
 - `nufft2` assumes nonuniform samples and integer frequencies;
 - `nufft3 ( = nufft)` assumes nonuniform samples and noninteger frequencies;
 - `inufft1` inverts an `nufft1`; and,
 - `inufft2` inverts an `nufft2`.

Here is an example:

```julia
julia> n = 10^4;

julia> c = complex(rand(n));

julia> ω = collect(0:n-1) + rand(n);

julia> nufft1(c, ω, eps());

julia> p1 = plan_nufft1(ω, eps());

julia> @time p1*c;
  0.002383 seconds (6 allocations: 156.484 KiB)

julia> x = (collect(0:n-1) + 3rand(n))/n;

julia> nufft2(c, x, eps());

julia> p2 = plan_nufft2(x, eps());

julia> @time p2*c;
  0.001478 seconds (6 allocations: 156.484 KiB)

julia> nufft3(c, x, ω, eps());

julia> p3 = plan_nufft3(x, ω, eps());

julia> @time p3*c;
  0.058999 seconds (6 allocations: 156.484 KiB)

```

## The Padua Transform

The Padua transform and its inverse are implemented thanks to [Michael Clarke](https://github.com/MikeAClarke). These are optimized methods designed for computing the bivariate Chebyshev coefficients by interpolating a bivariate function at the Padua points on `[-1,1]^2`.

```julia
julia> n = 200;

julia> N = div((n+1)*(n+2), 2);

julia> v = rand(N); # The length of v is the number of Padua points

julia> @time norm(ipaduatransform(paduatransform(v)) - v)/norm(v)
  0.007373 seconds (543 allocations: 1.733 MiB)
3.925164683252905e-16

```

# References

[1]  D. Ruiz—Antolín and A. Townsend, [A nonuniform fast Fourier transform based on low rank approximation](https://doi.org/10.1137/17M1134822), *SIAM J. Sci. Comput.*, **40**:A529–A547, 2018.

[2] K. Gumerov, S. Rigg, and R. M. Slevinsky, [Fast measure modification of orthogonal polynomials via matrices with displacement structure](https://arxiv.org/abs/2412.17663), arXiv:2412.17663, 2024.

[3] T. S. Gutleb, S. Olver and R. M. Slevinsky, [Polynomial and rational measure modifications of orthogonal polynomials via infinite-dimensional banded matrix factorizations](https://arxiv.org/abs/2302.08448), arXiv:2302.08448, 2023.

[4] S. Olver, R. M. Slevinsky, and A. Townsend, [Fast algorithms using orthogonal polynomials](https://doi.org/10.1017/S0962492920000045), *Acta Numerica*, **29**:573—699, 2020.

[5]  R. M. Slevinsky, [Fast and backward stable transforms between spherical harmonic expansions and bivariate Fourier series](https://doi.org/10.1016/j.acha.2017.11.001), *Appl. Comput. Harmon. Anal.*, **47**:585—606, 2019.

[6]  R. M. Slevinsky, [Conquering the pre-computation in two-dimensional harmonic polynomial transforms](https://arxiv.org/abs/1711.07866), arXiv:1711.07866, 2017.


================================================
FILE: deps/build.jl
================================================
if get(ENV, "FT_BUILD_FROM_SOURCE", "false") == "true"
    extension = Sys.isapple() ? "dylib" : Sys.islinux() ? "so" : Sys.iswindows() ? "dll" : ""
    make = Sys.iswindows() ? "mingw32-make" : "make"
    flags = Sys.isapple() ? "FT_USE_APPLEBLAS=1" : Sys.iswindows() ? "FT_FFTW_WITH_COMBINED_THREADS=1" : ""
    script = """
        set -e
        set -x
        if [ -d "FastTransforms" ]; then
            cd FastTransforms
            git fetch
            git checkout master
            git pull
            cd ..
        else
            git clone https://github.com/MikaelSlevinsky/FastTransforms.git FastTransforms
        fi
        cd FastTransforms
        $make assembly
        $make lib $flags
        cd ..
        mv -f FastTransforms/libfasttransforms.$extension libfasttransforms.$extension
    """
    try
        run(`bash -c $(script)`)
    catch
        error(
            "FastTransforms could not be properly installed.\n Please check that you have all dependencies installed. " *
            "Sample installation of dependencies:\n" *
            (Sys.isapple() ? "On MacOS\n\tbrew install libomp fftw mpfr\n" :
             Sys.islinux() ? "On Linux\n\tsudo apt-get install libomp-dev libblas-dev libopenblas-base libfftw3-dev libmpfr-dev\n" :
             Sys.iswindows() ? "On Windows\n\tvcpkg install openblas:x64-windows fftw3[core,threads]:x64-windows mpir:x64-windows mpfr:x64-windows\n" :
             "On your platform, please consider opening a pull request to add support to build from source.\n")
        )
    end
    println("FastTransforms built from source.")
else
    println("FastTransforms using precompiled binaries.")
end


================================================
FILE: docs/Project.toml
================================================
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FastTransforms = "057dd010-8810-581a-b7be-e3fc3b93f78c"
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"

[compat]
Documenter = "~0.24"
Literate = "~2.8"


================================================
FILE: docs/make.jl
================================================
using Documenter, FastTransforms, Literate, Plots

plotlyjs()

const EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
const OUTPUT_DIR   = joinpath(@__DIR__, "src/generated")

examples = [
    "annulus.jl",
    "automaticdifferentiation.jl",
    "chebyshev.jl",
    "disk.jl",
    "halfrange.jl",
    "nonlocaldiffusion.jl",
    "padua.jl",
    "sphere.jl",
    "spinweighted.jl",
    "subspaceangles.jl",
    "triangle.jl",
]

function uncomment_objects(str)
    str = replace(str, "###```@raw" => "```\n\n```@raw")
    str = replace(str, "###<object" => "<object")
    str = replace(str, "###```\n```" => "```")
    str
end

for example in examples
    example_filepath = joinpath(EXAMPLES_DIR, example)
    Literate.markdown(example_filepath, OUTPUT_DIR; execute=true, postprocess = uncomment_objects)
end

makedocs(
            doctest = false,
            format = Documenter.HTML(),
            sitename = "FastTransforms.jl",
            authors = "Richard Mikael Slevinsky",
            pages = Any[
                    "Home" => "index.md",
                    "Development" => "dev.md",
                    "Examples" => [
                        "generated/annulus.md",
                        "generated/automaticdifferentiation.md",
                        "generated/chebyshev.md",
                        "generated/disk.md",
                        "generated/halfrange.md",
                        "generated/nonlocaldiffusion.md",
                        "generated/padua.md",
                        "generated/sphere.md",
                        "generated/spinweighted.md",
                        "generated/subspaceangles.md",
                        "generated/triangle.md",
                        ],
                    ]
        )


deploydocs(
    repo   = "github.com/JuliaApproximation/FastTransforms.jl.git",
    )


================================================
FILE: docs/src/dev.md
================================================
# Development Documentation

The core of [`FastTransforms.jl`](https://github.com/JuliaApproximation/FastTransforms.jl) is developed in parallel with the [C library](https://github.com/MikaelSlevinsky/FastTransforms) of the same name. Julia and C interoperability is enhanced by the [BinaryBuilder](https://github.com/JuliaPackaging/BinaryBuilder.jl) infrastructure, which provides the user a safe and seamless experience using a package in a different language.

## Why two packages?

Orthogonal polynomial transforms are performance-sensitive imperative tasks. Yet, many of Julia's rich and evolving language features are simply unnecessary for defining these computational routines. Moreover, rapid language changes in Julia (as compared to C) have been more than a perturbation to this repository in the past.

The C library generates assembly for vectorized operations such as single instruction multiple data (SIMD) that is more efficient than that generated by a compiler without human intervention. It also uses OpenMP to introduce shared memory parallelism for large tasks. Finally, calling into precompiled binaries reduces the Julia package's pre-compilation and dependencies, improving the user experience. Some of these capabilities also exist in Julia, but with C there is frankly more control over performance.

C libraries are easier to call from any other language, partly explaining why the Python package manager Spack [already supports the C library](https://spack.readthedocs.io/en/latest/package_list.html#fasttransforms) through third-party efforts.

In Julia, a parametric composite type with unrestricted type parameters is just about as big as `Any`. Such a type allows the Julia API to far exceed the C API in its ability to unify all of the orthogonal polynomial transforms and present them as linear operators. The `mutable struct FTPlan{T, N, K}`, together with `AdjointFTPlan` and `TransposeFTPlan`, are the core Julia types in this repository. Whereas `T` is understood to represent element type of the plan and `N` represents the number of leading dimensions of the array on which it operates, `K` is a mere enumeration which serves to distinguish the orthogonal polynomials at play. For example, `FTPlan{Float64, 1, LEG2CHEB}` represents the necessary pre-computations to convert 64-bit Legendre series to Chebyshev series (of the first kind). `N == 1` because Chebyshev and Legendre series are naturally represented with vectors of coefficients. However, this particular plan may operate not only on vectors but also on matrices, column-by-column.

## The developer's right to build from source

Precompiled binaries are important for users, but development in C may be greatly accelerated by coupling it with a dynamic language such as Julia. For this reason, the repository preserves the developer's right to build the C library from source by setting an environment variable to trigger the build script:

```julia
julia> ENV["FT_BUILD_FROM_SOURCE"] = "true"
"true"

(@v1.5) pkg> build FastTransforms
   Building FFTW ──────────→ `~/.julia/packages/FFTW/ayqyZ/deps/build.log`
   Building TimeZones ─────→ `~/.julia/packages/TimeZones/K98G0/deps/build.log`
   Building FastTransforms → `~/.julia/dev/FastTransforms/deps/build.log`

julia> using FastTransforms
[ Info: Precompiling FastTransforms [057dd010-8810-581a-b7be-e3fc3b93f78c]

```

This lets the developer experiment with new features through `ccall`ing into bleeding edge source code. Customizing the build script further allows the developer to track a different branch or even a fork.

## From release to release to release

To get from a C library release to a Julia package release, the developer needs to update Yggdrasil's [build_tarballs.jl](https://github.com/JuliaPackaging/Yggdrasil/blob/master/F/FastTransforms/build_tarballs.jl) script for the new version and its 256-bit SHA. On macOS, the SHA can be found by:

```julia
shell> curl https://codeload.github.com/MikaelSlevinsky/FastTransforms/tar.gz/v0.6.2 --output FastTransforms.tar.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  168k    0  168k    0     0   429k      0 --:--:-- --:--:-- --:--:--  429k

shell> shasum -a 256 FastTransforms.tar.gz
fd00befcb0c20ba962a8744a7b9139355071ee95be70420de005b7c0f6e023aa  FastTransforms.tar.gz

shell> rm -f FastTransforms.tar.gz

```

Using [SHA.jl](https://github.com/JuliaCrypto/SHA.jl), the SHA can also be found by:

```julia
shell> curl https://codeload.github.com/MikaelSlevinsky/FastTransforms/tar.gz/v0.6.2 --output FastTransforms.tar.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  168k    0  168k    0     0   442k      0 --:--:-- --:--:-- --:--:--  443k

julia> using SHA

julia> open("FastTransforms.tar.gz") do f
           bytes2hex(sha256(f))
       end
"fd00befcb0c20ba962a8744a7b9139355071ee95be70420de005b7c0f6e023aa"

shell> rm -f FastTransforms.tar.gz

```

Then we wait for the friendly folks at [JuliaPackaging](https://github.com/JuliaPackaging) to merge the pull request to Yggdrasil, triggering a new release of the [FastTransforms_jll.jl](https://github.com/JuliaBinaryWrappers/FastTransforms_jll.jl) meta package that stores all precompiled binaries. With this release, we update the FastTransforms.jl [Project.toml](https://github.com/JuliaApproximation/FastTransforms.jl/blob/master/Project.toml) to point to the latest release and register the new version.

Since development of Yggdrasil is quite rapid, a fork may easily become stale. Git permits the developer to forcibly make a master branch on a fork even with upstream master:

```
git fetch upstream
git checkout master
git reset --hard upstream/master
git push origin master --force
```


================================================
FILE: docs/src/index.md
================================================
# FastTransforms.jl Documentation

## Introduction

[`FastTransforms.jl`](https://github.com/JuliaApproximation/FastTransforms.jl) allows the user to conveniently work with orthogonal polynomials with degrees well into the millions.

This package provides a Julia wrapper for the [C library](https://github.com/MikaelSlevinsky/FastTransforms) of the same name. Additionally, all three types of nonuniform fast Fourier transforms available, as well as the Padua transform.

## Fast orthogonal polynomial transforms

For this documentation, please see the documentation for [FastTransforms](https://github.com/MikaelSlevinsky/FastTransforms). Most transforms have separate forward and inverse plans. In some instances, however, the inverse is in the sense of least-squares, and therefore only the forward transform is planned.

### Modified orthogonal polynomials via fast Cholesky factorization of the Gram matrix

```@docs
GramMatrix
ChebyshevGramMatrix
```

## Nonuniform fast Fourier transforms

```@docs
nufft1
nufft2
nufft3
inufft1
inufft2
paduatransform
ipaduatransform
```

## Other Exported Methods

```@docs
gaunt
paduapoints
sphevaluate
```

## Internal Methods

### Miscellaneous Special Functions

```@docs
FastTransforms.half
FastTransforms.two
FastTransforms.δ
FastTransforms.Λ
FastTransforms.lambertw
FastTransforms.pochhammer
FastTransforms.stirlingseries
```

### Modified Chebyshev Moment-Based Quadrature

```@docs
FastTransforms.clenshawcurtisnodes
FastTransforms.clenshawcurtisweights
FastTransforms.fejernodes1
FastTransforms.fejerweights1
FastTransforms.fejernodes2
FastTransforms.fejerweights2
FastTransforms.chebyshevmoments1
FastTransforms.chebyshevjacobimoments1
FastTransforms.chebyshevlogmoments1
FastTransforms.chebyshevmoments2
FastTransforms.chebyshevjacobimoments2
FastTransforms.chebyshevlogmoments2
```

### Elliptic Submodule

```@docs
FastTransforms.Elliptic
```


================================================
FILE: examples/annulus.jl
================================================
# # Integration on an annulus
# In this example, we explore integration of the function:
# ```math
#   f(x,y) = \frac{x^3}{x^2+y^2-\frac{1}{4}},
# ```
# over the annulus defined by $\{(r,\theta) : \rho < r < 1, 0 < \theta < 2\pi\}$
# with parameter $\rho = \frac{2}{3}$. We will calculate the integral:
# ```math
#   \int_0^{2\pi}\int_{\frac{2}{3}}^1 f(r\cos\theta,r\sin\theta)^2r{\rm\,d}r{\rm\,d}\theta,
# ```
# by analyzing the function in an annulus polynomial series.
# We analyze the function on an $N\times M$ tensor product grid defined by:
# ```math
# \begin{aligned}
# r_n & = \sqrt{\cos^2\left[(n+\tfrac{1}{2})\pi/2N\right] + \rho^2 \sin^2\left[(n+\tfrac{1}{2})\pi/2N\right]},\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
# \theta_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
# \end{aligned}
# ```
# we convert the function samples to Chebyshev×Fourier coefficients using
# `plan_annulus_analysis`; and finally, we transform the Chebyshev×Fourier
# coefficients to annulus polynomial coefficients using `plan_ann2cxf`.
#
# For the storage pattern of the arrays, please consult the
# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).

using FastTransforms, LinearAlgebra, Plots
const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
!isdir(GENFIGS) && mkdir(GENFIGS)
plotlyjs()

# Our function $f$ on the annulus:
f = (x,y) -> x^3/(x^2+y^2-1/4)

# The annulus polynomial degree:
N = 8
M = 4N-3

# The annulus inner radius:
ρ = 2/3

# The radial grid:
r = [begin t = (N-n-0.5)/(2N); ct = sinpi(t); st = cospi(t); sqrt(ct^2+ρ^2*st^2) end for n in 0:N-1]

# The angular grid (mod $\pi$):
θ = (0:M-1)*2/M

# On the mapped tensor product grid, our function samples are:
F = [f(r*cospi(θ), r*sinpi(θ)) for r in r, θ in θ]

# We superpose a surface plot of $f$ on top of the grid:
X = [r*cospi(θ) for r in r, θ in θ]
Y = [r*sinpi(θ) for r in r, θ in θ]
scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:red)
surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "annulus.html"))
###```@raw html
###<object type="text/html" data="../annulus.html" style="width:100%;height:400px;"></object>
###```

# We precompute an Annulus--Chebyshev×Fourier plan:
α, β, γ = 0, 0, 0
P = plan_ann2cxf(F, α, β, γ, ρ)

# And an FFTW Chebyshev×Fourier analysis plan on the annulus:
PA = plan_annulus_analysis(F, ρ)

# Its annulus coefficients are:
U = P\(PA*F)

# The annulus coefficients are useful for integration.
# The integral of $[f(x,y)]^2$ over the annulus is
# approximately the square of the 2-norm of the coefficients:
norm(U)^2, 5π/8*(1675/4536+9*log(3)/32-3*log(7)/32)


================================================
FILE: examples/automaticdifferentiation.jl
================================================
# # Automatic differentiation through spherical harmonic transforms
# This example finds a positive value of $\lambda$ in:
# ```math
# f(r) = \sin[\lambda (k\cdot r)],
# ```
# for some $k,r\in\mathbb{S}^2$ such that $\int_{\mathbb{S}^2} f^2 {\rm\,d}\Omega = 1$.
# We do this by using derivative information through:
# ```math
# \dfrac{\partial f}{\partial \lambda} = (k\cdot r) \cos[\lambda (k\cdot r)].
# ```

using FastTransforms, LinearAlgebra

# The colatitudinal grid (mod $\pi$):
N = 15
θ = (0.5:N-0.5)/N

# The longitudinal grid (mod $\pi$):
M = 2*N-1
φ = (0:M-1)*2/M

# We precompute a spherical harmonic--Fourier plan:
P = plan_sph2fourier(Float64, N)

# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
PA = plan_sph_analysis(Float64, N, M)

# Our choice of $k$ and angular parametrization of $r$:
k = [2/7, 3/7, 6/7]
r = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]

# Our initial guess for $\lambda$:
λ = 1.0

# Then we run Newton iteration and grab an espresso:
for _ in 1:7
    F = [sin(λ*(k⋅r(θ,φ))) for θ in θ, φ in φ]
    Fλ = [(k⋅r(θ,φ))*cos(λ*(k⋅r(θ,φ))) for θ in θ, φ in φ]
    U = P\(PA*F)
    Uλ = P\(PA*Fλ)
    global λ = λ - (norm(U)^2-1)/(2*sum(U.*Uλ))
    println("λ: $(rpad(λ, 18)) and the 2-norm: $(rpad(norm(U), 18))")
end


================================================
FILE: examples/chebyshev.jl
================================================
# # Chebyshev transform
# This demonstrates the Chebyshev transform and inverse transform,
# explaining precisely the normalization and points

using FastTransforms
n = 20

# First kind points $\to$ first kind polynomials
p_1 = chebyshevpoints(Float64, n, Val(1))
f = exp.(p_1)
f̌ = chebyshevtransform(f, Val(1))
f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
f̃(0.1) ≈ exp(0.1)

# First kind polynomials $\to$ first kind points
ichebyshevtransform(f̌, Val(1)) ≈ exp.(p_1)

# Second kind points $\to$ first kind polynomials
p_2 = chebyshevpoints(Float64, n, Val(2))
f = exp.(p_2)
f̌ = chebyshevtransform(f, Val(2))
f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
f̃(0.1) ≈ exp(0.1)

# First kind polynomials $\to$ second kind points
ichebyshevtransform(f̌, Val(2)) ≈ exp.(p_2)

# First kind points $\to$ second kind polynomials
p_1 = chebyshevpoints(Float64, n, Val(1))
f = exp.(p_1)
f̌ = chebyshevutransform(f, Val(1))
f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-1]' * f̌
f̃(0.1) ≈ exp(0.1)

# Second kind polynomials $\to$ first kind points
ichebyshevutransform(f̌, Val(1)) ≈ exp.(p_1)

# Second kind points $\to$ second kind polynomials
p_2 = chebyshevpoints(Float64, n, Val(2))[2:n-1]
f = exp.(p_2)
f̌ = chebyshevutransform(f, Val(2))
f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-3]' * f̌
f̃(0.1) ≈ exp(0.1)

# Second kind polynomials $\to$ second kind points
ichebyshevutransform(f̌, Val(2)) ≈ exp.(p_2)


================================================
FILE: examples/disk.jl
================================================
# # Holomorphic integration on the unit disk
# In this example, we explore integration of a harmonic function:
# ```math
#   f(x,y) = \frac{x^2-y^2+1}{(x^2-y^2+1)^2+(2xy+1)^2},
# ```
# over the unit disk. In this case, we know from complex analysis that the
# integral of a holomorphic function is equal to $\pi \times f(0,0)$.
# We analyze the function on an $N\times M$ tensor product grid defined by:
# ```math
# \begin{aligned}
# r_n & = \cos\left[(n+\tfrac{1}{2})\pi/2N\right],\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
# \theta_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
# \end{aligned}
# ```
# we convert the function samples to Chebyshev×Fourier coefficients using
# `plan_disk_analysis`; and finally, we transform the Chebyshev×Fourier
# coefficients to Zernike polynomial coefficients using `plan_disk2cxf`.
#
# For the storage pattern of the arrays, please consult the
# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).

using FastTransforms, LinearAlgebra, Plots
const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
!isdir(GENFIGS) && mkdir(GENFIGS)
plotlyjs()

# Our function $f$ on the disk:
f = (x,y) -> (x^2-y^2+1)/((x^2-y^2+1)^2+(2x*y+1)^2)

# The Zernike polynomial degree:
N = 15
M = 4N-3

# The radial grid:
r = [sinpi((N-n-0.5)/(2N)) for n in 0:N-1]

# The angular grid (mod $\pi$):
θ = (0:M-1)*2/M

# On the mapped tensor product grid, our function samples are:
F = [f(r*cospi(θ), r*sinpi(θ)) for r in r, θ in θ]

# We superpose a surface plot of $f$ on top of the grid:
X = [r*cospi(θ) for r in r, θ in θ]
Y = [r*sinpi(θ) for r in r, θ in θ]
scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:red)
surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "zernike.html"))
###```@raw html
###<object type="text/html" data="../zernike.html" style="width:100%;height:400px;"></object>
###```

# We precompute a (generalized) Zernike--Chebyshev×Fourier plan:
α, β = 0, 0
P = plan_disk2cxf(F, α, β)

# And an FFTW Chebyshev×Fourier analysis plan on the disk:
PA = plan_disk_analysis(F)

# Its Zernike coefficients are:
U = P\(PA*F)

# The Zernike coefficients are useful for integration. The integral of $f(x,y)$
# over the disk should be $\pi/2$ by harmonicity. The coefficient of $Z_{0,0}$
# multiplied by `√π` is:
U[1, 1]*sqrt(π)

# Using an orthonormal basis, the integral of $[f(x,y)]^2$ over the disk is
# approximately the square of the 2-norm of the coefficients:
norm(U)^2, π/(2*sqrt(2))*log1p(sqrt(2))

# But there's more! Next, we repeat the experiment using the Dunkl-Xu
# orthonormal polynomials supported on the rectangularized disk.
N = 2N
M = N

# We analyze the function on an $N\times M$ mapped tensor product $xy$-grid defined by:
# ```math
# \begin{aligned}
# x_n & = \cos\left(\frac{2n+1}{2N}\pi\right) = \sin\left(\frac{N-2n-1}{2N}\pi\right),\quad {\rm for} \quad 0 \le n < N,\quad{\rm and}\\
# z_m & = \cos\left(\frac{2m+1}{2M}\pi\right) = \sin\left(\frac{M-2m-1}{2M}\pi\right),\quad {\rm for} \quad 0 \le m < M,\\
# y_{n,m} & = \sqrt{1-x_n^2}z_m.
# \end{aligned}
# ```
# Slightly more accuracy can be expected by using an auxiliary array:
# ```math
#   w_n = \sin\left(\frac{2n+1}{2N}\pi\right),\quad {\rm for} \quad 0 \le n < N,
# ```
# so that $y_{n,m} = w_nz_m$.
#
# The x grid
w = [sinpi((n+0.5)/N) for n in 0:N-1]
x = [sinpi((N-2n-1)/(2N)) for n in 0:N-1]

# The z grid
z = [sinpi((M-2m-1)/(2M)) for m in 0:M-1]

# On the mapped tensor product grid, our function samples are:
F = [f(x[n], w[n]*z) for n in 1:N, z in z]

# We superpose a surface plot of $f$ on top of the grid:
X = [x for x in x, z in z]
Y = [w*z for w in w, z in z]
scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:green)
surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "dunklxu.html"))
###```@raw html
###<object type="text/html" data="../dunklxu.html" style="width:100%;height:400px;"></object>
###```

# We precompute a Dunkl-Xu--Chebyshev plan:
P = plan_rectdisk2cheb(F, β)

# And an FFTW Chebyshev² analysis plan on the rectangularized disk:
PA = plan_rectdisk_analysis(F)

# Its Dunkl-Xu coefficients are:
U = P\(PA*F)

# The Dunkl-Xu coefficients are useful for integration. The integral of $f(x,y)$
# over the disk should be $\pi/2$ by harmonicity. The coefficient of $P_{0,0}$
# multiplied by `√π` is:
U[1, 1]*sqrt(π)

# Using an orthonormal basis, the integral of $[f(x,y)]^2$ over the disk is
# approximately the square of the 2-norm of the coefficients:
norm(U)^2, π/(2*sqrt(2))*log1p(sqrt(2))


================================================
FILE: examples/halfrange.jl
================================================
# # Half-range Chebyshev polynomials
# In [this paper](https://doi.org/10.1137/090752456), [Daan Huybrechs](https://github.com/daanhb) introduced the so-called half-range Chebyshev polynomials
# as the semi-classical orthogonal polynomials with respect to the inner product:
# ```math
# \langle f, g \rangle = \int_0^1 f(x) g(x)\frac{{\rm d} x}{\sqrt{1-x^2}}.
# ```
# By the variable transformation $y = 2x-1$, the resulting polynomials can be related to
# orthogonal polynomials on $(-1,1)$ with the Jacobi weight $(1-y)^{-\frac{1}{2}}$ modified by the weight $(3+y)^{-\frac{1}{2}}$.
#
# We shall use the fact that:
# ```math
# \frac{1}{\sqrt{3+y}} = \sqrt{\frac{2}{3+\sqrt{8}}}\sum_{n=0}^\infty P_n(y) \left(\frac{-1}{3+\sqrt{8}}\right)^n,
# ```
# and results from [this paper](https://arxiv.org/abs/2302.08448) to consider the half-range Chebyshev polynomials as
# modifications of the Jacobi polynomials $P_n^{(-\frac{1}{2},0)}(y)$.

using FastTransforms, LinearAlgebra, Plots, LaTeXStrings
const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
!isdir(GENFIGS) && mkdir(GENFIGS)
plotlyjs()

# We truncate the generating function to ensure a relative error less than `eps()` in the uniform norm on $(-1,1)$:
z = -1/(3+sqrt(8))
K = sqrt(-2z)
N = ceil(Int, log(abs(z), eps()/2*(1-abs(z))/K) - 1)
d = K .* z .^(0:N)

# Then, we convert this representation to the expansion in Jacobi polynomials $P_n^{(-\frac{1}{2}, 0)}(y)$:
u = jac2jac(d, 0.0, 0.0, -0.5, 0.0; norm1 = false, norm2 = true)

# Our working polynomial degree will be:
n = 100

# We compute the connection coefficients between the modified orthogonal polynomials and the Jacobi polynomials:
P = plan_modifiedjac2jac(Float64, n+1, -0.5, 0.0, u)

# We store the connection to first kind Chebyshev polynomials:
P1 = plan_jac2cheb(Float64, n+1, -0.5, 0.0; normjac = true)

# We compute the Chebyshev series for the degree-$k\le n$ modified polynomial and its values at the Chebyshev points:
q = k -> lmul!(P1, lmul!(P, [zeros(k); 1.0; zeros(n-k)]))
qvals = k-> ichebyshevtransform(q(k))

# With the symmetric Jacobi matrix for $P_n^{(-\frac{1}{2}, 0)}(y)$ and the modified plan, we may compute the modified Jacobi matrix and the corresponding roots (as eigenvalues):
XP = SymTridiagonal([-inv((4n-1)*(4n-5)) for n in 1:n+1], [4n*(2n-1)/(4n-1)/sqrt((4n-3)*(4n+1)) for n in 1:n])
XQ = FastTransforms.modified_jacobi_matrix(P, XP)
SymTridiagonal(XQ.dv[1:10], XQ.ev[1:9])

# And we plot:
x = (chebyshevpoints(Float64, n+1, Val(1)) .+ 1 ) ./ 2
p = plot(x, qvals(0); linewidth=2.0, legend = false, xlim=(0,1), xlabel=L"x",
         ylabel=L"T^h_n(x)", title="Half-Range Chebyshev Polynomials and Their Roots",
         extra_plot_kwargs = KW(:include_mathjax => "cdn"))
for k in 1:10
    λ = (eigvals(SymTridiagonal(XQ.dv[1:k], XQ.ev[1:k-1])) .+ 1) ./ 2
    plot!(x, qvals(k); linewidth=2.0, color=palette(:default)[k+1])
    scatter!(λ, zero(λ); markersize=2.5, color=palette(:default)[k+1])
end
p
savefig(joinpath(GENFIGS, "halfrange.html"))
###```@raw html
###<object type="text/html" data="../halfrange.html" style="width:100%;height:400px;"></object>
###```

# By [Theorem 2.20](https://arxiv.org/abs/2302.08448) it turns out that the *derivatives* of the half-range Chebyshev polynomials are a linear combination of at most two polynomials orthogonal with respect to $\sqrt{(3+y)(1-y)}(1+y)$ on $(-1,1)$. This fact enables us to compute the banded differentiation matrix:
v̂ = 3*[u; 0]+XP[1:N+2, 1:N+1]*u
v = jac2jac(v̂, -0.5, 0.0, 0.5, 1.0; norm1 = true, norm2 = true)
function threshold!(A::AbstractArray, ϵ)
    for i in eachindex(A)
        if abs(A[i]) < ϵ A[i] = 0 end
    end
    A
end
P′ = plan_modifiedjac2jac(Float64, n+1, 0.5, 1.0, v)
DP = UpperTriangular(diagm(1=>[sqrt(n*(n+1/2)) for n in 1:n])) # The classical differentiation matrix representing 𝒟 P^{(-1/2,0)}(y) = P^{(1/2,1)}(y) D_P.
DQ = UpperTriangular(threshold!(P′\(DP*(P*I)), 100eps())) # The semi-classical differentiation matrix representing 𝒟 Q(y) = Q̂(y) D_Q.
UpperTriangular(DQ[1:10,1:10])


================================================
FILE: examples/nonlocaldiffusion.jl
================================================
# # Nonlocal diffusion on $\mathbb{S}^2$
# This example calculates the spectrum of the nonlocal diffusion operator:
# ```math
# \mathcal{L}_\delta u = \int_{\mathbb{S}^2} \rho_\delta(|\mathbf{x}-\mathbf{y}|)\left[u(\mathbf{x}) - u(\mathbf{y})\right] \,\mathrm{d}\Omega(\mathbf{y}),
# ```
# defined in Eq. (2) of
#
# R. M. Slevinsky, H. Montanelli, and Q. Du, [A spectral method for nonlocal diffusion operators on the sphere](https://doi.org/10.1016/j.jcp.2018.06.024), *J. Comp. Phys.*, **372**:893--911, 2018.
#
# In the above, $0<\delta<2$, $-1<\alpha<1$, and the kernel:
# ```math
# \rho_\delta(|\mathbf{x}-\mathbf{y}|) = \frac{4(1+\alpha)}{\pi \delta^{2+2\alpha}} \frac{\chi_{[0,\delta]}(|\mathbf{x}-\mathbf{y}|)}{|\mathbf{x}-\mathbf{y}|^{2-2\alpha}},
# ```
# where $\chi_I(\cdot)$ is the indicator function on the set $I$.
#
# This nonlocal operator is diagonalized by spherical harmonics:
# ```math
# \mathcal{L}_\delta Y_\ell^m(\mathbf{x}) = \lambda_\ell(\alpha, \delta) Y_\ell^m(\mathbf{x}),
# ```
# and its eigenfunctions are given by the generalized Funk--Hecke formula:
# ```math
# \lambda_\ell(\alpha, \delta) = \frac{(1+\alpha) 2^{2+\alpha}}{\delta^{2+2\alpha}}\int_{1-\delta^2/2}^1 \left[P_\ell(t)-1\right] (1-t)^{\alpha-1} \,\mathrm{d} t.
# ```
# In the paper, the authors use Clenshaw--Curtis quadrature and asymptotic evaluation of Legendre polynomials to achieve $\mathcal{O}(n^2\log n)$ complexity for the evaluation of the first $n$ eigenvalues. With a change of basis, this complexity can be reduced to $\mathcal{O}(n\log n)$.
#
# First, we represent:
# ```math
# P_n(t) - 1 = \sum_{j=0}^{n-1} \left[P_{j+1}(t) - P_j(t)\right] = -\sum_{j=0}^{n-1} (1-t) P_j^{(1,0)}(t).
# ```
# Then, we represent $P_j^{(1,0)}(t)$ with Jacobi polynomials $P_i^{(\alpha,0)}(t)$ and we integrate using [DLMF 18.9.16](https://dlmf.nist.gov/18.9.16):
# ```math
# \int_x^1 P_i^{(\alpha,0)}(t)(1-t)^\alpha\,\mathrm{d}t = \left\{ \begin{array}{cc} \frac{(1-x)^{\alpha+1}}{\alpha+1} & \mathrm{for~}i=0,\\ \frac{1}{2i}(1-x)^{\alpha+1}(1+x)P_{i-1}^{(\alpha+1,1)}(x), & \mathrm{for~}i>0.\end{array}\right.
# ```
# The code below implements this algorithm, making use of the Jacobi--Jacobi transform `plan_jac2jac`.
# For numerical stability, the conversion from Jacobi polynomials $P_j^{(1,0)}(t)$ to $P_i^{(\alpha,0)}(t)$ is divided into conversion from $P_j^{(1,0)}(t)$ to $P_k^{(0,0)}(t)$, before conversion from $P_k^{(0,0)}(t)$ to $P_i^{(\alpha,0)}(t)$.

using FastTransforms, LinearAlgebra

function oprec!(n::Integer, v::AbstractVector, alpha::Real, delta2::Real)
    if n > 0
        v[1] = 1
    end
    if n > 1
        v[2] = (4*alpha+8-(alpha+4)*delta2)/4
    end
    for i = 1:n-2
        v[i+2] = (((2*i+alpha+2)*(2*i+alpha+4)+alpha*(alpha+2))/(2*(i+1)*(2*i+alpha+2))*(2*i+alpha+3)/(i+alpha+3) - delta2/4*(2*i+alpha+3)/(i+1)*(2*i+alpha+4)/(i+alpha+3))*v[i+1] - (i+alpha+1)/(i+alpha+3)*(2*i+alpha+4)/(2*i+alpha+2)*v[i]
    end
    return v
end

function evaluate_lambda(n::Integer, alpha::T, delta::T) where T
    delta2 = delta*delta
    scl = (1+alpha)*(2-delta2/2)

    lambda = Vector{T}(undef, n)

    if n > 0
        lambda[1] = 0
    end
    if n > 1
        lambda[2] = -2
    end

    oprec!(n-2, view(lambda, 3:n), alpha, delta2)

    for i = 2:n-1
        lambda[i+1] *= -scl/(i-1)
    end

    p = plan_jac2jac(T, n-1, zero(T), zero(T), alpha, zero(T))

    lmul!(p', view(lambda, 2:n))

    for i = 2:n-1
        lambda[i+1] = ((2i-1)*lambda[i+1] + (i-1)*lambda[i])/i
    end

    for i = 2:n-1
        lambda[i+1] += lambda[i]
    end

    return lambda
end

# The spectrum in `Float64`:
lambda = evaluate_lambda(10, -0.5, 1.0)

# The spectrum in `BigFloat`:
lambdabf = evaluate_lambda(10, parse(BigFloat, "-0.5"), parse(BigFloat, "1.0"))

# The $\infty$-norm relative error:
norm(lambda-lambdabf, Inf)/norm(lambda, Inf)


================================================
FILE: examples/padua.jl
================================================
# # Padua transform
# This demonstrates the Padua transform and inverse transform,
# explaining precisely the normalization and points

using FastTransforms

# We define the Padua points and extract Cartesian components:
N = 15
pts = paduapoints(N)
x = pts[:,1]
y = pts[:,2];

# We take the Padua transform of the function:
f = (x,y) -> exp(x + cos(y))
f̌ = paduatransform(f.(x , y));

# and use the coefficients to create an approximation to the function $f$:
f̃ = (x,y) -> begin
    j = 1
    ret = 0.0
    for n in 0:N, k in 0:n
        ret += f̌[j]*cos((n-k)*acos(x)) * cos(k*acos(y))
        j += 1
    end
    ret
end

# At a particular point, is the function well-approximated?
f̃(0.1,0.2) ≈ f(0.1,0.2)

# Does the inverse transform bring us back to the grid?
ipaduatransform(f̌) ≈ f̃.(x,y)


================================================
FILE: examples/sphere.jl
================================================
# # Spherical harmonic addition theorem
# This example confirms numerically that
# ```math
# f(z) = \frac{P_n(z\cdot y) - P_n(x\cdot y)}{z\cdot y - x\cdot y},
# ```
# is actually a degree-$(n-1)$ polynomial on $\mathbb{S}^2$, where $P_n$ is the degree-$n$
# Legendre polynomial, and $x,y,z \in \mathbb{S}^2$.
# To verify, we sample the function on a $N\times M$ equiangular grid
# defined by:
# ```math
# \begin{aligned}
# \theta_n & = (n+\tfrac{1}{2})\pi/N,\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
# \varphi_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
# \end{aligned}
# ```
# we convert the function samples to Fourier coefficients using
# `plan_sph_analysis`; and finally, we transform
# the Fourier coefficients to spherical harmonic coefficients using
# `plan_sph2fourier`.
#
# In the basis of spherical harmonics, it is plain to see the
# addition theorem in action, since $P_n(x\cdot y)$ should only consist of
# exact-degree-$n$ harmonics.
#
# For the storage pattern of the arrays, please consult the
# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).

function threshold!(A::AbstractArray, ϵ)
    for i in eachindex(A)
        if abs(A[i]) < ϵ A[i] = 0 end
    end
    A
end

using FastTransforms, LinearAlgebra, Plots
const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
!isdir(GENFIGS) && mkdir(GENFIGS)
plotlyjs()

# The colatitudinal grid (mod $\pi$):
N = 15
θ = (0.5:N-0.5)/N

# The longitudinal grid (mod $\pi$):
M = 2*N-1
φ = (0:M-1)*2/M

# Arbitrarily, we place $x$ at the North pole:
x = [0,0,1]

# Another vector is completely free:
y = normalize([.123,.456,.789])

# Thus $z \in \mathbb{S}^2$ is our variable vector, parameterized in spherical coordinates:
z = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]

# On the tensor product grid, the Legendre polynomial $P_n(z\cdot y)$ is:
A = [(2k+1)/(k+1) for k in 0:N-1]
B = zeros(N)
C = [k/(k+1) for k in 0:N]
c = zeros(N); c[N] = 1
pts = vec([z(θ, φ)⋅y for θ in θ, φ in φ])
phi0 = ones(N*M)
F = reshape(FastTransforms.clenshaw!(zeros(N*M), c, A, B, C, pts, phi0), N, M)

# We superpose a surface plot of $f$ on top of the grid:
X = [sinpi(θ)*cospi(φ) for θ in θ, φ in φ]
Y = [sinpi(θ)*sinpi(φ) for θ in θ, φ in φ]
Z = [cospi(θ) for θ in θ, φ in φ]
scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "sphere1.html"))
###```@raw html
###<object type="text/html" data="../sphere1.html" style="width:100%;height:400px;"></object>
###```

# We show the cut in the surface to help illustrate the definition of the grid.
# In particular, we do not sample the poles.
#
# We precompute a spherical harmonic--Fourier plan:
P = plan_sph2fourier(F)

# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
PA = plan_sph_analysis(F)

# Its spherical harmonic coefficients demonstrate that it is exact-degree-$n$:
V = PA*F
U = threshold!(P\V, 400*eps())

# The $L^2(\mathbb{S}^2)$ norm of the function is:
nrm1 = norm(U)

# Similarly, on the tensor product grid, our function samples are:
Pnxy = FastTransforms.clenshaw!([0.0], c, A, B, C, [x⋅y], [1.0])[1]
F = [(F[n, m] - Pnxy)/(z(θ[n], φ[m])⋅y - x⋅y) for n in 1:N, m in 1:M]

# We superpose a surface plot of $f$ on top of the grid:
scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "sphere2.html"))
###```@raw html
###<object type="text/html" data="../sphere2.html" style="width:100%;height:400px;"></object>
###```

# Its spherical harmonic coefficients demonstrate that it is degree-$(n-1)$:
V = PA*F
U = threshold!(P\V, 400*eps())

# Finally, the Legendre polynomial $P_n(z\cdot x)$ is aligned with the grid:
pts = vec([z(θ, φ)⋅x for θ in θ, φ in φ])
F = reshape(FastTransforms.clenshaw!(zeros(N*M), c, A, B, C, pts, phi0), N, M)

# We superpose a surface plot of $f$ on top of the grid:
scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "sphere3.html"))
###```@raw html
###<object type="text/html" data="../sphere3.html" style="width:100%;height:400px;"></object>
###```

# It only has one nonnegligible spherical harmonic coefficient.
# Can you spot it?
V = PA*F
U = threshold!(P\V, 400*eps())

# That nonnegligible coefficient should be
ret = eval("√(2π/($(N-1)+1/2))")

# which is approximately
eval(Meta.parse(ret))

# since the convention in this library is to orthonormalize.
nrm2 = norm(U)

# Note that the integrals of both functions $P_n(z\cdot y)$ and $P_n(z\cdot x)$ and their
# $L^2(\mathbb{S}^2)$ norms are the same because of rotational invariance. The integral of
# either is perhaps not interesting as it is mathematically zero, but the norms
# of either should be approximately the same.
nrm1 ≈ nrm2


================================================
FILE: examples/sphericalisometries.jl
================================================
function threshold!(A::AbstractArray, ϵ)
    for i in eachindex(A)
        if abs(A[i]) < ϵ A[i] = 0 end
    end
    A
end

using FastTransforms, LinearAlgebra, Random, Test

# The colatitudinal grid (mod π):
N = 10
θ = (0.5:N-0.5)/N

# The longitudinal grid (mod π):
M = 2*N-1
φ = (0:M-1)*2/M

x = [cospi(φ)*sinpi(θ) for θ in θ, φ in φ]
y = [sinpi(φ)*sinpi(θ) for θ in θ, φ in φ]
z = [cospi(θ) for θ in θ, φ in φ]

P = plan_sph2fourier(Float64, N)
PA = plan_sph_analysis(Float64, N, M)
J = FastTransforms.plan_sph_isometry(Float64, N)


f = (x, y, z) -> x^2+y^4+x^2*y*z^3-x*y*z^2


F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_yz_axis_exchange!(J, U)
FR = f.(x, -z, -y)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)


α, β, γ = 0.123, 0.456, 0.789

# Isometry built up from ZYZR
A = [cos(α) -sin(α) 0; sin(α) cos(α) 0; 0 0 1]
B = [cos(β) 0 -sin(β); 0 1 0; sin(β) 0 cos(β)]
C = [cos(γ) -sin(γ) 0; sin(γ) cos(γ) 0; 0 0 1]
R = diagm([1, 1, 1.0])
Q = A*B*C*R

# Transform the sampling grid. Note that `Q` is transposed here.
u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z

F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_rotation!(J, α, β, γ, U)
FR = f.(u, v, w)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)


F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_polar_reflection!(U)
FR = f.(x, y, -z)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)


# Isometry built up from planar reflection
W = [0.123, 0.456, 0.789]
H = w -> I - 2/(w'w)*w*w'
Q = H(W)

# Transform the sampling grid. Note that `Q` is transposed here.
u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z

F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_reflection!(J, W, U)
FR = f.(u, v, w)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)

F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_reflection!(J, (W[1], W[2], W[3]), U)
FR = f.(u, v, w)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)

# Random orthogonal transformation
Random.seed!(0)
Q = qr(rand(3, 3)).Q

# Transform the sampling grid, note that `Q` is transposed here.
u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z

F = f.(x, y, z)
V = PA*F
U = threshold!(P\V, 100eps())
FastTransforms.execute_sph_orthogonal_transformation!(J, Q, U)
FR = f.(u, v, w)
VR = PA*FR
UR = threshold!(P\VR, 100eps())
@test U ≈ UR
norm(U-UR)


================================================
FILE: examples/spinweighted.jl
================================================
# # Spin-weighted spherical harmonics
# This example plays with analysis of:
# ```math
# f(r) = e^{{\rm i} k\cdot r},
# ```
# for some $k\in\mathbb{R}^3$ and where $r\in\mathbb{S}^2$, using spin-$0$ spherical harmonics.
#
# It applies ð, the spin-raising operator,
# both on the spin-$0$ coefficients as well as the original function,
# followed by a spin-$1$ analysis to compare coefficients.
#
# For the storage pattern of the arrays, please consult the
# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).

using FastTransforms, LinearAlgebra

# The colatitudinal grid (mod $\pi$):
N = 10
θ = (0.5:N-0.5)/N

# The longitudinal grid (mod $\pi$):
M = 2*N-1
φ = (0:M-1)*2/M

# Our choice of $k$ and angular parametrization of $r$:
k = [2/7, 3/7, 6/7]
r = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]

# On the tensor product grid, our function samples are:
F = [exp(im*(k⋅r(θ,φ))) for θ in θ, φ in φ]

# We precompute a spin-$0$ spherical harmonic--Fourier plan:
P = plan_spinsph2fourier(F, 0)

# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
PA = plan_spinsph_analysis(F, 0)

# Its spin-$0$ spherical harmonic coefficients are:
U⁰ = P\(PA*F)

# We can check its $L^2(\mathbb{S}^2)$ norm against an exact result:
norm(U⁰) ≈ sqrt(4π)

# Spin can be incremented by applying ð, either on the spin-$0$ coefficients:
U¹c = zero(U⁰)
for n in 1:N-1
    U¹c[n, 1] = sqrt(n*(n+1))*U⁰[n+1, 1]
end
for m in 1:M÷2
    for n in 0:N-1
        U¹c[n+1, 2m] = -sqrt((n+m)*(n+m+1))*U⁰[n+1, 2m]
        U¹c[n+1, 2m+1] = sqrt((n+m)*(n+m+1))*U⁰[n+1, 2m+1]
    end
end

# or on the original function through analysis with spin-$1$ spherical harmonics:
F = [-(k[1]*(im*cospi(θ)*cospi(φ) + sinpi(φ)) + k[2]*(im*cospi(θ)*sinpi(φ)-cospi(φ)) - im*k[3]*sinpi(θ))*exp(im*(k⋅r(θ,φ))) for θ in θ, φ in φ]

# We change plans with spin-$1$ now and reanalyze:
P = plan_spinsph2fourier(F, 1)
PA = plan_spinsph_analysis(F, 1)
U¹s = P\(PA*F)

# Finally, we check $L^2(\mathbb{S}^2)$ norms against another exact result:
norm(U¹c) ≈ norm(U¹s) ≈ sqrt(8π/3*(k⋅k))


================================================
FILE: examples/subspaceangles.jl
================================================
# # Subspace angles
# This example considers the angles between neighbouring Laguerre polynomials with a perturbed measure:
# ```math
# \cos\theta_n = \frac{\langle L_n, L_{n+k}\rangle}{\|L_n\|_2 \|L_{n+k}\|_2},\quad{\rm for}\quad 0\le n < N-k,
# ```
# where the inner product is defined by $\langle f, g\rangle = \int_0^\infty f(x) g(x) x^\beta e^{-x}{\rm\,d}x$.
#
# We do so by connecting Laguerre polynomials to the normalized generalized Laguerre polynomials associated with the perturbed measure. It follows by the inner product of the connection coefficients that:
# ```math
# \cos\theta_n = \frac{(V^\top V)_{n, n+k}}{\sqrt{(V^\top V)_{n, n}(V^\top V)_{n+k, n+k}}}.
# ```
#
using FastTransforms, LinearAlgebra

# The neighbouring index `k` and the maximum degree `N-1`:
k, N = 1, 11

# The Laguerre connection parameters:
α, β = 0.0, 0.125

# We precompute a Laguerre--Laguerre plan:
P = plan_lag2lag(Float64, N, α, β; norm2=true)

# We apply the plan to the identity, followed by the adjoint plan:
VtV = parent(P*I)
lmul!(P', VtV)

# From this matrix, the angles are recovered from:
θ = [acos(VtV[n, n+k]/sqrt(VtV[n, n]*VtV[n+k, n+k])) for n in 1:N-k]


================================================
FILE: examples/triangle.jl
================================================
# # Calculus on the reference triangle
# In this example, we sample a bivariate function:
# ```math
# f(x,y) = \frac{1}{1+x^2+y^2},
# ```
# on the reference triangle with vertices $(0,0)$, $(0,1)$, and $(1,0)$ and analyze it
# in a Proriol series. Then, we find Proriol series for each component of its
# gradient by term-by-term differentiation of our expansion, and we compare them
# with the true Proriol series by sampling an exact expression for the gradient.
#
# We analyze $f(x,y)$ on an $N\times M$ mapped tensor product grid defined by:
# ```math
# \begin{aligned}
# x & = (1+u)/2,\quad{\rm and}\quad y = (1-u)(1+v)/4,\quad {\rm where:}\\
# u_n & = \cos\left[(n+\tfrac{1}{2})\pi/N\right],\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
# v_m & = \cos\left[(m+\tfrac{1}{2})\pi/M\right],\quad{\rm for}\quad 0\le m < M;
# \end{aligned}
# ```
# we convert the function samples to mapped Chebyshev² coefficients using
# `plan_tri_analysis`; and finally, we transform the mapped Chebyshev²
# coefficients to Proriol coefficients using `plan_tri2cheb`.
#
# For the storage pattern of the arrays, please consult the
# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).

using FastTransforms, LinearAlgebra, Plots
const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
!isdir(GENFIGS) && mkdir(GENFIGS)
plotlyjs()

# Our function $f$ and the Cartesian components of its gradient:
f = (x,y) -> 1/(1+x^2+y^2)
fx = (x,y) -> -2x/(1+x^2+y^2)^2
fy = (x,y) -> -2y/(1+x^2+y^2)^2

# The polynomial degree:
N = 15
M = N

# The parameters of the Proriol series:
α, β, γ = 0, 0, 0

# The $u$ grid:
u = [sinpi((N-2n-1)/(2N)) for n in 0:N-1]

# And the $v$ grid:
v = [sinpi((M-2m-1)/(2M)) for m in 0:M-1]

# Instead of using the $u\times v$ grid, we use one with more accuracy near the origin.
# Defining $x$ by:
x = [sinpi((2N-2n-1)/(4N))^2 for n in 0:N-1]

# And $w$ by:
w = [sinpi((2M-2m-1)/(4M))^2 for m in 0:M-1]

# We see how the two grids are related:
((1 .+ u)./2 ≈ x) * ((1 .- u).*(1 .+ v')/4 ≈ reverse(x).*w')

# On the mapped tensor product grid, our function samples are:
F = [f(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]

# We superpose a surface plot of $f$ on top of the grid:
X = [x for x in x, w in w]
Y = [x[N-n]*w[m+1] for n in 0:N-1, m in 0:M-1]
scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:blue)
surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
savefig(joinpath(GENFIGS, "proriol.html"))
###```@raw html
###<object type="text/html" data="../proriol.html" style="width:100%;height:400px;"></object>
###```

# We precompute a Proriol--Chebyshev² plan:
P = plan_tri2cheb(F, α, β, γ)

# And an FFTW Chebyshev² plan on the triangle:
PA = plan_tri_analysis(F)

# Its Proriol-$(α,β,γ)$ coefficients are:
U = P\(PA*F)

# Similarly, our function's gradient samples are:
Fx = [fx(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]

# and:
Fy = [fy(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]

# For the partial derivative with respect to $x$, [Olver et al.](https://doi.org/10.1137/19M1245888)
# derive simple expressions for the representation of this component
# using a Proriol-$(α+1,β,γ+1)$ series.
Gx = zeros(Float64, N, M)
for m = 0:M-2
    for n = 0:N-2
        cf1 = m == 0 ? sqrt((n+1)*(n+2m+α+β+γ+3)/(2m+β+γ+2)*(m+γ+1)*8) : sqrt((n+1)*(n+2m+α+β+γ+3)/(2m+β+γ+1)*(m+β+γ+1)/(2m+β+γ+2)*(m+γ+1)*8)
        cf2 = sqrt((n+α+1)*(m+1)/(2m+β+γ+2)*(m+β+1)/(2m+β+γ+3)*(n+2m+β+γ+3)*8)
        Gx[n+1, m+1] = cf1*U[n+2, m+1] + cf2*U[n+1, m+2]
    end
end
Px = plan_tri2cheb(Fx, α+1, β, γ+1)
Ux = Px\(PA*Fx)

# For the partial derivative with respect to y, the analogous formulae result
# in a Proriol-$(α,β+1,γ+1)$ series.
Gy = zeros(Float64, N, M)
for m = 0:M-2
    for n = 0:N-2
        Gy[n+1, m+1] = 4*sqrt((m+1)*(m+β+γ+2))*U[n+1, m+2]
    end
end
Py = plan_tri2cheb(Fy, α, β+1, γ+1)
Uy = Py\(PA*Fy)

# The $2$-norm relative error in differentiating the Proriol series
# for $f(x,y)$ term-by-term and its sampled gradient is:
hypot(norm(Ux-Gx), norm(Uy-Gy))/hypot(norm(Ux), norm(Uy))

# This error can be improved upon by increasing $N$ and $M$.


================================================
FILE: src/FastTransforms.jl
================================================
module FastTransforms

using ArrayLayouts, BandedMatrices, FastGaussQuadrature, FillArrays, LazyArrays, LinearAlgebra,
      SpecialFunctions, ToeplitzMatrices, RecurrenceRelationships

using AbstractFFTs
using FFTW
using GenericFFT

import Base: convert, unsafe_convert, eltype, ndims, adjoint, transpose, show,
             *, \, inv, length, size, view, getindex, tail, OneTo

import Base.GMP: Limb

import AbstractFFTs: Plan, ScaledPlan,
                     fft, ifft, bfft, fft!, ifft!, bfft!, rfft, irfft, brfft,
                     plan_fft, plan_ifft, plan_bfft, plan_fft!, plan_ifft!,
                     plan_bfft!, plan_rfft, plan_irfft, plan_brfft,
                     fftshift, ifftshift, rfft_output_size, brfft_output_size,
                     normalization

import ArrayLayouts: rowsupport, colsupport, LayoutMatrix, MemoryLayout, AbstractBandedLayout

import BandedMatrices: bandwidths, BandedLayout

import FFTW: dct, dct!, idct, idct!, plan_dct!, plan_idct!,
             plan_dct, plan_idct, fftwNumber

import FastGaussQuadrature: unweightedgausshermite

import FillArrays: AbstractFill, getindex_value

import LinearAlgebra: cholesky, issymmetric, isposdef, mul!, lmul!, ldiv!

import GenericFFT: interlace # imported in downstream packages

import RecurrenceRelationships: check_clenshaw_recurrences


export leg2cheb, cheb2leg, ultra2ultra, jac2jac,
       lag2lag, jac2ultra, ultra2jac, jac2cheb,
       cheb2jac, ultra2cheb, cheb2ultra, associatedjac2jac,
       modifiedjac2jac, modifiedlag2lag, modifiedherm2herm,
       sph2fourier, sphv2fourier, disk2cxf, ann2cxf, rectdisk2cheb,
       tri2cheb, tet2cheb,fourier2sph, fourier2sphv, cxf2disk, cxf2ann,
       cheb2rectdisk, cheb2tri, cheb2tet

export plan_leg2cheb, plan_cheb2leg, plan_ultra2ultra, plan_jac2jac,
       plan_lag2lag, plan_jac2ultra, plan_ultra2jac, plan_jac2cheb,
       plan_cheb2jac, plan_ultra2cheb, plan_cheb2ultra, plan_associatedjac2jac,
       plan_modifiedjac2jac, plan_modifiedlag2lag, plan_modifiedherm2herm,
       plan_sph2fourier, plan_sph_synthesis, plan_sph_analysis,
       plan_sphv2fourier, plan_sphv_synthesis, plan_sphv_analysis,
       plan_disk2cxf, plan_disk_synthesis, plan_disk_analysis,
       plan_ann2cxf, plan_annulus_synthesis, plan_annulus_analysis,
       plan_rectdisk2cheb, plan_rectdisk_synthesis, plan_rectdisk_analysis,
       plan_tri2cheb, plan_tri_synthesis, plan_tri_analysis,
       plan_tet2cheb, plan_tet_synthesis, plan_tet_analysis,
       plan_spinsph2fourier, plan_spinsph_synthesis, plan_spinsph_analysis


include("libfasttransforms.jl")
include("elliptic.jl")

export nufft, nufft1, nufft2, nufft3, inufft1, inufft2

export plan_nufft, plan_nufft1, plan_nufft2, plan_nufft3,
       plan_inufft1, plan_inufft2

include("nufft.jl")
include("inufft.jl")

export paduatransform, ipaduatransform, paduatransform!, ipaduatransform!,
       paduapoints

export plan_paduatransform!, plan_ipaduatransform!

include("PaduaTransform.jl")

export chebyshevtransform, ichebyshevtransform,
       chebyshevtransform!, ichebyshevtransform!,
       chebyshevutransform, ichebyshevutransform,
       chebyshevutransform!, ichebyshevutransform!, chebyshevpoints

export plan_chebyshevtransform, plan_ichebyshevtransform,
       plan_chebyshevtransform!, plan_ichebyshevtransform!,
       plan_chebyshevutransform, plan_ichebyshevutransform,
       plan_chebyshevutransform!, plan_ichebyshevutransform!

include("chebyshevtransform.jl")

export clenshawcurtisnodes, clenshawcurtisweights, fejernodes1, fejerweights1,
       fejernodes2, fejerweights2

export plan_clenshawcurtis, plan_fejer1, plan_fejer2

include("clenshawcurtis.jl")
include("fejer.jl")

export gaunt

include("gaunt.jl")

export GramMatrix, ChebyshevGramMatrix

include("GramMatrix.jl")

export weightedhermitetransform, iweightedhermitetransform

include("hermite.jl")

export sphones, sphzeros, sphrand, sphrandn, sphevaluate,
       sphvones, sphvzeros, sphvrand, sphvrandn,
       diskones, diskzeros, diskrand, diskrandn,
       rectdiskones, rectdiskzeros, rectdiskrand, rectdiskrandn,
       triones, trizeros, trirand, trirandn, trievaluate,
       tetones, tetzeros, tetrand, tetrandn,
       spinsphones, spinsphzeros, spinsphrand, spinsphrandn

include("specialfunctions.jl")

include("toeplitzplans.jl")
include("toeplitzhankel.jl")

export ToeplitzPlusHankel

include("ToeplitzPlusHankel.jl")

# following use libfasttransforms by default
for f in (:jac2jac,
    :lag2lag, :jac2ultra, :ultra2jac, :jac2cheb,
    :cheb2jac, :ultra2cheb, :cheb2ultra, :associatedjac2jac,
    :modifiedjac2jac, :modifiedlag2lag, :modifiedherm2herm,
    :sph2fourier, :sphv2fourier, :disk2cxf, :ann2cxf,
    :rectdisk2cheb, :tri2cheb, :tet2cheb,
    :leg2cheb, :cheb2leg, :ultra2ultra)
    lib_f = Symbol("lib_", f)
    @eval $f(x::AbstractArray, y...; z...) = $lib_f(x, y...; z...)
end

include("arrays.jl")
# following use Toeplitz-Hankel to avoid expensive plans
# for f in (:leg2cheb, :cheb2leg, :ultra2ultra)
#     th_f = Symbol("th_", f)
#     lib_f = Symbol("lib_", f)
#     @eval begin
#         $f(x::AbstractArray, y...; z...) = $th_f(x, y...; z...)
#         # $f(x::AbstractArray, y...; z...) = $lib_f(x, y...; z...)
#     end
# end

include("docstrings.jl")

end # module


================================================
FILE: src/GramMatrix.jl
================================================
abstract type AbstractGramMatrix{T} <: LayoutMatrix{T} end

@inline issymmetric(G::AbstractGramMatrix) = true
@inline isposdef(G::AbstractGramMatrix) = true

struct GramMatrix{T, WT <: AbstractMatrix{T}, XT <: AbstractMatrix{T}} <: AbstractGramMatrix{T}
    W::WT
    X::XT
    function GramMatrix{T, WT, XT}(W::WT, X::XT) where {T, WT, XT}
        if size(W) ≠ size(X)
            throw(ArgumentError("Cannot construct a GramMatrix with W and X of different sizes."))
        end
        if !issymmetric(W)
            throw(ArgumentError("Cannot construct a GramMatrix with a nonsymmetric W."))
        end
        if bandwidths(X) ≠ (1, 1)
            throw(ArgumentError("Cannot construct a GramMatrix with a nontridiagonal X."))
        end
        new{T, WT, XT}(W, X)
    end
end

"""
    GramMatrix(W::AbstractMatrix, X::AbstractMatrix)

Construct a symmetric positive-definite Gram matrix with data stored in ``W``.
Given a family of orthogonal polynomials ``𝐏(x) = {p₀(x), p₁(x),…}``
and a continuous inner product ``⟨f, g⟩``, the Gram matrix is defined by:
```math
W[i, j] = ⟨p_{i-1}, p_{j-1}⟩.
```
Moreover, given ``X``, the transposed Jacobi matrix that satisfies ``x 𝐏(x) = 𝐏(x) X``,
the Gram matrix satisfies the skew-symmetric rank-2 displacement equation (``X = X[1:n, 1:n]``):
```math
XᵀW - WX = GJGᵀ,
```
where ``J = [0 1; -1 0]`` and where:
```math
G[:, 1] = 𝐞_n, \\quad  G[:, 2] = W[n-1, :]X[n-1, n] - Xᵀ W[:, n].
```
Fast (``O(n^2)``) Cholesky factorization of the Gram matrix returns the
connection coefficients between ``𝐏(x)`` and the polynomials ``𝐐(x)``
orthogonal in the modified inner product, ``𝐏(x) = 𝐐(x) R``.

See also [`ChebyshevGramMatrix`](@ref) for a special case.

> K. Gumerov, S. Rigg, and R. M. Slevinsky, [Fast measure modification of orthogonal polynomials via matrices with displacement structure](https://arxiv.org/abs/2412.17663), arXiv:2412.17663, 2024.
"""
GramMatrix(W::WT, X::XT) where {T, WT <: AbstractMatrix{T}, XT <: AbstractMatrix{T}} = GramMatrix{T, WT, XT}(W, X)

@inline size(G::GramMatrix) = size(G.W)
@inline getindex(G::GramMatrix, i::Integer, j::Integer) = G.W[i, j]
@inline bandwidths(G::GramMatrix) = bandwidths(G.W)
@inline MemoryLayout(G::GramMatrix) = MemoryLayout(G.W)
@inline rowsupport(G::GramMatrix, j) = rowsupport(MemoryLayout(G), G.W, j)
@inline colsupport(G::GramMatrix, j) = colsupport(MemoryLayout(G), G.W, j)

"""
    GramMatrix(μ::AbstractVector, X::AbstractMatrix)

Construct a GramMatrix from modified orthogonal polynomial moments and the multiplication operator.
In the standard (classical) normalization, ``p₀(x) = 1``, so that the moments
``µ[n] = ⟨ pₙ₋₁, 1⟩`` are in fact the first column of the Gram matrix.
The recurrence is built from ``XᵀW = WX``.
"""
GramMatrix(μ::AbstractVector{T}, X::XT) where {T, XT <: AbstractMatrix{T}} = GramMatrix(μ, X, one(T))
function GramMatrix(μ::AbstractVector{T}, X::XT, p0::T) where {T, XT <: AbstractMatrix{T}}
    N = length(μ)
    n = (N+1)÷2
    @assert N == size(X, 1) == size(X, 2)
    @assert bandwidths(X) == (1, 1)
    W = LowerTriangular(Matrix{T}(undef, N, N))
    if n > 0
        @inbounds for m in 1:N
            W[m, 1] = p0*μ[m]
        end
    end
    if n > 1
        @inbounds for m in 2:N-1
            W[m, 2] = (X[m-1, m]*W[m-1, 1] + (X[m, m]-X[1, 1])*W[m, 1] + X[m+1, m]*W[m+1, 1])/X[2, 1]
        end
    end
    @inbounds @simd for n in 3:n
        for m in n:N-n+1
            W[m, n] = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n-2, n-1]*W[m, n-2])/X[n, n-1]
        end
    end
    return GramMatrix(Symmetric(W[1:n, 1:n], :L), eval(XT.name.name)(view(X, 1:n, 1:n)))
end

function GramMatrix(μ::PaddedVector{T}, X::XT, p0::T) where {T, XT <: AbstractMatrix{T}}
    N = length(μ)
    b = length(μ.args[2])-1
    n = (N+1)÷2
    @assert N == size(X, 1) == size(X, 2)
    @assert bandwidths(X) == (1, 1)
    W = BandedMatrix{T}(undef, (N, N), (b, 0))
    if n > 0
        @inbounds for m in 1:min(N, b+1)
            W[m, 1] = p0*μ[m]
        end
    end
    if n > 1
        @inbounds for m in 2:min(N-1, b+2)
            W[m, 2] = (X[m-1, m]*W[m-1, 1] + (X[m, m]-X[1, 1])*W[m, 1] + X[m+1, m]*W[m+1, 1])/X[2, 1]
        end
    end
    @inbounds @simd for n in 3:n
        for m in n:min(N-n+1, b+n)
            W[m, n] = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n-2, n-1]*W[m, n-2])/X[n, n-1]
        end
    end
    return GramMatrix(Symmetric(W[1:n, 1:n], :L), eval(XT.name.name)(view(X, 1:n, 1:n)))
end

"""
    GramMatrix(cnm1::AbstractVector, cn::AbstractVector, X::AbstractMatrix)

Construct a GramMatrix from its last two columns and the multiplication operator.
The recurrence is built from ``XᵀW = WX`` and is used in case the moment method is unstable (such as with Laguerre).
"""
function GramMatrix(cnm1::AbstractVector{T}, cn::AbstractVector{T}, X::XT) where {T, XT <: AbstractMatrix{T}}
    N = length(cn)
    @assert N == length(cnm1) == size(X, 1) == size(X, 2)
    @assert bandwidths(X) == (1, 1)
    W = Matrix{T}(undef, N, N)
    if N > 0
        @inbounds for m in 1:N
            W[N, m] = W[m, N] = cn[m]
        end
    end
    if N > 1
        @inbounds for m in 1:N
            W[N-1, m] = W[m, N-1] = cnm1[m]
        end
    end
    @inbounds @simd for n in N:-1:3
        W[1, n-2]  = ((X[1, 1]-X[n-1, n-1])*W[1, n-1] + X[2, 1]*W[2, n-1] - X[n, n-1]*W[1, n])/X[n-2, n-1]
        for m in 2:n-2
            W[m, n-2]  = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n, n-1]*W[m, n])/X[n-2, n-1]
        end
        for m in n-1:N-2
            W[m, n-2] = W[n-2, m]
        end
    end
    return GramMatrix(W, X)
end

#
# X'W-W*X = G*J*G'
# This returns G, where J = [0 1; -1 0], respecting the skew-symmetry of the right-hand side.
#
function compute_skew_generators(W::GramMatrix{T}) where T
    X = W.X
    n = size(W, 1)
    G = zeros(T, n, 2)
    G[n, 1] = one(T)
    G[:, 2] .= W[:, n-1]*X[n-1, n] + W[:, n]*X[n, n] - X'W[:, n]
    return G
end

function cholesky(W::GramMatrix{T}) where T
    cholesky(MemoryLayout(W), W)
end

function cholesky(_, W::GramMatrix{T}) where T
    n = size(W, 1)
    G = compute_skew_generators(W)
    L = zeros(T, n, n)
    c = W[:, 1]
    ĉ = zeros(T, n)
    l = zeros(T, n)
    v = zeros(T, n)
    row1 = zeros(T, n)
    fastcholesky!(L, W.X, G, c, ĉ, l, v, row1, n)
    return Cholesky(L, 'L', 0)
end

function fastcholesky!(L::Matrix{T}, X, G, c, ĉ, l, v, row1, n) where T
    @inbounds @simd for k in 1:n-1
        d = sqrt(c[k])
        for j in k:n
            L[j, k] = l[j] = c[j]/d
        end
        for j in k:n
            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
        end
        for j in k+1:n-1
            ĉ[j] = (X[j-1, j]*c[j-1] + (X[j, j]-X[k, k])*c[j] + X[j+1, j]*c[j+1] + c[k]*row1[j] - row1[k]*c[j] - v[j])/X[k+1, k]
        end
        ĉ[n] = (X[n-1, n]*c[n-1] + (X[n, n]-X[k, k])*c[n] + c[k]*row1[n] - row1[k]*c[n] - v[n])/X[k+1, k]
        cst = X[k+1, k]/d
        for j in k+1:n
            row1[j] = -cst*l[j]
        end
        cst = c[k+1]/d
        for j in k:n
            c[j] = ĉ[j] - cst*l[j]
        end
        gd1 = G[k, 1]/d
        gd2 = G[k, 2]/d
        for j in k:n
            G[j, 1] -= l[j]*gd1
            G[j, 2] -= l[j]*gd2
        end
    end
    L[n, n] = sqrt(c[n])
end

function cholesky(::Union{AbstractBandedLayout, SymmetricLayout{<: AbstractBandedLayout}}, W::GramMatrix{T}) where T
    n = size(W, 1)
    G = compute_skew_generators(W)
    L = BandedMatrix{T}(undef, (n, n), (bandwidth(W, 1), 0))
    c = W[:, 1]
    ĉ = zeros(T, n)
    l = zeros(T, n)
    v = zeros(T, n)
    row1 = zeros(T, n)
    fastcholesky!(L, W.X, G, c, ĉ, l, v, row1, n)
    return Cholesky(L, 'L', 0)
end

function fastcholesky!(L::BandedMatrix{T}, X, G, c, ĉ, l, v, row1, n) where T
    b = bandwidth(L, 1)
    @inbounds @simd for k in 1:n-1
        d = sqrt(c[k])
        for j in k:min(k+b, n)
            L[j, k] = l[j] = c[j]/d
        end
        for j in max(k, n-b-1):n
            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
        end
        for j in k+1:min(k+b+1, n-1)
            ĉ[j] = (X[j-1, j]*c[j-1] + (X[j, j]-X[k, k])*c[j] + X[j+1, j]*c[j+1] + c[k]*row1[j] - row1[k]*c[j] - v[j])/X[k+1, k]
        end
        if k ≥ n-b-1
            ĉ[n] = (X[n-1, n]*c[n-1] + (X[n, n]-X[k, k])*c[n] + c[k]*row1[n] - row1[k]*c[n] - v[n])/X[k+1, k]
        end
        cst = X[k+1, k]/d
        for j in k+1:min(k+b+1, n)
            row1[j] = -cst*l[j]
        end
        cst = c[k+1]/d
        for j in k:min(k+b+1, n)
            c[j] = ĉ[j] - cst*l[j]
        end
        gd1 = G[k, 1]/d
        gd2 = G[k, 2]/d
        for j in max(k, n-b-1):n
            G[j, 1] -= l[j]*gd1
            G[j, 2] -= l[j]*gd2
        end
    end
    L[n, n] = sqrt(c[n])
end

struct ChebyshevGramMatrix{T, V <: AbstractVector{T}} <: AbstractGramMatrix{T}
    μ::V
    n::Int
end

"""
    ChebyshevGramMatrix(μ::AbstractVector)

Construct a Chebyshev--Gram matrix of size `(length(μ)+1)÷2` with entries:
```math
2 W[i, j] = µ[|i-j|+1] + µ[i+j-1].
```
Due to the linearization of a product of two first-kind Chebyshev polynomials,
the Chebyshev--Gram matrix can be constructed from modified Chebyshev moments:
```math
µ[n] = ⟨ Tₙ₋₁, 1⟩.
```
Specialized construction and Cholesky factorization is given for this type.

See also [`GramMatrix`](@ref) for the general case.
"""
function ChebyshevGramMatrix(μ::V) where {T, V <: AbstractVector{T}}
    n = (length(μ)+1)÷2
    ChebyshevGramMatrix{T, V}(μ, n)
end

@inline size(G::ChebyshevGramMatrix) = (G.n, G.n)
@inline getindex(G::ChebyshevGramMatrix, i::Integer, j::Integer) = (G.μ[abs(i-j)+1] + G.μ[i+j-1])/2
@inline bandwidths(G::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T = (length(G.μ.args[2])-1, length(G.μ.args[2])-1)
@inline MemoryLayout(G::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T = BandedLayout()

#
# 2X'W-W*2X = G*J*G'
# This returns G, where J = [0 1; -1 0], respecting the skew-symmetry of the right-hand side.
# We use twice the Chebybshev Jacobi matrix so that subsequent arithmetic is easier.
#
function compute_skew_generators(W::ChebyshevGramMatrix{T}) where T
    μ = W.μ
    n = size(W, 1)
    G = zeros(T, n, 2)
    G[n, 1] = one(T)
    @inbounds @simd for j in 1:n-1
        G[j, 2] = -(μ[n+2-j] + μ[n+j])/2
    end
    G
end

function cholesky(W::ChebyshevGramMatrix{T}) where T
    n = size(W, 1)
    G = compute_skew_generators(W)
    L = zeros(T, n, n)
    c = W[:, 1]
    ĉ = zeros(T, n)
    l = zeros(T, n)
    v = zeros(T, n)
    row1 = zeros(T, n)
    fastcholesky!(L, G, c, ĉ, l, v, row1, n)
    return Cholesky(L, 'L', 0)
end

function fastcholesky!(L::Matrix{T}, G, c, ĉ, l, v, row1, n) where T
    @inbounds @simd for k in 1:n-1
        d = sqrt(c[k])
        for j in k:n
            L[j, k] = l[j] = c[j]/d
        end
        for j in k:n
            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
        end
        if k == 1
            for j in 2:n-1
                ĉ[j] = (c[j+1] + c[j-1] + c[1]*row1[j] - row1[1]*c[j] - v[j])/2
            end
            ĉ[n] = (c[n-1] + c[1]*row1[n] - row1[1]*c[n] - v[n])/2
            cst = 2/d
        else
            for j in k+1:n-1
                ĉ[j] = c[j+1] + c[j-1] + c[k]*row1[j] - row1[k]*c[j] - v[j]
            end
            ĉ[n] = c[n-1] + c[k]*row1[n] - row1[k]*c[n] - v[n]
            cst = 1/d
        end
        for j in k+1:n
            row1[j] = -cst*l[j]
        end
        cst = c[k+1]/d
        for j in k:n
            c[j] = ĉ[j] - cst*l[j]
        end
        gd1 = G[k, 1]/d
        gd2 = G[k, 2]/d
        for j in k:n
            G[j, 1] -= l[j]*gd1
            G[j, 2] -= l[j]*gd2
        end
    end
    L[n, n] = sqrt(c[n])
end

function cholesky(W::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T
    n = size(W, 1)
    G = compute_skew_generators(W)
    L = BandedMatrix{T}(undef, (n, n), (bandwidth(W, 1), 0))
    c = W[:, 1]
    ĉ = zeros(T, n)
    l = zeros(T, n)
    v = zeros(T, n)
    row1 = zeros(T, n)
    fastcholesky!(L, G, c, ĉ, l, v, row1, n)
    return Cholesky(L, 'L', 0)
end

function fastcholesky!(L::BandedMatrix{T}, G, c, ĉ, l, v, row1, n) where T
    b = bandwidth(L, 1)
    @inbounds @simd for k in 1:n-1
        d = sqrt(c[k])
        for j in k:min(k+b, n)
            L[j, k] = l[j] = c[j]/d
        end
        for j in max(k, n-b-1):n
            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
        end
        if k == 1
            for j in 2:min(b+2, n-1)
                ĉ[j] = (c[j+1] + c[j-1] + c[1]*row1[j] - row1[1]*c[j] - v[j])/2
            end
            if 1 ≥ n-b-1
                ĉ[n] = (c[n-1] + c[1]*row1[n] - row1[1]*c[n] - v[n])/2
            end
            cst = 2/d
        else
            for j in k+1:min(k+b+1, n-1)
                ĉ[j] = c[j+1] + c[j-1] + c[k]*row1[j] - row1[k]*c[j] - v[j]
            end
            if k ≥ n-b-1
                ĉ[n] = c[n-1] + c[k]*row1[n] - row1[k]*c[n] - v[n]
            end
            cst = 1/d
        end
        for j in k+1:min(k+b+1, n)
            row1[j] = -cst*l[j]
        end
        cst = c[k+1]/d
        for j in k:min(k+b+1, n)
            c[j] = ĉ[j] - cst*l[j]
        end
        gd1 = G[k, 1]/d
        gd2 = G[k, 2]/d
        for j in max(k, n-b-1):n
            G[j, 1] -= l[j]*gd1
            G[j, 2] -= l[j]*gd2
        end
    end
    L[n, n] = sqrt(c[n])
end


================================================
FILE: src/PaduaTransform.jl
================================================

# lex indicates if its lexigraphical (i.e., x, y) or reverse (y, x)
# If in lexigraphical order the coefficient vector's entries
# corrrespond to the following basis polynomials:
# [T0(x) * T0(y), T1(x) * T0(y), T0(x) * T1(y), T2(x) * T0(y), T1(x) * T1(y), T0(x) * T2(y), ...]
# else, if not in lexigraphical order:
# [T0(x) * T0(y), T0(x) * T1(y), T1(x) * T0(y), T0(x) * T2(y), T1(x) * T1(y), T2(x) * T0(y), ...]
"""
Pre-plan an Inverse Padua Transform.
"""
struct IPaduaTransformPlan{lex,IDCTPLAN,T}
    cfsmat::Matrix{T}
    idctplan::IDCTPLAN
end

IPaduaTransformPlan(cfsmat::Matrix{T},idctplan,::Type{Val{lex}}) where {T,lex} =
    IPaduaTransformPlan{lex,typeof(idctplan),T}(cfsmat,idctplan)

"""
Pre-plan an Inverse Padua Transform.
"""
function plan_ipaduatransform!(::Type{T},N::Integer,lex) where T
    n=Int(cld(-3+sqrt(1+8N),2))
    if N ≠ div((n+1)*(n+2),2)
        error("Padua transforms can only be applied to vectors of length (n+1)*(n+2)/2.")
    end
    IPaduaTransformPlan(Array{T}(undef,n+2,n+1),FFTW.plan_r2r!(Array{T}(undef,n+2,n+1),FFTW.REDFT00),lex)
end


plan_ipaduatransform!(::Type{T},N::Integer) where {T} = plan_ipaduatransform!(T,N,Val{true})
plan_ipaduatransform!(v::AbstractVector{T},lex...) where {T} = plan_ipaduatransform!(eltype(v),length(v),lex...)


function *(P::IPaduaTransformPlan,v::AbstractVector{T}) where T
    cfsmat=trianglecfsmat(P,v)
    n,m=size(cfsmat)
    rmul!(view(cfsmat,:,2:m-1),0.5)
    rmul!(view(cfsmat,2:n-1,:),0.5)
    tensorvals=P.idctplan*cfsmat
    paduavec!(v,P,tensorvals)
end

ipaduatransform!(v::AbstractVector,lex...) = plan_ipaduatransform!(v,lex...)*v
"""
Inverse Padua Transform maps the 2D Chebyshev coefficients to the values of the interpolation polynomial at the Padua points.
"""
ipaduatransform(v::AbstractVector,lex...) = plan_ipaduatransform!(v,lex...)*copy(v)

"""
Creates ``(n+2)x(n+1)`` Chebyshev coefficient matrix from triangle coefficients.
"""
function trianglecfsmat(P::IPaduaTransformPlan{true},cfs::AbstractVector)
    N=length(cfs)
    n=Int(cld(-3+sqrt(1+8N),2))
    cfsmat=fill!(P.cfsmat,0)
    m=1
    for d=1:n+1
        @inbounds for k=1:d
            j=d-k+1
            cfsmat[k,j]=cfs[m]
            if m==N
                return cfsmat
            else
                m+=1
            end
        end
    end
    return cfsmat
end

function trianglecfsmat(P::IPaduaTransformPlan{false},cfs::AbstractVector)
    N=length(cfs)
    n=Int(cld(-3+sqrt(1+8N),2))
    cfsmat=fill!(P.cfsmat,0)
    m=1
    for d=1:n+1
        @inbounds for k=d:-1:1
            j=d-k+1
            cfsmat[k,j]=cfs[m]
            if m==N
                return cfsmat
            else
                m+=1
            end
        end
    end
    return cfsmat
end

"""
Vectorizes the function values at the Padua points.
"""
function paduavec!(v,P::IPaduaTransformPlan,padmat::Matrix)
    n=size(padmat,2)-1
    N=(n+1)*(n+2)
    if iseven(n)>0
        d=div(n+2,2)
        m=0
        @inbounds for i=1:n+1
            v[m+1:m+d]=view(padmat,1+mod(i,2):2:n+1+mod(i,2),i)
            m+=d
        end
    else
        @inbounds v[:]=view(padmat,1:2:N-1)
    end
    return v
end

"""
Pre-plan a Padua Transform.
"""
struct PaduaTransformPlan{lex,DCTPLAN,T}
    vals::Matrix{T}
    dctplan::DCTPLAN
end

PaduaTransformPlan(vals::Matrix{T},dctplan,::Type{Val{lex}}) where {T,lex} =
    PaduaTransformPlan{lex,typeof(dctplan),T}(vals,dctplan)

"""
Pre-plan a Padua Transform.
"""
function plan_paduatransform!(::Type{T},N::Integer,lex) where T
    n=Int(cld(-3+sqrt(1+8N),2))
    if N ≠ ((n+1)*(n+2))÷2
        error("Padua transforms can only be applied to vectors of length (n+1)*(n+2)/2.")
    end
    PaduaTransformPlan(Array{T}(undef,n+2,n+1),FFTW.plan_r2r!(Array{T}(undef,n+2,n+1),FFTW.REDFT00),lex)
end

plan_paduatransform!(::Type{T},N::Integer) where {T} = plan_paduatransform!(T,N,Val{true})
plan_paduatransform!(v::AbstractVector{T},lex...) where {T} = plan_paduatransform!(eltype(v),length(v),lex...)

function *(P::PaduaTransformPlan,v::AbstractVector{T}) where T
    N=length(v)
    n=Int(cld(-3+sqrt(1+8N),2))
    vals=paduavalsmat(P,v)
    tensorcfs=P.dctplan*vals
    m,l=size(tensorcfs)
    rmul!(tensorcfs,T(2)/(n*(n+1)))
    rmul!(view(tensorcfs,1,:),0.5)
    rmul!(view(tensorcfs,:,1),0.5)
    rmul!(view(tensorcfs,m,:),0.5)
    rmul!(view(tensorcfs,:,l),0.5)
    trianglecfsvec!(v,P,tensorcfs)
end

paduatransform!(v::AbstractVector,lex...) = plan_paduatransform!(v,lex...)*v
"""
Padua Transform maps from interpolant values at the Padua points to the 2D Chebyshev coefficients.
"""
paduatransform(v::AbstractVector,lex...) = plan_paduatransform!(v,lex...)*copy(v)

"""
Creates ``(n+2)x(n+1)`` matrix of interpolant values on the tensor grid at the ``(n+1)(n+2)/2`` Padua points.
"""
function paduavalsmat(P::PaduaTransformPlan,v::AbstractVector)
    N=length(v)
    n=Int(cld(-3+sqrt(1+8N),2))
    vals=fill!(P.vals,0.)
    if iseven(n)>0
        d=div(n+2,2)
        m=0
        @inbounds for i=1:n+1
            vals[1+mod(i,2):2:n+1+mod(i,2),i]=view(v,m+1:m+d)
            m+=d
        end
    else
        @inbounds vals[1:2:end]=view(v,:)
    end
    return vals
end

"""
Creates length ``(n+1)(n+2)/2`` vector from matrix of triangle Chebyshev coefficients.
"""
function trianglecfsvec!(v,P::PaduaTransformPlan{true},cfs::Matrix)
    m=size(cfs,2)
    l=1
    for d=1:m
        @inbounds for k=1:d
            j=d-k+1
            v[l]=cfs[k,j]
            l+=1
        end
    end
    return v
end

function trianglecfsvec!(v,P::PaduaTransformPlan{false},cfs::Matrix)
    m=size(cfs,2)
    l=1
    for d=1:m
        @inbounds for k=d:-1:1
            j=d-k+1
            v[l]=cfs[k,j]
            l+=1
        end
    end
    return v
end

"""
Returns coordinates of the ``(n+1)(n+2)/2`` Padua points.
"""
function paduapoints(::Type{T}, n::Integer) where T
    N=div((n+1)*(n+2),2)
    MM=Matrix{T}(undef,N,2)
    m=0
    delta=0
    NN=div(n,2)+1
    # x coordinates
    for k=n:-1:0
        if isodd(n)
            delta = Int(isodd(k))
        end
        x = -cospi(T(k)/n)
        @inbounds for j=NN+delta:-1:1
            m+=1
            MM[m,1]=x
        end
    end
    # y coordinates
    # populate the first two sets, and copy the rest
    m=0
    for k=n:-1:n-1
        if isodd(n)
            delta = Int(isodd(k))
        end
        for j=NN+delta:-1:1
            m+=1
            @inbounds if isodd(n-k)
                MM[m,2]=-cospi((2j-one(T))/(n+1))
            else
                MM[m,2]=-cospi(T(2j-2)/(n+1))
            end
        end
    end
    m += 1
    # number of y coordinates between k=n and k=n-2
    Ny_shift = 2NN+isodd(n)
    for k in n-2:-1:0
        if isodd(n)
            delta = Int(isodd(k))
        end
        for j in range(m, length=NN+delta)
            @inbounds MM[j,2] = MM[j-Ny_shift,2]
        end
        m += NN+delta
    end
    return MM
end

paduapoints(n::Integer) = paduapoints(Float64,n)


================================================
FILE: src/ToeplitzPlusHankel.jl
================================================
struct ToeplitzPlusHankel{T, S, P1 <: Plan{S}, P2 <: Plan{S}} <: AbstractMatrix{T}
    tc::Vector{T}
    tr::Vector{T}
    h::Vector{T}
    th_dft::Matrix{S}
    tht_dft::Matrix{S}
    temp::Matrix{S}
    plan::P1
    iplan::P2
    size::NTuple{2, Int}
end

# enforces tr[1] == tc[1]
function ToeplitzPlusHankel(tc::Vector{T}, tr::Vector{T}, h::Vector{T}) where T
    m = length(tc)
    n = length(tr)
    @assert length(h) == m+n-1
    tr[1] = tc[1]
    mn = m+n
    S = promote_type(float(T), Complex{Float32})
    th_dft = Matrix{S}(undef, mn, 2)
    copyto!(th_dft, 1, tc, 1, m)
    th_dft[m+1, 1] = zero(T)
    copyto!(th_dft, m+2, Iterators.reverse(tr), 1, n-1)
    copyto!(th_dft, mn+1, h, n, m)
    th_dft[m+1, 2] = zero(T)
    copyto!(th_dft, mn+m+2, h, 1, n-1)
    tht_dft = Matrix{S}(undef, mn, 2)
    copyto!(tht_dft, 1, tr, 1, n)
    tht_dft[n+1, 1] = zero(T)
    copyto!(tht_dft, n+2, Iterators.reverse(tc), 1, m-1)
    copyto!(tht_dft, mn+1, h, m, n)
    tht_dft[n+1, 2] = zero(T)
    copyto!(tht_dft, mn+n+2, h, 1, m-1)

    plan = plan_fft!(th_dft, 1)
    plan*th_dft
    plan*tht_dft
    temp = zeros(S, mn, 2)
    iplan = inv(plan)

    ToeplitzPlusHankel{T, S, typeof(plan), typeof(iplan)}(tc, tr, h, th_dft, tht_dft, temp, plan, iplan, (m, n))
end

# A ChebyshevGramMatrix isa (symmetric positive-definite) ToeplitzPlusHankel matrix.
function ToeplitzPlusHankel(G::ChebyshevGramMatrix)
    n = size(G, 1)
    ToeplitzPlusHankel(G.μ[1:n]/2, G.μ[1:n]/2, G.μ/2)
end

size(A::ToeplitzPlusHankel) = A.size
getindex(A::ToeplitzPlusHankel, i::Integer, j::Integer) = (i ≥ j ? A.tc[i-j+1] : A.tr[j-i+1]) + A.h[i+j-1]

# A view of a T+H is also T+H.
function getindex(A::ToeplitzPlusHankel, ir::UnitRange{Int}, jr::UnitRange{Int})
    fir, lir = first(ir), last(ir)
    fjr, ljr = first(jr), last(jr)
    if fir ≥ fjr
        tc = A.tc[fir-fjr+1:lir-fjr+1]
        tr = [A.tc[fir-fjr+1:-1:max(1, fir-ljr+1)]; A.tr[2:ljr-fir+1]]
    else
        tc = [A.tr[fjr-fir+1:-1:max(1, fjr-lir+1)]; A.tc[2:lir-fjr+1]]
        tr = A.tr[fjr-fir+1:ljr-fir+1]
    end
    ToeplitzPlusHankel(tc, tr, A.h[fir+fjr-1:lir+ljr-1])
end


# y ← A x α + y β
function mul!(y::StridedVector{T}, A::ToeplitzPlusHankel{T}, x::StridedVector{T}, α::S, β::S) where {T <: Real, S <: Real}
    m, n = size(A)
    @assert m == length(y)
    @assert n == length(x)
    mn = m+n
    th_dft = A.th_dft
    temp = A.temp
    plan = A.plan
    iplan = A.iplan

    copyto!(temp, 1, x, 1, n)
    copyto!(temp, mn+1, Iterators.reverse(x), 1, n)
    @inbounds for j in n+1:mn
        temp[j, 1] = zero(T)
        temp[j, 2] = zero(T)
    end
    plan*temp
    temp .*= th_dft
    iplan*temp

    if iszero(β)
        @inbounds @simd for i in 1:m
            y[i] = α * (real(temp[i, 1])+real(temp[i, 2]))
        end
    else
        @inbounds @simd for i in 1:m
            y[i] = α * (real(temp[i, 1])+real(temp[i, 2])) + β*y[i]
        end
    end
    return y
end

# y ← A' x α + y β
function mul!(y::StridedVector{T}, A::Adjoint{T, <:ToeplitzPlusHankel{T}}, x::StridedVector{T}, α::S, β::S) where {T <: Real, S <: Real}
    m, n = size(A)
    @assert m == length(y)
    @assert n == length(x)
    mn = m+n
    AP = A.parent
    tht_dft = AP.tht_dft
    temp = AP.temp
    plan = AP.plan
    iplan = AP.iplan

    copyto!(temp, 1, x, 1, n)
    copyto!(temp, mn+1, Iterators.reverse(x), 1, n)
    @inbounds for j in n+1:mn
        temp[j, 1] = zero(T)
        temp[j, 2] = zero(T)
    end
    plan*temp
    temp .*= tht_dft
    iplan*temp

    if iszero(β)
        @inbounds @simd for i in 1:m
            y[i] = α * (real(temp[i, 1])+real(temp[i, 2]))
        end
    else
        @inbounds @simd for i in 1:m
            y[i] = α * (real(temp[i, 1])+real(temp[i, 2])) + β*y[i]
        end
    end
    return y
end


# C ← A B α + C β
function mul!(C::StridedMatrix{T}, A::ToeplitzPlusHankel{T}, B::StridedMatrix{T}, α::S, β::S) where {T <: Real, S <: Real}
    m, n = size(A)
    @assert m == size(C, 1)
    @assert n == size(B, 1)
    p = size(B, 2)
    if size(C, 2) != p
        throw(DimensionMismatch("input and output matrices must have same number of columns"))
    end

    th_dft = A.th_dft
    TC = promote_type(float(T), Complex{Float32})
    temp = zeros(TC, m+n, 2p)
    plan = plan_fft!(temp, 1)

    for k in 1:p
        copyto!(view(temp, :, 2k-1), 1, view(B, :, k), 1, n)
        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(B, :, k)), 1, n)
    end
    plan*temp
    for k in 1:p
        vt = view(temp, :, 2k-1:2k)
        vt .*= th_dft
    end
    plan\temp

    if iszero(β)
        @inbounds for k in 1:p
            for i in 1:m
                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
            end
        end
    else
        @inbounds for k in 1:p
            for i in 1:m
                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[i, k]
            end
        end
    end
    return C
end

# Morally equivalent to mul!(C', B', A', α, β)' with StridedMatrix replaced by AbstractMatrix below
function mul!(C::StridedMatrix{T}, A::StridedMatrix{T}, B::ToeplitzPlusHankel{T}, α::S, β::S) where {T <: Real, S <: Real}
    n, m = size(B)
    @assert m == size(C, 2)
    @assert n == size(A, 2)
    p = size(A, 1)
    if size(C, 1) != p
        throw(DimensionMismatch("input and output matrices must have same number of rows"))
    end

    tht_dft = B.tht_dft
    TC = promote_type(float(T), Complex{Float32})
    temp = zeros(TC, m+n, 2p)
    plan = plan_fft!(temp, 1)

    for k in 1:p
        copyto!(view(temp, :, 2k-1), 1, view(A, k, :), 1, n)
        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(A, k, :)), 1, n)
    end
    plan*temp
    for k in 1:p
        vt = view(temp, :, 2k-1:2k)
        vt .*= tht_dft
    end
    plan\temp

    if iszero(β)
        @inbounds for k in 1:p
            for i in 1:m
                C[k, i] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
            end
        end
    else
        @inbounds for k in 1:p
            for i in 1:m
                C[k, i] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[k, i]
            end
        end
    end
    return C
end

# C ← A' B α + C β
function mul!(C::StridedMatrix{T}, A::Adjoint{T, <:ToeplitzPlusHankel{T}}, B::StridedMatrix{T}, α::S, β::S) where {T <: Real, S <: Real}
    m, n = size(A)
    @assert m == size(C, 1)
    @assert n == size(B, 1)
    p = size(B, 2)
    if size(C, 2) != p
        throw(DimensionMismatch("input and output matrices must have same number of columns"))
    end

    tht_dft = A.parent.tht_dft
    TC = promote_type(float(T), Complex{Float32})
    temp = zeros(TC, m+n, 2p)
    plan = plan_fft!(temp, 1)

    for k in 1:p
        copyto!(view(temp, :, 2k-1), 1, view(B, :, k), 1, n)
        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(B, :, k)), 1, n)
    end
    plan*temp
    for k in 1:p
        vt = view(temp, :, 2k-1:2k)
        vt .*= tht_dft
    end
    plan\temp

    if iszero(β)
        @inbounds for k in 1:p
            for i in 1:m
                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
            end
        end
    else
        @inbounds for k in 1:p
            for i in 1:m
                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[i, k]
            end
        end
    end
    return C
end

# Estimate the Frobenius norm of the Toeplitz-plus-Hankel matrix by working with the symbols.
function normest(A::ToeplitzPlusHankel{T}) where T
    m, n = size(A)
    tc = A.tc
    tr = A.tr
    h = A.h
    ret1 = zero(T)
    ret2 = zero(T)
    if m == min(m, n)
        for i = 1:m
            ret1 += (m+1-i)*abs2(tc[i])
        end
        for i = 2:n-m
            ret1 += m*abs2(tr[i])
        end
        for i = max(n-m+1, 2):n
            ret1 += (n+1-i)*abs2(tr[i])
        end
        for i = 1:m
            ret2 += i*abs2(h[i])
        end
        for i = m+1:n
            ret2 += m*abs2(h[i])
        end
        for i = n+1:m+n-1
            ret2 += (m+n-i)*abs2(h[i])
        end
    else
        for i = 1:n
            ret1 += (n+1-i)*abs2(tr[i])
        end
        for i = 2:m-n
            ret1 += n*abs2(tc[i])
        end
        for i = max(m-n+1, 2):m
            ret1 += (m+1-i)*abs2(tc[i])
        end
        for i = 1:n
            ret2 += i*abs2(h[i])
        end
        for i = n+1:m
            ret2 += n*abs2(h[i])
        end
        for i = m+1:m+n-1
            ret2 += (m+n-i)*abs2(h[i])
        end
    end
    sqrt(ret1) + sqrt(ret2)
end

normest(A::Symmetric{T, <: ToeplitzPlusHankel{T}}) where T = normest(parent(A))
normest(A::Hermitian{T, <: ToeplitzPlusHankel{T}}) where T = normest(parent(A))
normest(A::ChebyshevGramMatrix{T}) where T = normest(ToeplitzPlusHankel(A))


================================================
FILE: src/arrays.jl
================================================
struct ArrayPlan{T, FF<:FTPlan{<:T}, Szs<:Tuple, Dims<:Tuple{<:Int}} <: Plan{T}
    F::FF
    szs::Szs
    dims::Dims
end
size(P::ArrayPlan) = P.szs

function ArrayPlan(F::FTPlan{<:T}, c::AbstractArray{T}, dims::Tuple{<:Int}=(1,)) where T
    szs = size(c)
    @assert F.n == szs[dims[1]]
    ArrayPlan(F, size(c), dims)
end

function *(P::ArrayPlan, f::AbstractArray)
    F, dims, szs = P.F, P.dims, P.szs
    @assert length(dims) == 1
    @assert szs == size(f)
    d = first(dims)

    perm = (d, ntuple(i-> i + (i >= d), ndims(f) -1)...)
    fp = permutedims(f, perm)

    fr = reshape(fp, size(fp,1), :)

    permutedims(reshape(F*fr, size(fp)...), invperm(perm))
end

function \(P::ArrayPlan, f::AbstractArray)
    F, dims, szs = P.F, P.dims, P.szs
    @assert length(dims) == 1
    @assert szs == size(f)
    d = first(dims)

    perm = (d, ntuple(i-> i + (i >= d), ndims(f) -1)...)
    fp = permutedims(f, perm)

    fr = reshape(fp, size(fp,1), :)

    permutedims(reshape(F\fr, size(fp)...), invperm(perm))
end

struct NDimsPlan{T, FF<:ArrayPlan{<:T}, Szs<:Tuple, Dims<:Tuple} <: Plan{T}
    F::FF
    szs::Szs
    dims::Dims
    function NDimsPlan(F, szs, dims)
        if length(Set(szs[[dims...]])) > 1
            error("Different size in dims axes not yet implemented in N-dimensional transform.")
        end
        new{eltype(F), typeof(F), typeof(szs), typeof(dims)}(F, szs, dims)
    end
end

size(P::NDimsPlan) = P.szs

function NDimsPlan(F::FTPlan, szs::Tuple, dims::Tuple)
    NDimsPlan(ArrayPlan(F, szs, (first(dims),)), szs, dims)
end

function *(P::NDimsPlan, f::AbstractArray)
    F, dims = P.F, P.dims
    @assert size(P) == size(f)
    g = copy(f)
    t = 1:ndims(g)
    d1 = dims[1]
    for d in dims
        perm = ntuple(k -> k == d1 ? t[d] : k == d ? t[d1] : t[k], ndims(g))
        gp = permutedims(g, perm)
        g = permutedims(F*gp, invperm(perm))
    end
    return g
end

function \(P::NDimsPlan, f::AbstractArray)
    F, dims = P.F, P.dims
    @assert size(P) == size(f)
    g = copy(f)
    t = 1:ndims(g)
    d1 = dims[1]
    for d in dims
        perm = ntuple(k -> k == d1 ? t[d] : k == d ? t[d1] : t[k], ndims(g))
        gp = permutedims(g, perm)
        g = permutedims(F\gp, invperm(perm))
    end
    return g
end

================================================
FILE: src/chebyshevtransform.jl
================================================
## Transforms take values at Chebyshev points of the first and second kinds and produce Chebyshev coefficients

abstract type ChebyshevPlan{T} <: Plan{T} end

*(P::ChebyshevPlan{T}, x::AbstractArray{T}) where T = error("Plan applied to wrong size array")

size(P::ChebyshevPlan) = isdefined(P, :plan) ? size(P.plan) : (0,)
length(P::ChebyshevPlan) = isdefined(P, :plan) ? length(P.plan) : 0


const FIRSTKIND = FFTW.REDFT10
const SECONDKIND = FFTW.REDFT00

struct ChebyshevTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
    ChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
    ChebyshevTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
end

ChebyshevTransformPlan{T,kind}(plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
    ChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan)

# jump through some hoops to make inferrable

function plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        ChebyshevTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        ChebyshevTransformPlan{T,1}(FFTW.plan_r2r!(x, FIRSTKIND, dims...; kws...))
    end
end
function plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    ChebyshevTransformPlan{T,2}(FFTW.plan_r2r!(x, SECONDKIND, dims...; kws...))
end


function plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        ChebyshevTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        ChebyshevTransformPlan{T,1}(FFTW.plan_r2r(x, FIRSTKIND, dims...; kws...))
    end
end
function plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    ChebyshevTransformPlan{T,2}(FFTW.plan_r2r(x, SECONDKIND, dims...; kws...))
end


# convert x if necessary
_maybemutablecopy(x::StridedArray{T}, ::Type{T}) where {T} = x
_maybemutablecopy(x, T) = Array{T}(x)
@inline _plan_mul!(y::AbstractArray{T}, P::Plan{T}, x::AbstractArray) where T = mul!(y, P, _maybemutablecopy(x, T))

function applydim!(op!, X::AbstractArray, Rpre, Rpost, ind)
    for Ipost in Rpost, Ipre in Rpre
        v = view(X, Ipre, ind, Ipost)
        op!(v)
    end
    X
end
function applydim!(op!, X::AbstractArray, d::Integer, ind)
    Rpre = CartesianIndices(axes(X)[1:d-1])
    Rpost = CartesianIndices(axes(X)[d+1:end])
    applydim!(op!, X, Rpre, Rpost, ind)
end

for op in (:ldiv, :lmul)
    op_dim_begin! = Symbol(op, :_dim_begin!)
    op_dim_end! = Symbol(op, :_dim_end!)
    op! = Symbol(op, :!)
    @eval begin
        function $op_dim_begin!(α, d::Number, y::AbstractArray)
            # scale just the d-th dimension by permuting it to the first
            d ∈ 1:ndims(y) || throw(ArgumentError("dimension $d must lie between 1 and $(ndims(y))"))
            applydim!(v -> $op!(α, v), y, d, 1)
        end

        function $op_dim_end!(α, d::Number, y::AbstractArray)
            # scale just the d-th dimension by permuting it to the first
            d ∈ 1:ndims(y) || throw(ArgumentError("dimension $d must lie between 1 and $(ndims(y))"))
            applydim!(v -> $op!(α, v), y, d, size(y, d))
        end
    end
end


@inline function _cheb1_rescale!(d::Number, y::AbstractArray)
    ldiv_dim_begin!(2, d, y)
    ldiv!(size(y,d), y)
end

function _prod_size(sz, d)
    ret = 1
    for k in d
        ret *= sz[k]
    end
    ret
end


@inline function _cheb1_rescale!(d, y::AbstractArray)
    for k in d
        ldiv_dim_begin!(2, k, y)
    end
    ldiv!(_prod_size(size(y), d), y)
end


function *(P::ChebyshevTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
    isempty(x) && return x

    y = P.plan*x # will be  === x if in-place
    _cheb1_rescale!(P.plan.region, y)
end

function mul!(y::AbstractArray{T,N}, P::ChebyshevTransformPlan{T,1,K,false,N}, x::AbstractArray{<:Any,N}) where {T,K,N}
    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
    isempty(x) && return y
    _plan_mul!(y, P.plan, x)
    _cheb1_rescale!(P.plan.region, y)
end


function _cheb2_rescale!(d::Number, y::AbstractArray)
    ldiv_dim_begin!(2, d, y)
    ldiv_dim_end!(2, d, y)
    ldiv!(size(y,d)-1, y)
end

# TODO: higher dimensional arrays
function _cheb2_rescale!(d, y::AbstractArray)
    for k in d
        ldiv_dim_begin!(2, k, y)
        ldiv_dim_end!(2, k, y)
    end

    ldiv!(_prod_size(size(y) .- 1, d), y)
end

function *(P::ChebyshevTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
    n = length(x)
    y = P.plan*x # will be  === x if in-place
    _cheb2_rescale!(P.plan.region, y)
end

function mul!(y::AbstractArray{T,N}, P::ChebyshevTransformPlan{T,2,K,false,N}, x::AbstractArray{<:Any,N}) where {T,K,N}
    n = length(x)
    length(y) == n || throw(DimensionMismatch("output must match dimension"))
    _plan_mul!(y, P.plan, x)
    _cheb2_rescale!(P.plan.region, y)
end

*(P::ChebyshevTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
    mul!(similar(x), P, x)

"""
    chebyshevtransform!(x, kind=Val(1))

transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
coefficients, in-place
"""
chebyshevtransform!(x, dims...; kws...) = plan_chebyshevtransform!(x, dims...; kws...)*x


"""
    chebyshevtransform(x, kind=Val(1))

transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
coefficients.
"""
chebyshevtransform(x, dims...; kws...) = plan_chebyshevtransform(x, dims...; kws...) * x


## Inverse transforms take Chebyshev coefficients and produce values at Chebyshev points of the first and second kinds


const IFIRSTKIND = FFTW.REDFT01

struct IChebyshevTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
    IChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
    IChebyshevTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
end

IChebyshevTransformPlan{T,kind}(F::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
    IChebyshevTransformPlan{T,kind,K,inplace,N,R}(F)


# second kind Chebyshev transforms share a plan with their inverse
# so we support this via inv
inv(P::ChebyshevTransformPlan{T,2}) where {T} = IChebyshevTransformPlan{T,2}(P.plan)
inv(P::IChebyshevTransformPlan{T,2}) where {T} = ChebyshevTransformPlan{T,2}(P.plan)

inv(P::ChebyshevTransformPlan{T,1}) where {T} = IChebyshevTransformPlan{T,1}(inv(P.plan).p)
inv(P::IChebyshevTransformPlan{T,1}) where {T} = ChebyshevTransformPlan{T,1}(inv(P.plan).p)


\(P::ChebyshevTransformPlan, x::AbstractArray) = inv(P) * x
\(P::IChebyshevTransformPlan, x::AbstractArray) = inv(P) * x


function plan_ichebyshevtransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        IChebyshevTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        IChebyshevTransformPlan{T,1}(FFTW.plan_r2r!(x, IFIRSTKIND, dims...; kws...))
    end
end

function plan_ichebyshevtransform!(x::AbstractArray{T}, ::Val{2}, dims...; kws...) where T<:fftwNumber
    inv(plan_chebyshevtransform!(x, Val(2), dims...; kws...))
end

function plan_ichebyshevtransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        IChebyshevTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        IChebyshevTransformPlan{T,1}(FFTW.plan_r2r(x, IFIRSTKIND, dims...; kws...))
    end
end

function plan_ichebyshevtransform(x::AbstractArray{T}, ::Val{2}, dims...; kws...) where T<:fftwNumber
    inv(plan_chebyshevtransform(x, Val(2), dims...; kws...))
end

@inline function _icheb1_prescale!(d::Number, x::AbstractArray)
    lmul_dim_begin!(2, d, x)
    x
end
@inline function _icheb1_prescale!(d, x::AbstractArray)
    for k in d
        _icheb1_prescale!(k, x)
    end
    x
end
@inline function _icheb1_postscale!(d::Number, x::AbstractArray)
    ldiv_dim_begin!(2, d, x)
    x
end

@inline function _icheb1_postscale!(d, x::AbstractArray)
    for k in d
        _icheb1_postscale!(k, x)
    end
    x
end

function *(P::IChebyshevTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
    n = length(x)
    n == 0 && return x

    _icheb1_prescale!(P.plan.region, x)
    x = ldiv!(2^length(P.plan.region), P.plan*x)
    x
end

function mul!(y::AbstractArray{T,N}, P::IChebyshevTransformPlan{T,1,K,false,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
    isempty(x) && return y

    _icheb1_prescale!(P.plan.region, x) # TODO: don't mutate x
    _plan_mul!(y, P.plan, x)
    _icheb1_postscale!(P.plan.region, x)
    ldiv!(2^length(P.plan.region), y)
end

@inline function _icheb2_prescale!(d::Number, x::AbstractArray)
    lmul_dim_begin!(2, d, x)
    lmul_dim_end!(2, d, x)
    x
end
@inline function _icheb2_prescale!(d, x::AbstractArray)
    for k in d
        _icheb2_prescale!(k, x)
    end
    x
end

@inline function _icheb2_postrescale!(d::Number, x::AbstractArray)
    ldiv_dim_begin!(2, d, x)
    ldiv_dim_end!(2, d, x)
    x
end
@inline function _icheb2_postrescale!(d, x::AbstractArray)
    for k in d
        _icheb2_postrescale!(k, x)
    end
    x
end
@inline function _icheb2_rescale!(d::Number, y::AbstractArray{T}) where T
    _icheb2_prescale!(d, y)
    lmul!(convert(T, size(y,d) - 1)/2, y)
    y
end
@inline function _icheb2_rescale!(d, y::AbstractArray{T}) where T
    _icheb2_prescale!(d, y)
    lmul!(_prod_size(convert.(T, size(y) .- 1)./2, d), y)
    y
end

function *(P::IChebyshevTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
    n = length(x)

    _icheb2_prescale!(P.plan.region, x)
    x = inv(P)*x
    _icheb2_rescale!(P.plan.region, x)
end

function mul!(y::AbstractArray{T,N}, P::IChebyshevTransformPlan{T,2,K,false,N}, x::AbstractArray{<:Any,N}) where {T<:fftwNumber,K,N}
    n = length(x)
    length(y) == n || throw(DimensionMismatch("output must match dimension"))

    _icheb2_prescale!(P.plan.region, x)
    _plan_mul!(y, inv(P), x)
    _icheb2_postrescale!(P.plan.region, x)
    _icheb2_rescale!(P.plan.region, y)
end

*(P::IChebyshevTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
    mul!(similar(x), P, _maybemutablecopy(x, T))
ichebyshevtransform!(x::AbstractArray, dims...; kwds...) = plan_ichebyshevtransform!(x, dims...; kwds...)*x
ichebyshevtransform(x, dims...; kwds...) = plan_ichebyshevtransform(x, dims...; kwds...)*x


#######
# Chebyshev U
#######

const UFIRSTKIND = FFTW.RODFT10
const USECONDKIND = FFTW.RODFT00

struct ChebyshevUTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
end

ChebyshevUTransformPlan{T,kind}(plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan)


function plan_chebyshevutransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        ChebyshevUTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        ChebyshevUTransformPlan{T,1}(FFTW.plan_r2r!(x, UFIRSTKIND, dims...; kws...))
    end
end
function plan_chebyshevutransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    ChebyshevUTransformPlan{T,2}(FFTW.plan_r2r!(x, USECONDKIND, dims...; kws...))
end

function plan_chebyshevutransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        ChebyshevUTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        ChebyshevUTransformPlan{T,1}(FFTW.plan_r2r(x, UFIRSTKIND, dims...; kws...))
    end
end
function plan_chebyshevutransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(dims)
        any(≤(1), size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    else
        for d in dims[1]
            size(x,d) ≤ 1 && throw(ArgumentError("Array must contain at least 2 entries"))
        end
    end
    ChebyshevUTransformPlan{T,2}(FFTW.plan_r2r(x, USECONDKIND, dims...; kws...))
end

for f in [:_chebu1_prescale!, :_chebu1_postscale!, :_chebu2_prescale!, :_chebu2_postscale!,
            :_ichebu1_postscale!]
    _f = Symbol(:_, f)
    @eval begin
        @inline function $f(d::Number, X::AbstractArray)
            d ∈ 1:ndims(X) || throw("dimension $d must lie between 1 and $(ndims(X))")
            $_f(d, X)
            X
        end
        @inline function $f(d, y::AbstractArray)
            for k in d
                $f(k, y)
            end
            y
        end
    end
end

function __chebu1_prescale!(d::Number, X::AbstractArray{T}) where {T}
    m = size(X,d)
    r = one(T)/(2m) .+ ((1:m) .- one(T))./m
    applydim!(v -> v .*= sinpi.(r) ./ m, X, d, :)
end

@inline function __chebu1_postscale!(d::Number, X::AbstractArray{T}) where {T}
    m = size(X,d)
    r = one(T)/(2m) .+ ((1:m) .- one(T))./m
    applydim!(v -> v ./= sinpi.(r) ./ m, X, d, :)
end

function *(P::ChebyshevUTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
    length(x) ≤ 1 && return x
    _chebu1_prescale!(P.plan.region, x)
    P.plan * x
end

function mul!(y::AbstractArray{T}, P::ChebyshevUTransformPlan{T,1,K,false}, x::AbstractArray{T}) where {T,K}
    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
    isempty(x) && return y
    _chebu1_prescale!(P.plan.region, x) # Todo don't mutate x
    _plan_mul!(y, P.plan, x)
    _chebu1_postscale!(P.plan.region, x)
    for d in P.plan.region
        size(y,d) == 1 && ldiv!(2, y) # fix doubling
    end
    y
end


@inline function __chebu2_prescale!(d, X::AbstractArray{T}) where {T}
    m = size(X,d)
    c = one(T)/ (m+1)
    r = (1:m) .* c
    applydim!(v -> v .*= sinpi.(r), X, d, :)
end

@inline function __chebu2_postscale!(d::Number, X::AbstractArray{T}) where {T}
    m = size(X,d)
    c = one(T)/ (m+1)
    r = (1:m) .* c
    applydim!(v -> v ./= sinpi.(r), X, d, :)
end

function *(P::ChebyshevUTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
    sc = one(T)
    for d in P.plan.region
        sc *= one(T)/(size(x,d)+1)
    end
    _chebu2_prescale!(P.plan.region, x)
    lmul!(sc, P.plan * x)
end

function mul!(y::AbstractArray{T}, P::ChebyshevUTransformPlan{T,2,K,false}, x::AbstractArray{T}) where {T,K}
    sc = one(T)
    for d in P.plan.region
        sc *= one(T)/(size(x,d)+1)
    end
    _chebu2_prescale!(P.plan.region, x) # TODO don't mutate x
    _plan_mul!(y, P.plan, x)
    _chebu2_postscale!(P.plan.region, x)
    lmul!(sc, y)
end

*(P::ChebyshevUTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
    mul!(similar(x), P, x)

chebyshevutransform!(x::AbstractArray{T}, dims...; kws...) where {T<:fftwNumber} =
    plan_chebyshevutransform!(x, dims...; kws...)*x


"""
    chebyshevutransform(x, ::Val{kind}=Val(1))

transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
coefficients of the 2nd kind (Chebyshev U expansion).
"""
chebyshevutransform(x, dims...; kws...) = plan_chebyshevutransform(x, dims...; kws...)*x


## Inverse transforms take ChebyshevU coefficients and produce values at ChebyshevU points of the first and second kinds
const IUFIRSTKIND = FFTW.RODFT01

struct IChebyshevUTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
end

IChebyshevUTransformPlan{T,kind}(F::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}(F)

function plan_ichebyshevutransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        IChebyshevUTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        IChebyshevUTransformPlan{T,1}(FFTW.plan_r2r!(x, IUFIRSTKIND, dims...; kws...))
    end
end
function plan_ichebyshevutransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    IChebyshevUTransformPlan{T,2}(FFTW.plan_r2r!(x, USECONDKIND, dims...))
end

function plan_ichebyshevutransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
    if isempty(x)
        IChebyshevUTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
    else
        IChebyshevUTransformPlan{T,1}(FFTW.plan_r2r(x, IUFIRSTKIND, dims...; kws...))
    end
end
function plan_ichebyshevutransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
    IChebyshevUTransformPlan{T,2}(FFTW.plan_r2r(x, USECONDKIND, dims...; kws...))
end


# second kind Chebyshev transforms share a plan with their inverse
# so we support this via inv
inv(P::ChebyshevUTransformPlan{T,2}) where {T} = IChebyshevUTransformPlan{T,2}(P.plan)
inv(P::IChebyshevUTransformPlan{T,2}) where {T} = ChebyshevUTransformPlan{T,2}(P.plan)

inv(P::ChebyshevUTransformPlan{T,1}) where {T} = IChebyshevUTransformPlan{T,1}(inv(P.plan).p)
inv(P::IChebyshevUTransformPlan{T,1}) where {T} = ChebyshevUTransformPlan{T,1}(inv(P.plan).p)

@inline function __ichebu1_postscale!(d::Number, X::AbstractArray{T}) where {T}
    m = size(X,d)
    r = one(T)/(2m) .+ ((1:m) .- one(T))/m
    applydim!(v -> v ./= 2 .* sinpi.(r), X, d, :)
end

function *(P::IChebyshevUTransformPlan{T,1,K,true}, x::AbstractArray{T}) where {T<:fftwNumber,K}
    length(x) ≤ 1 && return x
    x = P.plan * x
    _ichebu1_postscale!(P.plan.region, x)
end

function mul!(y::AbstractArray{T}, P::IChebyshevUTransformPlan{T,1,K,false}, x::AbstractArray{T}) where {T<:fftwNumber,K}
    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
    isempty(x) && return y
    _plan_mul!(y, P.plan, x)
    _ichebu1_postscale!(P.plan.region, y)
    for d in P.plan.region
        size(y,d) == 1 && lmul!(2, y) # fix doubling
    end
    y
end

function _ichebu2_rescale!(d::Number, x::AbstractArray{T}) where T
    _chebu2_postscale!(d, x)
    ldiv!(2, x)
    x
end

@inline function _ichebu2_rescale!(d, y::AbstractArray)
    for k in d
        _ichebu2_rescale!(k, y)
    end
    y
end

function *(P::IChebyshevUTransformPlan{T,2,K,true}, x::AbstractArray{T}) where {T<:fftwNumber,K}
    n = length(x)
    n ≤ 1 && return x

    x = P.plan * x
    _ichebu2_rescale!(P.plan.region, x)
end

function mul!(y::AbstractArray{T}, P::IChebyshevUTransformPlan{T,2,K,false}, x::AbstractArray{T}) where {T<:fftwNumber,K}
    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
    length(x) ≤ 1 && return x

    _plan_mul!(y, P.plan, x)
    _ichebu2_rescale!(P.plan.region, y)
end

ichebyshevutransform!(x::AbstractArray{T}, dims...; kwds...) where {T<:fftwNumber} =
    plan_ichebyshevutransform!(x, dims...; kwds...)*x

ichebyshevutransform(x, dims...; kwds...) = plan_ichebyshevutransform(x, dims...; kwds...)*x

*(P::IChebyshevUTransformPlan{T,k,K,false,N}, x::AbstractArray{T,N}) where {T,k,K,N} =
    mul!(similar(x), P, x)


## Code generation for integer inputs

for func in (:chebyshevtransform,:ichebyshevtransform,:chebyshevutransform,:ichebyshevutransform)
    @eval $func(x::AbstractVector{T}, dims...; kwds...) where {T<:Integer} = $func(convert(AbstractVector{float(T)},x), dims...; kwds...)
end


## points

struct ChebyshevGrid{kind,T} <: AbstractVector{T}
    n::Int
    function ChebyshevGrid{1,T}(n::Int) where T
        n ≥ 0 || throw(ArgumentError("Number of points must be nonnehative"))
        new{1,T}(n)
    end
    function ChebyshevGrid{2,T}(n::Int) where T
        n ≥ 2 || throw(ArgumentError("Number of points must be greater than 2"))
        new{2,T}(n)
    end
end

ChebyshevGrid{kind}(n::Integer) where kind = ChebyshevGrid{kind,Float64}(n)

size(g::ChebyshevGrid) = (g.n,)
getindex(g::ChebyshevGrid{1,T}, k::Integer) where T =
    sinpi(convert(T,g.n-2k+1)/(2g.n))

getindex(g::ChebyshevGrid{2,T}, k::Integer) where T =
    sinpi(convert(T,g.n-2k+1)/(2g.n-2))

chebyshevpoints(::Type{T}, n::Integer, ::Val{kind}) where {T<:Number,kind} = ChebyshevGrid{kind,T}(n)
chebyshevpoints(::Type{T}, n::Integer) where T = chebyshevpoints(T, n, Val(1))
chebyshevpoints(n::Integer, kind=Val(1)) = chebyshevpoints(Float64, n, kind)


# sin(nθ) coefficients to values at Clenshaw-Curtis nodes except ±1
#
# struct DSTPlan{T,kind,inplace,P} <: Plan{T}
#     plan::P
# end
#
# DSTPlan{k,inp}(plan) where {k,inp} =
#     DSTPlan{eltype(plan),k,inp,typeof(plan)}(plan)
#
#
# plan_DSTI!(x) = length(x) > 0 ? DSTPlan{1,true}(FFTW.FFTW.plan_r2r!(x, FFTW.FFTW.RODFT00)) :
#                                 fill(one(T),1,length(x))
#
# function *(P::DSTPlan{T,1}, x::AbstractArray) where {T}
#     x = P.plan*x
#     rmul!(x,half(T))
# end


###
# BigFloat
# Use `Nothing` and fall back to FFT
###


plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
    ChebyshevTransformPlan{T,kind,Nothing,false,N,UnitRange{Int}}()
plan_ichebyshevtransform(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
    IChebyshevTransformPlan{T,kind,Nothing,false,N,UnitRange{Int}}()

plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
    ChebyshevTransformPlan{T,kind,Nothing,true,N,UnitRange{Int}}()
plan_ichebyshevtransform!(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
    IChebyshevTransformPlan{T,kind,Nothing,true,N,UnitRange{Int}}()


#following Chebfun's @Chebtech1/vals2coeffs.m and @Chebtech2/vals2coeffs.m
function *(P::ChebyshevTransformPlan{T,1,Nothing,false}, x::AbstractVector{T}) where T
    n = length(x)
    if n == 1
        x
    else
        w = [2exp(im*convert(T,π)*k/2n) for k=0:n-1]
        ret = w.*ifft([x;reverse(x)])[1:n]
        ret = T<:Real ? real(ret) : ret
        ret[1] /= 2
        ret
    end
end


# function *(P::ChebyshevTransformPlan{T,1,K,Nothing,false}, x::AbstractVector{T}) where {T,K}
#     n = length(x)
#     if n == 1
#         x
#     else
#         ret = ifft([x;x[end:-1:2]])[1:n]
#         ret = T<:Real ? real(ret) : ret
#         ret[2:n-1] *= 2
#         ret
#     end
# end


*(P::ChebyshevTransformPlan{T,1,Nothing,true,N,R}, x::AbstractVector{T}) where {T,N,R} =
    copyto!(x, ChebyshevTransformPlan{T,1,Nothing,false,N,R}() * x)
# *(P::ChebyshevTransformPlan{T,2,true,Nothing}, x::AbstractVector{T}) where T =
#     copyto!(x, ChebyshevTransformPlan{T,2,false,Nothing}() * x)


#following Chebfun's @Chebtech1/vals2coeffs.m and @Chebtech2/vals2coeffs.m
function *(P::IChebyshevTransformPlan{T,1,Nothing,false}, x::AbstractVector{T}) where T
    n = length(x)
    if n == 1
        x
    else
        w = [exp(-im*convert(T,π)*k/2n)/2 for k=0:2n-1]
        w[1] *= 2;w[n+1] *= 0;w[n+2:end] *= -1
        ret = fft(w.*[x;one(T);x[end:-1:2]])
        ret = T<:Real ? real(ret) : ret
        ret[1:n]
    end
end

# function *(P::IChebyshevTransformPlan{T,2,K,Nothing,true}, x::AbstractVector{T}) where {T,K}
#     n = length(x)
#     if n == 1
#         x
#     else
#         x[1] *= 2; x[end] *= 2
#         chebyshevtransform!(x, Val(2))
#         x[1] *= 2; x[end] *= 2
#         lmul!(convert(T,n-1)/2, x)
#         x
#     end
# end

*(P::IChebyshevTransformPlan{T,1,Nothing,true,N,R}, x::AbstractVector{T}) where {T,N,R} =
    copyto!(x, IChebyshevTransformPlan{T,1,Nothing,false,N,R}() * x)
# *(P::IChebyshevTransformPlan{T,SECONDKIND,false,Nothing}, x::AbstractVector{T}) where T =
#     IChebyshevTransformPlan{T,SECONDKIND,true,Nothing}() * copy(x)


for pln in (:plan_chebyshevtransform!, :plan_chebyshevtransform, 
            :plan_chebyshevutransform!, :plan_chebyshevutransform, 
            :plan_ichebyshevutransform, :plan_ichebyshevutransform!, 
            :plan_ichebyshevtransform, :plan_ichebyshevtransform!)
    @eval begin
        $pln(x::AbstractArray, dims...; kws...) = $pln(x, Val(1), dims...; kws...)
        $pln(::Type{T}, szs, dims...; kwds...) where T = $pln(Array{T}(undef, szs...), dims...; kwds...)
    end
end


================================================
FILE: src/clenshawcurtis.jl
================================================
plan_clenshawcurtis(μ) = length(μ) > 1 ? FFTW.plan_r2r!(μ, FFTW.REDFT00) : fill!(similar(μ),1)'

"""
Compute nodes of the Clenshaw—Curtis quadrature rule.
"""
clenshawcurtisnodes(::Type{T}, N::Int) where T = chebyshevpoints(T, N, Val(2))

"""
Compute weights of the Clenshaw—Curtis quadrature rule with modified Chebyshev moments of the first kind ``\\mu``.
"""
clenshawcurtisweights(μ::Vector) = clenshawcurtisweights!(copy(μ))
clenshawcurtisweights!(μ::Vector) = clenshawcurtisweights!(μ, plan_clenshawcurtis(μ))
function clenshawcurtisweights!(μ::Vector{T}, plan) where T
    N = length(μ)
    rmul!(μ, inv(N-one(T)))
    plan*μ
    μ[1] *= half(T); μ[N] *= half(T)
    return μ
end


================================================
FILE: src/docstrings.jl
================================================
"""
	leg2cheb(v::AbstractVector; normleg::Bool=false, normcheb::Bool=false)

Convert the vector of expansions coefficients `v` from a Legendre to a Chebyshev basis.
The keyword arguments denote whether the bases are normalized.
"""
leg2cheb

"""
	cheb2leg(v::AbstractVector; normcheb::Bool=false, normleg::Bool=false)

Convert the vector of expansions coefficients `v` from a Chebyshev to a Legendre basis.
The keyword arguments denote whether the bases are normalized.
"""
cheb2leg

"""
	ultra2ultra(v::AbstractVector, λ, μ; norm1::Bool=false, norm2::Bool=false)

Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
order `λ` to an Ultraspherical basis of order `μ`.
The keyword arguments denote whether the bases are normalized.
"""
ultra2ultra

"""
	jac2jac(v::AbstractVector, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)

Convert the vector of expansions coefficients `v` from a Jacobi basis of
order `(α,β)` to a Jacobi basis of order `(γ,δ)`.
The keyword arguments denote whether the bases are normalized.
"""
jac2jac

"""
	lag2lag(v::AbstractVector, α, β; norm1::Bool=false, norm2::Bool=false)

Convert the vector of expansions coefficients `v` from a Laguerre basis of
order `α` to a La basis of order `β`.
The keyword arguments denote whether the bases are normalized."""
lag2lag

"""
	jac2ultra(v::AbstractVector, α, β, λ; normjac::Bool=false, normultra::Bool=false)

Convert the vector of expansions coefficients `v` from a Jacobi basis of
order `(α,β)` to an Ultraspherical basis of order `λ`.
The keyword arguments denote whether the bases are normalized."""
jac2ultra

"""
	ultra2jac(v::AbstractVector, λ, α, β; normultra::Bool=false, normjac::Bool=false)

Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
order `λ` to a Jacobi basis of order `(α,β)`.
The keyword arguments denote whether the bases are normalized.
"""
ultra2jac

"""
	jac2cheb(v::AbstractVector, α, β; normjac::Bool=false, normcheb::Bool=false)

Convert the vector of expansions coefficients `v` from a Jacobi basis of
order `(α,β)` to a Chebyshev basis.
The keyword arguments denote whether the bases are normalized.
"""
jac2cheb

"""
	cheb2jac(v::AbstractVector, α, β; normcheb::Bool=false, normjac::Bool=false)

Convert the vector of expansions coefficients `v` from a Chebyshev basis to a
Jacobi basis of order `(α,β)`.
The keyword arguments denote whether the bases are normalized.
"""
cheb2jac

"""
	ultra2cheb(v::AbstractVector, λ; normultra::Bool=false, normcheb::Bool=false)

Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
order `λ` to a Chebyshev basis.
The keyword arguments denote whether the bases are normalized.
"""
ultra2cheb

"""
	cheb2ultra(v::AbstractVector, λ; normcheb::Bool=false, normultra::Bool=false)

Convert the vector of expansions coefficients `v` from a Chebyshev basis
to an Ultraspherical basis of order `λ`.
The keyword arguments denote whether the bases are normalized.
"""
cheb2ultra

"""
	associatedjac2jac(v::AbstractVector, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)

Convert the vector of expansions coefficients `v` from an associated Jacobi basis
of orders `(α,β)` to a Jacobi basis of order `(γ,δ)`.
The keyword arguments denote whether the bases are normalized.
"""
associatedjac2jac

"""
	modifiedjac2jac(v::AbstractVector{T}, α, β, u::Vector{T}; verbose::Bool=false) where {T}
	modifiedjac2jac(v::AbstractVector{T}, α, β, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
"""
modifiedjac2jac

"""
	modifiedlag2lag(v::AbstractVector{T}, α, u::Vector{T}; verbose::Bool=false)
	modifiedlag2lag(v::AbstractVector{T}, α, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
"""
modifiedlag2lag

"""
	modifiedherm2herm(v::AbstractVector{T}, u::Vector{T}; verbose::Bool=false)
	modifiedherm2herm(v::AbstractVector{T}, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
"""
modifiedherm2herm


================================================
FILE: src/elliptic.jl
================================================
"""
`FastTransforms` submodule for the computation of some elliptic integrals and functions.

Complete elliptic integrals of the first and second kinds:
```math
K(k) = \\int_0^{\\frac{\\pi}{2}} \\frac{{\\rm d}\\theta}{\\sqrt{1-k^2\\sin^2\\theta}},\\quad{\\rm and},
```
```math
E(k) = \\int_0^{\\frac{\\pi}{2}} \\sqrt{1-k^2\\sin^2\\theta} {\\rm\\,d}\\theta.
```

Jacobian elliptic functions:
```math
x = \\int_0^{\\operatorname{sn}(x,k)} \\frac{{\\rm d}t}{\\sqrt{(1-t^2)(1-k^2t^2)}},
```
```math
x = \\int_{\\operatorname{cn}(x,k)}^1 \\frac{{\\rm d}t}{\\sqrt{(1-t^2)[1-k^2(1-t^2)]}},
```
```math
x = \\int_{\\operatorname{dn}(x,k)}^1 \\frac{{\\rm d}t}{\\sqrt{(1-t^2)(t^2-1+k^2)}},
```
and the remaining nine are defined by:
```math
\\operatorname{pq}(x,k) = \\frac{\\operatorname{pr}(x,k)}{\\operatorname{qr}(x,k)} = \\frac{1}{\\operatorname{qp}(x,k)}.
```
"""
module Elliptic

import FastTransforms: libfasttransforms

export K, E,
       sn, cn, dn, ns, nc, nd,
       sc, cs, sd, ds, cd, dc

for (fC, elty) in ((:ft_complete_elliptic_integralf, :Float32), (:ft_complete_elliptic_integral, :Float64))
    @eval begin
        function K(k::$elty)
            return ccall(($(string(fC)), libfasttransforms), $elty, (Cint, $elty), '1', k)
        end
        function E(k::$elty)
            return ccall(($(string(fC)), libfasttransforms), $elty, (Cint, $elty), '2', k)
        end
    end
end

const SN = UInt(1)
const CN = UInt(2)
const DN = UInt(4)

for (fC, elty) in ((:ft_jacobian_elliptic_functionsf, :Float32), (:ft_jacobian_elliptic_functions, :Float64))
    @eval begin
        function sn(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, C_NULL, SN)
            retsn[]
        end
        function cn(x::$elty, k::$elty)
            retcn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, C_NULL, CN)
            retcn[]
        end
        function dn(x::$elty, k::$elty)
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, C_NULL, retdn, DN)
            retdn[]
        end
        function ns(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, C_NULL, SN)
            inv(retsn[])
        end
        function nc(x::$elty, k::$elty)
            retcn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, C_NULL, CN)
            inv(retcn[])
        end
        function nd(x::$elty, k::$elty)
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, C_NULL, retdn, DN)
            inv(retdn[])
        end
        function sc(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            retcn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, retcn, C_NULL, SN & CN)
            retsn[]/retcn[]
        end
        function cs(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            retcn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, retcn, C_NULL, SN & CN)
            retcn[]/retsn[]
        end
        function sd(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, retdn, SN & DN)
            retsn[]/retdn[]
        end
        function ds(x::$elty, k::$elty)
            retsn = Ref{$elty}()
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, retdn, SN & DN)
            retdn[]/retsn[]
        end
        function cd(x::$elty, k::$elty)
            retcn = Ref{$elty}()
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, retdn, CN & DN)
            retcn[]/retdn[]
        end
        function dc(x::$elty, k::$elty)
            retcn = Ref{$elty}()
            retdn = Ref{$elty}()
            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, retdn, CN & DN)
            retdn[]/retcn[]
        end
    end
end

end # module


================================================
FILE: src/fejer.jl
================================================
plan_fejer1(μ) = FFTW.plan_r2r!(μ, FFTW.REDFT01)

"""
Compute nodes of Fejer's first quadrature rule.
"""
fejernodes1(::Type{T}, N::Int) where T = chebyshevpoints(T, N, Val(1))

"""
Compute weights of Fejer's first quadrature rule with modified Chebyshev moments of the first kind ``\\mu``.
"""
fejerweights1(μ::Vector) = fejerweights1!(copy(μ))
fejerweights1!(μ::Vector) = fejerweights1!(μ, plan_fejer1(μ))
function fejerweights1!(μ::Vector{T}, plan) where T
    N = length(μ)
    rmul!(μ, inv(T(N)))
    return plan*μ
end


plan_fejer2(μ) = FFTW.plan_r2r!(μ, FFTW.RODFT00)

"""
Compute nodes of Fejer's second quadrature rule.
"""
fejernodes2(::Type{T}, N::Int) where T = T[sinpi((N-2k-one(T))/(2N+two(T))) for k=0:N-1]

"""
Compute weights of Fejer's second quadrature rule with modified Chebyshev moments of the second kind ``\\mu``.
"""
fejerweights2(μ::Vector) = fejerweights2!(copy(μ))
fejerweights2!(μ::Vector) = fejerweights2!(μ, plan_fejer2(μ))
function fejerweights2!(μ::Vector{T}, plan) where T
    N = length(μ)
    Np1 = N+one(T)
    rmul!(μ, inv(Np1))
    plan*μ
    @inbounds for i=1:N μ[i] = sinpi(i/Np1)*μ[i] end
    return μ
end


================================================
FILE: src/gaunt.jl
================================================
"""
Calculates the Gaunt coefficients, defined by:

```math
a(m,n,\\mu,\\nu,q) = \\frac{2(n+\\nu-2q)+1}{2} \\frac{(n+\\nu-2q-m-\\mu)!}{(n+\\nu-2q+m+\\mu)!} \\int_{-1}^{+1} P_n^m(x) P_\\nu^\\mu(x) P_{n+\\nu-2q}^{m+\\mu}(x) {\\rm\\,d}x.
```
or defined by:

```math
P_n^m(x) P_\\nu^\\mu(x) = \\sum_{q=0}^{q_{\\rm max}} a(m,n,\\mu,\\nu,q) P_{n+\\nu-2q}^{m+\\mu}(x)
```

This is a Julia implementation of the stable recurrence described in:

Y.-l. Xu, Fast evaluation of Gaunt coefficients: recursive approach, *J. Comp. Appl. Math.*, **85**:53–65, 1997.
"""
function gaunt(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer;normalized::Bool=false) where T
    if normalized
        normalizedgaunt(T,m,n,μ,ν)
    else
        lmul!(normalization(T,m,n,μ,ν),gaunt(T,m,n,μ,ν;normalized=true))
    end
end
"""
Calculates the Gaunt coefficients in 64-bit floating-point arithmetic.
"""
gaunt(m::Integer,n::Integer,μ::Integer,ν::Integer;kwds...) = gaunt(Float64,m,n,μ,ν;kwds...)

gaunt(::Type{T},m::Int32,n::Int32,μ::Int32,ν::Int32;normalized::Bool=false) where T =
    gaunt(T,Int64(m),Int64(n),Int64(μ),Int64(ν);normalized=normalized)


function normalization(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer) where T
    pochhammer(n+one(T),n)*pochhammer(ν+one(T),ν)/pochhammer(n+ν+one(T),n+ν)*gamma(n+ν-m-μ+one(T))/gamma(n-m+one(T))/gamma(ν-μ+one(T))
end

normalization(::Type{Float64},m::Integer,n::Integer,μ::Integer,ν::Integer) = normalization1(Float64,n,ν)*normalization2(Float64,n-m,ν-μ)

function normalization1(::Type{Float64},n::Integer,ν::Integer)
    if n ≥ 8
        if ν ≥ 8
            return exp((n+0.5)*log1p(n/(n+1))+(ν+0.5)*log1p(ν/(ν+1))+(n+ν+0.5)*log1p(-(n+ν)/(2n+2ν+1))+n*log1p(-2ν/(2n+2ν+1))+ν*log1p(-2n/(2n+2ν+1)))*stirlingseries(2n+1.0)*stirlingseries(2ν+1.0)*stirlingseries(n+ν+1.0)/stirlingseries(n+1.0)/stirlingseries(ν+1.0)/stirlingseries(2n+2ν+1.0)
        else
            return pochhammer(ν+1.0,ν)/(2n+2ν+1.0)^ν*exp(ν+(n+0.5)*log1p(n/(n+1))+(n+ν+0.5)*log1p(-(n+ν)/(2n+2ν+1))+n*log1p(-2ν/(2n+2ν+1)))*stirlingseries(2n+1.0)*stirlingseries(n+ν+1.0)/stirlingseries(n+1.0)/stirlingseries(2n+2ν+1.0)
        end
    elseif ν ≥ 8
        return pochhammer(n+1.0,n)/(2n+2ν+1.0)^n*exp(n+(ν+0.5)*log1p(ν/(ν+1))+(n+ν+0.5)*log1p(-(n+ν)/(2n+2ν+1))+ν*log1p(-2n/(2n+2ν+1)))*stirlingseries(2ν+1.0)*stirlingseries(n+ν+1.0)/stirlingseries(ν+1.0)/stirlingseries(2n+2ν+1.0)
    else
        return pochhammer(n+1.0,n)*pochhammer(ν+1.0,ν)/pochhammer(n+ν+1.0,n+ν)
    end
end

function normalization2(::Type{Float64},nm::Integer,νμ::Integer)
    if nm ≥ 8
        if νμ ≥ 8
            return edivsqrt2pi*exp((nm+0.5)*log1p(νμ/(nm+1))+(νμ+0.5)*log1p(nm/(νμ+1)))/sqrt(nm+νμ+1.0)*stirlingseries(nm+νμ+1.0)/stirlingseries(nm+1.0)/stirlingseries(νμ+1.0)
        else
            return (nm+νμ+1.0)^νμ*exp(-νμ+(nm+0.5)*log1p(νμ/(nm+1)))*stirlingseries(nm+νμ+1.0)/stirlingseries(nm+1.0)/gamma(νμ+1.0)
        end
    elseif νμ ≥ 8
        return (nm+νμ+1.0)^nm*exp(-nm+(νμ+0.5)*log1p(nm/(νμ+1)))*stirlingseries(nm+νμ+1.0)/stirlingseries(νμ+1.0)/gamma(nm+1.0)
    else
        return gamma(nm+νμ+1.0)/gamma(nm+1.0)/gamma(νμ+1.0)
    end
end

function normalizedgaunt(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer) where T
    qmax = min(n,ν,(n+ν-abs(m+μ))÷2)
    a = Vector{T}(undef, qmax+1)
    a[1] = one(T)
    if μ == m && ν == n # zero class (i) of Aₚ
        if μ == m == 0
            for q = 1:qmax
                p = n+ν-2q
                a[q+1] = α(T,n,ν,p+2)/α(T,n,ν,p+1)*a[q]
            end
        else
            for q = 1:qmax
                p = n+ν-2q
                p₁,p₂ = p-m-μ,p+m+μ
                a[q+1] = (p+1)*(p₂+2)*α(T,n,ν,p+2)/(p+2)/(p₁+1)/α(T,n,ν,p+1)*a[q]
            end
        end
    else
        qmax > 0 && (a[2] = secondinitialcondition(T,m,n,μ,ν))
        q = 2
        if qmax > 1
            p = n+ν-2q
            p₁,p₂ = p-m-μ,p+m+μ
            if A(T,m,n,μ,ν,p+4) != 0
                a[q+1] = (c₁(T,m,n,μ,ν,p,p₁,p₂)*a[q] + c₂(T,m,n,μ,ν,p,p₂)*a[q-1])/c₀(T,m,n,μ,ν,p,p₁)
            else
                a[q+1] = thirdinitialcondition(T,m,n,μ,ν)
            end
            q+=1
        end
        while q ≤ qmax
            p = n+ν-2q
            p₁,p₂ = p-m-μ,p+m+μ
            if A(T,m,n,μ,ν,p+4) != 0
                a[q+1] = (c₁(T,m,n,μ,ν,p,p₁,p₂)*a[q] + c₂(T,m,n,μ,ν,p,p₂)*a[q-1])/c₀(T,m,n,μ,ν,p,p₁)
            elseif A(T,m,n,μ,ν,p+6) != 0
                a[q+1] = (d₁(T,m,n,μ,ν,p,p₁,p₂)*a[q] + d₂(T,m,n,μ,ν,p,p₁,p₂)*a[q-1] + d₃(T,m,n,μ,ν,p,p₂)*a[q-2])/d₀(T,m,n,μ,ν,p,p₁)
            else
                a[q+1] = (p+1)*(p₂+2)*α(T,n,ν,p+2)/(p+2)/(p₁+1)/α(T,n,ν,p+1)*a[q]
            end
            q+=1
        end
    end
    a
end

function secondinitialcondition(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer) where T
    n₄ = n+ν-m-μ
    mn = m-n
    μν = μ-ν
    temp = 2n+2ν-one(T)
    return (temp-2)/2*(1-temp/n₄/(n₄-1)*(mn*(mn+one(T))/(2n-1)+μν*(μν+one(T))/(2ν-1)))
end

function thirdinitialcondition(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer) where T
    n₄ = n+ν-m-μ
    mn = m-n
    μν = μ-ν
    temp = 2n+2ν-one(T)
    temp1 = mn*(mn+one(T))*(mn+2)*(mn+3)/(2n-1)/(2n-3) + 2mn*(mn+one(T))*μν*(μν+one(T))/(2n-1)/(2ν-1) + μν*(μν+one(T))*(μν+2)*(μν+3)/(2ν-1)/(2ν-3)
    temp2 = (temp-4)/(2(n₄-2)*(n₄-3))*temp1 - mn*(mn+one(T))/(2n-1)-μν*(μν+one(T))/(2ν-1)
    return temp*(temp-6)/4*( (temp-2)/n₄/(n₄-1)*temp2 + one(T)/2 )
end

α(::Type{T},n::Integer,ν::Integer,p::Integer) where T =
    (p^2-(n+ν+1)^2)*(p^2-(n-ν)^2)/(4p^2-one(T))
A(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer) where T =
    p*(p-one(T))*(m-μ)-(m+μ)*(n-ν)*(n+ν+one(T))

c₀(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₁::Integer)  where T =
    (p+2)*(p+3)*(p₁+1)*(p₁+2)*A(T,m,n,μ,ν,p+4)*α(T,n,ν,p+1)
c₁(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₁::Integer,p₂::Integer)  where T =
    A(T,m,n,μ,ν,p+2)*A(T,m,n,μ,ν,p+3)*A(T,m,n,μ,ν,p+4) + (p+1)*(p+3)*(p₁+2)*(p₂+2)*A(T,m,n,μ,ν,p+4)*α(T,n,ν,p+2) + (p+2)*(p+4)*(p₁+3)*(p₂+3)*A(T,m,n,μ,ν,p+2)*α(T,n,ν,p+3)
c₂(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₂::Integer)  where T =
    -(p+2)*(p+3)*(p₂+3)*(p₂+4)*A(T,m,n,μ,ν,p+2)*α(T,n,ν,p+4)

d₀(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₁::Integer)  where T =
    (p+2)*(p+3)*(p+5)*(p₁+2)*(p₁+4)*A(T,m,n,μ,ν,p+6)*α(T,n,ν,p+1)
d₁(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₁::Integer,p₂::Integer) where T =
    (p+5)*(p₁+4)*A(T,m,n,μ,ν,p+6)*( A(T,m,n,μ,ν,p+2)*A(T,m,n,μ,ν,p+3) + (p+1)*(p+3)*(p₁+2)*(p₂+2)*α(T,n,ν,p+2) )
d₂(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₁::Integer,p₂::Integer) where T =
    (p+2)*(p₂+3)*A(T,m,n,μ,ν,p+2)*( A(T,m,n,μ,ν,p+5)*A(T,m,n,μ,ν,p+6) + (p+4)*(p+6)*(p₁+5)*(p₂+5)*α(T,n,ν,p+5) )
d₃(::Type{T},m::Integer,n::Integer,μ::Integer,ν::Integer,p::Integer,p₂::Integer) where T =
    -(p+2)*(p+4)*(p+5)*(p₂+3)*(p₂+5)*(p₂+6)*A(T,m,n,μ,ν,p+2)*α(T,n,ν,p+6)


================================================
FILE: src/hermite.jl
================================================
# exp(-x^2/2) H_n(x) / sqrt(π*prod(1:n))

struct ForwardWeightedHermitePlan{T}
    Vtw::Matrix{T} # vandermonde
end

struct BackwardWeightedHermitePlan{T}
    V::Matrix{T} # vandermonde
end

function _weightedhermite_vandermonde(n)
    V = Array{Float64}(undef, n, n)
    x,w = unweightedgausshermite(n)
    for k=1:n
        V[k,:] = FastGaussQuadrature.hermpoly_rec(0:n-1, sqrt(2)*x[k])
    end
    V,w
end

function ForwardWeightedHermitePlan(n::Integer)
    V,w = _weightedhermite_vandermonde(n)
    ForwardWeightedHermitePlan(V' * Diagonal(w / sqrt(π)))
end

BackwardWeightedHermitePlan(n::Integer) = BackwardWeightedHermitePlan(_weightedhermite_vandermonde(n)[1])

*(P::ForwardWeightedHermitePlan, v::AbstractVector) = P.Vtw*v
*(P::BackwardWeightedHermitePlan, v::AbstractVector) = P.V*v

weightedhermitetransform(v) = ForwardWeightedHermitePlan(length(v))*v
iweightedhermitetransform(v) = BackwardWeightedHermitePlan(length(v))*v


================================================
FILE: src/inufft.jl
================================================
"""
Pre-computes an inverse nonuniform fast Fourier transform of type `N`.

For best performance, choose the right number of threads by `FFTW.set_num_threads(4)`, for example.
"""
struct iNUFFTPlan{N,T,S,PT,TF} <: Plan{T}
    pt::PT
    TP::TF
    r::Vector{T}
    p::Vector{T}
    Ap::Vector{T}
    ϵ::S
end

"""
Pre-computes an inverse nonuniform fast Fourier transform of type I.
"""
function plan_inufft1(ω::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    N = length(ω)
    p = plan_nufft1(ω, ϵ)
    pt = plan_nufft2(ω/N, ϵ)
    c = p*ones(Complex{T}, N)
    r = conj(c)
    avg = (r[1]+c[1])/2
    r[1] = avg
    c[1] = avg
    TP = factorize(Toeplitz(c, r))
    r = zero(c)
    p = zero(c)
    Ap = zero(c)

    iNUFFTPlan{1, eltype(TP), typeof(ϵ), typeof(pt), typeof(TP)}(pt, TP, r, p, Ap, ϵ)
end

"""
Pre-computes an inverse nonuniform fast Fourier transform of type II.
"""
function plan_inufft2(x::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    N = length(x)
    pt = plan_nufft1(N*x, ϵ)
    r = pt*ones(Complex{T}, N)
    c = conj(r)
    avg = (r[1]+c[1])/2
    r[1] = avg
    c[1] = avg
    TP = factorize(Toeplitz(c, r))
    r = zero(c)
    p = zero(c)
    Ap = zero(c)

    iNUFFTPlan{2, eltype(TP), typeof(ϵ), typeof(pt), typeof(TP)}(pt, TP, r, p, Ap, ϵ)
end


function (*)(p::iNUFFTPlan{N,T}, x::AbstractVector{V}) where {N,T,V}
    mul!(zeros(promote_type(T,V), length(x)), p, x)
end

function mul!(c::AbstractVector{T}, P::iNUFFTPlan{1,T}, f::AbstractVector{T}) where T
    pt, TP, r, p, Ap, ϵ = P.pt, P.TP, P.r, P.p, P.Ap, P.ϵ
    cg_for_inufft(TP, c, f, r, p, Ap, 50, 100ϵ)
    conj!(mul!(c, pt, conj!(c)))
end


function mul!(c::AbstractVector{T}, P::iNUFFTPlan{2,T}, f::AbstractVector{T}) where T
    pt, TP, r, p, Ap, ϵ = P.pt, P.TP, P.r, P.p, P.Ap, P.ϵ
    cg_for_inufft(TP, c, conj!(pt*conj!(f)), r, p, Ap, 50, 100ϵ)
    conj!(f)
    c
end

"""
Computes an inverse nonuniform fast Fourier transform of type I.
"""
inufft1(c::AbstractVector, ω::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_inufft1(ω, ϵ)*c

"""
Computes an inverse nonuniform fast Fourier transform of type II.
"""
inufft2(c::AbstractVector, x::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_inufft2(x, ϵ)*c

function cg_for_inufft(A::ToeplitzMatrices.ToeplitzFactorization{T}, x::AbstractVector{T}, b::AbstractVector{T}, r::AbstractVector{T}, p::AbstractVector{T}, Ap::AbstractVector{T}, max_it::Integer, tol::Real) where T
	n = length(b)
    nrmb = norm(b)
    if nrmb == 0 nrmb = one(typeof(nrmb)) end
	copyto!(x, b)
    fill!(r, zero(T))
    fill!(p, zero(T))
    fill!(Ap, zero(T))
    # r = b - A*x
    copyto!(r, b)
    mul!(r, A, x, -one(T), one(T))
	copyto!(p, r)
	nrm2 = r⋅r
    for k = 1:max_it
        # Ap = A*p
        mul!(Ap, A, p)
		α = nrm2/(p⋅Ap)
        @inbounds @simd for l = 1:n
            x[l] += α*p[l]
            r[l] -= α*Ap[l]
        end
		nrm2new = r⋅r
        cst = nrm2new/nrm2
        @inbounds @simd for l = 1:n
            p[l] = muladd(cst, p[l], r[l])
        end
		nrm2 = nrm2new
        if sqrt(abs(nrm2)) ≤ tol*nrmb break end
	end
    return x
end


================================================
FILE: src/libfasttransforms.jl
================================================
if get(ENV, "FT_BUILD_FROM_SOURCE", "false") == "true"
    using Libdl
    const libfasttransforms = find_library("libfasttransforms", [joinpath(dirname(@__DIR__), "deps")])
    if libfasttransforms ≡ nothing || length(libfasttransforms) == 0
        error("FastTransforms is not properly installed. Please run Pkg.build(\"FastTransforms\") ",
              "and restart Julia.")
    end
else
    using FastTransforms_jll
end

ft_set_num_threads(n::Integer) = ccall((:ft_set_num_threads, libfasttransforms), Cvoid, (Cint, ), n)
ft_fftw_plan_with_nthreads(n::Integer) = ccall((:ft_fftw_plan_with_nthreads, libfasttransforms), Cvoid, (Cint, ), n)

function __init__()
    n = ceil(Int, Sys.CPU_THREADS/2)
    ft_set_num_threads(n)
    ccall((:ft_fftw_init_threads, libfasttransforms), Cint, ())
    ft_fftw_plan_with_nthreads(n)
end


"""
    mpfr_t <: AbstractFloat

A Julia struct that exactly matches `mpfr_t`.
"""
struct mpfr_t <: AbstractFloat
    prec::Clong
    sign::Cint
    exp::Clong
    d::Ptr{Limb}
end

"""
`BigFloat` is a mutable struct and there is no guarantee that each entry in an
`AbstractArray{BigFloat}` is unique. For example, looking at the `Limb`s,

    Id = Matrix{BigFloat}(I, 3, 3)
    map(x->x.d, Id)

shows that the ones and the zeros all share the same pointers. If a C function
assumes unicity of each datum, then the array must be renewed with a `deepcopy`.
"""
function renew!(x::AbstractArray{BigFloat})
    for i in eachindex(x)
        @inbounds x[i] = deepcopy(x[i])
    end
    return x
end

function horner!(f::Vector{Float64}, c::StridedVector{Float64}, x::Vector{Float64})
    @assert length(x) == length(f)
    ccall((:ft_horner, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Cint, Ptr{Float64}, Ptr{Float64}), length(c), c, stride(c, 1), length(x), x, f)
    f
end

function horner!(f::Vector{Float32}, c::StridedVector{Float32}, x::Vector{Float32})
    @assert length(x) == length(f)
    ccall((:ft_hornerf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Cint, Ptr{Float32}, Ptr{Float32}), length(c), c, stride(c, 1), length(x), x, f)
    f
end

function check_clenshaw_points(x, ϕ₀, f)
    length(x) == length(ϕ₀) == length(f) || throw(ArgumentError("Dimensions must match"))
end

function check_clenshaw_points(x, f)
    length(x) == length(f) || throw(ArgumentError("Dimensions must match"))
end

function clenshaw!(f::Vector{Float64}, c::StridedVector{Float64}, x::Vector{Float64})
    @boundscheck check_clenshaw_points(x, f)
    ccall((:ft_clenshaw, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Cint, Ptr{Float64}, Ptr{Float64}), length(c), c, stride(c, 1), length(x), x, f)
    f
end

function clenshaw!(f::Vector{Float32}, c::StridedVector{Float32}, x::Vector{Float32})
    @boundscheck check_clenshaw_points(x, f)
    ccall((:ft_clenshawf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Cint, Ptr{Float32}, Ptr{Float32}), length(c), c, stride(c, 1), length(x), x, f)
    f
end

function clenshaw!(f::Vector{Float64}, c::StridedVector{Float64}, A::Vector{Float64}, B::Vector{Float64}, C::Vector{Float64}, x::Vector{Float64}, ϕ₀::Vector{Float64})
    N = length(c)
    @boundscheck check_clenshaw_recurrences(N, A, B, C)
    @boundscheck check_clenshaw_points(x, ϕ₀, f)
    ccall((:ft_orthogonal_polynomial_clenshaw, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}), N, c, stride(c, 1), A, B, C, length(x), x, ϕ₀, f)
    f
end

function clenshaw!(f::Vector{Float32}, c::StridedVector{Float32}, A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, x::Vector{Float32}, ϕ₀::Vector{Float32})
    N = length(c)
    @boundscheck check_clenshaw_recurrences(N, A, B, C)
    @boundscheck check_clenshaw_points(x, ϕ₀, f)
    ccall((:ft_orthogonal_polynomial_clenshawf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}), N, c, stride(c, 1), A, B, C, length(x), x, ϕ₀, f)
    f
end

@enum Transforms::Cint begin
    LEG2CHEB=0
    CHEB2LEG
    ULTRA2ULTRA
    JAC2JAC
    LAG2LAG
    JAC2ULTRA
    ULTRA2JAC
    JAC2CHEB
    CHEB2JAC
    ULTRA2CHEB
    CHEB2ULTRA
    ASSOCIATEDJAC2JAC
    MODIFIEDJAC2JAC
    MODIFIEDLAG2LAG
    MODIFIEDHERM2HERM
    SPHERE
    SPHEREV
    DISK
    ANNULUS
    RECTDISK
    TRIANGLE
    TETRAHEDRON
    SPINSPHERE
    SPHERESYNTHESIS
    SPHEREANALYSIS
    SPHEREVSYNTHESIS
    SPHEREVANALYSIS
    DISKSYNTHESIS
    DISKANALYSIS
    ANNULUSSYNTHESIS
    ANNULUSANALYSIS
    RECTDISKSYNTHESIS
    RECTDISKANALYSIS
    TRIANGLESYNTHESIS
    TRIANGLEANALYSIS
    TETRAHEDRONSYNTHESIS
    TETRAHEDRONANALYSIS
    SPINSPHERESYNTHESIS
    SPINSPHEREANALYSIS
    SPHERICALISOMETRY
end

Transforms(t::Transforms) = t

let k2s = Dict(LEG2CHEB             => "Legendre--Chebyshev",
               CHEB2LEG             => "Chebyshev--Legendre",
               ULTRA2ULTRA          => "ultraspherical--ultraspherical",
               JAC2JAC              => "Jacobi--Jacobi",
               LAG2LAG              => "Laguerre--Laguerre",
               JAC2ULTRA            => "Jacobi--ultraspherical",
               ULTRA2JAC            => "ultraspherical--Jacobi",
               JAC2CHEB             => "Jacobi--Chebyshev",
               CHEB2JAC             => "Chebyshev--Jacobi",
               ULTRA2CHEB           => "ultraspherical--Chebyshev",
               CHEB2ULTRA           => "Chebyshev--ultraspherical",
               ASSOCIATEDJAC2JAC    => "Associated Jacobi--Jacobi",
               MODIFIEDJAC2JAC      => "Modified Jacobi--Jacobi",
               MODIFIEDLAG2LAG      => "Modified Laguerre--Laguerre",
               MODIFIEDHERM2HERM    => "Modified Hermite--Hermite",
               SPHERE               => "Spherical harmonic--Fourier",
               SPHEREV              => "Spherical vector field--Fourier",
               DISK                 => "Zernike--Chebyshev×Fourier",
               ANNULUS              => "Annulus--Chebyshev×Fourier",
               RECTDISK             => "Dunkl-Xu--Chebyshev²",
               TRIANGLE             => "Proriol--Chebyshev²",
               TETRAHEDRON          => "Proriol--Chebyshev³",
               SPINSPHERE           => "Spin-weighted spherical harmonic--Fourier",
               SPHERESYNTHESIS      => "FFTW Fourier synthesis on the sphere",
               SPHEREANALYSIS       => "FFTW Fourier analysis on the sphere",
               SPHEREVSYNTHESIS     => "FFTW Fourier synthesis on the sphere (vector field)",
               SPHEREVANALYSIS      => "FFTW Fourier analysis on the sphere (vector field)",
               DISKSYNTHESIS        => "FFTW Chebyshev×Fourier synthesis on the disk",
               DISKANALYSIS         => "FFTW Chebyshev×Fourier analysis on the disk",
               ANNULUSSYNTHESIS     => "FFTW Chebyshev×Fourier synthesis on the annulus",
               ANNULUSANALYSIS      => "FFTW Chebyshev×Fourier analysis on the annulus",
               RECTDISKSYNTHESIS    => "FFTW Chebyshev synthesis on the rectangularized disk",
               RECTDISKANALYSIS     => "FFTW Chebyshev analysis on the rectangularized disk",
               TRIANGLESYNTHESIS    => "FFTW Chebyshev synthesis on the triangle",
               TRIANGLEANALYSIS     => "FFTW Chebyshev analysis on the triangle",
               TETRAHEDRONSYNTHESIS => "FFTW Chebyshev synthesis on the tetrahedron",
               TETRAHEDRONANALYSIS  => "FFTW Chebyshev analysis on the tetrahedron",
               SPINSPHERESYNTHESIS  => "FFTW Fourier synthesis on the sphere (spin-weighted)",
               SPINSPHEREANALYSIS   => "FFTW Fourier analysis on the sphere (spin-weighted)",
               SPHERICALISOMETRY    => "Spherical isometry")
    global kind2string
    kind2string(k::Union{Integer, Transforms}) = k2s[Transforms(k)]
end

struct ft_plan_struct end

mutable struct FTPlan{T, N, K}
    plan::Ptr{ft_plan_struct}
    n::Int
    l::Int
    m::Int
    function FTPlan{T, N, K}(plan::Ptr{ft_plan_struct}, n::Int) where {T, N, K}
        p = new(plan, n)
        finalizer(destroy_plan, p)
        p
    end
    function FTPlan{T, N, K}(plan::Ptr{ft_plan_struct}, n::Int, m::Int) where {T, N, K}
        p = new(plan, n, -1, m)
        finalizer(destroy_plan, p)
        p
    end
    function FTPlan{T, N, K}(plan::Ptr{ft_plan_struct}, n::Int, l::Int, m::Int) where {T, N, K}
        p = new(plan, n, l, m)
        finalizer(destroy_plan, p)
        p
    end
end

eltype(p::FTPlan{T}) where {T} = T
ndims(p::FTPlan{T, N}) where {T, N} = N
show(io::IO, p::FTPlan{T, 1, K}) where {T, K} = print(io, "FastTransforms ", kind2string(K), " plan for $(p.n)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, SPHERE}) where T = print(io, "FastTransforms ", kind2string(SPHERE), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, SPHEREV}) where T = print(io, "FastTransforms ", kind2string(SPHEREV), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, DISK}) where T = print(io, "FastTransforms ", kind2string(DISK), " plan for $(p.n)×$(4p.n-3)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, ANNULUS}) where T = print(io, "FastTransforms ", kind2string(ANNULUS), " plan for $(p.n)×$(4p.n-3)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, RECTDISK}) where T = print(io, "FastTransforms ", kind2string(RECTDISK), " plan for $(p.n)×$(p.n)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, TRIANGLE}) where T = print(io, "FastTransforms ", kind2string(TRIANGLE), " plan for $(p.n)×$(p.n)-element array of ", T)
show(io::IO, p::FTPlan{T, 3, TETRAHEDRON}) where T = print(io, "FastTransforms ", kind2string(TETRAHEDRON), " plan for $(p.n)×$(p.n)×$(p.n)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, SPINSPHERE}) where T = print(io, "FastTransforms ", kind2string(SPINSPHERE), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, K}) where {T, K} = print(io, "FastTransforms plan for ", kind2string(K), " for $(p.n)×$(p.m)-element array of ", T)
show(io::IO, p::FTPlan{T, 3, K}) where {T, K} = print(io, "FastTransforms plan for ", kind2string(K), " for $(p.n)×$(p.l)×$(p.m)-element array of ", T)
show(io::IO, p::FTPlan{T, 2, SPHERICALISOMETRY}) where T = print(io, "FastTransforms ", kind2string(SPHERICALISOMETRY), " plan for $(p.n)×$(2p.n-1)-element array of ", T)

function checksize(p::FTPlan{T, 1}, x::StridedArray{T}) where T
    if p.n != size(x, 1)
        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
    end
end

function checkstride(p::FTPlan{T, 1}, x::StridedArray{T}) where T
    if stride(x, 1) != 1
        error("FTPlan requires unit stride in the leading dimension, x has stride $(stride(x, 1)) in the leading dimension.")
    end
end

for (N, K) in ((2, RECTDISK), (2, TRIANGLE), (3, TETRAHEDRON))
    @eval function checksize(p::FTPlan{T, $N, $K}, x::Array{T, $N}) where T
        if p.n != size(x, 1)
            throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
        end
    end
end

for K in (SPHERE, SPHEREV, DISK, ANNULUS, SPINSPHERE)
    @eval function checksize(p::FTPlan{T, 2, $K}, x::Matrix{T}) where T
        if p.n != size(x, 1)
            throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
        end
        if iseven(size(x, 2))
            throw(DimensionMismatch("This FTPlan only operates on arrays with an odd number of columns."))
        end
    end
end

function checksize(p::FTPlan{T, 2}, x::Array{T, 2}) where T
    if p.n != size(x, 1) || p.m != size(x, 2)
        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2))"))
    end
end

function checksize(p::FTPlan{T, 3}, x::Array{T, 3}) where T
    if p.n != size(x, 1) || p.l != size(x, 2) || p.m != size(x, 3)
        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.l) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2)) × $(size(x, 3))"))
    end
end

function checksize(p::FTPlan{T, 2, SPHERICALISOMETRY}, x::Matrix{T}) where T
    if p.n != size(x, 1) || 2p.n-1 != size(x, 2)
        throw(DimensionMismatch("This FTPlan must operate on arrays of size $(p.n) × $(2p.n-1)."))
    end
end

unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::FTPlan) = p.plan
unsafe_convert(::Type{Ptr{mpfr_t}}, p::FTPlan) = unsafe_convert(Ptr{mpfr_t}, p.plan)

destroy_plan(p::FTPlan{Float32, 1}) = ccall((:ft_destroy_tb_eigen_FMMf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 1}) = ccall((:ft_destroy_tb_eigen_FMM, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{BigFloat, 1}) = ccall((:ft_mpfr_destroy_plan, libfasttransforms), Cvoid, (Ptr{mpfr_t}, Cint), p, p.n)
destroy_plan(p::FTPlan{Float32, 1, ASSOCIATEDJAC2JAC}) = ccall((:ft_destroy_btb_eigen_FMMf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 1, ASSOCIATEDJAC2JAC}) = ccall((:ft_destroy_btb_eigen_FMM, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float32, 1, MODIFIEDJAC2JAC}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 1, MODIFIEDJAC2JAC}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float32, 1, MODIFIEDLAG2LAG}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 1, MODIFIEDLAG2LAG}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float32, 1, MODIFIEDHERM2HERM}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 1, MODIFIEDHERM2HERM}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64}) = ccall((:ft_destroy_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHERE}) = ccall((:ft_destroy_spin_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, SPHERESYNTHESIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, SPHEREANALYSIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, SPHEREVSYNTHESIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, SPHEREVANALYSIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, DISKSYNTHESIS}) = ccall((:ft_destroy_disk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, DISKANALYSIS}) = ccall((:ft_destroy_disk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, ANNULUSSYNTHESIS}) = ccall((:ft_destroy_annulus_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, ANNULUSANALYSIS}) = ccall((:ft_destroy_annulus_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, RECTDISKSYNTHESIS}) = ccall((:ft_destroy_rectdisk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, RECTDISKANALYSIS}) = ccall((:ft_destroy_rectdisk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, TRIANGLESYNTHESIS}) = ccall((:ft_destroy_triangle_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, TRIANGLEANALYSIS}) = ccall((:ft_destroy_triangle_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 3, TETRAHEDRONSYNTHESIS}) = ccall((:ft_destroy_tetrahedron_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 3, TETRAHEDRONANALYSIS}) = ccall((:ft_destroy_tetrahedron_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHERESYNTHESIS}) = ccall((:ft_destroy_spinsphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHEREANALYSIS}) = ccall((:ft_destroy_spinsphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
destroy_plan(p::FTPlan{Float64, 2, SPHERICALISOMETRY}) = ccall((:ft_destroy_sph_isometry_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)

struct AdjointFTPlan{T, S, R}
    parent::S
    adjoint::R
    function AdjointFTPlan{T, S, R}(parent::S) where {T, S, R}
        new(parent)
    end
    function AdjointFTPlan{T, S, R}(parent::S, adjoint::R) where {T, S, R}
        new(parent, adjoint)
    end
end

AdjointFTPlan(p::FTPlan) = AdjointFTPlan{eltype(p), typeof(p), typeof(p)}(p)
AdjointFTPlan(p::FTPlan, q::FTPlan) = AdjointFTPlan{eltype(q), typeof(p), typeof(q)}(p, q)

adjoint(p::FTPlan) = AdjointFTPlan(p)
adjoint(p::AdjointFTPlan) = p.parent

eltype(p::AdjointFTPlan{T}) where T = T
ndims(p::AdjointFTPlan) = ndims(p.parent)
function show(io::IO, p::AdjointFTPlan)
    print(io, "Adjoint ")
    show(io, p.parent)
end

function checksize(p::AdjointFTPlan, x)
    try
        checksize(p.adjoint, x)
    catch
        checksize(p.parent, x)
    end
end

function checkstride(p::AdjointFTPlan, x)
    try
        checkstride(p.adjoint, x)
    catch
        checkstride(p.parent, x)
    end
end

function unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::AdjointFTPlan)
    try
        unsafe_convert(Ptr{ft_plan_struct}, p.adjoint)
    catch
        unsafe_convert(Ptr{ft_plan_struct}, p.parent)
    end
end

function unsafe_convert(::Type{Ptr{mpfr_t}}, p::AdjointFTPlan)
    try
        unsafe_convert(Ptr{mpfr_t}, p.adjoint)
    catch
        unsafe_convert(Ptr{mpfr_t}, p.parent)
    end
end

struct TransposeFTPlan{T, S, R}
    parent::S
    transpose::R
    function TransposeFTPlan{T, S, R}(parent::S) where {T, S, R}
        new(parent)
    end
    function TransposeFTPlan{T, S, R}(parent::S, transpose::R) where {T, S, R}
        new(parent, transpose)
    end
end

TransposeFTPlan(p::FTPlan) = TransposeFTPlan{eltype(p), typeof(p), typeof(p)}(p)
TransposeFTPlan(p::FTPlan, q::FTPlan) = TransposeFTPlan{eltype(q), typeof(p), typeof(q)}(p, q)

transpose(p::FTPlan) = TransposeFTPlan(p)
transpose(p::TransposeFTPlan) = p.parent

eltype(p::TransposeFTPlan{T}) where T = T
ndims(p::TransposeFTPlan) = ndims(p.parent)
function show(io::IO, p::TransposeFTPlan)
    print(io, "Transpose ")
    show(io, p.parent)
end

function checksize(p::TransposeFTPlan, x)
    try
        checksize(p.transpose, x)
    catch
        checksize(p.parent, x)
    end
end

function checkstride(p::TransposeFTPlan, x)
    try
        checkstride(p.transpose, x)
    catch
        checkstride(p.parent, x)
    end
end

function unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::TransposeFTPlan)
    try
        unsafe_convert(Ptr{ft_plan_struct}, p.transpose)
    catch
        unsafe_convert(Ptr{ft_plan_struct}, p.parent)
    end
end

function unsafe_convert(::Type{Ptr{mpfr_t}}, p::TransposeFTPlan)
    try
        unsafe_convert(Ptr{mpfr_t}, p.transpose)
    catch
        unsafe_convert(Ptr{mpfr_t}, p.parent)
    end
end

const ModifiedFTPlan{T} = Union{FTPlan{T, 1, MODIFIEDJAC2JAC}, FTPlan{T, 1, MODIFIEDLAG2LAG}, FTPlan{T, 1, MODIFIEDHERM2HERM}}

for f in (:leg2cheb, :cheb2leg, :ultra2ultra, :jac2jac,
          :lag2lag, :jac2ultra, :ultra2jac, :jac2cheb,
          :cheb2jac, :ultra2cheb, :cheb2ultra, :associatedjac2jac,
          :modifiedjac2jac, :modifiedlag2lag, :modifiedherm2herm,
          :sph2fourier, :sphv2fourier, :disk2cxf, :ann2cxf,
          :rectdisk2cheb, :tri2cheb, :tet2cheb)
    plan_f = Symbol("plan_", f)
    lib_f = Symbol("lib_", f)
    @eval begin
        $plan_f(x::AbstractArray{T}, y...; z...) where T = $plan_f(T, size(x, 1), y...; z...)
        $plan_f(::Type{Complex{T}}, y...; z...) where T <: Real = $plan_f(T, y...; z...)
        $lib_f(x::AbstractArray, y...; z...) = $plan_f(x, y...; z...)*x
    end
end

for (f, plan_f) in ((:fourier2sph, :plan_sph2fourier), (:fourier2sphv, :plan_sphv2fourier),
                    (:cxf2disk, :plan_disk2cxf), (:cxf2ann, :plan_ann2cxf),
                    (:cheb2rectdisk, :plan_rectdisk2cheb), (:cheb2tri, :plan_tri2cheb),
                    (:cheb2tet, :plan_tet2cheb))
    @eval begin
        $f(x::AbstractArray, y...; z...) = $plan_f(x, y...; z...)\x
    end
end

plan_spinsph2fourier(x::AbstractArray{T}, y...; z...) where T = plan_spinsph2fourier(T, size(x, 1), y...; z...)
spinsph2fourier(x::AbstractArray, y...; z...) = plan_spinsph2fourier(x, y...; z...)*x
fourier2spinsph(x::AbstractArray, y...; z...) = plan_spinsph2fourier(x, y...; z...)\x

function plan_leg2cheb(::Type{Float32}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_legendre_to_chebyshevf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normleg, normcheb, n)
    return FTPlan{Float32, 1, LEG2CHEB}(plan, n)
end

function plan_cheb2leg(::Type{Float32}, n::Integer; normcheb::Bool=false, normleg::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_legendref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normcheb, normleg, n)
    return FTPlan{Float32, 1, CHEB2LEG}(plan, n)
end

function plan_ultra2ultra(::Type{Float32}, n::Integer, λ, μ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_ultrasphericalf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32), norm1, norm2, n, λ, μ)
    return FTPlan{Float32, 1, ULTRA2ULTRA}(plan, n)
end

function plan_jac2jac(::Type{Float32}, n::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32, Float32, Float32), norm1, norm2, n, α, β, γ, δ)
    return FTPlan{Float32, 1, JAC2JAC}(plan, n)
end

function plan_lag2lag(::Type{Float32}, n::Integer, α, β; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_laguerre_to_laguerref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32), norm1, norm2, n, α, β)
    return FTPlan{Float32, 1, LAG2LAG}(plan, n)
end

function plan_jac2ultra(::Type{Float32}, n::Integer, α, β, λ; normjac::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_ultrasphericalf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32, Float32), normjac, normultra, n, α, β, λ)
    return FTPlan{Float32, 1, JAC2ULTRA}(plan, n)
end

function plan_ultra2jac(::Type{Float32}, n::Integer, λ, α, β; normultra::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32, Float32), normultra, normjac, n, λ, α, β)
    return FTPlan{Float32, 1, ULTRA2JAC}(plan, n)
end

function plan_jac2cheb(::Type{Float32}, n::Integer, α, β; normjac::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_chebyshevf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32), normjac, normcheb, n, α, β)
    return FTPlan{Float32, 1, JAC2CHEB}(plan, n)
end

function plan_cheb2jac(::Type{Float32}, n::Integer, α, β; normcheb::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32, Float32), normcheb, normjac, n, α, β)
    return FTPlan{Float32, 1, CHEB2JAC}(plan, n)
end

function plan_ultra2cheb(::Type{Float32}, n::Integer, λ; normultra::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_chebyshevf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32), normultra, normcheb, n, λ)
    return FTPlan{Float32, 1, ULTRA2CHEB}(plan, n)
end

function plan_cheb2ultra(::Type{Float32}, n::Integer, λ; normcheb::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_ultrasphericalf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float32), normcheb, normultra, n, λ)
    return FTPlan{Float32, 1, CHEB2ULTRA}(plan, n)
end

function plan_associatedjac2jac(::Type{Float32}, n::Integer, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_associated_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cint, Float32, Float32, Float32, Float32), norm1, norm2, n, c, α, β, γ, δ)
    return FTPlan{Float32, 1, ASSOCIATEDJAC2JAC}(plan, n)
end

function plan_modifiedjac2jac(::Type{Float32}, n::Integer, α, β, u::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, β, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float32, 1, MODIFIEDJAC2JAC}(plan, n)
end

function plan_modifiedjac2jac(::Type{Float32}, n::Integer, α, β, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, β, length(u), u, length(v), v, verbose)
    return FTPlan{Float32, 1, MODIFIEDJAC2JAC}(plan, n)
end

function plan_modifiedlag2lag(::Type{Float32}, n::Integer, α, u::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_laguerre_to_laguerref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float32, 1, MODIFIEDLAG2LAG}(plan, n)
end

function plan_modifiedlag2lag(::Type{Float32}, n::Integer, α, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_laguerre_to_laguerref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, length(u), u, length(v), v, verbose)
    return FTPlan{Float32, 1, MODIFIEDLAG2LAG}(plan, n)
end

function plan_modifiedherm2herm(::Type{Float32}, n::Integer, u::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_hermite_to_hermitef, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float32, 1, MODIFIEDHERM2HERM}(plan, n)
end

function plan_modifiedherm2herm(::Type{Float32}, n::Integer, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_hermite_to_hermitef, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, length(u), u, length(v), v, verbose)
    return FTPlan{Float32, 1, MODIFIEDHERM2HERM}(plan, n)
end


function plan_leg2cheb(::Type{Float64}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_legendre_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normleg, normcheb, n)
    return FTPlan{Float64, 1, LEG2CHEB}(plan, n)
end

function plan_cheb2leg(::Type{Float64}, n::Integer; normcheb::Bool=false, normleg::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_legendre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normcheb, normleg, n)
    return FTPlan{Float64, 1, CHEB2LEG}(plan, n)
end

function plan_ultra2ultra(::Type{Float64}, n::Integer, λ, μ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64), norm1, norm2, n, λ, μ)
    return FTPlan{Float64, 1, ULTRA2ULTRA}(plan, n)
end

function plan_jac2jac(::Type{Float64}, n::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64, Float64, Float64), norm1, norm2, n, α, β, γ, δ)
    return FTPlan{Float64, 1, JAC2JAC}(plan, n)
end

function plan_lag2lag(::Type{Float64}, n::Integer, α, β; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64), norm1, norm2, n, α, β)
    return FTPlan{Float64, 1, LAG2LAG}(plan, n)
end

function plan_jac2ultra(::Type{Float64}, n::Integer, α, β, λ; normjac::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64, Float64), normjac, normultra, n, α, β, λ)
    return FTPlan{Float64, 1, JAC2ULTRA}(plan, n)
end

function plan_ultra2jac(::Type{Float64}, n::Integer, λ, α, β; normultra::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64, Float64), normultra, normjac, n, λ, α, β)
    return FTPlan{Float64, 1, ULTRA2JAC}(plan, n)
end

function plan_jac2cheb(::Type{Float64}, n::Integer, α, β; normjac::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_jacobi_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64), normjac, normcheb, n, α, β)
    return FTPlan{Float64, 1, JAC2CHEB}(plan, n)
end

function plan_cheb2jac(::Type{Float64}, n::Integer, α, β; normcheb::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64, Float64), normcheb, normjac, n, α, β)
    return FTPlan{Float64, 1, CHEB2JAC}(plan, n)
end

function plan_ultra2cheb(::Type{Float64}, n::Integer, λ; normultra::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_plan_ultraspherical_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64), normultra, normcheb, n, λ)
    return FTPlan{Float64, 1, ULTRA2CHEB}(plan, n)
end

function plan_cheb2ultra(::Type{Float64}, n::Integer, λ; normcheb::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_plan_chebyshev_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Float64), normcheb, normultra, n, λ)
    return FTPlan{Float64, 1, CHEB2ULTRA}(plan, n)
end

function plan_associatedjac2jac(::Type{Float64}, n::Integer, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_plan_associated_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cint, Float64, Float64, Float64, Float64), norm1, norm2, n, c, α, β, γ, δ)
    return FTPlan{Float64, 1, ASSOCIATEDJAC2JAC}(plan, n)
end

function plan_modifiedjac2jac(::Type{Float64}, n::Integer, α, β, u::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, β, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float64, 1, MODIFIEDJAC2JAC}(plan, n)
end

function plan_modifiedjac2jac(::Type{Float64}, n::Integer, α, β, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, β, length(u), u, length(v), v, verbose)
    return FTPlan{Float64, 1, MODIFIEDJAC2JAC}(plan, n)
end

function plan_modifiedlag2lag(::Type{Float64}, n::Integer, α, u::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float64, 1, MODIFIEDLAG2LAG}(plan, n)
end

function plan_modifiedlag2lag(::Type{Float64}, n::Integer, α, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, length(u), u, length(v), v, verbose)
    return FTPlan{Float64, 1, MODIFIEDLAG2LAG}(plan, n)
end

function plan_modifiedherm2herm(::Type{Float64}, n::Integer, u::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_hermite_to_hermite, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, length(u), u, 0, C_NULL, verbose)
    return FTPlan{Float64, 1, MODIFIEDHERM2HERM}(plan, n)
end

function plan_modifiedherm2herm(::Type{Float64}, n::Integer, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
    plan = ccall((:ft_plan_modified_hermite_to_hermite, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, length(u), u, length(v), v, verbose)
    return FTPlan{Float64, 1, MODIFIEDHERM2HERM}(plan, n)
end


function plan_leg2cheb(::Type{BigFloat}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_mpfr_plan_legendre_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Clong, Int32), normleg, normcheb, n, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, LEG2CHEB}(plan, n)
end

function plan_cheb2leg(::Type{BigFloat}, n::Integer; normcheb::Bool=false, normleg::Bool=false)
    plan = ccall((:ft_mpfr_plan_chebyshev_to_legendre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Clong, Int32), normcheb, normleg, n, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, CHEB2LEG}(plan, n)
end

function plan_ultra2ultra(::Type{BigFloat}, n::Integer, λ, μ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_mpfr_plan_ultraspherical_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), norm1, norm2, n, λ, μ, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, ULTRA2ULTRA}(plan, n)
end

function plan_jac2jac(::Type{BigFloat}, n::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_mpfr_plan_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), norm1, norm2, n, α, β, γ, δ, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, JAC2JAC}(plan, n)
end

function plan_lag2lag(::Type{BigFloat}, n::Integer, α, β; norm1::Bool=false, norm2::Bool=false)
    plan = ccall((:ft_mpfr_plan_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), norm1, norm2, n, α, β, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, LAG2LAG}(plan, n)
end

function plan_jac2ultra(::Type{BigFloat}, n::Integer, α, β, λ; normjac::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_mpfr_plan_jacobi_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), normjac, normultra, n, α, β, λ, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, JAC2ULTRA}(plan, n)
end

function plan_ultra2jac(::Type{BigFloat}, n::Integer, λ, α, β; normultra::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_mpfr_plan_ultraspherical_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), normultra, normjac, n, λ, α, β, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, ULTRA2JAC}(plan, n)
end

function plan_jac2cheb(::Type{BigFloat}, n::Integer, α, β; normjac::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_mpfr_plan_jacobi_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), normjac, normcheb, n, α, β, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, JAC2CHEB}(plan, n)
end

function plan_cheb2jac(::Type{BigFloat}, n::Integer, α, β; normcheb::Bool=false, normjac::Bool=false)
    plan = ccall((:ft_mpfr_plan_chebyshev_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Ref{BigFloat}, Clong, Int32), normcheb, normjac, n, α, β, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, CHEB2JAC}(plan, n)
end

function plan_ultra2cheb(::Type{BigFloat}, n::Integer, λ; normultra::Bool=false, normcheb::Bool=false)
    plan = ccall((:ft_mpfr_plan_ultraspherical_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Clong, Int32), normultra, normcheb, n, λ, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, ULTRA2CHEB}(plan, n)
end

function plan_cheb2ultra(::Type{BigFloat}, n::Integer, λ; normcheb::Bool=false, normultra::Bool=false)
    plan = ccall((:ft_mpfr_plan_chebyshev_to_ultraspherical, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Ref{BigFloat}, Clong, Int32), normcheb, normultra, n, λ, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
    return FTPlan{BigFloat, 1, CHEB2ULTRA}(plan, n)
end


function plan_sph2fourier(::Type{Float64}, n::Integer)
    plan = ccall((:ft_plan_sph2fourier, libfasttransforms), Ptr{ft_plan_struct}, (Cint, ), n)
    return FTPlan{Float64, 2, SPHERE}(plan, n)
end

function plan_sphv2fourier(::Type{Float64}, n::Integer)
    plan = ccall((:ft_plan_sph2fourier, libfasttransforms), Ptr{ft_plan_struct}, (Cint, ), n)
    return FTPlan{Float64, 2, SPHEREV}(plan, n)
end

function plan_disk2cxf(::Type{Float64}, n::Integer, α, β)
    plan = ccall((:ft_plan_disk2cxf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64), n, α, β)
    return FTPlan{Float64, 2, DISK}(plan, n)
end

function plan_ann2cxf(::Type{Float64}, n::Integer, α, β, γ, ρ)
    plan = ccall((:ft_plan_ann2cxf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Float64, Float64), n, α, β, γ, ρ)
    return FTPlan{Float64, 2, ANNULUS}(plan, n)
end

function plan_rectdisk2cheb(::Type{Float64}, n::Integer, β)
    plan = ccall((:ft_plan_rectdisk2cheb, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64), n, β)
    return FTPlan{Float64, 2, RECTDISK}(plan, n)
end

function plan_tri2cheb(::Type{Float64}, n::Integer, α, β, γ)
    plan = ccall((:ft_plan_tri2cheb, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Float64), n, α, β, γ)
    return FTPlan{Float64, 2, TRIANGLE}(plan, n)
end

function plan_tet2cheb(::Type{Float64}, n::Integer, α, β, γ, δ)
    plan = ccall((:ft_plan_tet2cheb, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Float64, Float64), n, α, β, γ, δ)
    return FTPlan{Float64, 3, TETRAHEDRON}(plan, n)
end

function plan_spinsph2fourier(::Type{Complex{Float64}}, n::Integer, s::Integer)
    plan = ccall((:ft_plan_spinsph2fourier, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint), n, s)
    return FTPlan{Complex{Float64}, 2, SPINSPHERE}(plan, n)
end

plan_disk2cxf(::Type{Float64}, n::Integer, α) = plan_disk2cxf(Float64, n, α, 0)
plan_disk2cxf(::Type{Float64}, n::Integer) = plan_disk2cxf(Float64, n, 0)
plan_ann2cxf(::Type{Float64}, n::Integer, α, β, γ) = plan_ann2cxf(Float64, n, α, β, γ, 0)
plan_ann2cxf(::Type{Float64}, n::Integer, α, β) = plan_disk2cxf(Float64, n, α, β)
plan_ann2cxf(::Type{Float64}, n::Integer, α) = plan_disk2cxf(Float64, n, α)
plan_ann2cxf(::Type{Float64}, n::Integer) = plan_disk2cxf(Float64, n)
plan_rectdisk2cheb(::Type{Float64}, n::Integer) = plan_rectdisk2cheb(Float64, n, 0)
plan_tri2cheb(::Type{Float64}, n::Integer, α, β) = plan_tri2cheb(Float64, n, α, β, 0)
plan_tri2cheb(::Type{Float64}, n::Integer, α) = plan_tri2cheb(Float64, n, α, 0)
plan_tri2cheb(::Type{Float64}, n::Integer) = plan_tri2cheb(Float64, n, 0)
plan_tet2cheb(::Type{Float64}, n::Integer, α, β, γ) = plan_tet2cheb(Float64, n, α, β, γ, 0)
plan_tet2cheb(::Type{Float64}, n::Integer, α, β) = plan_tet2cheb(Float64, n, α, β, 0)
plan_tet2cheb(::Type{Float64}, n::Integer, α) = plan_tet2cheb(Float64, n, α, 0)
plan_tet2cheb(::Type{Float64}, n::Integer) = plan_tet2cheb(Float64, n, 0)

for (fJ, fadJ, fC, fE, K) in ((:plan_sph_synthesis, :plan_sph_analysis, :ft_plan_sph_synthesis, :ft_execute_sph_synthesis, SPHERESYNTHESIS),
                              (:plan_sph_analysis, :plan_sph_synthesis, :ft_plan_sph_analysis, :ft_execute_sph_analysis, SPHEREANALYSIS),
                              (:plan_sphv_synthesis, :plan_sphv_analysis, :ft_plan_sphv_synthesis, :ft_execute_sphv_synthesis, SPHEREVSYNTHESIS),
                              (:plan_sphv_analysis, :plan_sphv_synthesis, :ft_plan_sphv_analysis, :ft_execute_sphv_analysis, SPHEREVANALYSIS),
                              (:plan_disk_synthesis, :plan_disk_analysis, :ft_plan_disk_synthesis, :ft_execute_disk_synthesis, DISKSYNTHESIS),
                              (:plan_disk_analysis, :plan_disk_synthesis, :ft_plan_disk_analysis, :ft_execute_disk_analysis, DISKANALYSIS),
                              (:plan_rectdisk_synthesis, :plan_rectdisk_analysis, :ft_plan_rectdisk_synthesis, :ft_execute_rectdisk_synthesis, RECTDISKSYNTHESIS),
                              (:plan_rectdisk_analysis, :plan_rectdisk_synthesis, :ft_plan_rectdisk_analysis, :ft_execute_rectdisk_analysis, RECTDISKANALYSIS),
                              (:plan_tri_synthesis, :plan_tri_analysis, :ft_plan_tri_synthesis, :ft_execute_tri_synthesis, TRIANGLESYNTHESIS),
                              (:plan_tri_analysis, :plan_tri_synthesis, :ft_plan_tri_analysis, :ft_execute_tri_analysis, TRIANGLEANALYSIS))
    @eval begin
        $fJ(x::Matrix{T}; y...) where T = $fJ(T, size(x, 1), size(x, 2); y...)
        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
        function $fJ(::Type{Float64}, n::Integer, m::Integer; flags::Integer=FFTW.ESTIMATE)
            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cuint), n, m, flags)
            return FTPlan{Float64, 2, $K}(plan, n, m)
        end
        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m))
        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m))
        function lmul!(p::FTPlan{Float64, 2, $K}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
    end
end

ft_get_rho_annulus_fftw_plan(p::FTPlan{Float64, 2, ANNULUSSYNTHESIS}) = ccall((:ft_get_rho_annulus_fftw_plan, libfasttransforms), Float64, (Ptr{ft_plan_struct}, ), p)
ft_get_rho_annulus_fftw_plan(p::FTPlan{Float64, 2, ANNULUSANALYSIS}) = ccall((:ft_get_rho_annulus_fftw_plan, libfasttransforms), Float64, (Ptr{ft_plan_struct}, ), p)

for (fJ, fadJ, fC, fE, K) in ((:plan_annulus_synthesis, :plan_annulus_analysis, :ft_plan_annulus_synthesis, :ft_execute_annulus_synthesis, ANNULUSSYNTHESIS),
                              (:plan_annulus_analysis, :plan_annulus_synthesis, :ft_plan_annulus_analysis, :ft_execute_annulus_analysis, ANNULUSANALYSIS))
    @eval begin
        $fJ(x::Matrix{T}, ρ; y...) where T = $fJ(T, size(x, 1), size(x, 2), ρ; y...)
        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
        function $fJ(::Type{Float64}, n::Integer, m::Integer, ρ; flags::Integer=FFTW.ESTIMATE)
            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Float64, Cuint), n, m, ρ, flags)
            return FTPlan{Float64, 2, $K}(plan, n, m)
        end
        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m, ft_get_rho_annulus_fftw_plan(p)))
        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m, ft_get_rho_annulus_fftw_plan(p)))
        function lmul!(p::FTPlan{Float64, 2, $K}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
    end
end

for (fJ, fadJ, fC, fE, K) in ((:plan_tet_synthesis, :plan_tet_analysis, :ft_plan_tet_synthesis, :ft_execute_tet_synthesis, TETRAHEDRONSYNTHESIS),
                              (:plan_tet_analysis, :plan_tet_synthesis, :ft_plan_tet_analysis, :ft_execute_tet_analysis, TETRAHEDRONANALYSIS))
    @eval begin
        $fJ(x::Array{T, 3}; y...) where T = $fJ(T, size(x, 1), size(x, 2), size(x, 3); y...)
        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
        function $fJ(::Type{Float64}, n::Integer, l::Integer, m::Integer; flags::Integer=FFTW.ESTIMATE)
            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cuint), n, l, m, flags)
            return FTPlan{Float64, 3, $K}(plan, n, l, m)
        end
        adjoint(p::FTPlan{T, 3, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.l, p.m))
        transpose(p::FTPlan{T, 3, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.l, p.m))
        function lmul!(p::FTPlan{Float64, 3, $K}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 3, $K}}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 3, $K}}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
    end
end

for (fJ, fadJ, fC, fE, K) in ((:plan_spinsph_synthesis, :plan_spinsph_analysis, :ft_plan_spinsph_synthesis, :ft_execute_spinsph_synthesis, SPINSPHERESYNTHESIS),
                              (:plan_spinsph_analysis, :plan_spinsph_synthesis, :ft_plan_spinsph_analysis, :ft_execute_spinsph_analysis, SPINSPHEREANALYSIS))
    @eval begin
        $fJ(x::Matrix{T}, s::Integer; y...) where T = $fJ(T, size(x, 1), size(x, 2), s; y...)
        function $fJ(::Type{Complex{Float64}}, n::Integer, m::Integer, s::Integer; flags::Integer=FFTW.ESTIMATE)
            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cuint), n, m, s, flags)
            return FTPlan{Complex{Float64}, 2, $K}(plan, n, m)
        end
        get_spin(p::FTPlan{T, 2, $K}) where T = ccall((:ft_get_spin_spinsphere_fftw_plan, libfasttransforms), Cint, (Ptr{ft_plan_struct},), p)
        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m, get_spin(p)))
        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m, get_spin(p)))
        function lmul!(p::FTPlan{Complex{Float64}, 2, $K}, x::Matrix{Complex{Float64}})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::AdjointFTPlan{Complex{Float64}, FTPlan{Complex{Float64}, 2, $K}}, x::Matrix{Complex{Float64}})
            checksize(p, x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'C', p, x, size(x, 1), size(x, 2))
            return x
        end
        function lmul!(p::TransposeFTPlan{Complex{Float64}, FTPlan{Complex{Float64}, 2, $K}}, x::Matrix{Complex{Float64}})
            checksize(p, x)
            conj!(x)
            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'C', p, x, size(x, 1), size(x, 2))
            conj!(x)
            return x
        end
    end
end

function plan_sph_isometry(::Type{Float64}, n::Integer)
    plan = ccall((:ft_plan_sph_isometry, libfasttransforms), Ptr{ft_plan_struct}, (Cint, ), n)
    return FTPlan{Float64, 2, SPHERICALISOMETRY}(plan, n)
end

*(p::FTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
*(p::AdjointFTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
*(p::TransposeFTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
\(p::FTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))
\(p::AdjointFTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))
\(p::TransposeFTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))

*(p::FTPlan{T, 1}, x::UniformScaling{S}) where {T, S} = UpperTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.n, p.n)))
*(p::AdjointFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
*(p::TransposeFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
\(p::FTPlan{T, 1}, x::UniformScaling{S}) where {T, S} = UpperTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.n, p.n)))
\(p::AdjointFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
\(p::TransposeFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))

const AbstractUpperTriangular{T, S <: AbstractMatrix} = Union{UpperTriangular{T, S}, UnitUpperTriangular{T, S}}
const AbstractLowerTriangular{T, S <: AbstractMatrix} = Union{LowerTriangular{T, S}, UnitLowerTriangular{T, S}}

*(p::FTPlan{T, 1}, x::AbstractUpperTriangular) where T = UpperTriangular(lmul!(p, Array(x)))
*(p::AdjointFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(lmul!(p, Array(x)))
*(p::TransposeFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(lmul!(p, Array(x)))
\(p::FTPlan{T, 1}, x::AbstractUpperTriangular) where T = UpperTriangular(ldiv!(p, Array(x)))
\(p::AdjointFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(ldiv!(p, Array(x)))
\(p::TransposeFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(ldiv!(p, Array(x)))

for (fJ, fC, elty) in ((:lmul!, :ft_bfmvf, :Float32),
                       (:ldiv!, :ft_bfsvf, :Float32),
                       (:lmul!, :ft_bfmv , :Float64),
                       (:ldiv!, :ft_bfsv , :Float64))
    @eval begin
        function $fJ(p::FTPlan{$elty, 1}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', p, x)
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedVector{$elty}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedVector{$elty}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
            return x
        end
    end
end

for (fJ, fC, elty) in ((:lmul!, :ft_bbbfmvf, :Float32),
                       (:lmul!, :ft_bbbfmv , :Float64))
    @eval begin
        function $fJ(p::FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', '2', '1', p, x)
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', '1', '2', p, x)
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', '1', '2', p, x)
            return x
        end
    end
end

for (fJ, fC, elty) in ((:lmul!, :ft_mpmvf, :Float32),
                       (:ldiv!, :ft_mpsvf, :Float32),
                       (:lmul!, :ft_mpmv , :Float64),
                       (:ldiv!, :ft_mpsv , :Float64))
    @eval begin
        function $fJ(p::ModifiedFTPlan{$elty}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', p, x)
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedVector{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
            return x
        end
    end
end

for (fJ, fC) in ((:lmul!, :ft_mpfr_trmv_ptr),
                 (:ldiv!, :ft_mpfr_trsv_ptr))
    @eval begin
        function $fJ(p::FTPlan{BigFloat, 1}, x::StridedVector{BigFloat})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'N', p.n, p, p.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
            return x
        end
        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedVector{BigFloat}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
            return x
        end
        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedVector{BigFloat}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
            return x
        end
    end
end

for (fJ, fC, elty) in ((:lmul!, :ft_bfmmf, :Float32),
                       (:ldiv!, :ft_bfsmf, :Float32),
                       (:lmul!, :ft_bfmm , :Float64),
                       (:ldiv!, :ft_bfsm , :Float64))
    @eval begin
        function $fJ(p::FTPlan{$elty, 1}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedMatrix{$elty}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedMatrix{$elty}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
            return x
        end
    end
end

for (fJ, fC, elty) in ((:lmul!, :ft_bbbfmmf, :Float32),
                       (:lmul!, :ft_bbbfmm , :Float64))
    @eval begin
        function $fJ(p::FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', '2', '1', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', '1', '2', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', '1', '2', p, x, stride(x, 2), size(x, 2))
            return x
        end
    end
end

for (fJ, fC, elty) in ((:lmul!, :ft_mpmmf, :Float32),
                       (:ldiv!, :ft_mpsmf, :Float32),
                       (:lmul!, :ft_mpmm , :Float64),
                       (:ldiv!, :ft_mpsm , :Float64))
    @eval begin
        function $fJ(p::ModifiedFTPlan{$elty}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::AdjointFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
            return x
        end
        function $fJ(p::TransposeFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedMatrix{$elty})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
            return x
        end
    end
end

for (fJ, fC) in ((:lmul!, :ft_mpfr_trmm_ptr),
                 (:ldiv!, :ft_mpfr_trsm_ptr))
    @eval begin
        function $fJ(p::FTPlan{BigFloat, 1}, x::StridedMatrix{BigFloat})
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'N', p.n, p, p.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
            return x
        end
        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedMatrix{BigFloat}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
            return x
        end
        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedMatrix{BigFloat}) where K
            checksize(p, x)
            checkstride(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
            return x
        end
    end
end

for (fJ, fC, T, N, K) in ((:lmul!, :ft_execute_sph2fourier, Float64, 2, SPHERE),
                          (:ldiv!, :ft_execute_fourier2sph, Float64, 2, SPHERE),
                          (:lmul!, :ft_execute_sphv2fourier, Float64, 2, SPHEREV),
                          (:ldiv!, :ft_execute_fourier2sphv, Float64, 2, SPHEREV),
                          (:lmul!, :ft_execute_spinsph2fourier, Complex{Float64}, 2, SPINSPHERE),
                          (:ldiv!, :ft_execute_fourier2spinsph, Complex{Float64}, 2, SPINSPHERE),
                          (:lmul!, :ft_execute_disk2cxf, Float64, 2, DISK),
                          (:ldiv!, :ft_execute_cxf2disk, Float64, 2, DISK),
                          (:lmul!, :ft_execute_ann2cxf, Float64, 2, ANNULUS),
                          (:ldiv!, :ft_execute_cxf2ann, Float64, 2, ANNULUS),
                          (:lmul!, :ft_execute_rectdisk2cheb, Float64, 2, RECTDISK),
                          (:ldiv!, :ft_execute_cheb2rectdisk, Float64, 2, RECTDISK),
                          (:lmul!, :ft_execute_tri2cheb, Float64, 2, TRIANGLE),
                          (:ldiv!, :ft_execute_cheb2tri, Float64, 2, TRIANGLE))
    @eval begin
        function $fJ(p::FTPlan{$T, $N, $K}, x::Array{$T, $N})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
            return x
        end
        function $fJ(p::AdjointFTPlan{$T, FTPlan{$T, $N, $K}}, x::Array{$T, $N})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
        function $fJ(p::TransposeFTPlan{$T, FTPlan{$T, $N, $K}}, x::Array{$T, $N})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
            return x
        end
    end
end

for (fJ, fC) in ((:lmul!, :ft_execute_tet2cheb),
                 (:ldiv!, :ft_execute_cheb2tet))
    @eval begin
        function $fJ(p::FTPlan{Float64, 3, TETRAHEDRON}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
        function $fJ(p::AdjointFTPlan{Float64, FTPlan{Float64, 3, TETRAHEDRON}}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
        function $fJ(p::TransposeFTPlan{Float64, FTPlan{Float64, 3, TETRAHEDRON}}, x::Array{Float64, 3})
            checksize(p, x)
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
            return x
        end
    end
end

function execute_sph_polar_rotation!(x::Matrix{Float64}, α)
    ccall((:ft_execute_sph_polar_rotation, libfasttransforms), Cvoid, (Ptr{Float64}, Cint, Cint, Float64, Float64), x, size(x, 1), size(x, 2), sin(α), cos(α))
    return x
end

function execute_sph_polar_reflection!(x::Matrix{Float64})
    ccall((:ft_execute_sph_polar_reflection, libfasttransforms), Cvoid, (Ptr{Float64}, Cint, Cint), x, size(x, 1), size(x, 2))
    return x
end

struct ft_orthogonal_transformation
    Q::NTuple{9, Float64}
end

function convert(::Type{ft_orthogonal_transformation}, Q::AbstractMatrix)
    @assert size(Q, 1) ≥ 3 && size(Q, 2) ≥ 3
    return ft_orthogonal_transformation((Q[1, 1], Q[2, 1], Q[3, 1], Q[1, 2], Q[2, 2], Q[3, 2], Q[1, 3], Q[2, 3], Q[3, 3]))
end
convert(::Type{ft_orthogonal_transformation}, Q::NTuple{9, Float64}) = ft_orthogonal_transformation(Q)

function execute_sph_orthogonal_transformation!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, Q, x::Matrix{Float64})
    checksize(p, x)
    ccall((:ft_execute_sph_orthogonal_transformation, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ft_orthogonal_transformation, Ptr{Float64}, Cint, Cint), p, Q, x, size(x, 1), size(x, 2))
    return x
end

function execute_sph_yz_axis_exchange!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, x::Matrix{Float64})
    checksize(p, x)
    ccall((:ft_execute_sph_yz_axis_exchange, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), p, x, size(x, 1), size(x, 2))
    return x
end

function execute_sph_rotation!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, α, β, γ, x::Matrix{Float64})
    checksize(p, x)
    ccall((:ft_execute_sph_rotation, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Float64, Float64, Float64, Ptr{Float64}, Cint, Cint), p, α, β, γ, x, size(x, 1), size(x, 2))
    return x
end

struct ft_reflection
    w::NTuple{3, Float64}
end

function convert(::Type{ft_reflection}, w::AbstractVector)
    @assert length(w) ≥ 3
    return ft_reflection((w[1], w[2], w[3]))
end
convert(::Type{ft_reflection}, w::NTuple{3, Float64}) = ft_reflection(w)

function execute_sph_reflection!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, w, x::Matrix{Float64})
    checksize(p, x)
    ccall((:ft_execute_sph_reflection, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ft_reflection, Ptr{Float64}, Cint, Cint), p, w, x, size(x, 1), size(x, 2))
    return x
end
execute_sph_reflection!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, w1, w2, w3, x::Matrix{Float64}) = execute_sph_reflection!(p, ft_reflection(w1, w2, w3), x)

*(p::FTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
*(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
*(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
\(p::FTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))
\(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))
\(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))

for fJ in (:lmul!, :ldiv!)
    @eval begin
        function $fJ(p::FTPlan{T}, x::AbstractArray{Complex{T}}) where T
            x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
            return x
        end
        function $fJ(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T
            x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
            return x
        end
        function $fJ(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T
            x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
            return x
        end
    end
end

for (fC, T) in ((:execute_jacobi_similarityf, Float32), (:execute_jacobi_similarity, Float64))
    @eval begin
        function modified_jacobi_matrix(P::ModifiedFTPlan{$T}, XP::SymTridiagonal{$T, Vector{$T}})
            n = min(P.n, size(XP, 1))
            XQ = SymTridiagonal(Vector{$T}(undef, n-1), Vector{$T}(undef, n-2))
            ccall(($(string(fC)), libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Cint, Ptr{$T}, Ptr{$T}, Ptr{$T}, Ptr{$T}), P, n, XP.dv, XP.ev, XQ.dv, XQ.ev)
            return XQ
        end
    end
end

function modified_jacobi_matrix(R, XP)
    n = size(R, 1) - 1
    XQ = SymTridiagonal(zeros(n), zeros(n-1))
    XQ.dv[1] = (R[1, 1]*XP[1, 1] + R[1, 2]*XP[2, 1])/R[1, 1]
    for i in 1:n-1
        XQ.ev[i] = R[i+1, i+1]*XP[i+1, i]/R[i, i]
    end
    for i in 2:n
        XQ.dv[i] = (R[i, i]*XP[i,i] + R[i, i+1]*XP[i+1, i] - XQ[i, i-1]*R[i-1, i])/R[i, i]
    end
    return XQ
end


================================================
FILE: src/nufft.jl
================================================
"""
Pre-computes a nonuniform fast Fourier transform of type `N`.

For best performance, choose the right number of threads by `FFTW.set_num_threads(4)`, for example.
"""
struct NUFFTPlan{N,T,FFT} <: Plan{T}
    U::Matrix{T}
    V::Matrix{T}
    p::FFT
    t::Vector{Int}
    temp::Matrix{T}
    temp2::Matrix{T}
    Ones::Vector{T}
end

"""
Pre-computes a nonuniform fast Fourier transform of type I.
"""
function plan_nufft1(ω::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat}
    N = length(ω)
    ωdN = ω/N
    t = AssignClosestEquispacedFFTpoint(ωdN)
    γ = PerturbationParameter(ωdN, AssignClosestEquispacedGridpoint(ωdN))
    K = FindK(γ, ϵ)
    U = constructU(ωdN, K)
    V = constructV(range(zero(T), stop=N-1, length=N), K)
    p = plan_bfft!(V, 1)
    temp = zeros(Complex{T}, N, K)
    temp2 = zeros(Complex{T}, N, K)
    Ones = ones(Complex{T}, K)

    NUFFTPlan{1, eltype(U), typeof(p)}(U, V, p, t, temp, temp2, Ones)
end

"""
Pre-computes a nonuniform fast Fourier transform of type II.
"""
function plan_nufft2(x::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    N = length(x)
    t = AssignClosestEquispacedFFTpoint(x)
    γ = PerturbationParameter(x, AssignClosestEquispacedGridpoint(x))
    K = FindK(γ, ϵ)
    U = constructU(x, K)
    V = constructV(range(zero(T), stop=N-1, length=N), K)
    p = plan_fft!(U, 1)
    temp = zeros(Complex{T}, N, K)
    temp2 = zeros(Complex{T}, N, K)
    Ones = ones(Complex{T}, K)

    NUFFTPlan{2, eltype(U), typeof(p)}(U, V, p, t, temp, temp2, Ones)
end

"""
Pre-computes a nonuniform fast Fourier transform of type III.
"""
function plan_nufft3(x::AbstractVector{T}, ω::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    N = length(x)
    s = AssignClosestEquispacedGridpoint(x)
    t = AssignClosestEquispacedFFTpoint(x)
    γ = PerturbationParameter(x, s)
    K = FindK(γ, ϵ)
    u = constructU(x, K)
    v = constructV(ω, K)

    p = plan_nufft1(ω, ϵ)

    D1 = Diagonal(1 .- (s .- t .+ 1)./N)
    D2 = Diagonal((s .- t .+ 1)./N)
    D3 = Diagonal(exp.(-2 .* im .* T(π) .* ω ))
    U = hcat(D1*u, D2*u)
    V = hcat(v, D3*v)

    temp = zeros(Complex{T}, N, 2K)
    temp2 = zeros(Complex{T}, N, 2K)
    Ones = ones(Complex{T}, 2K)

    NUFFTPlan{3, eltype(U), typeof(p)}(U, V, p, t, temp, temp2, Ones)
end

function (*)(p::NUFFTPlan{N,T}, c::AbstractArray{V}) where {N,T,V}
    mul!(zeros(promote_type(T,V), size(c)), p, c)
end

function mul!(f::AbstractVector{T}, P::NUFFTPlan{1,T}, c::AbstractVector{T}) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast!(*, temp, c, U)
    conj!(temp)
    fill!(temp2, zero(T))
    recombine_rows!(temp, t, temp2)
    p*temp2
    conj!(temp2)
    broadcast!(*, temp, V, temp2)
    mul!(f, temp, Ones)

    f
end

function mul!(F::Matrix{T}, P::NUFFTPlan{N,T}, C::Matrix{T}) where {N,T}
    for J = 1:size(F, 2)
        mul_col_J!(F, P, C, J)
    end
    F
end

function broadcast_col_J!(f, temp::Matrix, C::Matrix, U::Matrix, J::Int)
    N = size(C, 1)
    COLSHIFT = N*(J-1)
    @inbounds for j = 1:size(temp, 2)
        for i = 1:N
            temp[i,j] = f(C[i+COLSHIFT],U[i,j])
        end
    end
    temp
end

function mul_col_J!(F::Matrix{T}, P::NUFFTPlan{1,T}, C::Matrix{T}, J::Int) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast_col_J!(*, temp, C, U, J)
    conj!(temp)
    fill!(temp2, zero(T))
    recombine_rows!(temp, t, temp2)
    p*temp2
    conj!(temp2)
    broadcast!(*, temp, V, temp2)
    COLSHIFT = size(C, 1)*(J-1)
    mul_for_col_J!(F, temp, Ones, 1+COLSHIFT, 1)

    F
end

function mul!(f::AbstractVector{T}, P::NUFFTPlan{2,T}, c::AbstractVector{T}) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast!(*, temp, c, V)
    p*temp
    reindex_temp!(temp, t, temp2)
    broadcast!(*, temp, U, temp2)
    mul!(f, temp, Ones)

    f
end

function mul_col_J!(F::Matrix{T}, P::NUFFTPlan{2,T}, C::Matrix{T}, J::Int) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast_col_J!(*, temp, C, V, J)
    p*temp
    reindex_temp!(temp, t, temp2)
    broadcast!(*, temp, U, temp2)
    COLSHIFT = size(C, 1)*(J-1)
    mul_for_col_J!(F, temp, Ones, 1+COLSHIFT, 1)

    F
end

function mul!(f::AbstractVector{T}, P::NUFFTPlan{3,T}, c::AbstractVector{T}) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast!(*, temp2, c, V)
    mul!(temp, p, temp2)
    reindex_temp!(temp, t, temp2)
    broadcast!(*, temp, U, temp2)
    mul!(f, temp, Ones)

    f
end


function mul_col_J!(F::Matrix{T}, P::NUFFTPlan{3,T}, C::Matrix{T}, J::Int) where {T}
    U, V, p, t, temp, temp2, Ones = P.U, P.V, P.p, P.t, P.temp, P.temp2, P.Ones

    broadcast_col_J!(*, temp2, C, V, J)
    mul!(temp, p, temp2)
    reindex_temp!(temp, t, temp2)
    broadcast!(*, temp, U, temp2)
    COLSHIFT = size(C, 1)*(J-1)
    mul_for_col_J!(F, temp, Ones, 1+COLSHIFT, 1)

    F
end

mul_for_col_J!(y::AbstractVecOrMat{T}, A::AbstractMatrix{T}, x::AbstractVecOrMat{T}, istart::Int, jstart::Int) where T =
    mul_for_col_J!(y, A, x, istart, jstart, 1, 1)

function mul_for_col_J!(y::AbstractVecOrMat{T}, A::AbstractMatrix{T}, x::AbstractVecOrMat{T}, istart::Int, jstart::Int, INCX::Int, INCY::Int) where T
    m, n = size(A)
    ishift, jshift = istart-INCY, jstart-INCX
    @inbounds for i = 1:m
        y[ishift+i*INCY] = zero(T)
    end
    @inbounds for j = 1:n
        xj = x[jshift+j*INCX]
        for i = 1:m
            y[ishift+i*INCY] += A[i,j]*xj
        end
    end

    y
end

function reindex_temp!(temp::Matrix{T}, t::Vector{Int}, temp2::Matrix{T}) where {T}
    @inbounds for j = 1:size(temp, 2)
        for i = 1:size(temp, 1)
            temp2[i, j] = temp[t[i], j]
        end
    end
    temp2
end

function recombine_rows!(temp::Matrix{T}, t::Vector{Int}, temp2::Matrix{T}) where {T}
    @inbounds for j = 1:size(temp, 2)
        for i = 1:size(temp, 1)
            temp2[t[i], j] += temp[i, j]
        end
    end
    temp2
end

"""
Computes a nonuniform fast Fourier transform of type I:

```math
f_j = \\sum_{k=0}^{N-1} c_k e^{-2\\pi{\\rm i} \\frac{j}{N} \\omega_k},\\quad{\\rm for}\\quad 0 \\le j \\le N-1.
```
"""
nufft1(c::AbstractVector, ω::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_nufft1(ω, ϵ)*c

"""
Computes a nonuniform fast Fourier transform of type II:

```math
f_j = \\sum_{k=0}^{N-1} c_k e^{-2\\pi{\\rm i} x_j k},\\quad{\\rm for}\\quad 0 \\le j \\le N-1.
```
"""
nufft2(c::AbstractVector, x::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat}  = plan_nufft2(x, ϵ)*c

"""
Computes a nonuniform fast Fourier transform of type III:

```math
f_j = \\sum_{k=0}^{N-1} c_k e^{-2\\pi{\\rm i} x_j \\omega_k},\\quad{\\rm for}\\quad 0 \\le j \\le N-1.
```
"""
nufft3(c::AbstractVector, x::AbstractVector{T}, ω::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_nufft3(x, ω, ϵ)*c

const nufft = nufft3
const plan_nufft = plan_nufft3


"""
Pre-computes a 2D nonuniform fast Fourier transform.

For best performance, choose the right number of threads by `FFTW.set_num_threads(4)`, for example.
"""
struct NUFFT2DPlan{T,P1,P2} <: Plan{T}
    p1::P1
    p2::P2
    temp::Vector{T}
end

"""
Pre-computes a 2D nonuniform fast Fourier transform of type I-I.
"""
function plan_nufft1(ω::AbstractVector{T}, π::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    p1 = plan_nufft1(ω, ϵ)
    p2 = plan_nufft1(π, ϵ)
    temp = zeros(Complex{T}, length(π))

    NUFFT2DPlan(p1, p2, temp)
end

"""
Pre-computes a 2D nonuniform fast Fourier transform of type II-II.
"""
function plan_nufft2(x::AbstractVector{T}, y::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
    p1 = plan_nufft2(x, ϵ)
    p2 = plan_nufft2(y, ϵ)
    temp = zeros(Complex{T}, length(y))

    NUFFT2DPlan(p1, p2, temp)
end

function (*)(p::NUFFT2DPlan{T}, C::Matrix{V}) where {T,V}
    mul!(zeros(promote_type(T,V), size(C)), p, C)
end

function mul!(F::Matrix{T}, P::NUFFT2DPlan{T}, C::Matrix{T}) where {T}
    p1, p2, temp = P.p1, P.p2, P.temp

    # Apply 1D x-plan to all columns
    mul!(F, p1, C)

    # Apply 1D y-plan to all rows
    for i = 1:size(C, 1)
        @inbounds for j = 1:size(F, 2) temp[j] = F[i,j] end
        mul!(temp, p2, temp)
        @inbounds for j = 1:size(F, 2) F[i,j] = temp[j] end
    end

    F
end

"""
Computes a 2D nonuniform fast Fourier transform of type I-I:

```math
F_{i,j} = \\sum_{k=0}^{M-1}\\sum_{\\ell=0}^{N-1} C_{k,\\ell} e^{-2\\pi{\\rm i} (\\frac{i}{M} \\omega_k + \\frac{j}{N} \\pi_{\\ell})},\\quad{\\rm for}\\quad 0 \\le i \\le M-1,\\quad 0 \\le j \\le N-1.
```
"""
nufft1(C::Matrix, ω::AbstractVector{T}, π::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_nufft1(ω, π, ϵ)*C

"""
Computes a 2D nonuniform fast Fourier transform of type II-II:

```math
F_{i,j} = \\sum_{k=0}^{M-1}\\sum_{\\ell=0}^{N-1} C_{k,\\ell} e^{-2\\pi{\\rm i} (x_i k + y_j \\ell)},\\quad{\\rm for}\\quad 0 \\le i \\le M-1,\\quad 0 \\le j \\le N-1.
```
"""
nufft2(C::Matrix, x::AbstractVector{T}, y::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_nufft2(x, y, ϵ)*C


FindK(γ::T, ϵ::T) where {T<:AbstractFloat} = γ ≤ ϵ ? 1 : Int(ceil(5*γ*exp(lambertw(log(10/ϵ)/γ/7))))

(AssignClosestEquispacedGridpoint(x::AbstractVector{T})::AbstractVector{T}) where {T<:AbstractFloat} = round.([Int], size(x, 1)*x)
(AssignClosestEquispacedFFTpoint(x::AbstractVector{T})::Array{Int,1}) where {T<:AbstractFloat} = mod.(round.([Int], size(x, 1)*x), size(x, 1)) .+ 1
(PerturbationParameter(x::AbstractVector{T}, s_vec::AbstractVector{T})::AbstractFloat) where {T<:AbstractFloat} = norm(size(x, 1)*x - s_vec, Inf)

function constructU(x::AbstractVector{T}, K::Int) where {T<:AbstractFloat}
    # Construct a low rank approximation, using Chebyshev expansions
    # for AK = exp(-2*pi*1im*(x[j]-j/N)*k):
    N = length(x)
    s = AssignClosestEquispacedGridpoint(x)
    er = N*x - s
    γ = norm(er, Inf)
    Diagonal(exp.(-im*(π*er)))*ChebyshevP(K-1, er/γ)*Bessel_coeffs(K, γ)
end

function constructV(ω::AbstractVector{T}, K::Int) where {T<:AbstractFloat}
    complex(ChebyshevP(K-1, ω.*(two(T)/length(ω)) .- 1))
end

function Bessel_coeffs(K::Int, γ::T) where {T<:AbstractFloat}
    # Calculate the Chebyshev coefficients of exp(-2*pi*1im*x*y) on [-gam,gam]x[0,1]
    cfs = zeros(Complex{T}, K, K)
    arg = -γ*π/two(T)
    for p = 0:K-1
     	for q = mod(p,2):2:K-1
    		cfs[p+1,q+1] = 4*(1im)^q*besselj((p+q)/2,arg).*besselj((q-p)/2,arg)
    	end
    end
    cfs[1,:] = cfs[1,:]/two(T)
    cfs[:,1] = cfs[:,1]/two(T)
    return cfs
end

function ChebyshevP(n::Int, x::AbstractVector{T}) where T<:AbstractFloat
    # Evaluate Chebyshev polynomials of degree 0,...,n at x:
    N = size(x, 1)
    Tcheb = Matrix{T}(undef, N, n+1)

    # T_0(x) = 1.0
    One = convert(eltype(x),1.0)
    @inbounds for j = 1:N
        Tcheb[j, 1] = One
    end
    # T_1(x) = x
    if ( n > 0 )
        @inbounds for j = 1:N
            Tcheb[j, 2] = x[j]
        end
    end
    # 3-term recurrence relation:
    twoX = 2x
    @inbounds for k = 2:n
        @simd for j = 1:N
            Tcheb[j, k+1] = twoX[j]*Tcheb[j, k] - Tcheb[j, k-1]
        end
    end
    return Tcheb
end


================================================
FILE: src/specialfunctions.jl
================================================
import Base.Math: @horner

const FORWARD  =  true
const BACKWARD = false

const sqrtpi = 1.772453850905516027298
const edivsqrt2pi = 1.084437551419227546612

"""
Compute a typed 0.5.
"""
half(x::Number) = oftype(x,0.5)
half(x::Integer) = half(float(x))
half(::Type{T}) where {T<:Number} = convert(T,0.5)
half(::Type{T}) where {T<:Integer} = half(AbstractFloat)

"""
Compute a typed 2.
"""
two(x::Number) = oftype(x,2)
two(::Type{T}) where {T<:Number} = convert(T,2)

"""
The Kronecker ``\\delta`` function:

```math
\\delta_{k,j} = \\left\\{\\begin{array}{ccc} 1 & {\\rm for} & k = j,\\\\ 0 & {\\rm for} & k \\ne j.\\end{array}\\right.
```
"""
δ(k::Integer,j::Integer) = k == j ? 1 : 0


"""
Pochhammer symbol ``(x)_n = \\frac{\\Gamma(x+n)}{\\Gamma(x)}`` for the rising factorial.
"""
function pochhammer(x::Number,n::Integer)
    ret = one(x)
    if n≥0
        for i=0:n-1
            ret *= x+i
        end
    else
        ret /= pochhammer(x+n,-n)
    end
    ret
end

pochhammer(x::Number,n::Number) = isinteger(n) ? pochhammer(x,Int(n)) : ogamma(x)/ogamma(x+n)

function pochhammer(x::Number,n::UnitRange{T}) where T<:Real
    ret = Vector{promote_type(typeof(x),T)}(length(n))
    ret[1] = pochhammer(x,first(n))
    for i=2:length(n)
        ret[i] = (x+n[i]-1)*ret[i-1]
    end
    ret
end

lgamma(x) = logabsgamma(x)[1]

ogamma(x::Number) = (isinteger(x) && x<0) ? zero(float(x)) : inv(gamma(x))

"""
Stirling's asymptotic series for ``\\Gamma(z)``.
"""
stirlingseries(z) = gamma(z)*sqrt((z/π)/2)*exp(z)/z^z

function stirlingseries(z::Float64)
    if z ≥ 3274.12075200175       # N =  4
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273)
    elseif z ≥ 590.1021805526798  # N =  5
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917)
    elseif z ≥ 195.81733962412835 # N =  6
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666)
    elseif z ≥ 91.4692823071966   # N =  7
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5)
    elseif z ≥ 52.70218954633605  # N =  8
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939)
    elseif z ≥ 34.84031591198865  # N =  9
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5)
    elseif z ≥ 25.3173982783047   # N = 10
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873)
    elseif z ≥ 19.685015283078513 # N = 11
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5)
    elseif z ≥ 16.088669099569266 # N = 12
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776)
    elseif z ≥ 13.655055978888104 # N = 13
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776,-0.00016251626278391583)
    elseif z ≥ 11.93238782087875  # N = 14
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776,-0.00016251626278391583,0.00640336283380807)
    elseif z ≥ 10.668852439197263 # N = 15
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776,-0.00016251626278391583,0.00640336283380807,0.0005401647678926045)
    elseif z ≥ 9.715358216638403  # N = 16
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776,-0.00016251626278391583,0.00640336283380807,0.0005401647678926045,-0.02952788094569912)
    elseif z ≥ 8.979120323411497  # N = 17
        @horner(inv(z),1.0,0.08333333333333333,0.003472222222222222,-0.0026813271604938273,-0.00022947209362139917,0.0007840392217200666,6.972813758365857e-5,-0.0005921664373536939,-5.171790908260592e-5,0.0008394987206720873,7.204895416020011e-5,-0.0019144384985654776,-0.00016251626278391583,0.00640336283380807,0.0005401647678926045,-0.02952788094569912,-0.002481743600264998)
    else
        gamma(z)*sqrt(z/2π)*exp(z)/z^z
    end
end


stirlingremainder(z::Number,N::Int) = (1+zeta(N))*gamma(N)/((2π)^(N+1)*z^N)/stirlingseries(z)

Aratio(n::Int,α::Float64,β::Float64) = exp((n/2+α+1/4)*log1p(-β/(n+α+β+1))+(n/2+β+1/4)*log1p(-α/(n+α+β+1))+(n/2+1/4)*log1p(α/(n+1))+(n/2+1/4)*log1p(β/(n+1)))
Aratio(n::Number,α::Number,β::Number) = (1+(α+1)/n)^(n+α+1/2)*(1+(β+1)/n)^(n+β+1/2)/(1+(α+β+1)/n)^(n+α+β+1/2)/(1+(zero(α)+zero(β))/n)^(n+1/2)

Cratio(n::Int,α::Float64,β::Float64) = exp((n+α+1/2)*log1p((α-β)/(2n+α+β+2))+(n+β+1/2)*log1p((β-α)/(2n+α+β+2))-log1p((α+β+2)/2n)/2)/sqrt(n)
Cratio(n::Number,α::Number,β::Number) = n^(-1/2)*(1+(α+1)/n)^(n+α+1/2)*(1+(β+1)/n)^(n+β+1/2)/(1+(α+β+2)/2n)^(2n+α+β+3/2)


Anαβ(n::Number,α::Number,β::Number) = 2^(α+β+1)/(2n+α+β+1)*exp(lgamma(n+α+1)-lgamma(n+α+β+1)+lgamma(n+β+1)-lgamma(n+1))
function Anαβ(n::Integer,α::Number,β::Number)
    if n==0
        2^(α+β+1)*beta(α+1,β+1)
    else
        val = Anαβ(0,α,β)
        for i=1:n
            val *= (i+α)*(i+β)/(i+α+β)/i*(2i+α+β-1)/(2i+α+β+1)
        end
        val
    end
end

function Anαβ(n::Integer,α::Float64,β::Float64)
    if n+min(α,β,α+β,0) ≥ 7.979120323411497
        2 ^ (α+β+1)/(2n+α+β+1)*stirlingseries(n+α+1)*Aratio(n,α,β)/stirlingseries(n+α+β+1)*stirlingseries(n+β+1)/stirlingseries(n+one(Float64))
    else
        (n+1)*(n+α+β+1)/(n+α+1)/(n+β+1)*Anαβ(n+1,α,β)*((2n+α+β+3)/(2n+α+β+1))
    end
end


"""
The Lambda function ``\\Lambda(z) = \\frac{\\Gamma(z+\\frac{1}{2})}{\\Gamma(z+1)}`` for the ratio of gamma functions.
"""
Λ(z::Number) = Λ(z, half(z), one(z))

"""
For 64-bit floating-point arithmetic, the Lambda function uses the asymptotic series for ``\\tau`` in Appendix B of

I. Bogaert and B. Michiels and J. Fostier, 𝒪(1) computation of Legendre polynomials and Gauss–Legendre nodes and weights for parallel computing, *SIAM J. Sci. Comput.*, **34**:C83–C101, 2012.
"""
function Λ(x::Float64)
    if x > 9.84475
        xp = x+0.25
        @horner(inv(xp^2),1.0,-1.5625e-02,2.5634765625e-03,-1.2798309326171875e-03,1.343511044979095458984375e-03,-2.432896639220416545867919921875e-03,6.7542375336415716446936130523681640625e-03)/sqrt(xp)
    else
        (x+1.0)*Λ(x+1.0)/(x+0.5)
    end
end

"""
The Lambda function ``\\Lambda(z,λ₁,λ₂) = \\frac{\\Gamma(z+\\lambda_1)}{Γ(z+\\lambda_2)}`` for the ratio of gamma functions.
"""
function Λ(z::Real, λ₁::Real, λ₂::Real)
    if z+λ₁ > 0 && z+λ₂ > 0
        exp(lgamma(z+λ₁)-lgamma(z+λ₂))
    else
        gamma(z+λ₁)/gamma(z+λ₂)
    end
end
function Λ(x::Float64, λ₁::Float64, λ₂::Float64)
    if min(x+λ₁,x+λ₂) ≥ 8.979120323411497
        exp(λ₂-λ₁+(x-.5)*log1p((λ₁-λ₂)/(x+λ₂)))*(x+λ₁)^λ₁/(x+λ₂)^λ₂*stirlingseries(x+λ₁)/stirlingseries(x+λ₂)
    else
        (x+λ₂)/(x+λ₁)*Λ(x + 1.0, λ₁, λ₂)
    end
end

## TODO: deprecate when Lambert-W is supported in a mainstream repository such as SpecialFunctions.jl
"""
The principal branch of the Lambert-W function, defined by ``x = W_0(x) e^{W_0(x)}``, computed using Halley's method for ``x \\in [-e^{-1},\\infty)``.
"""
function lambertw(x::AbstractFloat)
    if x < -exp(-one(x))
        return throw(DomainError())
    elseif x == -exp(-one(x))
        return -one(x)
    elseif x < 0
        w0 = ℯ*x/(1+inv(inv(sqrt(2*ℯ*x+2))+inv(ℯ-1)-inv(sqrt(2))))
    else
        log1px = log1p(x)
        w0 = log1px*(1-log1p(log1px)/(2+log1px))
    end
    expw0 = exp(w0)
    w1 = w0 - (w0*expw0 - x)/((w0 + 1)*expw0 -
        (w0 + 2) * (w0*expw0 - x)/(2w0 + 2))
    while abs(w1/w0 - 1) > 2eps(typeof(x))
        w0 = w1
        expw0 = exp(w0)
        w1 = w0 - (w0*expw0 - x)/((w0 + 1)*expw0 -
            (w0 + 2) * (w0*expw0 - x)/(2w0 + 2))
    end
    return w1
end
lambertw(x::Real) = lambertw(float(x))


Cnλ(n::Integer,λ::Float64) = 2^λ/sqrtpi*Λ(n+λ)
Cnλ(n::Integer,λ::Number) = 2^λ/sqrt(oftype(λ,π))*Λ(n+λ)
function Cnλ(n::UnitRange{T},λ::Number) where T<:Integer
    ret = Vector{typeof(λ)}(undef, length(n))
    ret[1] = Cnλ(first(n),λ)
    for i=2:length(n)
        ret[i] = (n[i]+λ-half(λ))/(n[i]+λ)*ret[i-1]
    end
    ret
end

function Cnmλ(n::Integer,m::Integer,λ::Number)
    if m == 0
        Cnλ(n,λ)
    else
        (λ+m-1)/2/m*(m-λ)/(n+λ+m)*Cnmλ(n,m-1,λ)
    end
end


function Cnαβ(n::Integer,α::Number,β::Number)
    if n==0
        2^(α+β+1)*beta(α+1,β+1)/π
    else
        val = Cnαβ(0,α,β)
        for i=1:n
            val *= (i+α)*(i+β)/(i+(α+β+1)/2)/(i+(α+β)/2)
        end
        val
    end
end

function Cnαβ(n::Integer,α::Float64,β::Float64)
    if n+min(α,β) ≥ 7.979120323411497
        stirlingseries(n+α+1)/sqrtpi/stirlingseries(2n+α+β+2)*Cratio(n,α,β)*stirlingseries(n+β+1)
    else
        (n+(α+β+3)/2)/(n+β+1)*(n+(α+β+2)/2)/(n+α+1)*Cnαβ(n+1,α,β)
    end
end

function Cnmαβ(n::Integer,m::Integer,α::Number,β::Number)
    if m == 0
        Cnαβ(n,α,β)
    else
        Cnmαβ(n,m-1,α,β)/2(2n+α+β+m+1)
    end
end


function Cnmαβ(n::Integer,m::Integer,α::AbstractArray{T},β::AbstractArray{T}) where T<:Number
    shp = promote_shape(size(α),size(β))
    reshape([ Cnmαβ(n,m,α[i],β[i]) for i in eachindex(α,β) ], shp)
end


"""
Modified Chebyshev moments of the first kind:

```math
    \\int_{-1}^{+1} T_n(x) {\\rm\\,d}x.
```
"""
function chebyshevmoments1(::Type{T}, N::Int) where T
    μ = zeros(T, N)
    for i = 0:2:N-1
        @inbounds μ[i+1] = two(T)/T(1-i^2)
    end
    μ
end

"""
Modified Chebyshev moments of the first kind:

```math
    \\int_^a T_n(x) {\\rm\\,d}x.
```
"""
function chebyshevmoments1(::Type{T}, N::Int, a::T) where T
    μ = zeros(T, N)
    μ[1] = a
    μ[2] = a^2/2
    θ = acos(a)
    for i = 2:N-1
        @inbounds μ[i+1] = (cos((i+1)*θ)/(i+1) - cos((i-1)*θ)/(i-1))/2
    end
    μ
end

function chebyshevmoments1(::Type{T}, N::Int, a::NTuple{L, T}, w::NTuple{M, T}) where {T, L, M}
    @assert L == M+1
    @assert M > 0
    μ = zeros(T, N)
    for k in 1:M
        μ .+= w[k]*(chebyshevmoments1(T, N, a[k+1]) - chebyshevmoments1(T, N, a[k]))
    end
    μ
end

"""
Modified Chebyshev moments of the first kind with respect to the Jacobi weight:

```math
    \\int_{-1}^{+1} T_n(x) (1-x)^\\alpha(1+x)^\\beta{\\rm\\,d}x.
```
"""
function chebyshevjacobimoments1(::Type{T}, N::Int, α, β) where T
    μ = zeros(T, N)
    N > 0 && (μ[1] = 2 .^ (α+β+1)*beta(α+1,β+1))
    if N > 1
        μ[2] = μ[1]*(β-α)/(α+β+2)
        for i=1:N-2
            @inbounds μ[i+2] = (2(β-α)*μ[i+1]-(α+β-i+2)*μ[i])/(α+β+i+2)
        end
    end
    μ
end

"""
Modified Chebyshev moments of the first kind with respect to the logarithmic weight:

```math
    \\int_{-1}^{+1} T_n(x) \\log\\left(\\frac{2}{1-x}\\right){\\rm\\,d}x.
```
"""
function chebyshevlogmoments1(::Type{T}, N::Int) where T
    μ = zeros(T, N)
    N > 0 && (μ[1] = two(T))
    if N > 1
        μ[2] = one(T)
        for i=1:N-2
            cst = isodd(i) ? T(4)/T(4-i^2) : T(4)/T(1-i^2)
            @inbounds μ[i+2] = ((i-2)*μ[i]+cst)/(i+2)
        end
    end
    μ
end

"""
Modified Chebyshev moments of the first kind with respect to the log-Chebyshev weight:

```math
    \\int_{-1}^{+1} T_n(x) \\log\\left(\\frac{2}{1-x}\\right)\\frac{{\\rm d}x}{\\sqrt{1-x^2}}.
```
"""
function chebyshevlogchebyshevmoments1(::Type{T}, N::Int) where T
    μ = zeros(T, N)
    N > 0 && (μ[1] = 2*log(T(2))*π)
    if N > 1
        for i=1:N-1
            @inbounds μ[i+1] = T(π)/i
        end
    end
    μ
end

"""
Modified Chebyshev moments of the first kind with respect to the absolute value weight:

```math
    \\int_{-1}^{+1} T_n(x) |x|{\\rm\\,d}x.
```
"""
function chebyshevabsmoments1(::Type{T}, N::Int) where T
    μ = zeros(T, N)
    if N > 0
        for i=0:4:N-1
            @inbounds μ[i+1] = -T(1)/T((i÷2)^2-1)
        end
    end
    μ
end

"""
Modified Chebyshev moments of the second kind:

```math
    \\int_{-1}^{+1} U_n(x) {\\rm\\,d}x.
```
"""
function chebyshevmoments2(::Type{T}, N::Int) where T
    μ = zeros(T, N)
    for i = 0:2:N-1
        @inbounds μ[i+1] = two(T)/T(i+1)
    end
    μ
end

"""
Modified Chebyshev moments of the second kind with respect to the Jacobi weight:

```math
    \\int_{-1}^{+1} U_n(x) (1-x)^\\alpha(1+x)^\\beta{\\rm\\,d}x.
```
"""
function chebyshevjacobimoments2(::Type{T}, N::Int, α, β) where T
    μ = zeros(T, N)
    N > 0 && (μ[1] = 2 .^ (α+β+1)*beta(α+1,β+1))
    if N > 1
        μ[2] = 2μ[1]*(β-α)/(α+β+2)
        for i=1:N-2
            @inbounds μ[i+2] = (2(β-α)*μ[i+1]-(α+β-i)*μ[i])/(α+β+i+2)
        end
    end
    μ
end

"""
Modified Chebyshev moments of the second kind with respect to the logarithmic weight:

```math
    \\int_{-1}^{+1} U_n(x) \\log\\left(\\frac{2}{1-x}\\right){\\rm\\,d}x.
```
"""
function chebyshevlogmoments2(::Type{T}, N::Int) where T
    μ = chebyshevlogmoments1(T, N)
    if N > 1
        μ[2] *= two(T)
        for i=1:N-2
            @inbounds μ[i+2] = 2μ[i+2] + μ[i]
        end
    end
    μ
end

"""
Modified Chebyshev moments of the second kind with respect to the absolute value weight:

```math
    \\int_{-1}^{+1} U_n(x) |x|{\\rm\\,d}x.
```
"""
function chebyshevabsmoments2(::Type{T}, N::Int) where T
    μ = chebyshevabsmoments1(T, N)
    if N > 1
        μ[2] *= two(T)
        for i=1:N-2
            @inbounds μ[i+2] = 2μ[i+2] + μ[i]
        end
    end
    μ
end

function sphrand(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = rand(T)
    end
    for j = 1:n÷2
        for i = 1:m-j
            A[i,2j] = rand(T)
            A[i,2j+1] = rand(T)
        end
    end
    A
end

function sphrandn(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = randn(T)
    end
    for j = 1:n÷2
        for i = 1:m-j
            A[i,2j] = randn(T)
            A[i,2j+1] = randn(T)
        end
    end
    A
end

function sphones(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = one(T)
    end
    for j = 1:n÷2
        for i = 1:m-j
            A[i,2j] = one(T)
            A[i,2j+1] = one(T)
        end
    end
    A
end

sphzeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)

"""
Pointwise evaluation of real orthonormal spherical harmonic:

```math
Y_\\ell^m(\\theta,\\varphi) = (-1)^{|m|}\\sqrt{(\\ell+\\frac{1}{2})\\frac{(\\ell-|m|)!}{(\\ell+|m|)!}} P_\\ell^{|m|}(\\cos\\theta) \\sqrt{\\frac{2-\\delta_{m,0}}{2\\pi}} \\left\\{\\begin{array}{ccc} \\cos m\\varphi & {\\rm for} & m \\ge 0,\\\\ \\sin(-m\\varphi) & {\\rm for} & m < 0.\\end{array}\\right.
```
"""
sphevaluate(θ, φ, L, M) = sphevaluatepi(θ/π, φ/π, L, M)

sphevaluatepi(θ::Number, φ::Number, L::Integer, M::Integer) = sphevaluatepi(θ, L, M)*sphevaluatepi(φ, M)

function sphevaluatepi(θ::Number, L::Integer, M::Integer)
    ret = one(θ)/sqrt(two(θ))
    if M < 0 M = -M end
    c, s = cospi(θ), sinpi(θ)
    for m = 1:M
        ret *= sqrt((m+half(θ))/m)*s
    end
    tc = two(c)*c

    if L == M
        return ret
    elseif L == M+1
        return sqrt(two(θ)*M+3)*c*ret
    else
        temp = ret
        ret *= sqrt(two(θ)*M+3)*c
        for l = M+1:L-1
            ret, temp = (sqrt(l+half(θ))*tc*ret - sqrt((l-M)*(l+M)/(l-half(θ)))*temp)/sqrt((l-M+1)*(l+M+1)/(l+3half(θ))), ret
        end
        return ret
    end
end

sphevaluatepi(φ::Number, M::Integer) = sqrt((two(φ)-δ(M, 0))/(two(φ)*π))*(M ≥ 0 ? cospi(M*φ) : sinpi(-M*φ))

function sphvrand(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m-1
        A[i,1] = rand(T)
    end
    for j = 1:n÷2
        for i = 1:m-j+1
            A[i,2j] = rand(T)
            A[i,2j+1] = rand(T)
        end
    end
    A
end

function sphvrandn(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m-1
        A[i,1] = randn(T)
    end
    for j = 1:n÷2
        for i = 1:m-j+1
            A[i,2j] = randn(T)
            A[i,2j+1] = randn(T)
        end
    end
    A
end

function sphvones(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m-1
        A[i,1] = one(T)
    end
    for j = 1:n÷2
        for i = 1:m-j+1
            A[i,2j] = one(T)
            A[i,2j+1] = one(T)
        end
    end
    A
end

sphvzeros(::Type{T}, m::Int, n::Int) where T = sphzeros(T, m, n)

function diskrand(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = rand(T)
    end
    for j = 1:n÷2
        for i = 1:m-(j+1)÷2
            A[i,2j] = rand(T)
            A[i,2j+1] = rand(T)
        end
    end
    A
end

function diskrandn(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = randn(T)
    end
    for j = 1:n÷2
        for i = 1:m-(j+1)÷2
            A[i,2j] = randn(T)
            A[i,2j+1] = randn(T)
        end
    end
    A
end

function diskones(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for i = 1:m
        A[i,1] = one(T)
    end
    for j = 1:n÷2
        for i = 1:m-(j+1)÷2
            A[i,2j] = one(T)
            A[i,2j+1] = one(T)
        end
    end
    A
end

diskzeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)

function trirand(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for j = 1:n
        for i = 1:m+1-j
            A[i,j] = rand(T)
        end
    end
    A
end

function trirandn(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for j = 1:n
        for i = 1:m+1-j
            A[i,j] = randn(T)
        end
    end
    A
end

function triones(::Type{T}, m::Int, n::Int) where T
    A = zeros(T, m, n)
    for j = 1:n
        for i = 1:m+1-j
            A[i,j] = one(T)
        end
    end
    A
end

trizeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)

const rectdiskrand = trirand
const rectdiskrandn = trirandn
const rectdiskones = triones
const rectdiskzeros = trizeros

"""
Pointwise evaluation of triangular harmonic:

```math
\\tilde{P}_{\\ell,m}^{(\\alpha,\\beta,\\gamma)}(x,y).
```
"""
trievaluate(x, y, L, M, α, β, γ) = trievaluate(x, L, M, α, β, γ)*trievaluate(x, y, M, β, γ)

function trievaluate(x::Number, L::Integer, M::Integer, α::Number, β::Number, γ::Number)

end

function trievaluate(x::Number, y::Number, M::Integer, β::Number, γ::Number)

end

function tetrand(::Type{T}, l::Int, m::Int, n::Int) where T
    A = zeros(T, l, m, n)
    for k = 1:n
        for j = 1:m+1-k
            for i = 1:l+2-k-j
                A[i,j,k] = rand(T)
            end
        end
    end
    A
end

function tetrandn(::Type{T}, l::Int, m::Int, n::Int) where T
    A = zeros(T, l, m, n)
    for k = 1:n
        for j = 1:m+1-k
            for i = 1:l+2-k-j
                A[i,j,k] = randn(T)
            end
        end
    end
    A
end

function tetones(::Type{T}, l::Int, m::Int, n::Int) where T
    A = zeros(T, l, m, n)
    for k = 1:n
        for j = 1:m+1-k
            for i = 1:l+2-k-j
                A[i,j,k] = one(T)
            end
        end
    end
    A
end

tetzeros(::Type{T}, l::Int, m::Int, n::Int) where T = zeros(T, l, m, n)

function spinsphrand(::Type{T}, m::Int, n::Int, s::Int) where T
    A = zeros(T, m, n)
    as = abs(s)
    for i = 1:m-as
        A[i,1] = rand(T)
    end
    for j = 1:n÷2
        for i = 1:m-max(j, as)
            A[i,2j] = rand(T)
            A[i,2j+1] = rand(T)
        end
    end
    A
end

function spinsphrandn(::Type{T}, m::Int, n::Int, s::Int) where T
    A = zeros(T, m, n)
    as = abs(s)
    for i = 1:m-as
        A[i,1] = randn(T)
    end
    for j = 1:n÷2
        for i = 1:m-max(j, as)
            A[i,2j] = randn(T)
            A[i,2j+1] = randn(T)
        end
    end
    A
end

function spinsphones(::Type{T}, m::Int, n::Int, s::Int) where T
    A = zeros(T, m, n)
    as = abs(s)
    for i = 1:m-as
        A[i,1] = one(T)
    end
    for j = 1:n÷2
        for i = 1:m-max(j, as)
            A[i,2j] = one(T)
            A[i,2j+1] = one(T)
        end
    end
    A
end

spinsphzeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)


================================================
FILE: src/toeplitzhankel.jl
================================================
"""
Represent a scaled Toeplitz∘Hankel matrix:

    DL(T∘H)DR

where the Hankel matrix `H` is non-negative definite, via

    ∑_{k=1}^r Diagonal(L[:,k])*T*Diagonal(R[:,k])

where `L` and `R` are determined by doing a rank-r pivoted Cholesky decomposition of `H`, which in low rank form is

    H ≈ ∑_{k=1}^r C[:,k]C[:,k]'

so that `L[:,k] = DL*C[:,k]` and `R[:,k] = DR*C[:,k]`.

This allows a Cholesky decomposition in 𝒪(K²N) operations and 𝒪(KN) storage, K = log N log ɛ⁻¹.
The tuple storage allows plans applied to each dimension.
"""
struct ToeplitzHankelPlan{S, N, N1, LowR, TP, Dims} <: Plan{S}
    T::TP # A length M Vector or Tuple of ToeplitzPlan
    L::LowR  # A length M Vector or Tuple of Matrices storing low rank factors of L
    R::LowR # A length M Vector or Tuple of Matrices storing low rank factors of R
    tmp::Array{S,N1} # A larger dimensional array to transform each scaled array all-at-once
    dims::Dims # A length M Vector or Tuple of Int storing the dimensions acted on
    function ToeplitzHankelPlan{S,N,N1,LowR,TP,Dims}(T::TP, L::LowR, R::LowR, dims) where {S,N,N1,LowR,TP,Dims}
        tmp = Array{S}(undef, max.(size.(T)...)...)
        new{S,N,N1,LowR,TP,Dims}(T, L, R, tmp, dims)
    end
end


ToeplitzHankelPlan{S,N,M}(T::TP, L::LowR, R::LowR, dims::Dims) where {S,N,M,LowR,TP,Dims} = ToeplitzHankelPlan{S,N,M,LowR,TP,Dims}(T, L, R, dims)
ToeplitzHankelPlan{S,N}(T, L, R, dims) where {S,N} = ToeplitzHankelPlan{S,N,N+1}(T, L, R, dims)
ToeplitzHankelPlan(T::ToeplitzPlan{S,M}, L::Matrix, R::Matrix, dims=1) where {S,M} = ToeplitzHankelPlan{S,M-1,M}((T,), (L,), (R,), dims)

size(TH::ToeplitzHankelPlan) = size(first(TH.T))


_reshape_broadcast(d, R, ::Val{N}, M) where N = reshape(R,ntuple(k -> k == d ? size(R,1) : 1, Val(N))...,M)
function _th_applymul!(d, v::AbstractArray{<:Any,N}, T, L, R, tmp) where N
    M = size(R,2)
    ax = (axes(v)..., OneTo(M))
    tmp[ax...] .=  _reshape_broadcast(d, R, Val(N), M) .* v
    T * view(tmp, ax...)
    view(tmp,ax...) .*= _reshape_broadcast(d, L, Val(N), M)
    sum!(v, view(tmp,ax...))
end


function *(P::ToeplitzHankelPlan{<:Any,N}, v::AbstractArray{<:Any,N}) where N
    for (R,L,T,d) in zip(P.R,P.L,P.T,P.dims)
        _th_applymul!(d, v, T, L, R, P.tmp)
    end
    v
end

*(P::ToeplitzHankelPlan, v::AbstractArray) = error("plan applied to wrong-sized array")


# partial cholesky for a Hankel matrix

function hankel_partialchol(v::Vector{T}) where T
    # Assumes positive definite
    σ = T[]
    n = isempty(v) ? 0 : (length(v)+2) ÷ 2
    C = Matrix{T}(undef, n, n)
    d = v[1:2:end] # diag of H
    @assert length(v) ≥ 2n-1
    reltol = maximum(abs,d)*eps(T)*log(n)
    r = 0
    for k = 1:n
        mx,idx = findmax(d)
        if mx ≤ reltol break end
        push!(σ, inv(mx))
        C[:,k] .= view(v,idx:n+idx-1)
        for j = 1:k-1
            nCjidxσj = -C[idx,j]*σ[j]
            LinearAlgebra.axpy!(nCjidxσj, view(C,:,j), view(C,:,k))
        end
        @inbounds for p=1:n
            d[p] -= C[p,k]^2/mx
        end
        r += 1
    end
    for k=1:length(σ) rmul!(view(C,:,k), sqrt(σ[k])) end
    C[:,1:r]
end

# cholesky for D .* H .* D'
function hankel_partialchol(v::Vector, D::AbstractVector)
    T = promote_type(eltype(v), eltype(D))
    # Assumes positive definite
    σ = T[]
    n = isempty(v) ? 0 : (length(v)+2) ÷ 2
    C = Matrix{T}(undef, n, 100)
    d = v[1:2:end] .* D.^2 # diag of D .* H .* D'
    @assert length(v) ≥ 2n-1
    reltol = maximum(abs,d)*eps(T)*log(n)
    r = 0
    for k = 1:n
        mx,idx = findmax(d)
        if mx ≤ reltol break end
        push!(σ, inv(mx))
        C[:,k] .= view(v,idx:n+idx-1) .*D.*D[idx]
        for j = 1:k-1
            nCjidxσj = -C[idx,j]*σ[j]
            LinearAlgebra.axpy!(nCjidxσj, view(C,:,j), view(C,:,k))
        end
        @inbounds for p=1:n
            d[p] -= C[p,k]^2/mx
        end
        r += 1
    end
    r == 100 && error("ranks more than 100 not yet supported")
    for k=1:length(σ) rmul!(view(C,:,k), sqrt(σ[k])) end
    C[:,1:r]
end


# Diagonally-scaled Toeplitz∘Hankel polynomial transforms


struct ChebyshevToLegendrePlanTH{S,TH<:ToeplitzHankelPlan{S}} <: Plan{S}
    toeplitzhankel::TH
end

function *(P::ChebyshevToLegendrePlanTH, v::AbstractVector{S}) where S
    n = length(v)
    iszero(n) && return v
    ret = zero(S)
    @inbounds for k = 1:2:n
        ret += -v[k]/(k*(k-2))
    end
    v[1] = ret
    P.toeplitzhankel*view(v,2:n)
    v
end

function _cheb2leg_rescale1!(V::AbstractArray{S}, Rpre, Rpost, d) where S
    m = size(V,d)
    for Ipost in Rpost, Ipre in Rpre
        ret = zero(S)
        @inbounds for k = 1:2:m
            ret += -V[Ipre,k,Ipost]/(k*(k-2))
        end
        V[Ipre,1,Ipost] = ret
    end
    V
end

_dropfirstdim(d::Int) = ()
_dropfirstdim(d::Int, m, szs...) = ((d == 1 ? 2 : 1):m, _dropfirstdim(d-1, szs...)...)

function *(P::ChebyshevToLegendrePlanTH, V::AbstractArray)
    m,n = size(V)
    tmp = P.toeplitzhankel.tmp
    for (d,R,L,T) in zip(P.toeplitzhankel.dims,P.toeplitzhankel.R,P.toeplitzhankel.L,P.toeplitzhankel.T)
        Rpre = CartesianIndices(axes(V)[1:d-1])
        Rpost = CartesianIndices(axes(V)[d+1:end])
        _cheb2leg_rescale1!(V, Rpre, Rpost, d)
        _th_applymul!(d, view(V, _dropfirstdim(d, size(V)...)...), T, L, R, tmp)
    end
    V
end

_add1tod(d::Integer, a, b...) = d == 1 ? (a+1, b...) : (a, _add1tod(d-1, b...)...)
_add1tod(d, a, b...) = _add1tod(first(d), a, b...)
size(P::ChebyshevToLegendrePlanTH) = Base.front(_add1tod(P.toeplitzhankel.dims, size(first(P.toeplitzhankel.T))...))
inv(P::ChebyshevToLegendrePlanTH{T}) where T = plan_th_leg2cheb!(T, size(P), P.toeplitzhankel.dims)


function _leg2chebTH_TLC(::Type{S}, mn, d) where S
    n = mn[d]
    λ = Λ.(0:half(real(S)):n-1)
    t = zeros(S,n)
    t[1:2:end] .= 2 .* view(λ, 1:2:n) ./ π
    C = hankel_partialchol(λ)
    T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
    L = copy(C)
    if n > 0
        L[1,:] ./= 2
    end
    T,L,C
end

function _leg2chebuTH_TLC(::Type{S}, mn, d) where {S}
    n = mn[d]
    S̃ = real(S)
    λ = Λ.(0:half(S̃):n-1)
    t = zeros(S,n)
    t[1:2:end] = λ[1:2:n]./(((1:2:n).-2))
    h = λ./((1:2n-1).+1)
    C = hankel_partialchol(h)
    T = plan_uppertoeplitz!(-2t/π, (mn..., size(C,2)), d)
    (T, (1:n) .* C, C)
end

for f in (:leg2cheb, :leg2chebu)
    plan = Symbol("plan_th_", f, "!")
    TLC = Symbol("_", f, "TH_TLC")
    @eval begin
        $plan(::Type{S}, mn::NTuple{N,Int}, dims::Int) where {S,N} = ToeplitzHankelPlan($TLC(S, mn, dims)..., dims)
        function $plan(::Type{S}, mn::NTuple{N,Int}, dims) where {S,N}
            TLCs = $TLC.(S, Ref(mn), dims)
            ToeplitzHankelPlan{S,N}(map(first, TLCs), map(TLC -> TLC[2], TLCs), map(last, TLCs), dims)
        end
    end
end

###
# th_cheb2leg
###

_sub_dim_by_one(d) = ()
_sub_dim_by_one(d, m, n...) = (isone(d) ? m-1 : m, _sub_dim_by_one(d-1, n...)...)

function _cheb2legTH_TLC(::Type{S}, mn, d) where S
    n = mn[d]
    t = zeros(S,max(0,n-1))
    S̃ = real(S)
    if n > 1
        t[1:2:end] = Λ.(0:one(S̃):div(n-2,2), -half(S̃), one(S̃))
    end
    h = Λ.(1:half(S̃):n-1, zero(S̃), 3half(S̃))
    D = 1:n-1
    DL = (3half(S̃):n-half(S̃)) ./ D
    DR = -(one(S̃):n-one(S̃)) ./ (4 .* D)
    C = hankel_partialchol(h, D)
    T = plan_uppertoeplitz!(t, (_sub_dim_by_one(d, mn...)..., size(C,2)), d)
    T, DL .* C, DR .* C
end

plan_th_cheb2leg!(::Type{S}, mn::NTuple{N,Int}, dims::Int) where {S,N} = ChebyshevToLegendrePlanTH(ToeplitzHankelPlan(_cheb2legTH_TLC(S, mn, dims)..., dims))

function plan_th_cheb2leg!(::Type{S}, mn::NTuple{N,Int}, dims) where {S,N}
    TLCs = _cheb2legTH_TLC.(S, Ref(mn), dims)
    ChebyshevToLegendrePlanTH(ToeplitzHankelPlan{S,N}(map(first, TLCs), map(TLC -> TLC[2], TLCs), map(last, TLCs), dims))
end


###
# th_ultra2ultra
###

# The second case handles zero
isapproxinteger(::Integer) = true
isapproxinteger(x) = isinteger(x) || x ≈ round(Int,x)  || x+1 ≈ round(Int,x+1)

"""
  _nearest_jacobi_par(α, γ)

returns a number that is an integer different than γ but less than 1 away from α.
"""
function _nearest_jacobi_par(α::T, γ::T) where T
    ret = isapproxinteger(α-γ) ? α : round(Int,α,RoundDown) + mod(γ,1)
    ret ≤ -1 ? ret + 1 : ret
end
_nearest_jacobi_par(α::T, ::T) where T<:Integer = α
_nearest_jacobi_par(α, γ) = _nearest_jacobi_par(promote(α,γ)...)


struct Ultra2UltraPlanTH{T, Plans, Dims} <: Plan{T}
    plans::Plans
    λ₁::T
    λ₂::T
    dims::Dims
end

function *(P::Ultra2UltraPlanTH, A::AbstractArray)
    ret = A
    if isapproxinteger(P.λ₂ - P.λ₁)
        _ultra2ultra_integerinc!(ret, P.λ₁, P.λ₂, P.dims)
    else
        for p in P.plans
            ret = p*ret
        end
        c = _nearest_jacobi_par(P.λ₁, P.λ₂)

        _ultra2ultra_integerinc!(ret, c, P.λ₂, P.dims)
    end
end

function _ultra2ultraTH_TLC(::Type{S}, mn, λ₁, λ₂, d) where {S}
    n = mn[d]
    @assert abs(λ₁-λ₂) < 1
    S̃ = real(S)
    DL = (zero(S̃):n-one(S̃)) .+ λ₂
    jk = 0:half(S̃):n-1
    t = zeros(S,n)
    t[1:2:n] = Λ.(jk,λ₁-λ₂,one(S̃))[1:2:n]
    h = Λ.(jk,λ₁,λ₂+one(S̃))
    lmul!(gamma(λ₂)/gamma(λ₁),h)
    C = hankel_partialchol(h)
    T = plan_uppertoeplitz!(lmul!(inv(gamma(λ₁-λ₂)),t), (mn..., size(C,2)), d)
    T, DL .* C, C
end

_good_plan_th_ultra2ultra!(::Type{S}, mn, λ₁, λ₂, dims::Int) where S = ToeplitzHankelPlan(_ultra2ultraTH_TLC(S, mn, λ₁, λ₂, dims)..., dims)

function _good_plan_th_ultra2ultra!(::Type{S}, mn::NTuple{2,Int}, λ₁, λ₂, dims::NTuple{2,Int}) where S
    T1,L1,C1 = _ultra2ultraTH_TLC(S, mn, λ₁, λ₂, 1)
    T2,L2,C2 = _ultra2ultraTH_TLC(S, mn, λ₁, λ₂, 2)
    ToeplitzHankelPlan{S,2}((T1,T2), (L1,L2), (C1,C2), dims)
end


function plan_th_ultra2ultra!(::Type{S}, mn, λ₁, λ₂, dims) where {S}
    c = _nearest_jacobi_par(λ₁, λ₂)

    if isapproxinteger(λ₂ - λ₁)
        # TODO: don't make extra plan
        plans = typeof(_good_plan_th_ultra2ultra!(S, mn, λ₂+0.1, λ₂, dims))[]
    else
        plans = [_good_plan_th_ultra2ultra!(S, mn, λ₁, c, dims)]
    end

    Ultra2UltraPlanTH(plans, λ₁, λ₂, dims)
end

function _ultra_raise!(B, λ)
    m, n = size(B, 1), size(B, 2)

    if m > 1
        @inbounds for j = 1:n
            for i = 1:m-2
                Bij = λ / (i+λ-1) * B[i,j]
                Bij += -λ / (i+λ+1) * B[i+2,j]
                B[i,j] = Bij
            end
            B[m-1,j] = λ / (m+λ-2)*B[m-1,j]
            B[m,j] = λ / (m+λ-1)*B[m,j]
        end
    end
    B
end

function _ultra_lower!(B, λ)
    m, n = size(B, 1), size(B, 2)

    if m > 1
        @inbounds for j = 1:n
            B[m,j] = (m+λ-1)/λ * B[m,j]
            B[m-1,j] = (m+λ-2)/λ *B[m-1,j]
            for i = m-2:-1:1
                Bij = B[i,j] + λ / (i+λ+1) * B[i+2,j]
                B[i,j] = (i+λ-1)/λ * Bij
            end  
        end
    end
    B
end


function _ultra_raise!(x, λ, dims)
    for d in dims
        if d == 1
            _ultra_raise!(x, λ)
        else
            _ultra_raise!(x', λ)
        end
    end
    x
end

function _ultra_lower!(x, λ, dims)
    for d in dims
        if d == 1
            _ultra_lower!(x, λ-1)
        else
            _ultra_lower!(x', λ-1)
        end
    end
    x
end

function _ultra2ultra_integerinc!(x, λ₁, λ₂, dims)
    while !(λ₁ ≈ λ₂)
        if λ₂ > λ₁
            _ultra_raise!(x, λ₁, dims)
            λ₁ += 1
        else
            _ultra_lower!(x, λ₁, dims)
            λ₁ -= 1
        end
    end
    x
end

###
# th_jac2jac
###


function _lmul!(A::Bidiagonal, B::AbstractVecOrMat)
    @assert A.uplo == 'U'
    
    m, n = size(B, 1), size(B, 2)
    if m != size(A, 1)
        throw(DimensionMismatch("right hand side B needs first dimension of size $(size(A,1)), has size $m"))
    end
    @inbounds for j = 1:n
        for i = 1:m-1
            Bij = A.dv[i]*B[i,j]
            Bij += A.ev[i]*B[i+1,j]
            B[i,j] = Bij
        end
        B[m,j] = A.dv[m]*B[m,j]
    end
    B
end

struct Jac2JacPlanTH{T, Plans, Dims} <: Plan{T}
    plans::Plans
    α::T
    β::T
    γ::T
    δ::T
    dims::Dims
end

Jac2JacPlanTH(plans, α, β, γ, δ, dims) = Jac2JacPlanTH(plans, promote(α, β, γ, δ)..., dims)

function *(P::Jac2JacPlanTH, A::AbstractArray)
    if P.α + P.β ≤ -1
        _jacobi_raise_a!(A, P.α, P.β, P.dims)
        c,d = _nearest_jacobi_par(P.α+1, P.γ), _nearest_jacobi_par(P.β, P.δ)
    else
        c,d = _nearest_jacobi_par(P.α, P.γ), _nearest_jacobi_par(P.β, P.δ)
    end

    ret = A
    for p in P.plans
        ret = p*ret
    end

    _jac2jac_integerinc!(ret, c, d, P.γ, P.δ, P.dims)
end

function alternatesign!(v)
    @inbounds for k = 2:2:length(v)
        v[k] = -v[k]
    end
    v
end

function _jac2jacTH_TLC(::Type{S}, mn, α, β, γ, δ, d) where {S}
    n = mn[d]
    @assert α+β > -1
    if β == δ
        @assert abs(α-γ) < 1
        jk = 0:n-1
        DL = (2jk .+ γ .+ β .+ 1).*Λ.(jk,γ+β+1,β+1)
        t = convert(AbstractVector{S}, Λ.(jk, α-γ,1))
        h = Λ.(0:2n-2,α+β+1,γ+β+2)
        DR = Λ.(jk,β+1,α+β+1)./gamma(α-γ)
        C = hankel_partialchol(h)
        T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
    elseif α == γ
        @assert abs(β-δ) < 1
        jk = 0:n-1
        DL = (2jk .+ δ .+ α .+ 1).*Λ.(jk,δ+α+1,α+1)
        h = Λ.(0:2n-2,α+β+1,δ+α+2)
        DR = Λ.(jk,α+1,α+β+1)./gamma(β-δ)
        t = alternatesign!(convert(AbstractVector{S}, Λ.(jk,β-δ,1)))
        C = hankel_partialchol(h)
        T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
    else
        throw(ArgumentError("Cannot create Toeplitz dot Hankel, use a sequence of plans."))
    end

    (T, DL .* C, DR .* C)
end

_good_plan_th_jac2jac!(::Type{S}, mn, α, β, γ, δ, dims::Int) where S = ToeplitzHankelPlan(_jac2jacTH_TLC(S, mn, α, β, γ, δ, dims)..., dims)

function _good_plan_th_jac2jac!(::Type{S}, mn::NTuple{2,Int}, α, β, γ, δ, dims::NTuple{2,Int}) where S
    T1,L1,C1 = _jac2jacTH_TLC(S, mn, α, β, γ, δ, 1)
    T2,L2,C2 = _jac2jacTH_TLC(S, mn, α, β, γ, δ, 2)
    ToeplitzHankelPlan{S,2}((T1,T2), (L1,L2), (C1,C2), dims)
end


function plan_th_jac2jac!(::Type{S}, mn, α, β, γ, δ, dims) where {S}
    if α + β ≤ -1
        c,d = _nearest_jacobi_par(α+1, γ), _nearest_jacobi_par(β, δ)
    else
        c,d = _nearest_jacobi_par(α, γ), _nearest_jacobi_par(β, δ)
    end

    if isapproxinteger(β - δ) && isapproxinteger(α-γ)
        # TODO: don't make extra plan
        plans = typeof(_good_plan_th_jac2jac!(S, mn, α+0.1, β, α, β, dims))[]
    elseif isapproxinteger(α - γ) || isapproxinteger(β - δ)
        if α + β ≤ -1
            # avoid degenerecies
            plans = [_good_plan_th_jac2jac!(S, mn, α+1, β, c, d, dims)]
        else
            plans = [_good_plan_th_jac2jac!(S, mn, α, β, c, d, dims)]
        end
    else
        if α + β ≤ -1
            plans = [_good_plan_th_jac2jac!(S, mn, α+1, β, α+1, d, dims), _good_plan_th_jac2jac!(S, mn, α+1, d, c, d, dims)]
        else
            plans = [_good_plan_th_jac2jac!(S, mn, α, β, α, d, dims), _good_plan_th_jac2jac!(S, mn, α, d, c, d, dims)]
        end
    end

    Jac2JacPlanTH(plans, α, β, γ, δ, dims)
end


function _jacobi_raise_a!(B, a, b)
    m, n = size(B, 1), size(B, 2)
    if m > 1
        @inbounds for j = 1:n
            B[1,j] = B[1,j] - (1+b) / (a+b+3) * B[2,j]
            for i = 2:m-1
                B[i,j] = (i+a+b)/(a+b-1+2i) * B[i,j] - (i+b) / (a+b+2i+1) * B[i+1,j]
            end
            B[m,j] = (m+a+b)/(a+b-1+2m)*B[m,j]
        end
    end
    B
end

function _jacobi_lower_a!(B, a, b)
    m, n = size(B, 1), size(B, 2)

    if m > 1
        @inbounds for j = 1:n
            B[m,j] = (a+b-1+2m)/(m+a+b) * B[m,j]
            for i = m-1:-1:2
                Bij = B[i,j] + (i+b) / (a+b+2i+1) * B[i+1,j]
                B[i,j] = (a+b-1+2i)/(i+a+b)  * Bij
            end
            B[1,j] = B[1,j] + (1+b) / (a+b+3) * B[2,j]
        end
    end
    B
end


function _jacobi_raise_b!(B, a, b)
    m, n = size(B, 1), size(B, 2)
    if m > 1
        @inbounds for j = 1:n
            B[1,j] = B[1,j] + (1+a) / (a+b+3) * B[2,j]
            
            for i = 2:m-1
                B[i,j] = (i+a+b)/(a+b-1+2i) * B[i,j] + (i+a) / (a+b+2i+1) * B[i+1,j]
            end
            B[m,j] = (m+a+b)/(a+b-1+2m)*B[m,j]
        end
    end
    B
end

function _jacobi_lower_b!(B, a, b)
    m, n = size(B, 1), size(B, 2)

    if m > 1
        @inbounds for j = 1:n
            B[m,j] = (a+b-1+2m)/(m+a+b) * B[m,j]
            for i = m-1:-1:2
                Bij = B[i,j] - (i+a) / (a+b+2i+1) * B[i+1,j]
                B[i,j] = (a+b-1+2i)/(i+a+b)  * Bij
            end
            B[1,j] = B[1,j] - (1+a) / (a+b+3) * B[2,j]
        end
    end
    B
end


function _jacobi_raise_b!(x, α, β, dims)
    for d in dims
        if d == 1
            _jacobi_raise_b!(x, α, β)
        else
            _jacobi_raise_b!(x', α, β)
        end
    end
    x
end
function _jacobi_raise_a!(x, α, β, dims)
    for d in dims
        if d == 1
            _jacobi_raise_a!(x, α, β)
        else
            _jacobi_raise_a!(x', α, β)
        end
    end
    x
end

function _jacobi_lower_b!(x, α, β, dims)
    for d in dims
        if d == 1
            _jacobi_lower_b!(x, α, β-1)
        else
            _jacobi_lower_b!(x', α, β-1)
        end
    end
    x
end
function _jacobi_lower_a!(x, α, β, dims)
    for d in dims
        if d == 1
            _jacobi_lower_a!(x, α-1, β)
        else
            _jacobi_lower_a!(x', α-1, β)
        end
    end
    x
end


function _jac2jac_integerinc!(x, α, β, γ, δ, dims)
    while !(α ≈ γ && β ≈ δ)
        if !(δ ≈ β) && δ > β
            _jacobi_raise_b!(x, α, β, dims)
            β += 1
        elseif !(δ ≈ β) && δ < β
            _jacobi_lower_b!(x, α, β, dims)
            β -= 1
        elseif !(γ ≈ α) && γ > α
            _jacobi_raise_a!(x, α, β, dims)
            α += 1
        else
            @assert γ < α
            _jacobi_lower_a!(x, α, β, dims)
            α -= 1
        end
    end
    x
end


###
# other routines
###

for f in (:th_leg2cheb, :th_cheb2leg, :th_leg2chebu)
    plan = Symbol("plan_", f, "!")
    @eval begin
        $plan(arr::AbstractArray{T}, dims...) where T = $plan(T, size(arr), dims...)
        $plan(::Type{S}, mn::NTuple{N,Int}) where {S,N} = $plan(S, mn, ntuple(identity,Val(N)))
        $f(v, dims...) = $plan(eltype(v), size(v), dims...)*copy(v)
    end
end

plan_th_ultra2ultra!(::Type{S}, mn::NTuple{N,Int}, λ₁, λ₂, dims::UnitRange) where {N,S} = plan_th_ultra2ultra!(S, mn, λ₁, λ₂, tuple(dims...))
plan_th_ultra2ultra!(::Type{S}, mn::Tuple{Int}, λ₁, λ₂, dims::Tuple{Int}=(1,)) where {S} = plan_th_ultra2ultra!(S, mn, λ₁, λ₂, dims...)
plan_th_ultra2ultra!(::Type{S}, (m,n)::NTuple{2,Int}, λ₁, λ₂) where {S} = plan_th_ultra2ultra!(S, (m,n), λ₁, λ₂, (1,2))
plan_th_ultra2ultra!(arr::AbstractArray{T}, λ₁, λ₂, dims...) where T = plan_th_ultra2ultra!(T, size(arr), λ₁, λ₂, dims...)
th_ultra2ultra(v, λ₁, λ₂, dims...) = plan_th_ultra2ultra!(eltype(v), size(v), λ₁, λ₂, dims...)*copy(v)

plan_th_jac2jac!(::Type{S}, mn::NTuple{N,Int}, α, β, γ, δ, dims::UnitRange) where {N,S} = plan_th_jac2jac!(S, mn, α, β, γ, δ, tuple(dims...))
plan_th_jac2jac!(::Type{S}, mn::Tuple{Int}, α, β, γ, δ, dims::Tuple{Int}=(1,)) where {S} = plan_th_jac2jac!(S, mn, α, β, γ, δ, dims...)
plan_th_jac2jac!(::Type{S}, (m,n)::NTuple{2,Int}, α, β, γ, δ) where {S} = plan_th_jac2jac!(S, (m,n), α, β, γ, δ, (1,2))
plan_th_jac2jac!(arr::AbstractArray{T}, α, β, γ, δ, dims...) where T = plan_th_jac2jac!(T, size(arr), α, β, γ, δ, dims...)
th_jac2jac(v, α, β, γ, δ, dims...) = plan_th_jac2jac!(eltype(v), size(v), α, β, γ, δ, dims...)*copy(v)


####
# cheb2jac
####

struct Cheb2JacPlanTH{T, Pl<:Jac2JacPlanTH{T}} <: Plan{T}
    jac2jac::Pl
end


struct Jac2ChebPlanTH{T, Pl<:Jac2JacPlanTH{T}} <: Plan{T}
    jac2jac::Pl
end


function jac_cheb_recurrencecoefficients(T, N)
    n = 0:N
    h = one(T)/2
    A = (2n .+ one(T)) ./ (n .+ one(T))
    A[1] /= 2
    A, Zeros(n), 
    ((n .- h) .* (n .- h) .* (2n .+ one(T))) ./ ((n .+ one(T)) .* n .* (2n .- one(T)))
end


function *(P::Cheb2JacPlanTH{T}, X::AbstractArray) where T
    A,B,C = jac_cheb_recurrencecoefficients(T, max(size(X)...))

    for d in P.jac2jac.dims
        if d == 1
            p = forwardrecurrence(size(X,1), A,B,C, one(T))
            X .= p .\ X
        else
            @assert d == 2
            n = size(X,2)
            p = forwardrecurrence(size(X,2), A,B,C, one(T))
            X .= X ./ transpose(p)
        end
    end
    P.jac2jac*X
end

function *(P::Jac2ChebPlanTH{T}, X::AbstractArray) where T
    X = P.jac2jac*X
    A,B,C = jac_cheb_recurrencecoefficients(T, max(size(X)...))

    for d in P.jac2jac.dims
        if d == 1
            p = forwardrecurrence(size(X,1), A,B,C, one(T))
            X .= p .* X
        else
            @assert d == 2
            n = size(X,2)
            p = forwardrecurrence(size(X,2), A,B,C, one(T))
            X .= X .* transpose(p)
        end
    end
    X
end

plan_th_cheb2jac!(::Type{T}, mn, α, β, dims...) where T = Cheb2JacPlanTH(plan_th_jac2jac!(T, mn, -one(α)/2, -one(α)/2, α, β, dims...))
plan_th_cheb2jac!(arr::AbstractArray{T}, α, β, dims...) where T = plan_th_cheb2jac!(T, size(arr), α, β, dims...)
th_cheb2jac(v, α, β, dims...) = plan_th_cheb2jac!(eltype(v), size(v), α, β, dims...)*copy(v)

plan_th_jac2cheb!(::Type{T}, mn, α, β, dims...) where T = Jac2ChebPlanTH(plan_th_jac2jac!(T, mn, α, β, -one(α)/2, -one(α)/2, dims...))
plan_th_jac2cheb!(arr::AbstractArray{T}, α, β, dims...) where T = plan_th_jac2cheb!(T, size(arr), α, β, dims...)
th_jac2cheb(v, α, β, dims...) = plan_th_jac2cheb!(eltype(v), size(v), α, β, dims...)*copy(v)


================================================
FILE: src/toeplitzplans.jl
================================================
using FFTW
import FFTW: plan_r2r!


"""
    ToeplitzPlan

applies Toeplitz matrices fast along each dimension.
"""

struct ToeplitzPlan{T, N, Dims, S, VECS, P<:Plan{S}, Pi<:Plan{S}} <: Plan{T}
    vectors::VECS # Vector or Tuple of storage
    tmp::Array{S,N}
    dft::P
    idft::Pi
    dims::Dims
end

ToeplitzPlan{T}(v, tmp::Array{S,N}, dft::Plan{S}, idft::Plan{S}, dims) where {T,S,N} = ToeplitzPlan{T,N,typeof(dims),S,typeof(v),typeof(dft), typeof(idft)}(v, tmp, dft, idft, dims)


divdimby2(d::Int, sz1, szs...) = isone(d) ? ((sz1 + 1) ÷ 2, szs...) : (sz1, divdimby2(d-1, szs...)...)
muldimby2(d::Int, sz1, szs...) = isone(d) ? (max(0,2sz1 - 1), szs...) : (sz1, muldimby2(d-1, szs...)...)

function toeplitzplan_size(dims, szs)
    ret = szs
    for d in dims
        ret = divdimby2(d, ret...)
    end
    ret
end

function to_toeplitzplan_size(dims, szs)
    ret = szs
    for d in dims
        ret = muldimby2(d, ret...)
    end
    ret
end


size(A::ToeplitzPlan) = toeplitzplan_size(A.dims, size(A.tmp))


# based on ToeplitzMatrices.jl
"""
    maybereal(::Type{T}, x)

Return real-valued part of `x` if `T` is a type of a real number, and `x` otherwise.
"""
maybereal(::Type, x) = x
maybereal(::Type{<:Real}, x) = real(x)

function *(A::ToeplitzPlan{T,N}, X::AbstractArray{T,N}) where {T,N}
    vcs,Y,dft,idft,dims = A.vectors,A.tmp, A.dft,A.idft,A.dims

    isempty(X) && return X

    fill!(Y, zero(eltype(Y)))
    copyto!(view(Y, axes(X)...), X)

    # Fourier transform each dimension
    dft * Y

    # Multiply by a diagonal matrix along each dimension by permuting
    # to first dimension
    for (vc,d) in zip(vcs,dims)
        applydim!(v -> v .= vc .* v, Y, d, :)
    end

    # Transform back
    idft * Y

    X .= maybereal.(T, view(Y, axes(X)...))
    X
end


function uppertoeplitz_padvec(v::AbstractVector{T}) where T
    n = length(v)
    S = complex(float(T))
    tmp = zeros(S, max(0,2n-1))
    if n ≠ 0
        tmp[1] = v[1]
        copyto!(tmp, n+1, Iterators.reverse(v), 1, n-1)
    end
    tmp
end

safe_fft!(A) = isempty(A) ? A : fft!(A)

uppertoeplitz_vecs(v, dims::AbstractVector, szs) = [safe_fft!(uppertoeplitz_padvec(v[1:szs[d]])) for d in dims]
uppertoeplitz_vecs(v, dims::Tuple{}, szs) = ()
uppertoeplitz_vecs(v, dims::Tuple, szs) = (safe_fft!(uppertoeplitz_padvec(v[1:szs[first(dims)]])), uppertoeplitz_vecs(v, tail(dims), szs)...)
uppertoeplitz_vecs(v, d::Int, szs) = (safe_fft!(uppertoeplitz_padvec(v[1:szs[d]])),)


# allow FFT to work by making sure tmp is non-empty
safe_tmp(tmp::AbstractArray{<:Any,N}) where N = isempty(tmp) ? similar(tmp, ntuple(_ -> 1, Val(N))...) : tmp

function plan_uppertoeplitz!(v::AbstractVector{T}, szs::NTuple{N,Int}, dim=ntuple(identity,Val(N))) where {T,N}
    S = complex(float(T))
    
    tmp = zeros(S, to_toeplitzplan_size(dim, szs)...)
    dft = plan_fft!(safe_tmp(tmp), dim)
    idft = plan_ifft!(safe_tmp(similar(tmp)), dim)
    
    return ToeplitzPlan{float(T)}(uppertoeplitz_vecs(v, dim, szs), tmp, dft, idft, dim)
end

plan_uppertoeplitz!(v::AbstractVector{T}) where T = plan_uppertoeplitz!(v, size(v))


================================================
FILE: test/arraystests.jl
================================================
using FastTransforms, Test
import FastTransforms: ArrayPlan, NDimsPlan

@testset "Array transform"  begin
    @testset "ArrayPlan" begin
        c = randn(5,20,10)
        F = plan_cheb2leg(c)
        FT = ArrayPlan(F, c)

        @test size(FT) == size(c)

        f = similar(c);
        for k in axes(c,3)
            f[:,:,k] = (F*c[:,:,k])
        end
        @test f ≈ FT*c
        @test c ≈ FT\f

        F = plan_cheb2leg(Vector{Float64}(axes(c,2)))
        FT = ArrayPlan(F, c, (2,))
        for k in axes(c,3)
            f[:,:,k] = (F*c[:,:,k]')'
        end
        @test f ≈ FT*c
        @test c ≈ FT\f
    end

    @testset "NDimsPlan" begin
        c = randn(20,10,20)
        @test_throws ErrorException("Different size in dims axes not yet implemented in N-dimensional transform.") NDimsPlan(ArrayPlan(plan_cheb2leg(c), c), size(c), (1,2))        

        c = randn(5,20)
        F = plan_cheb2leg(c)
        FT = ArrayPlan(F, c)
        P = NDimsPlan(F, size(c), (1,))
        @test F*c ≈ FT*c ≈ P*c

        c = randn(20,20,5);
        F = plan_cheb2leg(c)
        FT = ArrayPlan(F, c)
        P = NDimsPlan(FT, size(c), (1,2))

        @test size(P) == size(c)

        f = similar(c);
        for k in axes(f,3)
            f[:,:,k] = (F*(F*c[:,:,k])')'
        end
        @test f ≈ P*c
        @test c ≈ P\f

        c = randn(5,10,10,60)
        F = plan_cheb2leg(randn(10))
        P = NDimsPlan(F, size(c), (2,3))
        f = similar(c)
        for i in axes(f,1), j in axes(f,4)
            f[i,:,:,j] = (F*(F*c[i,:,:,j])')'
        end
        @test f ≈ P*c
        @test c ≈ P\f
    end
end


================================================
FILE: test/chebyshevtests.jl
================================================
using FastTransforms, Test

@testset "Chebyshev transform"  begin
    @testset "Chebyshev points" begin
        @test @inferred(chebyshevpoints(10)) == @inferred(chebyshevpoints(Float64, 10))
        @test @inferred(chebyshevpoints(10, Val(2))) == @inferred(chebyshevpoints(Float64, 10, Val(2)))
        for T in (Float32, Float64, ComplexF32, ComplexF64)
            @test chebyshevpoints(T, 0) == T[]
            @test chebyshevpoints(T, 1) == T[0]

            n = 20
            @test @inferred(chebyshevpoints(T, n)) == [sinpi(convert(T,n-2k+1)/(2n)) for k=1:n]
            @test @inferred(chebyshevpoints(T, n, Val(2))) == [sinpi(convert(T,n-2k+1)/(2n-2)) for k=1:n]

            @test_throws MethodError chebyshevpoints(n, Val(-1))
            @test_throws ArgumentError chebyshevpoints(T, 0, Val(2))
            @test_throws ArgumentError chebyshevpoints(T, 1, Val(2))
        end
    end

    @testset "Chebyshev first kind points <-> first kind coefficients" begin
        for T in (Float32, Float64, ComplexF32, ComplexF64)
            n = 20
            p_1 = chebyshevpoints(T, n)
            f = exp.(p_1)
            g = @inferred(chebyshevtransform(f))
            @test g == chebyshevtransform!(copy(f))

            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * g
            @test f̃(0.1) ≈ exp(T(0.1))
            @test @inferred(ichebyshevtransform(g)) ≈ ichebyshevtransform!(copy(g)) ≈ exp.(p_1)

            fcopy = copy(f)
            gcopy = copy(g)
            P = @inferred(plan_chebyshevtransform(f))
            @test @inferred(P*f) == g
            @test f == fcopy
            @test_throws ArgumentError P * T[1,2]
            P2 = @inferred(plan_chebyshevtransform(f, Val(1), 1:1))
            @test @inferred(P2*f) == g
            @test_throws ArgumentError P * T[1,2]

            P = @inferred(plan_chebyshevtransform!(f))
            @test @inferred(P*f) == g
            @test f == g
            @test_throws ArgumentError P * T[1,2]
            f .= fcopy
            P2 = @inferred(plan_chebyshevtransform!(f, 1:1))
            @test @inferred(P2*f) == g
            @test f == g
            @test_throws ArgumentError P * T[1,2]

            Pi = @inferred(plan_ichebyshevtransform(g))
            @test @inferred(Pi*g) ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]
            Pi2 = @inferred(plan_ichebyshevtransform(g, 1:1))
            @test @inferred(Pi2*g) ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]

            Pi = @inferred(plan_ichebyshevtransform!(g))
            @test @inferred(Pi*g) ≈ fcopy
            @test g ≈ fcopy
            g .= gcopy
            @test_throws ArgumentError Pi * T[1,2]
            Pi2 = @inferred(plan_ichebyshevtransform!(g, 1:1))
            @test @inferred(Pi2*g) ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]

            v = T[1]
            @test chebyshevtransform(v) == v
            @test ichebyshevtransform(v) == v
            @test chebyshevtransform!(v) === v
            @test ichebyshevtransform!(v) === v

            v = T[]
            @test chebyshevtransform(v) == v
            @test ichebyshevtransform(v) == v
            @test chebyshevtransform!(v) === v
            @test ichebyshevtransform!(v) === v
        end
    end
    @testset "Chebyshev second kind points <-> first kind coefficients" begin
        for T in (Float32, Float64, ComplexF32, ComplexF64)
            n = 20
            p_2 = chebyshevpoints(T, n, Val(2))
            f = exp.(p_2)
            g = @inferred(chebyshevtransform(f, Val(2)))
            @test g == chebyshevtransform!(copy(f), Val(2))

            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * g
            @test f̃(0.1) ≈ exp(T(0.1))
            @test @inferred(ichebyshevtransform(g, Val(2))) ≈ ichebyshevtransform!(copy(g), Val(2)) ≈ exp.(p_2)

            P = @inferred(plan_chebyshevtransform!(f, Val(2)))
            Pi = @inferred(plan_ichebyshevtransform!(f, Val(2)))
            @test all(@inferred(P \ copy(f)) .=== Pi * copy(f))
            @test all(@inferred(Pi \ copy(g)) .=== P * copy(g))
            @test f ≈ P \ (P*copy(f)) ≈ P * (P\copy(f)) ≈ Pi \ (Pi*copy(f)) ≈ Pi * (Pi \ copy(f))

            fcopy = copy(f)
            gcopy = copy(g)

            P = @inferred(plan_chebyshevtransform(f, Val(2)))
            @test P*f == g
            @test f == fcopy
            @test_throws ArgumentError P * T[1,2]
            P = @inferred(plan_chebyshevtransform(f, Val(2), 1:1))
            @test P*f == g
            @test f == fcopy
            @test_throws ArgumentError P * T[1,2]

            P = @inferred(plan_chebyshevtransform!(f, Val(2)))
            @test P*f == g
            @test f == g
            @test_throws ArgumentError P * T[1,2]
            f .= fcopy
            P = @inferred(plan_chebyshevtransform!(f, Val(2), 1:1))
            @test P*f == g
            @test f == g
            @test_throws ArgumentError P * T[1,2]

            Pi = @inferred(plan_ichebyshevtransform(g, Val(2)))
            @test Pi*g ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]
            Pi = @inferred(plan_ichebyshevtransform(g, Val(2), 1:1))
            @test Pi*g ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]

            Pi = @inferred(plan_ichebyshevtransform!(g, Val(2)))
            @test Pi*g ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]
            g .= gcopy
            Pi = @inferred(plan_ichebyshevtransform!(g, Val(2), 1:1))
            @test Pi*g ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]

            @test_throws ArgumentError chebyshevtransform(T[1], Val(2))
            @test_throws ArgumentError ichebyshevtransform(T[1], Val(2))
            @test_throws ArgumentError chebyshevtransform(T[], Val(2))
            @test_throws ArgumentError ichebyshevtransform(T[], Val(2))
        end
    end

    @testset "Chebyshev first kind points <-> second kind coefficients" begin
        for T in (Float32, Float64, ComplexF32, ComplexF64)
            n = 20
            p_1 = chebyshevpoints(T, n)
            f = exp.(p_1)
            g = @inferred(chebyshevutransform(f))
            @test f ≈ exp.(p_1)

            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-1]' * g
            @test f̃(0.1) ≈ exp(T(0.1))
            @test ichebyshevutransform(g) ≈ exp.(p_1)

            fcopy = copy(f)
            gcopy = copy(g)
            P = @inferred(plan_chebyshevutransform(f))
            @test P*f ≈ g
            @test f ≈ fcopy
            @test_throws ArgumentError P * T[1,2]
            P = @inferred(plan_chebyshevutransform(f, 1:1))
            @test P*f ≈ g
            @test f ≈ fcopy
            @test_throws ArgumentError P * T[1,2]

            P = @inferred(plan_chebyshevutransform!(f))
            @test P*f ≈ g
            @test f ≈ g
            @test_throws ArgumentError P * T[1,2]
            f .= fcopy
            P = @inferred(plan_chebyshevutransform!(f))
            @test P*f ≈ g
            @test f ≈ g
            @test_throws ArgumentError P * T[1,2]

            Pi = @inferred(plan_ichebyshevutransform(g))
            @test Pi*g ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]
            Pi = @inferred(plan_ichebyshevutransform(g, 1:1))
            @test Pi*g ≈ fcopy
            @test g == gcopy
            @test_throws ArgumentError Pi * T[1,2]

            Pi = @inferred(plan_ichebyshevutransform!(g))
            @test Pi*g ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]
            g .= gcopy
            Pi = @inferred(plan_ichebyshevutransform!(g))
            @test Pi*g ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]

            v = T[1]
            @test chebyshevutransform(v) == v
            @test ichebyshevutransform(v) == v
            @test chebyshevutransform!(v) === v
            @test ichebyshevutransform!(v) === v

            v = T[]
            @test chebyshevutransform(v) == v
            @test ichebyshevutransform(v) == v
            @test chebyshevutransform!(v) === v
            @test ichebyshevutransform!(v) === v
        end
    end
    @testset "Chebyshev second kind points <-> second kind coefficients" begin
        for T in (Float32, Float64, ComplexF32, ComplexF64)
            n = 20
            p_2 = chebyshevpoints(T, n, Val(2))[2:end-1]
            f = exp.(p_2)
            g = @inferred(chebyshevutransform(f, Val(2)))

            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-3]' * g
            @test f̃(0.1) ≈ exp(T(0.1))
            @test @inferred(ichebyshevutransform(g, Val(2))) ≈ f ≈ exp.(p_2)

            fcopy = copy(f)
            gcopy = copy(g)
            P = @inferred(plan_chebyshevutransform(f, Val(2)))
            @test @inferred(P*f) ≈ g
            @test f ≈ fcopy
            @test_throws ArgumentError P * T[1,2]
            P = @inferred(plan_chebyshevutransform(f, Val(2), 1:1))
            @test @inferred(P*f) ≈ g
            @test f ≈ fcopy
            @test_throws ArgumentError P * T[1,2]

            P = @inferred(plan_chebyshevutransform!(f, Val(2)))
            @test @inferred(P*f) ≈ g
            @test f ≈ g
            @test_throws ArgumentError P * T[1,2]
            f .= fcopy
            P = @inferred(plan_chebyshevutransform!(f, Val(2), 1:1))
            @test @inferred(P*f) ≈ g
            @test f ≈ g
            @test_throws ArgumentError P * T[1,2]

            Pi = @inferred(plan_ichebyshevutransform(g, Val(2)))
            @test @inferred(Pi*g) ≈ fcopy
            @test g ≈ gcopy
            @test_throws ArgumentError Pi * T[1,2]

            Pi = @inferred(plan_ichebyshevutransform!(g, Val(2)))
            @test @inferred(Pi*g) ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]
            g .= gcopy
            Pi = @inferred(plan_ichebyshevutransform!(g, Val(2)))
            @test @inferred(Pi*g) ≈ fcopy
            @test g ≈ fcopy
            @test_throws ArgumentError Pi * T[1,2]

            @test_throws ArgumentError chebyshevutransform(T[1], Val(2))
            @test_throws ArgumentError ichebyshevutransform(T[1], Val(2))
            @test_throws ArgumentError chebyshevutransform(T[], Val(2))
            @test_throws ArgumentError ichebyshevutransform(T[], Val(2))
        end
    end

    @testset "matrix" begin
        X = randn(4,5)
        @testset "chebyshevtransform" begin
            @test @inferred(chebyshevtransform(X,1)) ≈ @inferred(chebyshevtransform!(copy(X),1)) ≈ hcat(chebyshevtransform.([X[:,k] for k=axes(X,2)])...)
            @test chebyshevtransform(X,2) ≈ chebyshevtransform!(copy(X),2) ≈ hcat(chebyshevtransform.([X[k,:] for k=axes(X,1)])...)'
            @test @inferred(chebyshevtransform(X,Val(2),1)) ≈ @inferred(chebyshevtransform!(copy(X),Val(2),1)) ≈ hcat(chebyshevtransform.([X[:,k] for k=axes(X,2)],Val(2))...)
            @test chebyshevtransform(X,Val(2),2) ≈ chebyshevtransform!(copy(X),Val(2),2) ≈ hcat(chebyshevtransform.([X[k,:] for k=axes(X,1)],Val(2))...)'

            @test @inferred(chebyshevtransform(X)) ≈ @inferred(chebyshevtransform!(copy(X))) ≈ chebyshevtransform(chebyshevtransform(X,1),2)
            @test @inferred(chebyshevtransform(X,Val(2))) ≈ @inferred(chebyshevtransform!(copy(X),Val(2))) ≈ chebyshevtransform(chebyshevtransform(X,Val(2),1),Val(2),2)
        end

        @testset "ichebyshevtransform" begin
            @test @inferred(ichebyshevtransform(X,1)) ≈ @inferred(ichebyshevtransform!(copy(X),1)) ≈ hcat(ichebyshevtransform.([X[:,k] for k=axes(X,2)])...)
            @test ichebyshevtransform(X,2) ≈ ichebyshevtransform!(copy(X),2) ≈ hcat(ichebyshevtransform.([X[k,:] for k=axes(X,1)])...)'
            @test @inferred(ichebyshevtransform(X,Val(2),1)) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2),1)) ≈ hcat(ichebyshevtransform.([X[:,k] for k=axes(X,2)],Val(2))...)
            @test ichebyshevtransform(X,Val(2),2) ≈ ichebyshevtransform!(copy(X),Val(2),2) ≈ hcat(ichebyshevtransform.([X[k,:] for k=axes(X,1)],Val(2))...)'

            @test @inferred(ichebyshevtransform(X)) ≈ @inferred(ichebyshevtransform!(copy(X))) ≈ ichebyshevtransform(ichebyshevtransform(X,1),2)
            @test @inferred(ichebyshevtransform(X,Val(2))) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2))) ≈ ichebyshevtransform(ichebyshevtransform(X,Val(2),1),Val(2),2)

            @test ichebyshevtransform(chebyshevtransform(X)) ≈ X
            @test chebyshevtransform(ichebyshevtransform(X)) ≈ X
        end

        @testset "chebyshevutransform" begin
            @test @inferred(chebyshevutransform(X,1)) ≈ @inferred(chebyshevutransform!(copy(X),1)) ≈ hcat(chebyshevutransform.([X[:,k] for k=axes(X,2)])...)
            @test chebyshevutransform(X,2) ≈ chebyshevutransform!(copy(X),2) ≈ hcat(chebyshevutransform.([X[k,:] for k=axes(X,1)])...)'
            @test @inferred(chebyshevutransform(X,Val(2),1)) ≈ @inferred(chebyshevutransform!(copy(X),Val(2),1)) ≈ hcat(chebyshevutransform.([X[:,k] for k=axes(X,2)],Val(2))...)
            @test chebyshevutransform(X,Val(2),2) ≈ chebyshevutransform!(copy(X),Val(2),2) ≈ hcat(chebyshevutransform.([X[k,:] for k=axes(X,1)],Val(2))...)'

            @test @inferred(chebyshevutransform(X)) ≈ @inferred(chebyshevutransform!(copy(X))) ≈ chebyshevutransform(chebyshevutransform(X,1),2)
            @test @inferred(chebyshevutransform(X,Val(2))) ≈ @inferred(chebyshevutransform!(copy(X),Val(2))) ≈ chebyshevutransform(chebyshevutransform(X,Val(2),1),Val(2),2)
        end

        @testset "ichebyshevutransform" begin
            @test @inferred(ichebyshevutransform(X,1)) ≈ @inferred(ichebyshevutransform!(copy(X),1)) ≈ hcat(ichebyshevutransform.([X[:,k] for k=axes(X,2)])...)
            @test ichebyshevutransform(X,2) ≈ ichebyshevutransform!(copy(X),2) ≈ hcat(ichebyshevutransform.([X[k,:] for k=axes(X,1)])...)'
            @test @inferred(ichebyshevutransform(X,Val(2),1)) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2),1)) ≈ hcat(ichebyshevutransform.([X[:,k] for k=axes(X,2)],Val(2))...)
            @test ichebyshevutransform(X,Val(2),2) ≈ ichebyshevutransform!(copy(X),Val(2),2) ≈ hcat(ichebyshevutransform.([X[k,:] for k=axes(X,1)],Val(2))...)'

            @test @inferred(ichebyshevutransform(X)) ≈ @inferred(ichebyshevutransform!(copy(X))) ≈ ichebyshevutransform(ichebyshevutransform(X,1),2)
            @test @inferred(ichebyshevutransform(X,Val(2))) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2))) ≈ ichebyshevutransform(ichebyshevutransform(X,Val(2),1),Val(2),2)

            @test ichebyshevutransform(chebyshevutransform(X)) ≈ X
            @test chebyshevutransform(ichebyshevutransform(X)) ≈ X
        end

        X = randn(1,1)
        @test chebyshevtransform!(copy(X), Val(1)) == ichebyshevtransform!(copy(X), Val(1)) == X
        @test_throws ArgumentError chebyshevtransform!(copy(X), Val(2))
        @test_throws ArgumentError ichebyshevtransform!(copy(X), Val(2))
    end

    @testset "tensor" begin
        @testset "3D" begin
            X = randn(4,5,6)
            X̃ = similar(X)
            @testset "chebyshevtransform" begin
                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevtransform(X[:,k,j]) end
                @test @inferred(chebyshevtransform(X,1)) ≈ @inferred(chebyshevtransform!(copy(X),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevtransform(X[k,:,j]) end
                @test chebyshevtransform(X,2) ≈ chebyshevtransform!(copy(X),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevtransform(X[k,j,:]) end
                @test chebyshevtransform(X,3) ≈ chebyshevtransform!(copy(X),3) ≈ X̃

                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevtransform(X[:,k,j],Val(2)) end
                @test @inferred(chebyshevtransform(X,Val(2),1)) ≈ @inferred(chebyshevtransform!(copy(X),Val(2),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevtransform(X[k,:,j],Val(2)) end
                @test chebyshevtransform(X,Val(2),2) ≈ chebyshevtransform!(copy(X),Val(2),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevtransform(X[k,j,:],Val(2)) end
                @test chebyshevtransform(X,Val(2),3) ≈ chebyshevtransform!(copy(X),Val(2),3) ≈ X̃

                @test @inferred(chebyshevtransform(X)) ≈ @inferred(chebyshevtransform!(copy(X))) ≈ chebyshevtransform(chebyshevtransform(chebyshevtransform(X,1),2),3)
                @test @inferred(chebyshevtransform(X,Val(2))) ≈ @inferred(chebyshevtransform!(copy(X),Val(2))) ≈ chebyshevtransform(chebyshevtransform(chebyshevtransform(X,Val(2),1),Val(2),2),Val(2),3)
            end

            @testset "ichebyshevtransform" begin
                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevtransform(X[:,k,j]) end
                @test @inferred(ichebyshevtransform(X,1)) ≈ @inferred(ichebyshevtransform!(copy(X),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevtransform(X[k,:,j]) end
                @test ichebyshevtransform(X,2) ≈ ichebyshevtransform!(copy(X),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevtransform(X[k,j,:]) end
                @test ichebyshevtransform(X,3) ≈ ichebyshevtransform!(copy(X),3) ≈ X̃

                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevtransform(X[:,k,j],Val(2)) end
                @test @inferred(ichebyshevtransform(X,Val(2),1)) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevtransform(X[k,:,j],Val(2)) end
                @test ichebyshevtransform(X,Val(2),2) ≈ ichebyshevtransform!(copy(X),Val(2),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevtransform(X[k,j,:],Val(2)) end
                @test ichebyshevtransform(X,Val(2),3) ≈ ichebyshevtransform!(copy(X),Val(2),3) ≈ X̃

                @test @inferred(ichebyshevtransform(X)) ≈ @inferred(ichebyshevtransform!(copy(X))) ≈ ichebyshevtransform(ichebyshevtransform(ichebyshevtransform(X,1),2),3)
                @test @inferred(ichebyshevtransform(X,Val(2))) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2))) ≈ ichebyshevtransform(ichebyshevtransform(ichebyshevtransform(X,Val(2),1),Val(2),2),Val(2),3)

                @test ichebyshevtransform(chebyshevtransform(X)) ≈ X
                @test chebyshevtransform(ichebyshevtransform(X)) ≈ X
            end
        
            @testset "chebyshevutransform" begin
                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevutransform(X[:,k,j]) end
                @test @inferred(chebyshevutransform(X,1)) ≈ @inferred(chebyshevutransform!(copy(X),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevutransform(X[k,:,j]) end
                @test chebyshevutransform(X,2) ≈ chebyshevutransform!(copy(X),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevutransform(X[k,j,:]) end
                @test chebyshevutransform(X,3) ≈ chebyshevutransform!(copy(X),3) ≈ X̃

                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevutransform(X[:,k,j],Val(2)) end
                @test @inferred(chebyshevutransform(X,Val(2),1)) ≈ @inferred(chebyshevutransform!(copy(X),Val(2),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevutransform(X[k,:,j],Val(2)) end
                @test chebyshevutransform(X,Val(2),2) ≈ chebyshevutransform!(copy(X),Val(2),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevutransform(X[k,j,:],Val(2)) end
                @test chebyshevutransform(X,Val(2),3) ≈ chebyshevutransform!(copy(X),Val(2),3) ≈ X̃

                @test @inferred(chebyshevutransform(X)) ≈ @inferred(chebyshevutransform!(copy(X))) ≈ chebyshevutransform(chebyshevutransform(chebyshevutransform(X,1),2),3)
                @test @inferred(chebyshevutransform(X,Val(2))) ≈ @inferred(chebyshevutransform!(copy(X),Val(2))) ≈ chebyshevutransform(chebyshevutransform(chebyshevutransform(X,Val(2),1),Val(2),2),Val(2),3)
            end

            @testset "ichebyshevutransform" begin
                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevutransform(X[:,k,j]) end
                @test @inferred(ichebyshevutransform(X,1)) ≈ @inferred(ichebyshevutransform!(copy(X),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevutransform(X[k,:,j]) end
                @test ichebyshevutransform(X,2) ≈ ichebyshevutransform!(copy(X),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevutransform(X[k,j,:]) end
                @test ichebyshevutransform(X,3) ≈ ichebyshevutransform!(copy(X),3) ≈ X̃

                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevutransform(X[:,k,j],Val(2)) end
                @test @inferred(ichebyshevutransform(X,Val(2),1)) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2),1)) ≈ X̃
                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevutransform(X[k,:,j],Val(2)) end
                @test ichebyshevutransform(X,Val(2),2) ≈ ichebyshevutransform!(copy(X),Val(2),2) ≈ X̃
                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevutransform(X[k,j,:],Val(2)) end
                @test ichebyshevutransform(X,Val(2),3) ≈ ichebyshevutransform!(copy(X),Val(2),3) ≈ X̃

                @test @inferred(ichebyshevutransform(X)) ≈ @inferred(ichebyshevutransform!(copy(X))) ≈ ichebyshevutransform(ichebyshevutransform(ichebyshevutransform(X,1),2),3)
                @test @inferred(ichebyshevutransform(X,Val(2))) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2))) ≈ ichebyshevutransform(ichebyshevutransform(ichebyshevutransform(X,Val(2),1),Val(2),2),Val(2),3)

                @test ichebyshevutransform(chebyshevutransform(X)) ≈ X
                @test chebyshevutransform(ichebyshevutransform(X)) ≈ X
            end

            X = randn(1,1,1)
            @test chebyshevtransform!(copy(X), Val(1)) == ichebyshevtransform!(copy(X), Val(1)) == X
            @test_throws ArgumentError chebyshevtransform!(copy(X), Val(2))
            @test_throws ArgumentError ichebyshevtransform!(copy(X), Val(2))
        end

        @testset "4D" begin
            X = randn(2,3,4,5)
            X̃ = similar(X)
            for trans in (chebyshevtransform, ichebyshevtransform, chebyshevutransform, ichebyshevutransform)
                for k = axes(X,2), j = axes(X,3), l = axes(X,4) X̃[:,k,j,l] = trans(X[:,k,j,l]) end
                @test @inferred(trans(X,1)) ≈ X̃
                @test @inferred(trans(X)) ≈ trans(trans(trans(trans(X,1),2),3),4)
            end
        end
    end
    @testset "Integer" begin
        @test chebyshevtransform([1,2,3]) == chebyshevtransform([1.,2,3])
        @test chebyshevtransform([1,2,3], Val(2)) == chebyshevtransform([1.,2,3], Val(2))
        @test ichebyshevtransform([1,2,3]) == ichebyshevtransform([1.,2,3])
        @test ichebyshevtransform([1,2,3], Val(2)) == ichebyshevtransform([1.,2,3], Val(2))

        @test chebyshevutransform([1,2,3]) == chebyshevutransform([1.,2,3])
        @test chebyshevutransform([1,2,3], Val(2)) == chebyshevutransform([1.,2,3], Val(2))
        @test ichebyshevutransform([1,2,3]) == ichebyshevutransform([1.,2,3])
        @test ichebyshevutransform([1,2,3], Val(2)) == ichebyshevutransform([1.,2,3], Val(2))
    end

    @testset "BigFloat" begin
        x = BigFloat[1,2,3]
        @test ichebyshevtransform(chebyshevtransform(x)) ≈ x
        @test plan_chebyshevtransform(x)x ≈ chebyshevtransform(x)
        @test plan_ichebyshevtransform(x)x ≈ ichebyshevtransform(x)
        @test plan_chebyshevtransform!(x)copy(x) ≈ chebyshevtransform(x)
        @test plan_ichebyshevtransform!(x)copy(x) ≈ ichebyshevtransform(x)
    end
    @testset "BigInt" begin
        x = big(10)^400 .+ BigInt[1,2,3]
        @test ichebyshevtransform(chebyshevtransform(x)) ≈ x
    end

    @testset "immutable vectors" begin
        F = plan_chebyshevtransform([1.,2,3])
        @test chebyshevtransform(1.0:3) == F * (1:3)
        @test ichebyshevtransform(1.0:3) == ichebyshevtransform([1.0:3;])
    end

    @testset "inv" begin
        x = randn(5)
        for F in (plan_chebyshevtransform(x), plan_chebyshevtransform(x, Val(2)),
                  plan_chebyshevutransform(x), plan_chebyshevutransform(x, Val(2)),
                  plan_ichebyshevtransform(x), plan_ichebyshevtransform(x, Val(2)),
                  plan_ichebyshevutransform(x), plan_ichebyshevutransform(x, Val(2)))
            @test F \ (F*x) ≈ F * (F\x) ≈ x
        end

        X = randn(5,4)
        for F in (plan_chebyshevtransform(X,Val(1),1), plan_chebyshevtransform(X, Val(2),1),
            plan_chebyshevtransform(X,Val(1),2), plan_chebyshevtransform(X, Val(2),2),
            plan_ichebyshevtransform(X,Val(1),1), plan_ichebyshevtransform(X, Val(2),1),
            plan_ichebyshevtransform(X,Val(1),2), plan_ichebyshevtransform(X, Val(2),2))
            @test F \ (F*X) ≈ F * (F\X) ≈ X
        end
        # Matrix isn't implemented for chebyshevu
        for F in (plan_chebyshevutransform(X,Val(1),1), plan_chebyshevutransform(X, Val(2),1),
            plan_chebyshevutransform(X,Val(1),2), plan_chebyshevutransform(X, Val(2),2),
            plan_ichebyshevutransform(X,Val(1),1), plan_ichebyshevutransform(X, Val(2),1),
            plan_ichebyshevutransform(X,Val(1),2), plan_ichebyshevutransform(X, Val(2),2))
            @test F \ (F*X) ≈ F * (F\X) ≈ X
        end
    end

    @testset "incompatible shapes" begin
        @test_throws ErrorException plan_chebyshevtransform(randn(5)) * randn(5,5)
        @test_throws ErrorException plan_ichebyshevtransform(randn(5)) * randn(5,5)
    end

    @testset "plan via size" begin
        X = randn(3,4)
        p = plan_chebyshevtransform(Float64, (3,4))
        @test p * X == chebyshevtransform(X)
    end
end


================================================
FILE: test/gaunttests.jl
================================================
using FastTransforms, LinearAlgebra, Test

import FastTransforms: δ

@testset "Gaunt coefficients" begin
    # Table 2 of Y.-l. Xu, JCAM 85:53–65, 1997.
    for (m,n) in ((0,2),(1,2),(1,8),(6,8),(3,18),
                  (10,18),(5,25),(-23,25),(2,40),(-35,40),
                  (28,62),(-42,62),(1,99),(90,99),(10,120),
                  (80,120),(23,150),(88,150))
        @test norm(gaunt(m,n,-m,n)[end]./(big(-1.0)^m/(2n+1))-1, Inf) < 400eps()
    end
    # Table 3 of Y.-l. Xu, JCAM 85:53–65, 1997.
    for (m,n,μ,ν) in ((0,1,0,5),(0,5,0,10),(0,9,0,10),(0,10,0,12),
                      (0,11,0,15),(0,12,0,20),(0,20,0,45),(0,40,0,80),
                      (0,45,0,100),(3,5,-3,6),(4,9,-4,15),(-8,18,8,23),
                      (-10,20,10,30),(5,25,-5,45),(15,50,-15,60),(-28,68,28,75),
                      (32,78,-32,88),(45,82,-45,100))
        @test norm(sum(gaunt(m,n,μ,ν))-δ(m,0), Inf) < 15000eps()
    end
end


================================================
FILE: test/grammatrixtests.jl
================================================
using FastTransforms, BandedMatrices, LazyArrays, LinearAlgebra, Test

@testset "GramMatrix" begin
    n = 128
    for T in (Float32, Float64, BigFloat)
        R = plan_leg2cheb(T, n; normcheb=true)*I
        X = Tridiagonal([T(n)/(2n-1) for n in 1:n-1], zeros(T, n), [T(n)/(2n+1) for n in 1:n-1]) # Legendre X
        W = GramMatrix(Symmetric(R'R), X)
        @test issymmetric(W)
        @test isposdef(W)
        F = cholesky(W)
        @test F.L*F.L' ≈ Symmetric(R'R)
        @test F.U ≈ R

        R = plan_leg2cheb(T, n; normcheb=true, normleg=true)*I
        X = SymTridiagonal(zeros(T, n), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n-1]) # normalized Legendre X
        W = GramMatrix(Symmetric(R'R), X)
        F = cholesky(W)
        @test F.L*F.L' ≈ Symmetric(R'R)
        @test F.U ≈ R

        b = 4
        X = BandedMatrix(SymTridiagonal(zeros(T, n+b), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n+b-1])) # normalized Legendre X
        M = Symmetric((I+X^2+X^4)[1:n, 1:n])
        X = BandedMatrix(SymTridiagonal(zeros(T, n), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n-1])) # normalized Legendre X
        W = GramMatrix(M, X)
        @test bandwidths(W) == (b, b)
        F = cholesky(W)
        @test F.L*F.L' ≈ M

        X = BandedMatrix(SymTridiagonal(T[2n-1 for n in 1:n+b], T[-n for n in 1:n+b-1])) # Laguerre X, tests nonzero diagonal
        M = Symmetric((I+X^2+X^4)[1:n, 1:n])
        X = BandedMatrix(SymTridiagonal(T[2n-1 for n in 1:n], T[-n for n in 1:n-1])) # Laguerre X, tests nonzero diagonal
        W = GramMatrix(M, X)
        @test bandwidths(W) == (b, b)
        F = cholesky(W)
        @test F.L*F.L' ≈ M

        for μ in (PaddedVector([T(4)/3;0;-T(4)/15], 2n-1), # w(x) = 1-x^2
                  PaddedVector([T(26)/15;0;-T(4)/105;0;T(16)/315], 2n-1), # w(x) = 1-x^2+x^4
                  T(1) ./ (1:2n-1)) # Related to a log weight
            X = Tridiagonal([T(n)/(2n-1) for n in 1:2n-2], zeros(T, 2n-1), [T(n)/(2n+1) for n in 1:2n-2]) # Legendre X
            W = GramMatrix(μ, X)
            X = Tridiagonal(X[1:n, 1:n])
            G = FastTransforms.compute_skew_generators(W)
            J = T[0 1; -1 0]
            @test X'W-W*X ≈ G*J*G'
        end
    end
    W = reshape([i for i in 1.0:n^2], n, n)
    X = reshape([i for i in 1.0:4n^2], 2n, 2n)
    @test_throws "different sizes" GramMatrix(W, X)
    X = X[1:n, 1:n]
    @test_throws "nonsymmetric" GramMatrix(W, X)
    @test_throws "nontridiagonal" GramMatrix(Symmetric(W), X)
end

@testset "ChebyshevGramMatrix" begin
    n = 128
    for T in (Float32, Float64, BigFloat)
        μ = FastTransforms.chebyshevmoments1(T, 2n-1)
        W = ChebyshevGramMatrix(μ)
        F = cholesky(W)
        @test F.L*F.L' ≈ W
        R = plan_cheb2leg(T, n; normleg=true)*I
        @test F.U ≈ R

        α, β = (T(0.123), T(0.456))
        μ = FastTransforms.chebyshevjacobimoments1(T, 2n-1, α, β)
        W = ChebyshevGramMatrix(μ)
        F = cholesky(W)
        @test F.L*F.L' ≈ W
        R = plan_cheb2jac(T, n, α, β; normjac=true)*I
        @test F.U ≈ R

        μ = FastTransforms.chebyshevlogmoments1(T, 2n-1)
        W = ChebyshevGramMatrix(μ)
        F = cholesky(W)
        @test F.L*F.L' ≈ W

        μ = FastTransforms.chebyshevabsmoments1(T, 2n-1)
        W = ChebyshevGramMatrix(μ)
        F = cholesky(W)
        @test F.L*F.L' ≈ W

        μ = PaddedVector(T(1) ./ [1,2,3,4,5], 2n-1)
        W = ChebyshevGramMatrix(μ)
        @test bandwidths(W) == (4, 4)
        F = cholesky(W)
        @test F.L*F.L' ≈ W
        μd = Vector{T}(μ)
        Wd = ChebyshevGramMatrix(μd)
        Fd = cholesky(Wd)
        @test F.L ≈ Fd.L

        X = Tridiagonal([T(1); ones(T, n-2)/2], zeros(T, n), ones(T, n-1)/2)
        G = FastTransforms.compute_skew_generators(W)
        J = T[0 1; -1 0]
        @test 2*(X'W-W*X) ≈ G*J*G'
    end
end


================================================
FILE: test/hermitetests.jl
================================================
using FastTransforms, FastGaussQuadrature, Test

hermitepoints(n) = FastGaussQuadrature.unweightedgausshermite( n )[1]

@testset "Hermite" begin
    @test hermitepoints(1) == [0.0]
    @test hermitepoints(100_000)[end] ≈ 446.9720305443094

    @test weightedhermitetransform([1.0]) == [1.0]
    @test weightedhermitetransform(exp.(-hermitepoints(2).^2/2)) ≈ [1.0,0.0]
    @test weightedhermitetransform(exp.(-hermitepoints(3).^2/2)) ≈ [1.0,0.0,0.0]
    @test weightedhermitetransform(exp.(-hermitepoints(1000).^2/2)) ≈ [1.0; zeros(999)]
    @test weightedhermitetransform(exp.(-hermitepoints(3000).^2/2)) ≈ [1.0; zeros(2999)]

    for n in (1, 5,100)
        x = randn(n)
        @test iweightedhermitetransform(weightedhermitetransform(x)) ≈ x
        @test weightedhermitetransform(iweightedhermitetransform(x)) ≈ x
    end

    x = hermitepoints(100)
    @test iweightedhermitetransform([0.0; 1.0; zeros(98)]) ≈ (exp.(-x.^2 ./ 2) .* 2x/sqrt(2))
    @test iweightedhermitetransform([0.0; 0; 1.0; zeros(97)]) ≈ (exp.(-x.^2 ./ 2) .* (4x.^2 .- 2)/(sqrt(2)*2^(2/2)))
    @test iweightedhermitetransform([0.0; 0; 0; 1.0; zeros(96)]) ≈ (exp.(-x.^2 ./ 2) .* (-12x + 8x.^3) / (sqrt(2*3)*2^(3/2)))
    @test iweightedhermitetransform([0.0; 0; 0; 0; 1.0; zeros(95)]) ≈ (exp.(-x.^2 ./ 2) .* (12 .- 48x.^2 .+ 16x.^4) / (sqrt(2*3*4)*2^(4/2)))
end


================================================
FILE: test/libfasttransformstests.jl
================================================
using FastTransforms, Test

FastTransforms.ft_set_num_threads(ceil(Int, Base.Sys.CPU_THREADS/2))

@testset "libfasttransforms" begin
    n = 64
    for T in (Float32, Float64)
        c = one(T) ./ (1:n)
        x = collect(-1 .+ 2*(0:n-1)/T(n))
        f = similar(x)
        @test FastTransforms.horner!(f, c, x) == f
        fd = T[sum(c[k]*x^(k-1) for k in 1:length(c)) for x in x]
        @test f ≈ fd
        @test FastTransforms.clenshaw!(f, c, x) == f
        fd = T[sum(c[k]*cos((k-1)*acos(x)) for k in 1:length(c)) for x in x]
        @test f ≈ fd
        A = T[(2k+one(T))/(k+one(T)) for k in 0:length(c)-1]
        B = T[zero(T) for k in 0:length(c)-1]
        C = T[k/(k+one(T)) for k in 0:length(c)]
        phi0 = ones(T, length(x))
        c = FastTransforms.lib_cheb2leg(c)
        @test FastTransforms.clenshaw!(f, c, A, B, C, x, phi0) == f
        @test f ≈ fd
    end

    α, β, γ, δ, λ, μ, ρ = 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7
    function test_1d_plans(p1, p2, x)
        y = p1*x
        z = p2*y
        @test z ≈ x
        y = p1*view(x, :)
        z = p2*view(y, :)
        @test z ≈ x
        y = p1*x
        z = p1'y
        y = transpose(p1)*z
        z = transpose(p1)\y
        y = p1'\z
        z = p1\y
        @test z ≈ x
        y = p1*view(x, :)
        z = p1'view(y, :)
        y = transpose(p1)*view(z, :)
        z = transpose(p1)\view(y, :)
        y = p1'\view(z, :)
        z = p1\view(y, :)
        @test z ≈ x
        y = p2*x
        z = p2'y
        y = transpose(p2)*z
        z = transpose(p2)\y
        y = p2'\z
        z = p2\y
        @test z ≈ x
        y = p2*view(x, :)
        z = p2'view(y, :)
        y = transpose(p2)*view(z, :)
        z = transpose(p2)\view(y, :)
        y = p2'\view(z, :)
        z = p2\view(y, :)
        @test z ≈ x
        P = p1*I
        Q = p2*P
        @test Q ≈ I
        P = p1*I
        Q = p1'P
        P = transpose(p1)*Q
        Q = transpose(p1)\P
        P = p1'\Q
        Q = p1\P
        @test Q ≈ I
        P = p2*I
        Q = p2'P
        P = transpose(p2)*Q
        Q = transpose(p2)\P
        P = p2'\Q
        Q = p2\P
        @test Q ≈ I
    end

    for T in (Float32, Float64, Complex{Float32}, Complex{Float64}, BigFloat, Complex{BigFloat})
        x = T(1)./(1:n)
        Id = Matrix{T}(I, n, n)
        for (p1, p2) in ((plan_leg2cheb(Id), plan_cheb2leg(Id)),
                         (plan_ultra2ultra(Id, λ, μ), plan_ultra2ultra(Id, μ, λ)),
                         (plan_jac2jac(Id, α, β, γ, δ), plan_jac2jac(Id, γ, δ, α, β)),
                         (plan_lag2lag(Id, α, β), plan_lag2lag(Id, β, α)),
                         (plan_jac2ultra(Id, α, β, λ), plan_ultra2jac(Id, λ, α, β)),
                         (plan_jac2cheb(Id, α, β), plan_cheb2jac(Id, α, β)),
                         (plan_ultra2cheb(Id, λ), plan_cheb2ultra(Id, λ)))
            test_1d_plans(p1, p2, x)
        end
    end

    for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
        x = T(1)./(1:n)
        Id = Matrix{T}(I, n, n)
        p = plan_associatedjac2jac(Id, 1, α, β, γ, δ)
        V = p*I
        @test V ≈ p*Id
        y = p*x
        @test V\y ≈ x
    end

    @testset "Modified classical orthonormal polynomial transforms" begin
        (n, α, β) = (16, 0, 0)
        for T in (Float32, Float64)
            P1 = plan_modifiedjac2jac(T, n, α, β, T[0.9428090415820636, -0.32659863237109055, -0.42163702135578396, 0.2138089935299396]) # u1(x) = (1-x)^2*(1+x)
            P2 = plan_modifiedjac2jac(T, n, α, β, T[0.9428090415820636, -0.32659863237109055, -0.42163702135578396, 0.2138089935299396], T[1.4142135623730951]) # u2(x) = (1-x)^2*(1+x)
            P3 = plan_modifiedjac2jac(T, n, α, β, T[-0.9428090415820636, 0.32659863237109055, 0.42163702135578396, -0.2138089935299396], T[-5.185449728701348, 0.0, 0.42163702135578374]) # u3(x) = -(1-x)^2*(1+x), v3(x) = -(2-x)*(2+x)
            P4 = plan_modifiedjac2jac(T, n, α+2, β+1, T[1.1547005383792517], T[4.387862045841156, 0.1319657758147716, -0.20865621238292037]) # v4(x) = (2-x)*(2+x)

            @test P1*I ≈ P2*I
            @test P1\I ≈ P2\I
            @test P3*I ≈ P2*(P4*I)
            @test P3\I ≈ P4\(P2\I)

            P5 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0]) # u5(x) = x^2
            P6 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0], T[1.0]) # u6(x) = x^2
            P7 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0], T[7.0, -7.0, 2.0]) # u7(x) = x^2, v7(x) = (1+x)*(2+x)
            P8 = plan_modifiedlag2lag(T, n, α+2, T[sqrt(2.0)], T[sqrt(1058.0), -sqrt(726.0), sqrt(48.0)]) # v8(x) = (1+x)*(2+x)

            @test P5*I ≈ P6*I
            @test P5\I ≈ P6\I
            @test isapprox(P7*I, P6*(P8*I); rtol = eps(T)^(1/4))
            @test isapprox(P7\I, P8\(P6\I); rtol = eps(T)^(1/4))

            P9 = plan_modifiedherm2herm(T, n, T[2.995504568550877, 0.0, 3.7655850551068593, 0.0, 1.6305461589167827], T[2.995504568550877, 0.0, 3.7655850551068593, 0.0, 1.6305461589167827]) # u9(x) = 1+x^2+x^4, v9(x) = 1+x^2+x^4

            @test P9*I ≈ P9\I
        end
    end

    function test_nd_plans(p, ps, pa, A)
        B = copy(A)
        C = ps*(p*A)
        A = p\(pa*C)
        @test A ≈ B
        C = ps'*(p'A)
        A = p'\(pa'C)
        @test A ≈ B
        C = transpose(ps)*(transpose(p)*A)
        A = transpose(p)\(transpose(pa)*C)
        @test A ≈ B
    end

    A = sphones(Float64, n, 2n-1)
    p = plan_sph2fourier(A)
    ps = plan_sph_synthesis(A)
    pa = plan_sph_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = sphones(Float64, n, 2n-1) + im*sphones(Float64, n, 2n-1)
    p = plan_sph2fourier(A)
    ps = plan_sph_synthesis(A)
    pa = plan_sph_analysis(A)
    test_nd_plans(p, ps, pa, A)

    A = sphvones(Float64, n, 2n-1)
    p = plan_sphv2fourier(A)
    ps = plan_sphv_synthesis(A)
    pa = plan_sphv_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = sphvones(Float64, n, 2n-1) + im*sphvones(Float64, n, 2n-1)
    p = plan_sphv2fourier(A)
    ps = plan_sphv_synthesis(A)
    pa = plan_sphv_analysis(A)
    test_nd_plans(p, ps, pa, A)

    A = diskones(Float64, n, 4n-3)
    p = plan_disk2cxf(A, α, β)
    ps = plan_disk_synthesis(A)
    pa = plan_disk_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = diskones(Float64, n, 4n-3) + im*diskones(Float64, n, 4n-3)
    p = plan_disk2cxf(A, α, β)
    ps = plan_disk_synthesis(A)
    pa = plan_disk_analysis(A)
    test_nd_plans(p, ps, pa, A)

    A = diskones(Float64, n, 4n-3)
    p = plan_ann2cxf(A, α, β, 0, ρ)
    ps = plan_annulus_synthesis(A, ρ)
    pa = plan_annulus_analysis(A, ρ)
    test_nd_plans(p, ps, pa, A)
    A = diskones(Float64, n, 4n-3) + im*diskones(Float64, n, 4n-3)
    p = plan_ann2cxf(A, α, β, 0, ρ)
    ps = plan_annulus_synthesis(A, ρ)
    pa = plan_annulus_analysis(A, ρ)
    test_nd_plans(p, ps, pa, A)

    A = rectdiskones(Float64, n, n)
    p = plan_rectdisk2cheb(A, β)
    ps = plan_rectdisk_synthesis(A)
    pa = plan_rectdisk_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = rectdiskones(Float64, n, n) + im*rectdiskones(Float64, n, n)
    p = plan_rectdisk2cheb(A, β)
    ps = plan_rectdisk_synthesis(A)
    pa = plan_rectdisk_analysis(A)
    test_nd_plans(p, ps, pa, A)

    A = triones(Float64, n, n)
    p = plan_tri2cheb(A, α, β, γ)
    ps = plan_tri_synthesis(A)
    pa = plan_tri_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = triones(Float64, n, n) + im*triones(Float64, n, n)
    p = plan_tri2cheb(A, α, β, γ)
    ps = plan_tri_synthesis(A)
    pa = plan_tri_analysis(A)
    test_nd_plans(p, ps, pa, A)

    α, β, γ, δ = -0.1, -0.2, -0.3, -0.4
    A = tetones(Float64, n, n, n)
    p = plan_tet2cheb(A, α, β, γ, δ)
    ps = plan_tet_synthesis(A)
    pa = plan_tet_analysis(A)
    test_nd_plans(p, ps, pa, A)
    A = tetones(Float64, n, n, n) + im*tetones(Float64, n, n, n)
    p = plan_tet2cheb(A, α, β, γ, δ)
    ps = plan_tet_synthesis(A)
    pa = plan_tet_analysis(A)
    test_nd_plans(p, ps, pa, A)

    A = spinsphones(Complex{Float64}, n, 2n-1, 2) + im*spinsphones(Complex{Float64}, n, 2n-1, 2)
    p = plan_spinsph2fourier(A, 2)
    ps = plan_spinsph_synthesis(A, 2)
    pa = plan_spinsph_analysis(A, 2)
    test_nd_plans(p, ps, pa, A)
end

@testset "ultra2ulta bug and cheb2leg normalisation (#202, #203)" begin
    @test ultra2ultra([0.0, 1.0], 1, 1) == [0,1]
    @test cheb2leg([0.0, 1.0], normcheb=true) ≈ [0.,sqrt(2/π)]
    @test cheb2leg([0.0, 1.0], normleg=true) ≈ [0.,sqrt(2/3)]
end

================================================
FILE: test/nuffttests.jl
================================================
using FFTW, FastTransforms, LinearAlgebra, Test

FFTW.set_num_threads(ceil(Int, Sys.CPU_THREADS/2))

@testset "Nonuniform fast Fourier transforms" begin
    function nudft1(c::AbstractVector, ω::AbstractVector{T}) where {T<:AbstractFloat}
        # Nonuniform discrete Fourier transform of type I

        N = size(ω, 1)
        output = zero(c)
        for j = 1:N
        	output[j] = dot(exp.(2*T(π)*im*(j-1)/N*ω), c)
        end

        return output
    end

    function nudft2(c::AbstractVector, x::AbstractVector{T}) where {T<:AbstractFloat}
        # Nonuniform discrete Fourier transform of type II

        N = size(x, 1)
        output = zero(c)
        ω = collect(0:N-1)
        for j = 1:N
        	output[j] = dot(exp.(2*T(π)*im*x[j]*ω), c)
        end

        return output
    end

    function nudft3(c::AbstractVector, x::AbstractVector{T}, ω::AbstractVector{T}) where {T<:AbstractFloat}
        # Nonuniform discrete Fourier transform of type III

        N = size(x, 1)
        output = zero(c)
        for j = 1:N
            output[j] = dot(exp.(2*T(π)*im*x[j]*ω), c)
        end

        return output
    end

    N = round.([Int],10 .^ range(1,stop=3,length=10))

    for n in N, ϵ in (1e-4, 1e-8, 1e-12, eps(Float64))
        c = complex(rand(n))
        err_bnd = 500*ϵ*n*norm(c)

        ω = collect(0:n-1) + 0.25*rand(n)
        exact = nudft1(c, ω)
        fast = nufft1(c, ω, ϵ)
        @test norm(exact - fast, Inf) < err_bnd

        d = inufft1(fast, ω, ϵ)
        @test norm(c - d, Inf) < err_bnd

        x = (collect(0:n-1) + 0.25*rand(n))/n
        exact = nudft2(c, x)
        fast = nufft2(c, x, ϵ)
        @test norm(exact - fast, Inf) < err_bnd

        d = inufft2(fast, x, ϵ)
        @test norm(c - d, Inf) < err_bnd

        exact = nudft3(c, x, ω)
        fast = nufft3(c, x, ω, ϵ)
        @test norm(exact - fast, Inf) < err_bnd
    end

    # Check that if points/frequencies are indeed uniform, then it's equal to the fft.
    for n in (1000,), ϵ in (eps(Float64), 0.0)
        c = complex(rand(n))
        ω = collect(0.0:n-1)
        x = ω/n
        fftc = fft(c)
        if Sys.WORD_SIZE == 64
            @test_skip norm(nufft1(c, ω, ϵ) - fftc) == 0 # skip because fftw3 seems to change this
            @test norm(nufft2(c, x, ϵ) - fftc) == 0
            @test_skip norm(nufft3(c, x, ω, ϵ) - fftc) == 0 # skip because fftw3 seems to change this
        end
        err_bnd = 500*eps(Float64)*norm(c)
        @test norm(nufft1(c, ω, ϵ) - fftc) < err_bnd
        @test norm(nufft2(c, x, ϵ) - fftc) < err_bnd
        @test norm(nufft3(c, x, ω, ϵ) - fftc) < err_bnd
    end

    function nudft1(C::Matrix{Complex{T}}, ω1::AbstractVector{T}, ω2::AbstractVector{T}) where {T<:AbstractFloat}
        # Nonuniform discrete Fourier transform of type I-I

        M, N = size(C)
        output = zero(C)
        @inbounds for j1 = 1:M, j2 = 1:N
            for k1 = 1:M, k2 = 1:N
                output[j1,j2] += exp(-2*T(π)*im*((j1-1)/M*ω1[k1]+(j2-1)/N*ω2[k2]))*C[k1,k2]
            end
        end
        return output
    end

    function nudft2(C::Matrix{Complex{T}}, x::AbstractVector{T}, y::AbstractVector{T}) where {T<:AbstractFloat}
        # Nonuniform discrete Fourier transform of type II-II

        M, N = size(C)
        output = zero(C)
        @inbounds for j1 = 1:M, j2 = 1:N
            for k1 = 1:M, k2 = 1:N
                output[j1,j2] += exp(-2*T(π)*im*(x[j1]*(k1-1)+y[j2]*(k2-1)))*C[k1,k2]
            end
        end
        return output
    end

    N = round.([Int],10 .^ range(1,stop=1.7,length=5))

    for n in N, ϵ in (1e-4,1e-8,1e-12,eps(Float64))
        C = complex(rand(n,n))
        err_bnd = 500*ϵ*n*norm(C)

        x = (collect(0:n-1) + 0.25*rand(n))/n
        y = (collect(0:n-1) + 0.25*rand(n))/n
        ω1 = collect(0:n-1) + 0.25*rand(n)
        ω2 = collect(0:n-1) + 0.25*rand(n)

        exact = nudft1(C, ω1, ω2)
        fast = nufft1(C, ω1, ω2, ϵ)
        @test norm(exact - fast, Inf) < err_bnd

        exact = nudft2(C, x, y)
        fast = nufft2(C, x, y, ϵ)
        @test norm(exact - fast, Inf) < err_bnd
    end
end


================================================
FILE: test/paduatests.jl
================================================
using FastTransforms, Test

@testset "Padua transform and its inverse" begin
    n=200
    N=div((n+1)*(n+2),2)
    v=rand(N)  #Length of v is the no. of Padua points
    Pl=plan_paduatransform!(v)
    IPl=plan_ipaduatransform!(v)
    @test Pl*(IPl*copy(v)) ≈ v
    @test IPl*(Pl*copy(v)) ≈ v
    @test Pl*copy(v) ≈ paduatransform(v)
    @test IPl*copy(v) ≈ ipaduatransform(v)

    # check that the return vector is NOT reused
    Pl=plan_paduatransform!(v)
    x=Pl*v
    y=Pl*rand(N)
    @test x ≠ y

    IPl=plan_ipaduatransform!(v)
    x=IPl*v
    y=IPl*rand(N)
    @test x ≠ y

    # Accuracy of 2d function interpolation at a point

    """
    Interpolates a 2d function at a given point using 2d Chebyshev series.
    """
    function paduaeval(f::Function,x::AbstractFloat,y::AbstractFloat,m::Integer,lex)
        T=promote_type(typeof(x),typeof(y))
        M=div((m+1)*(m+2),2)
        pvals=Vector{T}(undef,M)
        p=paduapoints(T,m)
        map!(f,pvals,p[:,1],p[:,2])
        coeffs=paduatransform(pvals,lex)
        plan=plan_ipaduatransform!(pvals,lex)
        cfs_mat=FastTransforms.trianglecfsmat(plan,coeffs)
        f_x=sum([cfs_mat[k,j]*cos((j-1)*acos(x))*cos((k-1)*acos(y)) for k=1:m+1, j=1:m+1])
        return f_x
    end
    f_xy = (x,y) ->x^2*y+x^3
    g_xy = (x,y) ->cos(exp(2*x+y))*sin(y)
    x=0.1;y=0.2
    m=130
    l=80
    f_m=paduaeval(f_xy,x,y,m,Val{true})
    g_l=paduaeval(g_xy,x,y,l,Val{true})
    @test f_xy(x,y) ≈ f_m
    @test g_xy(x,y) ≈ g_l

    f_m=paduaeval(f_xy,x,y,m,Val{false})
    g_l=paduaeval(g_xy,x,y,l,Val{false})
    @test f_xy(x,y) ≈ f_m
    @test g_xy(x,y) ≈ g_l

    # odd n
    m=135
    l=85
    f_m=paduaeval(f_xy,x,y,m,Val{true})
    g_l=paduaeval(g_xy,x,y,l,Val{true})
    @test f_xy(x,y) ≈ f_m
    @test g_xy(x,y) ≈ g_l

    f_m=paduaeval(f_xy,x,y,m,Val{false})
    g_l=paduaeval(g_xy,x,y,l,Val{false})
    @test f_xy(x,y) ≈ f_m
    @test g_xy(x,y) ≈ g_l
end


================================================
FILE: test/quadraturetests.jl
================================================
using FastTransforms, LinearAlgebra, Test

import FastTransforms: chebyshevmoments1, chebyshevmoments2,
                       chebyshevjacobimoments1, chebyshevjacobimoments2,
                       chebyshevlogmoments1, chebyshevlogmoments2

@testset "Fejér and Clenshaw–Curtis quadrature" begin
    N = 20
    f = x -> exp(x)

    x = clenshawcurtisnodes(Float64, N)
    μ = chebyshevmoments1(Float64, N)
    w = clenshawcurtisweights(μ)
    @test norm(dot(f.(x), w)-2sinh(1)) ≤ 4eps()

    μ = chebyshevjacobimoments1(Float64, N, 0.25, 0.35)
    w = clenshawcurtisweights(μ)
    @test norm(dot(f.(x), w)-2.0351088204147243) ≤ 4eps()

    μ = chebyshevlogmoments1(Float64, N)
    w = clenshawcurtisweights(μ)
    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()

    x = fejernodes1(Float64, N)
    μ = chebyshevmoments1(Float64, N)
    w = fejerweights1(μ)
    @test norm(dot(f.(x), w)-2sinh(1)) ≤ 4eps()

    μ = chebyshevjacobimoments1(Float64, N, 0.25, 0.35)
    w = fejerweights1(μ)
    @test norm(dot(f.(x), w)-2.0351088204147243) ≤ 4eps()

    μ = chebyshevlogmoments1(Float64, N)
    w = fejerweights1(μ)
    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()

    x = fejernodes2(Float64, N)
    μ = chebyshevmoments2(Float64, N)
    w = fejerweights2(μ)
    @test norm(dot(f.(x), w)-2sinh(1)) ≤ 4eps()

    μ = chebyshevjacobimoments2(Float64, N, 0.25, 0.35)
    w = fejerweights2(μ)
    @test norm(dot(f.(x), w)-2.0351088204147243) ≤ 4eps()

    μ = chebyshevlogmoments2(Float64, N)
    w = fejerweights2(μ)
    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()
end


================================================
FILE: test/runtests.jl
================================================
using FastTransforms, LinearAlgebra, Test

include("specialfunctionstests.jl")
include("chebyshevtests.jl")
include("quadraturetests.jl")
include("libfasttransformstests.jl")
include("nuffttests.jl")
include("paduatests.jl")
include("gaunttests.jl")
include("hermitetests.jl")
include("toeplitzplanstests.jl")
include("toeplitzhankeltests.jl")
include("toeplitzplushankeltests.jl")
include("grammatrixtests.jl")
include("arraystests.jl")


================================================
FILE: test/specialfunctionstests.jl
================================================
using FastTransforms, LinearAlgebra, Test

import FastTransforms: pochhammer, sqrtpi, gamma, lgamma
import FastTransforms: Cnλ, Λ, lambertw, Cnαβ, Anαβ
import FastTransforms: chebyshevmoments1, chebyshevmoments2, chebyshevjacobimoments1, chebyshevjacobimoments2, chebyshevlogmoments1, chebyshevlogmoments2

@testset "Special functions" begin
    @test pochhammer(2,3) == 24
    @test pochhammer(0.5,3) == 0.5*1.5*2.5
    @test pochhammer(0.5,0.5) == 1/sqrtpi
    @test pochhammer(0,1) == 0
    @test pochhammer(-1,2) == 0
    @test pochhammer(-5,3) == -60
    @test pochhammer(-1,-0.5) == 0
    @test 1.0/pochhammer(-0.5,-0.5) == 0
    @test pochhammer(-1+0im,-1) == -0.5
    @test pochhammer(2,1) == pochhammer(2,1.0) == pochhammer(2.0,1) == 2
    @test pochhammer(1.1,2.2) ≈ gamma(3.3)/gamma(1.1)
    @test pochhammer(-2,1) == pochhammer(-2,1.0) == pochhammer(-2.0,1) == -2

    n = 0:1000
    λ = 0.125
    @test norm(Cnλ.(n, λ) ./ Cnλ.(n, big(λ)) .- 1, Inf) < 3eps()

    x = range(0, stop=20, length=81)
    @test norm((Λ.(x) .- Λ.(big.(x)))./Λ.(x), Inf) < 2eps()
    @test norm((lambertw.(x) .- lambertw.(big.(x)))./max.(lambertw.(x), 1), Inf) < 2eps()

    x = 0:0.5:1000
    λ₁, λ₂ = 0.125, 0.875
    @test norm((Λ.(x,λ₁,λ₂) .- Λ.(big.(x),big(λ₁),big(λ₂)))./Λ.(big.(x),big(λ₁),big(λ₂)), Inf) < 4eps()
    λ₁, λ₂ = 1//3, 2//3
    @test norm((Λ.(x,Float64(λ₁),Float64(λ₂)) .- Λ.(big.(x),big(λ₁),big(λ₂))) ./ Λ.(big.(x),big(λ₁),big(λ₂)), Inf) < 4eps()

    α, β = 0.125, 0.375

    @test norm(Cnαβ.(n,α,β) ./ Cnαβ.(n,big(α),big(β)) .- 1, Inf) < 3eps()
    @test norm(Anαβ.(n,α,β) ./ Anαβ.(n,big(α),big(β)) .- 1, Inf) < 4eps()

    @testset "BigFloat bug" begin
        @test Λ(0.0, -1/2, 1.0) ≈ -exp(lgamma(-1/2)-lgamma(1.0))
        @test Λ(1.0, -1/2, 1.0) ≈ exp(lgamma(1-1/2)-lgamma(2.0))
        @test Float64(Λ(big(0.0), -1/2, 1.0)) ≈ Λ(0.0, -1/2, 1.0)
    end
end


================================================
FILE: test/toeplitzhankeltests.jl
================================================
using FastTransforms, Test, Random
import FastTransforms: th_leg2cheb, th_cheb2leg, th_leg2chebu, th_ultra2ultra,th_jac2jac, th_leg2chebu,
                        lib_leg2cheb, lib_cheb2leg, lib_ultra2ultra, lib_jac2jac,
                        plan_th_cheb2leg!, plan_th_leg2chebu!, plan_th_leg2cheb!, plan_th_ultra2ultra!, plan_th_jac2jac!,
                        th_cheb2jac, th_jac2cheb

Random.seed!(0)

@testset "ToeplitzHankel" begin
    for x in ([1.0], [1.0,2,3,4,5], [1.0+im,2-3im,3+4im,4-5im,5+10im], collect(1.0:1000))
        @test th_leg2cheb(x) ≈ lib_leg2cheb(x)
        @test th_cheb2leg(x) ≈ lib_cheb2leg(x)
        @test th_leg2chebu(x) ≈ lib_ultra2ultra(x, 0.5, 1.0)
        @test th_ultra2ultra(x,0.1, 0.2) ≈ lib_ultra2ultra(x, 0.1, 0.2)
        @test th_ultra2ultra(x,1, 2) ≈ lib_ultra2ultra(x, 1, 2)
        @test th_ultra2ultra(x,0.1, 2.2) ≈ lib_ultra2ultra(x, 0.1, 2.2)
        @test th_ultra2ultra(x, 2.2, 0.1) ≈ lib_ultra2ultra(x, 2.2, 0.1)
        @test th_ultra2ultra(x, 1, 3) ≈ lib_ultra2ultra(x, 1, 3)
        @test @inferred(th_jac2jac(x,0.1, 0.2,0.1,0.4)) ≈ lib_jac2jac(x, 0.1, 0.2,0.1,0.4)
        @test th_jac2jac(x,0.1, 0.2,0.3,0.2) ≈ lib_jac2jac(x, 0.1, 0.2,0.3,0.2)
        @test th_jac2jac(x,0.1, 0.2,0.3,0.4) ≈ lib_jac2jac(x, 0.1, 0.2,0.3,0.4)
        @test @inferred(th_jac2jac(x,0.1, 0.2,1.3,0.4)) ≈ lib_jac2jac(x, 0.1, 0.2,1.3,0.4)
        @test th_jac2jac(x,0.1, 0.2,1.3,2.4) ≈ lib_jac2jac(x, 0.1, 0.2,1.3,2.4)
        @test th_jac2jac(x,1.3,2.4, 0.1, 0.2) ≈ lib_jac2jac(x,1.3,2.4, 0.1, 0.2)
        @test th_jac2jac(x,1.3, 1.2,-0.1,-0.2) ≈ lib_jac2jac(x, 1.3, 1.2,-0.1,-0.2)
        @test @inferred(th_jac2jac(x,-0.5, -0.5, -0.5,-0.5)) ≈ lib_jac2jac(x, -0.5, -0.5, -0.5,-0.5)
        @test th_jac2jac(x,-0.5, -0.5, 0.5,0.5) ≈ lib_jac2jac(x, -0.5, -0.5, 0.5,0.5)
        @test th_jac2jac(x,0.5,0.5,-0.5, -0.5) ≈ lib_jac2jac(x, 0.5,0.5,-0.5, -0.5)
        @test th_jac2jac(x,-0.5, -0.5, 0.5,-0.5) ≈ lib_jac2jac(x, -0.5, -0.5, 0.5,-0.5)
        @test th_jac2jac(x, -1/2,-1/2,1/2,0) ≈ lib_jac2jac(x, -1/2,-1/2,1/2,0)
        @test th_jac2jac(x, -1/2,-1/2,0,1/2) ≈ lib_jac2jac(x, -1/2,-1/2,0,1/2)
        @test th_jac2jac(x, -3/4,-3/4,0,3/4) ≈ lib_jac2jac(x, -3/4,-3/4,0,3/4)
        if length(x) < 10
            @test th_jac2jac(x,0, 0, 5, 5) ≈ lib_jac2jac(x, 0, 0, 5, 5)
            @test th_jac2jac(x, 5, 5, 0, 0) ≈ lib_jac2jac(x,  5, 5, 0, 0)
        end

        @test th_cheb2jac(x, 0.2, 0.3) ≈ cheb2jac(x, 0.2, 0.3)
        @test th_jac2cheb(x, 0.2, 0.3) ≈ jac2cheb(x, 0.2, 0.3)
        @test th_cheb2jac(x, 1, 1) ≈ cheb2jac(x, 1, 1)
        @test th_jac2cheb(x, 1, 1) ≈ jac2cheb(x, 1, 1)

        @test th_cheb2leg(th_leg2cheb(x)) ≈ x
        @test th_leg2cheb(th_cheb2leg(x)) ≈ x
        @test th_ultra2ultra(th_ultra2ultra(x, 0.1, 0.6), 0.6, 0.1) ≈ x
        @test th_jac2jac(th_jac2jac(x, 0.1, 0.6, 0.1, 0.8), 0.1, 0.8, 0.1, 0.6) ≈ x
        @test th_jac2jac(th_jac2jac(x, 0.1, 0.6, 0.2, 0.8), 0.2, 0.8, 0.1, 0.6) ≈ x
    end

    for X in (randn(5,4), randn(5,4) + im*randn(5,4))
        @test th_leg2cheb(X, 1) ≈ hcat([leg2cheb(X[:,j]) for j=1:size(X,2)]...)
        @test_broken th_leg2cheb(X, 1) ≈ leg2cheb(X, 1) # matrices not supported in FastTransforms
        @test th_leg2cheb(X, 2) ≈ vcat([permutedims(leg2cheb(X[k,:])) for k=1:size(X,1)]...)
        @test_broken th_leg2cheb(X, 2) ≈ leg2cheb(X, 2)
        @test th_leg2cheb(X) ≈ th_leg2cheb(th_leg2cheb(X, 1), 2)
        @test_broken th_leg2cheb(X) ≈ leg2cheb(X)

        @test th_cheb2leg(X, 1) ≈ hcat([cheb2leg(X[:,j]) for j=1:size(X,2)]...)
        @test th_cheb2leg(X, 2) ≈ vcat([permutedims(cheb2leg(X[k,:])) for k=1:size(X,1)]...)
        @test th_cheb2leg(X) ≈ th_cheb2leg(th_cheb2leg(X, 1), 2)

        @test th_cheb2leg(X) == plan_th_cheb2leg!(X, 1:2)*copy(X)
        @test th_leg2cheb(X) == plan_th_leg2cheb!(X, 1:2)*copy(X)

        @test th_leg2cheb(th_cheb2leg(X)) ≈ X

        @test th_leg2chebu(X, 1) ≈ hcat([ultra2ultra(X[:,j], 0.5, 1.0) for j=1:size(X,2)]...)
        @test th_leg2chebu(X, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.5, 1.0)) for k=1:size(X,1)]...)
        @test th_leg2chebu(X) ≈ th_leg2chebu(th_leg2chebu(X, 1), 2)

        @test th_leg2chebu(X) == plan_th_leg2chebu!(X, 1:2)*copy(X)

        @test th_ultra2ultra(X, 0.1, 0.6, 1) ≈ hcat([ultra2ultra(X[:,j], 0.1, 0.6) for j=1:size(X,2)]...)
        @test th_ultra2ultra(X, 0.1, 0.6, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.1, 0.6)) for k=1:size(X,1)]...)
        @test th_ultra2ultra(X, 0.1, 0.6) ≈ th_ultra2ultra(th_ultra2ultra(X, 0.1, 0.6, 1), 0.1, 0.6, 2)

        @test th_ultra2ultra(X, 0.1, 2.6, 1) ≈ hcat([ultra2ultra(X[:,j], 0.1, 2.6) for j=1:size(X,2)]...)
        @test th_ultra2ultra(X, 0.1, 2.6, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.1, 2.6)) for k=1:size(X,1)]...)
        @test th_ultra2ultra(X, 0.1, 2.6) ≈ th_ultra2ultra(th_ultra2ultra(X, 0.1, 2.6, 1), 0.1, 2.6, 2)

        @test th_ultra2ultra(X, 2.6, 0.1, 1) ≈ hcat([ultra2ultra(X[:,j], 2.6, 0.1) for j=1:size(X,2)]...)
        @test th_ultra2ultra(X, 2.6, 0.1, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 2.6, 0.1)) for k=1:size(X,1)]...)
        @test th_ultra2ultra(X, 2.6, 0.1) ≈ th_ultra2ultra(th_ultra2ultra(X, 2.6, 0.1, 1), 2.6, 0.1, 2)

        @test th_ultra2ultra(X, 0.1, 0.6) == plan_th_ultra2ultra!(X, 0.1, 0.6, 1:2)*copy(X)
        @test th_ultra2ultra(X, 0.1, 0.6) == plan_th_ultra2ultra!(X, 0.1, 0.6, 1:2)*copy(X)

        @test th_ultra2ultra(th_ultra2ultra(X, 0.1, 0.6), 0.6, 0.1) ≈ X

        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 0.1, 0.8) for j=1:size(X,2)]...)
        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 0.1, 0.8)) for k=1:size(X,1)]...)
        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) ≈ th_jac2jac(th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 1), 0.1, 0.6, 0.1, 0.8, 2)

        @test th_jac2jac(X, 0.1, 0.6, 0.2, 0.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 0.2, 0.8) for j=1:size(X,2)]...)
        @test th_jac2jac(X, 0.1, 0.6, 0.2, 0.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 0.2, 0.8)) for k=1:size(X,1)]...)

        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) == plan_th_jac2jac!(X, 0.1, 0.6, 0.1, 0.8, 1:2)*copy(X)
        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) == plan_th_jac2jac!(X, 0.1, 0.6, 0.1, 0.8, 1:2)*copy(X)

        @test th_jac2jac(th_jac2jac(X, 0.1, 0.6, 0.1, 0.8), 0.1, 0.8, 0.1, 0.6) ≈ X

        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 3.1, 2.8) for j=1:size(X,2)]...)
        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 3.1, 2.8)) for k=1:size(X,1)]...)
        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8) ≈ th_jac2jac(th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 1), 0.1, 0.6, 3.1, 2.8, 2)

        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 1) ≈ hcat([jac2jac(X[:,j], -0.5, -0.5, 3.1, 2.8) for j=1:size(X,2)]...)
        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], -0.5, -0.5, 3.1, 2.8)) for k=1:size(X,1)]...)
        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8) ≈ th_jac2jac(th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 1), -0.5, -0.5, 3.1, 2.8, 2)

        @test th_cheb2jac(X, 3.1, 2.8, 1) ≈ hcat([cheb2jac(X[:,j], 3.1, 2.8) for j=1:size(X,2)]...)
        @test th_cheb2jac(X, 3.1, 2.8, 2) ≈ vcat([permutedims(cheb2jac(X[k,:], 3.1, 2.8)) for k=1:size(X,1)]...)
        @test th_cheb2jac(X, 3.1, 2.8) ≈ th_cheb2jac(th_cheb2jac(X, 3.1, 2.8, 1), 3.1, 2.8, 2)

        @test th_jac2cheb(X, 3.1, 2.8, 1) ≈ hcat([jac2cheb(X[:,j], 3.1, 2.8) for j=1:size(X,2)]...)
        @test th_jac2cheb(X, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2cheb(X[k,:], 3.1, 2.8)) for k=1:size(X,1)]...)
        @test th_jac2cheb(X, 3.1, 2.8) ≈ th_jac2cheb(th_jac2cheb(X, 3.1, 2.8, 1), 3.1, 2.8, 2)
    end

    @testset "BigFloat" begin
        n = 10
        x = big.(collect(1.0:n))
        @test th_leg2cheb(x) ≈ lib_leg2cheb(x)
        @test th_cheb2leg(x) ≈ lib_cheb2leg(x)
    end

    @testset "jishnub example" begin
        x = chebyshevpoints(4096);
        f = x -> cospi(1000x);  
        y = f.(x);
        v = th_cheb2leg(chebyshevtransform(y))
        @test norm(v - th_cheb2leg(th_leg2cheb(v)), Inf) ≤ 1E-13
        @test norm(v - th_cheb2leg(th_leg2cheb(v)))/norm(v) ≤ 1E-14
    end

    @testset "tensor" begin
        X = randn(5,4,3)
        for trans in (th_leg2cheb, th_cheb2leg)
            Y = trans(X, 1)
            for ℓ = 1:size(X,3)
                @test Y[:,:,ℓ] ≈ trans(X[:,:,ℓ],1)
            end
            Y = trans(X, 2)
            for ℓ = 1:size(X,3)
                @test Y[:,:,ℓ] ≈ trans(X[:,:,ℓ],2)
            end
            Y = trans(X, 3)
            for j = 1:size(X,2)
                @test Y[:,j,:] ≈ trans(X[:,j,:],2)
            end

            Y = trans(X, (1,3))
            for j = 1:size(X,2)
                @test Y[:,j,:] ≈ trans(X[:,j,:])
            end 

            Y = trans(X, 1:3)
            M = copy(X)
            for j = 1:size(X,3)
                M[:,:,j] = trans(M[:,:,j])
            end
            for k = 1:size(X,1), j=1:size(X,2)
                M[k,j,:] = trans(M[k,j,:])
            end
            @test M ≈ Y
        end
    end

    @testset "inv" begin
        x = randn(10)
        pl = plan_th_cheb2leg!(x)
        @test size(pl) == (10,)
        @test pl\(pl*x) ≈ x

        X = randn(10,3)
        for pl in (plan_th_cheb2leg!(X), plan_th_cheb2leg!(X, 1), plan_th_cheb2leg!(X, 2))
            @test size(pl) == (10,3)
            @test pl\(pl*copy(X)) ≈ X
        end

        X = randn(10,3,5)
        for pl in (plan_th_cheb2leg!(X), plan_th_cheb2leg!(X, 1), plan_th_cheb2leg!(X, 2),  plan_th_cheb2leg!(X, 3))
            @test size(pl) == (10,3,5)
            @test pl\(pl*copy(X)) ≈ X
        end
    end

    @testset "empty" begin
        @test isempty(FastTransforms.th_cheb2leg(Float64[]))
        @test isempty(FastTransforms.th_leg2cheb(Float64[]))
        @test isempty(FastTransforms.th_leg2chebu(Float64[]))
        @test isempty(FastTransforms.th_cheb2jac(Float64[], 0.1, 0.2))
        @test isempty(FastTransforms.th_jac2cheb(Float64[], 0.1, 0.2))
        @test isempty(FastTransforms.th_ultra2ultra(Float64[], 0.1, 0.2))
    end
end

================================================
FILE: test/toeplitzplanstests.jl
================================================
using FastTransforms, Test
import FastTransforms: plan_uppertoeplitz!

@testset "ToeplitzPlan" begin
    @testset "Vector" begin
        P = plan_uppertoeplitz!([1,2,3])
        T = [1 2 3; 0 1 2; 0 0 1]
        x = randn(3)
        @test P * copy(x) ≈ T * x
    end

    @testset "Matrix" begin
        T = [1 2 3; 0 1 2; 0 0 1]

        X = randn(3,3)
        P = plan_uppertoeplitz!([1,2,3], size(X), 1)
        @test P * copy(X) ≈ T * X
        P = plan_uppertoeplitz!([1,2,3], size(X), 2)
        @test P * copy(X) ≈ X * T'

        P = plan_uppertoeplitz!([1,2,3], size(X))
        @test P * copy(X) ≈ T * X * T'

        X = randn(3,4)
        P1 = plan_uppertoeplitz!([1,2,3], size(X), 1)
        @test P1 * copy(X) ≈ T * X
        P2 = plan_uppertoeplitz!([1,2,3,4], size(X), 2)
        T̃ = [1 2 3 4; 0 1 2 3; 0 0 1 2; 0 0 0 1]
        @test P2 * copy(X) ≈ X * T̃'
        P = plan_uppertoeplitz!([1,2,3,4], size(X))
        @test P * copy(X) ≈ T * X * T̃'
    end

    @testset "Tensor" begin
        T = [1 2 3; 0 1 2; 0 0 1]
        
        @testset "3D" begin
            X = randn(3,3,3)
            P = plan_uppertoeplitz!([1,2,3], size(X), 1)
            PX = P * copy(X)
            for ℓ = 1:size(X,3)
                @test PX[:,:,ℓ] ≈ T*X[:,:,ℓ]
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 2)
            PX = P * copy(X)
            for ℓ = 1:size(X,3)
                @test PX[:,:,ℓ] ≈ X[:,:,ℓ]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 3)
            PX = P * copy(X)
            for j = 1:size(X,2)
                @test PX[:,j,:] ≈ X[:,j,:]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), (1,3))
            PX = P * copy(X)
            for j = 1:size(X,2)
                @test PX[:,j,:] ≈ T*X[:,j,:]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 1:3)
            PX = P * copy(X)
            M = copy(X)
            for j = 1:size(X,3)
                M[:,:,j] = T*M[:,:,j]*T'
            end
            for k = 1:size(X,1)
                M[k,:,:] = M[k,:,:]*T'
            end
            @test M ≈ PX
        end

        @testset "4D" begin
            X = randn(3,3,3,3)
            P = plan_uppertoeplitz!([1,2,3], size(X), 1)
            PX = P * copy(X)
            for ℓ = 1:size(X,3), m = 1:size(X,4)
                @test PX[:,:,ℓ,m] ≈ T*X[:,:,ℓ,m]
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 2)
            PX = P * copy(X)
            for ℓ = 1:size(X,3), m = 1:size(X,4)
                @test PX[:,:,ℓ,m] ≈ X[:,:,ℓ,m]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 3)
            PX = P * copy(X)
            for j = 1:size(X,2), m = 1:size(X,4)
                @test PX[:,j,:,m] ≈ X[:,j,:,m]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 4)
            PX = P * copy(X)
            for k = 1:size(X,1), j = 1:size(X,2)
                @test PX[k,j,:,:] ≈ X[k,j,:,:]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), (1,3))
            PX = P * copy(X)
            for j = 1:size(X,2), m=1:size(X,4)
                @test PX[:,j,:,m] ≈ T*X[:,j,:,m]*T'
            end

            P = plan_uppertoeplitz!([1,2,3], size(X), 1:4)
            PX = P * copy(X)
            M = copy(X)
            for ℓ = 1:size(X,3), m = 1:size(X,4)
                M[:,:,ℓ,m] = T*M[:,:,ℓ,m]*T'
            end
            for k = 1:size(X,1), j = 1:size(X,2)
                M[k,j,:,:] = T*M[k,j,:,:]*T'
            end
            @test M ≈ PX
        end
    end

    @testset "BigFloat" begin
        P = plan_uppertoeplitz!([big(π),2,3])
        T = [big(π) 2 3; 0 big(π) 2; 0 0 big(π)]
        x = randn(3)
        @test P * copy(x) ≈ T * x
    end
end

================================================
FILE: test/toeplitzplushankeltests.jl
================================================
using FastTransforms, LinearAlgebra, Test

import FastTransforms: normest

@testset "ToeplitzPlusHankel" begin
    n = 128
    for T in (Float32, Float64)
        μ = FastTransforms.chebyshevmoments1(T, 2n-1)
        G = ChebyshevGramMatrix(μ)
        TpH = ToeplitzPlusHankel(G)
        @test TpH ≈ G
        @test norm(TpH) ≤ normest(TpH)
        @test normest(TpH) == normest(G)
    end
end