Repository: HIPS/autograd
Branch: master
Commit: 994362fdbcc8
Files: 120
Total size: 426.4 KB

Directory structure:
gitextract_4gygwh8h/

├── .github/
│   └── workflows/
│       ├── check.yml
│       ├── publish.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── README.md
├── autograd/
│   ├── __init__.py
│   ├── builtins.py
│   ├── core.py
│   ├── differential_operators.py
│   ├── extend.py
│   ├── misc/
│   │   ├── __init__.py
│   │   ├── fixed_points.py
│   │   ├── flatten.py
│   │   ├── optimizers.py
│   │   └── tracers.py
│   ├── numpy/
│   │   ├── __init__.py
│   │   ├── fft.py
│   │   ├── linalg.py
│   │   ├── numpy_boxes.py
│   │   ├── numpy_jvps.py
│   │   ├── numpy_vjps.py
│   │   ├── numpy_vspaces.py
│   │   ├── numpy_wrapper.py
│   │   └── random.py
│   ├── scipy/
│   │   ├── __init__.py
│   │   ├── integrate.py
│   │   ├── linalg.py
│   │   ├── signal.py
│   │   ├── special.py
│   │   └── stats/
│   │       ├── __init__.py
│   │       ├── beta.py
│   │       ├── chi2.py
│   │       ├── dirichlet.py
│   │       ├── gamma.py
│   │       ├── multivariate_normal.py
│   │       ├── norm.py
│   │       ├── poisson.py
│   │       └── t.py
│   ├── test_util.py
│   ├── tracer.py
│   ├── util.py
│   └── wrap_util.py
├── benchmarks/
│   ├── __init__.py
│   ├── asv.conf.json.sample
│   ├── bench_core.py
│   ├── bench_mem.py
│   ├── bench_numpy_vjps.py
│   ├── bench_rnn.py
│   └── bench_util.py
├── conda_recipe/
│   └── conda.yaml
├── docs/
│   ├── tutorial.md
│   └── updateguide.md
├── examples/
│   ├── README.md
│   ├── __init__.py
│   ├── bayesian_neural_net.py
│   ├── bayesian_optimization.py
│   ├── black_box_svi.py
│   ├── convnet.py
│   ├── data.py
│   ├── data_mnist.py
│   ├── deep_gaussian_process.py
│   ├── define_gradient.py
│   ├── dot_graph.py
│   ├── fixed_points.py
│   ├── fluidsim/
│   │   ├── fluidsim.py
│   │   └── wing.py
│   ├── gaussian_process.py
│   ├── generative_adversarial_net.py
│   ├── gmm.py
│   ├── gplvm.py
│   ├── hmm_em.py
│   ├── ica.py
│   ├── logistic_regression.py
│   ├── lstm.py
│   ├── mixture_variational_inference.py
│   ├── natural_gradient_black_box_svi.py
│   ├── negative_binomial_maxlike.py
│   ├── neural_net.py
│   ├── neural_net_regression.py
│   ├── ode_net.py
│   ├── print_trace.py
│   ├── rkhs.py
│   ├── rnn.py
│   ├── rosenbrock.py
│   ├── sinusoid.py
│   ├── tanh.py
│   └── variational_autoencoder.py
├── license.txt
├── noxfile.py
├── pyproject.toml
└── tests/
    ├── _test_complexity.py
    ├── check_examples_run.sh
    ├── conftest.py
    ├── numpy_utils.py
    ├── profiling.py
    ├── test_binary_ops.py
    ├── test_builtins.py
    ├── test_complex.py
    ├── test_core.py
    ├── test_dict.py
    ├── test_direct.py
    ├── test_fft.py
    ├── test_graphs.py
    ├── test_jacobian.py
    ├── test_linalg.py
    ├── test_list.py
    ├── test_logic.py
    ├── test_misc.py
    ├── test_numpy.py
    ├── test_performance.py
    ├── test_scalar_ops.py
    ├── test_scipy.py
    ├── test_systematic.py
    ├── test_tests.py
    ├── test_truediv.py
    ├── test_tuple.py
    ├── test_vspaces.py
    └── test_wrappers.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/check.yml
================================================
name: Style and package checks

on:
  pull_request:
    branches:
    - master
  push:
    branches:
    - master
  workflow_dispatch:

env:
  PIP_DISABLE_PIP_VERSION_CHECK: "1"
  FORCE_COLOR: "3"

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  check:
    name: ${{ matrix.env }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        session:
      # - lint
        - validate-package
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0

    - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1

    - name: Run ${{ matrix.env }}
      run: uvx nox -s ${{ matrix.env }}


================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish

on:
  workflow_dispatch:
  release:
    types: [published]

env:
  PIP_DISABLE_PIP_VERSION_CHECK: '1'
  FORCE_COLOR: '3'

jobs:
  build:
    name: Build sdist and wheel
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      name: Checkout repository

    - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
      with:
        python-version: "3.12"

    - name: Install build tools
      run: |
        pipx run build --outdir dist

    - name: Upload wheel and sdist artifacts
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
        name: artifacts
        path: ./dist/*
        if-no-files-found: error

  publish:
    needs: [build]
    name: Upload to PyPI
    runs-on: ubuntu-latest
    environment:
      name: release
      url: https://pypi.org/p/autograd
    permissions:
      id-token: write # mandatory for trusted publishing

    steps:
      - name: Download artifacts
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          path: dist
          merge-multiple: true

      - name: Sanity check artifacts
        run: ls -la dist/

      - name: Publish sdist and wheel to PyPI
        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
        with:
          packages-dir: dist/


================================================
FILE: .github/workflows/test.yml
================================================
name: CI

on:
  pull_request:
    branches:
      - master
  push:
    branches:
      - master
  workflow_dispatch:
  schedule:
    - cron: "0 4 * * *"

env:
  PIP_DISABLE_PIP_VERSION_CHECK: "1"
  FORCE_COLOR: "3"

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  test:
    name: Regular tests / ${{ matrix.platform }} / Python ${{ matrix.python-version }}
    runs-on: ${{ matrix.platform }}
    strategy:
      fail-fast: false
      matrix:
        platform: [ubuntu-latest, ubuntu-22.04-arm, macos-15-intel, macos-latest, windows-latest]
        python-version:
          ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy-3.10"]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: ${{ matrix.python-version }}
          allow-prereleases: true
      - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1

      # On PyPy, we skip SciPy because we don't have wheels
      # available, see noxfile.py for more details.
      - name: Run tests
        run: uvx nox -s tests

  # In this job, we test against the NumPy nightly wheels hosted on
  # https://anaconda.org/scientific-python-nightly-wheels/numpy
  # on the latest Python version available across platforms, instead of
  # testing all Python versions and implementations on all platforms.
  # We do not test on PyPy.
  #
  # However, "nox -s nightly-tests" can be used locally anywhere, on
  # any Python version and implementation on any platform and we leave
  # it to the user to decide what Python version to test against, which
  # might or might not have a corresponding NumPy nightly wheel present.
  nightlies:
    name: Nightly tests / ${{ matrix.platform }} / Python ${{ matrix.python-version }}
    runs-on: ${{ matrix.platform }}
    strategy:
      fail-fast: false
      matrix:
        platform: [ubuntu-latest, ubuntu-22.04-arm, macos-15-intel, macos-latest, windows-latest]
        python-version: ["3.x"]

    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: ${{ matrix.python-version }}
          allow-prereleases: true
      - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1
      - name: Run tests against nightly wheels for NumPy and SciPy
        run: uvx nox -s nightly-tests


================================================
FILE: .gitignore
================================================
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
coverage.*
*.cover
.hypothesis/
nosetests.xml
.pytest_cache/
junit-report.xml

# pyenv
.python-version

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# mypy
.mypy_cache/

# OS and IDE config files
.DS_Store
.idea/

# project-specific
data/
*.so
*.c
scratch/
examples/data

.asv/
asv.conf.json
benchmarks/asv.conf.js


================================================
FILE: .pre-commit-config.yaml
================================================
ci:
  autoupdate_commit_msg: "chore: update pre-commit hooks"
  autofix_commit_msg: "style: pre-commit fixes"

repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
      - id: check-added-large-files
      - id: check-case-conflict
      - id: check-merge-conflict
      - id: check-yaml
        exclude: conda_recipe/conda.yaml
      - id: debug-statements
      - id: end-of-file-fixer
      - id: mixed-line-ending
      - id: trailing-whitespace

  - repo: https://github.com/asottile/pyupgrade
    rev: v3.21.2
    hooks:
      - id: pyupgrade
        args: [--py310-plus]

  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: "v0.15.6"
    hooks:
      - id: ruff
        args: ["--fix", "--show-fixes"]
      - id: ruff-format

  - repo: https://github.com/pre-commit/pygrep-hooks
    rev: v1.10.0
    hooks:
      - id: python-check-blanket-type-ignore
        exclude: ^src/vector/backends/_numba_object.py$
      - id: rst-backticks
      - id: rst-directive-colons
      - id: rst-inline-touching-normal


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Use [Nox](https://nox.thea.codes/en/stable/) to run tests and linting, e.g.,

```shell
pip install nox
```

`nox` will run all checks in an isolated virtual environment with Autograd and its dependencies, including its optional dependencies, installed.

## Run tests, linting, packaging checks

| Command                   | Description                                                                                                                                                     |
| ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nox --list`              | Lists all available Nox sessions, including selected ones                                                                                                       |
| `nox -s lint`             | Runs code style checks with pre-commit and pre-commit hooks as listed in `.pre-commit-config.yaml`. Accepts posargs to pass additional arguments to the linter. |
| `nox -s tests`            | Runs tests with your default Python interpreter. Accepts posargs to pass additional arguments and configuration to `pytest`.                                    |
| `nox -s nightly-tests`    | Similar to `nox -s tests`, except that it runs tests with nightly versions of dependencies (NumPy, SciPy, etc.).                                                |
| `nox -s validate-package` | Builds a source distribution and a wheel using `pypa/build` and checks the package with `twine` in strict mode.                                                 |
| `nox`                     | Runs all selected sessions, as listed in `nox.options.sessions` in `noxfile.py`.                                                                                |

Additionally, `nox` supports tags to run specific sessions, e.g., `nox --tags tests` runs all sessions tagged with `tests`.

Make sure all tests pass before you push your changes to GitHub.
GH Actions will run the tests across all supported Python versions.

## Using positional arguments (reformat, upload package, help)

You can use additional arguments for the tools (`pytest`, `pre-commit`, etc.) called by Nox by
separating them from the Nox arguments by a double-hyphen `--`, e.g.,

- `nox -s tests -- tests/test_tuple.py` runs just the tests listed `tests/test_tuple.py`.
- `nox -s lint -- --fix` runs the linter with the `--fix` flag.
- and so on.


================================================
FILE: README.md
================================================
# Autograd  [![Checks status][checks-badge]][checks-url] [![Tests status][tests-badge]][tests-url] [![Publish status][publish-badge]][publish-url] [![asv][asv-badge]](#)

[publish-badge]: https://github.com/HIPS/autograd/actions/workflows/publish.yml/badge.svg
[checks-badge]: https://github.com/HIPS/autograd/actions/workflows/check.yml/badge.svg
[tests-badge]: https://github.com/HIPS/autograd/actions/workflows/test.yml/badge.svg
[asv-badge]: http://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat
[publish-url]: https://github.com/HIPS/autograd/actions/workflows/publish.yml
[checks-url]: https://github.com/HIPS/autograd/actions/workflows/check.yml
[tests-url]: https://github.com/HIPS/autograd/actions/workflows/test.yml

Autograd can automatically differentiate native Python and Numpy code. It can
handle a large subset of Python's features, including loops, ifs, recursion and
closures, and it can even take derivatives of derivatives of derivatives. It
supports reverse-mode differentiation (a.k.a. backpropagation), which means it
can efficiently take gradients of scalar-valued functions with respect to
array-valued arguments, as well as forward-mode differentiation, and the two can
be composed arbitrarily. The main intended application of Autograd is
gradient-based optimization. For more information, check out the
[tutorial](docs/tutorial.md) and the [examples directory](examples/).

Example use:

```python
>>> import autograd.numpy as np  # Thinly-wrapped numpy
>>> from autograd import grad    # The only autograd function you may ever need
>>>
>>> def tanh(x):                 # Define a function
...     return (1.0 - np.exp((-2 * x))) / (1.0 + np.exp(-(2 * x)))
...
>>> grad_tanh = grad(tanh)       # Obtain its gradient function
>>> grad_tanh(1.0)               # Evaluate the gradient at x = 1.0
np.float64(0.419974341614026)
>>> (tanh(1.0001) - tanh(0.9999)) / 0.0002  # Compare to finite differences
np.float64(0.41997434264973155)
```

We can continue to differentiate as many times as we like, and use numpy's
vectorization of scalar-valued functions across many different input values:

```python
>>> from autograd import elementwise_grad as egrad  # for functions that vectorize over inputs
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(-7, 7, 700)
>>> plt.plot(x, tanh(x),
...          x, egrad(tanh)(x),                                     # first  derivative
...          x, egrad(egrad(tanh))(x),                              # second derivative
...          x, egrad(egrad(egrad(tanh)))(x),                       # third  derivative
...          x, egrad(egrad(egrad(egrad(tanh))))(x),)               # fourth derivative
>>> plt.show()
```

<img src="examples/tanh.png" width="600">

See the [tanh example file](examples/tanh.py) for the code.

## Documentation

You can find a tutorial [here.](docs/tutorial.md)

## End-to-end examples

* [Simple neural net](examples/neural_net.py)
* [Convolutional neural net](examples/convnet.py)
* [Recurrent neural net](examples/rnn.py)
* [LSTM](examples/lstm.py)
* [Neural Turing Machine](https://github.com/DoctorTeeth/diffmem/blob/512aadeefd6dbafc1bdd253a64b6be192a435dc3/ntm/ntm.py)
* [Backpropagating through a fluid simulation](examples/fluidsim/fluidsim.py)

<img src="examples/fluidsim/animated.gif" width="400">

* [Variational inference in Bayesian neural network](examples/bayesian_neural_net.py)
* [Gaussian process regression](examples/gaussian_process.py)
* [Sampyl, a pure Python MCMC package with HMC and NUTS](https://github.com/mcleonard/sampyl)

## How to install

Install Autograd using Pip:

```shell
pip install autograd
```

Some features require SciPy, which you can install separately or as an
optional dependency along with Autograd:

```shell
pip install "autograd[scipy]"
```

## Authors and maintainers

Autograd was written by [Dougal Maclaurin](https://dougalmaclaurin.com),
[David Duvenaud](https://www.cs.toronto.edu/~duvenaud/),
[Matt Johnson](http://people.csail.mit.edu/mattjj/),
[Jamie Townsend](https://github.com/j-towns)
and many other contributors. The package is currently being maintained by
[Agriya Khetarpal](https://github.com/agriyakhetarpal),
[Fabian Joswig](https://github.com/fjosw) and
[Jamie Townsend](https://github.com/j-towns).
Please feel free to submit any bugs or
feature requests. We'd also love to hear about your experiences with Autograd
in general. Drop us an email!

We want to thank Jasper Snoek and the rest of the HIPS group (led by Prof. Ryan
P. Adams) for helpful contributions and advice; Barak Pearlmutter for
foundational work on automatic differentiation and for guidance on our
implementation; and Analog Devices Inc. (Lyric Labs) and Samsung Advanced Institute
of Technology for their generous support.


================================================
FILE: autograd/__init__.py
================================================
from autograd.core import primitive_with_deprecation_warnings as primitive

from .builtins import dict, isinstance, list, tuple, type
from .differential_operators import (
    checkpoint,
    deriv,
    elementwise_grad,
    grad,
    grad_and_aux,
    grad_named,
    hessian,
    hessian_tensor_product,
    hessian_vector_product,
    holomorphic_grad,
    jacobian,
    make_ggnvp,
    make_hvp,
    make_jvp,
    make_vjp,
    multigrad_dict,
    tensor_jacobian_product,
    value_and_grad,
    vector_jacobian_product,
)


================================================
FILE: autograd/builtins.py
================================================
from .extend import (
    Box,
    SparseObject,
    VSpace,
    defjvp,
    defjvp_argnum,
    defvjp,
    defvjp_argnum,
    notrace_primitive,
    primitive,
    vspace,
)
from .util import subvals

isinstance_ = isinstance
isinstance = notrace_primitive(isinstance)

type_ = type
type = notrace_primitive(type)

tuple_, list_, dict_ = tuple, list, dict


@primitive
def container_take(A, idx):
    return A[idx]


def grad_container_take(ans, A, idx):
    return lambda g: container_untake(g, idx, vspace(A))


defvjp(container_take, grad_container_take)
defjvp(container_take, "same")


class SequenceBox(Box):
    __slots__ = []
    __getitem__ = container_take

    def __len__(self):
        return len(self._value)

    def __add__(self, other):
        return sequence_extend_right(self, *other)

    def __radd__(self, other):
        return sequence_extend_left(self, *other)

    def __contains__(self, elt):
        return elt in self._value

    def index(self, elt):
        return self._value.index(elt)


SequenceBox.register(tuple_)
SequenceBox.register(list_)


class DictBox(Box):
    __slots__ = []
    __getitem__ = container_take

    def __len__(self):
        return len(self._value)

    def __iter__(self):
        return self._value.__iter__()

    def __contains__(self, elt):
        return elt in self._value

    def items(self):
        return list(self.iteritems())

    def keys(self):
        return list(self.iterkeys())

    def values(self):
        return list(self.itervalues())

    def iteritems(self):
        return ((k, self[k]) for k in self)

    def iterkeys(self):
        return iter(self)

    def itervalues(self):
        return (self[k] for k in self)

    def get(self, k, d=None):
        return self[k] if k in self else d


DictBox.register(dict_)


@primitive
def container_untake(x, idx, vs):
    if isinstance(idx, slice):
        accum = lambda result: [elt_vs._mut_add(a, b) for elt_vs, a, b in zip(vs.shape[idx], result, x)]
    else:
        accum = lambda result: vs.shape[idx]._mut_add(result, x)

    def mut_add(A):
        return vs._subval(A, idx, accum(A[idx]))

    return SparseObject(vs, mut_add)


defvjp(container_untake, lambda ans, x, idx, _: lambda g: container_take(g, idx))
defjvp(container_untake, "same")


@primitive
def sequence_extend_right(seq, *elts):
    return seq + type(seq)(elts)


def grad_sequence_extend_right(argnum, ans, args, kwargs):
    seq, elts = args[0], args[1:]
    return lambda g: g[: len(seq)] if argnum == 0 else g[len(seq) + argnum - 1]


defvjp_argnum(sequence_extend_right, grad_sequence_extend_right)


@primitive
def sequence_extend_left(seq, *elts):
    return type(seq)(elts) + seq


def grad_sequence_extend_left(argnum, ans, args, kwargs):
    seq, elts = args[0], args[1:]
    return lambda g: g[len(elts) :] if argnum == 0 else g[argnum - 1]


defvjp_argnum(sequence_extend_left, grad_sequence_extend_left)


@primitive
def make_sequence(seq_type, *args):
    return seq_type(args)


defvjp_argnum(make_sequence, lambda argnum, *args: lambda g: g[argnum - 1])


def fwd_grad_make_sequence(argnum, g, ans, seq_type, *args, **kwargs):
    return container_untake(g, argnum - 1, vspace(ans))


defjvp_argnum(make_sequence, fwd_grad_make_sequence)


class TupleMeta(type(tuple_)):
    def __instancecheck__(self, instance):
        return isinstance(instance, tuple_)


class tuple(tuple_, metaclass=TupleMeta):
    def __new__(cls, xs):
        return make_sequence(tuple_, *xs)


class ListMeta(type_):
    def __instancecheck__(self, instance):
        return isinstance(instance, list_)


class list(list_, metaclass=ListMeta):
    def __new__(cls, xs):
        return make_sequence(list_, *xs)


class DictMeta(type_):
    def __instancecheck__(self, instance):
        return isinstance(instance, dict_)


class dict(dict_, metaclass=DictMeta):
    def __new__(cls, *args, **kwargs):
        result = dict_(*args, **kwargs)
        if result:
            return _make_dict(result.keys(), list(result.values()))
        return result


@primitive
def _make_dict(keys, vals):
    return dict_(zip(keys, vals))


defvjp(_make_dict, lambda ans, keys, vals: lambda g: list(g[key] for key in keys), argnums=(1,))


class ContainerVSpace(VSpace):
    def __init__(self, value):
        self.shape = value
        self.shape = self._map(vspace)

    @property
    def size(self):
        return sum(self._values(self._map(lambda vs: vs.size)))

    def zeros(self):
        return self._map(lambda vs: vs.zeros())

    def ones(self):
        return self._map(lambda vs: vs.ones())

    def randn(self):
        return self._map(lambda vs: vs.randn())

    def standard_basis(self):
        zero = self.zeros()
        for i, vs in self._kv_pairs(self.shape):
            for x in vs.standard_basis():
                yield self._subval(zero, i, x)

    def _add(self, xs, ys):
        return self._map(lambda vs, x, y: vs._add(x, y), xs, ys)

    def _mut_add(self, xs, ys):
        return self._map(lambda vs, x, y: vs._mut_add(x, y), xs, ys)

    def _scalar_mul(self, xs, a):
        return self._map(lambda vs, x: vs._scalar_mul(x, a), xs)

    def _inner_prod(self, xs, ys):
        return sum(self._values(self._map(lambda vs, x, y: vs._inner_prod(x, y), xs, ys)))

    def _covector(self, xs):
        return self._map(lambda vs, x: vs._covector(x), xs)


class SequenceVSpace(ContainerVSpace):
    def _values(self, x):
        return x

    def _kv_pairs(self, x):
        return enumerate(x)

    def _map(self, f, *args):
        return self.seq_type(map(f, self.shape, *args))

    def _subval(self, xs, idx, x):
        return self.seq_type(subvals(xs, [(idx, x)]))


class ListVSpace(SequenceVSpace):
    seq_type = list_


class TupleVSpace(SequenceVSpace):
    seq_type = tuple_


class DictVSpace(ContainerVSpace):
    def _values(self, x):
        return x.values()

    def _kv_pairs(self, x):
        return x.items()

    def _map(self, f, *args):
        return {k: f(vs, *[x[k] for x in args]) for k, vs in self.shape.items()}

    def _subval(self, xs, idx, x):
        d = dict(xs.items())
        d[idx] = x
        return d


ListVSpace.register(list_)
TupleVSpace.register(tuple_)
DictVSpace.register(dict_)


class NamedTupleVSpace(SequenceVSpace):
    def _map(self, f, *args):
        return self.seq_type(*map(f, self.shape, *args))

    def _subval(self, xs, idx, x):
        return self.seq_type(*subvals(xs, [(idx, x)]))


================================================
FILE: autograd/core.py
================================================
from functools import reduce
from itertools import count

from .tracer import Box, Node, getval, isbox, primitive, toposort, trace
from .util import func, subval

# -------------------- reverse mode --------------------


def make_vjp(fun, x):
    start_node = VJPNode.new_root()
    end_value, end_node = trace(start_node, fun, x)
    if end_node is None:

        def vjp(g):
            return vspace(x).zeros()
    else:

        def vjp(g):
            return backward_pass(g, end_node)

    return vjp, end_value


def backward_pass(g, end_node):
    outgrads = {end_node: (g, False)}
    for node in toposort(end_node):
        outgrad = outgrads.pop(node)
        ingrads = node.vjp(outgrad[0])
        for parent, ingrad in zip(node.parents, ingrads):
            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
    return outgrad[0]


class VJPNode(Node):
    __slots__ = ["parents", "vjp"]

    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        self.parents = parents
        try:
            vjpmaker = primitive_vjps[fun]
        except KeyError:
            fun_name = getattr(fun, "__name__", fun)
            raise NotImplementedError(f"VJP of {fun_name} wrt argnums {parent_argnums} not defined")
        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)

    def initialize_root(self):
        self.parents = []
        self.vjp = lambda g: ()


primitive_vjps = {}


def defvjp_argnums(fun, vjpmaker):
    primitive_vjps[fun] = vjpmaker


def defvjp_argnum(fun, vjpmaker):
    def vjp_argnums(argnums, *args):
        vjps = [vjpmaker(argnum, *args) for argnum in argnums]
        return lambda g: (vjp(g) for vjp in vjps)

    defvjp_argnums(fun, vjp_argnums)


def defvjp(fun, *vjpmakers, **kwargs):
    argnums = kwargs.get("argnums", count())
    vjps_dict = {
        argnum: translate_vjp(vjpmaker, fun, argnum) for argnum, vjpmaker in zip(argnums, vjpmakers)
    }

    def vjp_argnums(argnums, ans, args, kwargs):
        L = len(argnums)
        # These first two cases are just optimizations
        if L == 1:
            argnum = argnums[0]
            try:
                vjpfun = vjps_dict[argnum]
            except KeyError:
                raise NotImplementedError(f"VJP of {fun.__name__} wrt argnum 0 not defined")
            vjp = vjpfun(ans, *args, **kwargs)
            return lambda g: (vjp(g),)
        elif L == 2:
            argnum_0, argnum_1 = argnums
            try:
                vjp_0_fun = vjps_dict[argnum_0]
                vjp_1_fun = vjps_dict[argnum_1]
            except KeyError:
                raise NotImplementedError(f"VJP of {fun.__name__} wrt argnums 0, 1 not defined")
            vjp_0 = vjp_0_fun(ans, *args, **kwargs)
            vjp_1 = vjp_1_fun(ans, *args, **kwargs)
            return lambda g: (vjp_0(g), vjp_1(g))
        else:
            vjps = [vjps_dict[argnum](ans, *args, **kwargs) for argnum in argnums]
            return lambda g: (vjp(g) for vjp in vjps)

    defvjp_argnums(fun, vjp_argnums)


def translate_vjp(vjpfun, fun, argnum):
    if vjpfun is None:
        return lambda ans, *args, **kwargs: lambda g: vspace(args[argnum]).zeros()
    elif callable(vjpfun):
        return vjpfun
    else:
        raise Exception(f"Bad VJP '{vjpfun}' for '{fun.__name__}'")


# -------------------- forward mode --------------------


def make_jvp(fun, x):
    def jvp(g):
        start_node = JVPNode.new_root(g)
        end_value, end_node = trace(start_node, fun, x)
        if end_node is None:
            return end_value, vspace(end_value).zeros()
        else:
            return end_value, end_node.g

    return jvp


class JVPNode(Node):
    __slots__ = ["g"]

    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        parent_gs = [parent.g for parent in parents]
        try:
            jvpmaker = primitive_jvps[fun]
        except KeyError:
            name = getattr(fun, "__name__", fun)
            raise NotImplementedError(f"JVP of {name} wrt argnums {parent_argnums} not defined")
        self.g = jvpmaker(parent_argnums, parent_gs, value, args, kwargs)

    def initialize_root(self, g):
        self.g = g


primitive_jvps = {}


def defjvp_argnums(fun, jvpmaker):
    primitive_jvps[fun] = jvpmaker


def defjvp_argnum(fun, jvpmaker):
    def jvp_argnums(argnums, gs, ans, args, kwargs):
        return sum_outgrads(jvpmaker(argnum, g, ans, args, kwargs) for argnum, g in zip(argnums, gs))

    defjvp_argnums(fun, jvp_argnums)


def defjvp(fun, *jvpfuns, **kwargs):
    argnums = kwargs.get("argnums", count())
    jvps_dict = {argnum: translate_jvp(jvpfun, fun, argnum) for argnum, jvpfun in zip(argnums, jvpfuns)}

    def jvp_argnums(argnums, gs, ans, args, kwargs):
        return sum_outgrads(jvps_dict[argnum](g, ans, *args, **kwargs) for argnum, g in zip(argnums, gs))

    defjvp_argnums(fun, jvp_argnums)


def translate_jvp(jvpfun, fun, argnum):
    if jvpfun is None:
        return lambda g, ans, *a, **k: vspace(ans).zeros()
    elif jvpfun == "same":
        return lambda g, ans, *args, **kwargs: fun(*subval(args, argnum, g), **kwargs)
    elif callable(jvpfun):
        return jvpfun
    else:
        raise Exception(f"Bad JVP '{jvpfun}' for '{fun.__name__}'")


def def_linear(fun):
    """Flags that a function is linear wrt all args"""
    defjvp_argnum(fun, lambda argnum, g, ans, args, kwargs: fun(*subval(args, argnum, g), **kwargs))


# -------------------- vector behavior --------------------


def add_outgrads(prev_g_flagged, g):
    sparse = type(g) in sparse_object_types
    if prev_g_flagged:
        vs = vspace(g)
        prev_g, mutable = prev_g_flagged
        if mutable:
            if sparse:
                return sparse_add(vs, prev_g, g), True
            else:
                return vs.mut_add(prev_g, g), True
        else:
            if sparse:
                prev_g_mutable = vs.mut_add(None, prev_g)
                return sparse_add(vs, prev_g_mutable, g), True
            else:
                return vs.add(prev_g, g), True
    else:
        if sparse:
            return sparse_add(vspace(g), None, g), True
        else:
            return g, False


def sum_outgrads(gs):
    return reduce(add_outgrads, gs, None)[0]


@primitive
def sparse_add(vs, x_prev, x_new):
    x_prev = x_prev if x_prev is not None else vs.zeros()
    return x_new.mut_add(x_prev)


class VSpace:
    __slots__ = []
    mappings = {}
    iscomplex = False

    def __init__(self, value):
        pass

    def zeros(self):
        assert False, repr(self)

    def ones(self):
        assert False, repr(self)

    def standard_basis(self):
        assert False, repr(self)

    def randn(self):
        assert False, repr(self)

    @primitive
    def mut_add(self, x_prev, x_new):
        x_prev = x_prev if x_prev is not None else self.zeros()
        return self._mut_add(x_prev, x_new)

    @primitive
    def add(self, x_prev, x_new):
        return self._add(x_prev, x_new)

    @primitive
    def scalar_mul(self, x, a):
        return self._scalar_mul(x, a)

    @primitive
    def inner_prod(self, x, y):
        return self._inner_prod(x, y)

    @primitive
    def covector(self, x):
        return self._covector(x)

    def _add(self, x, y):
        return x + y

    def _mut_add(self, x, y):
        x += y
        return x

    def _scalar_mul(self, x, a):
        return x * a

    def _inner_prod(self, x, y):
        assert False

    def _covector(self, x):
        return x

    def __eq__(self, other):
        return type(self) == type(other) and self.__dict__ == other.__dict__

    def __repr__(self):
        return f"{type(self).__name__}_{self.__dict__}"

    @classmethod
    def register(cls, value_type, vspace_maker=None):
        if vspace_maker:
            VSpace.mappings[value_type] = vspace_maker
        else:
            VSpace.mappings[value_type] = cls


def vspace(value):
    try:
        return VSpace.mappings[type(value)](value)
    except KeyError:
        if isbox(value):
            return vspace(getval(value))
        else:
            raise TypeError(
                "Can't find vector space for value {} of type {}. Valid types are {}".format(
                    value, type(value), VSpace.mappings.keys()
                )
            )


class SparseBox(Box):
    __slots__ = []


class SparseObject:
    __slots__ = ["vs", "mut_add"]

    def __init__(self, vs, mut_add):
        self.vs = vs
        self.mut_add = mut_add


VSpace.register(SparseObject, lambda x: x.vs)
SparseBox.register(SparseObject)
sparse_object_types = {SparseObject, SparseBox}

# -------------------- core reverse mode grads --------------------

identity_vjp = lambda argnums, *args: lambda g: g
defvjp(sparse_add, None, identity_vjp, identity_vjp)
defvjp(func(VSpace.add), None, identity_vjp, identity_vjp)
defvjp(func(VSpace.mut_add), None, identity_vjp, identity_vjp)
defvjp(
    func(VSpace.inner_prod),
    None,
    lambda ans, vs, x, y: lambda g: vs.covector(vs.scalar_mul(y, g)),
    lambda ans, vs, x, y: lambda g: vs.covector(vs.scalar_mul(x, g)),
)
defvjp(func(VSpace.covector), None, lambda ans, vs, x: lambda g: vs.covector(g))
defvjp(
    func(VSpace.scalar_mul),
    None,
    lambda ans, vs, x, a: lambda g: vs.covector(vs.scalar_mul(vs.covector(g), a)),
    lambda ans, vs, x, a: lambda g: vs.inner_prod(g, vs.covector(x)),
)

# -------------------- core forward mode grads --------------------

identity_jvp = lambda g, *args, **kwargs: g
defjvp(sparse_add, None, identity_jvp, identity_jvp)
defjvp(func(VSpace.mut_add), None, identity_jvp, identity_jvp)
defjvp(func(VSpace.add), None, identity_jvp, identity_jvp)
defjvp(func(VSpace.scalar_mul), None, "same", "same")
defjvp(func(VSpace.inner_prod), None, "same", "same")
defjvp(func(VSpace.covector), None, "same")

# -------------------- deprecation warnings -----------------------

import warnings

deprecated_defvjp_message = """
The {} method is deprecated. See the update guide and tutorial:
https://github.com/HIPS/autograd/blob/master/docs/updateguide.md
https://github.com/HIPS/autograd/blob/master/docs/tutorial.md"""


def deprecated_defvjp(primitive_fun):
    deprecation_msg = deprecated_defvjp_message.format("defvjp")
    vjpfuns = {}

    def defvjp_unstaged(vjpmaker, argnum=0):
        warnings.warn(deprecation_msg)

        def staged_vjpmaker(ans, *args, **kwargs):
            def vjp(g):
                vs, gvs = vspace(args[argnum]), vspace(g)
                return vjpmaker(g, ans, vs, gvs, *args, **kwargs)

            return vjp

        vjpfuns[argnum] = staged_vjpmaker
        argnums, vjpmakers = zip(*[(argnum, vjpfuns[argnum]) for argnum in sorted(vjpfuns.keys())])
        defvjp(primitive_fun, *vjpmakers, argnums=argnums)

    return defvjp_unstaged


def deprecated_defvjp_is_zero(primitive_fun):
    deprecation_msg = deprecated_defvjp_message.format("defvjp_is_zero")
    zero_vjps = [set()]

    def defvjp_is_zero(argnums=(0,)):
        warnings.warn(deprecation_msg)
        zero_vjps[0] |= set(argnums)
        nones = [None] * len(zero_vjps[0])
        defvjp(primitive_fun, *nones, argnums=sorted(zero_vjps[0]))

    return defvjp_is_zero


def deprecated_defgrad(primitive_fun):
    deprecation_msg = deprecated_defvjp_message.format("defgrad")
    gradfuns = {}

    def defgrad(gradfun, argnum=0):
        warnings.warn(deprecation_msg)
        gradfuns[argnum] = gradfun
        argnums, vjpmakers = zip(*[(argnum, gradfuns[argnum]) for argnum in sorted(gradfuns.keys())])
        defvjp(primitive_fun, *vjpmakers, argnums=argnums)

    return defgrad


primitive_ = primitive


def primitive_with_deprecation_warnings(f_raw):
    f_wrapped = primitive_(f_raw)
    f_wrapped.defvjp = deprecated_defvjp(f_wrapped)
    f_wrapped.defvjp_is_zero = deprecated_defvjp_is_zero(f_wrapped)
    f_wrapped.defgrad = deprecated_defgrad(f_wrapped)
    return f_wrapped


primitive = primitive_with_deprecation_warnings


================================================
FILE: autograd/differential_operators.py
================================================
"""Convenience functions built on top of `make_vjp`."""

from collections import OrderedDict

try:
    from inspect import getfullargspec as _getargspec  # Python 3
except ImportError:
    from inspect import getargspec as _getargspec  # Python 2
import warnings

import autograd.numpy as np

from .builtins import tuple as atuple
from .core import make_jvp as _make_jvp
from .core import make_vjp as _make_vjp
from .extend import defvjp_argnum, primitive, vspace
from .wrap_util import unary_to_nary

make_vjp = unary_to_nary(_make_vjp)
make_jvp = unary_to_nary(_make_jvp)


@unary_to_nary
def grad(fun, x):
    """
    Returns a function which computes the gradient of `fun` with respect to
    positional argument number `argnum`. The returned function takes the same
    arguments as `fun`, but returns the gradient instead. The function `fun`
    should be scalar-valued. The gradient has the same type as the argument."""
    vjp, ans = _make_vjp(fun, x)
    if not vspace(ans).size == 1:
        raise TypeError(
            "Grad only applies to real scalar-output functions. "
            "Try jacobian, elementwise_grad or holomorphic_grad."
        )
    return vjp(vspace(ans).ones())


@unary_to_nary
def elementwise_grad(fun, x):
    """
    Returns a function that computes the sum of each column of the Jacobian of
    `fun`, in one pass. If the Jacobian is diagonal, then this is the diagonal
    of the Jacobian.
    """
    vjp, ans = _make_vjp(fun, x)
    if vspace(ans).iscomplex:
        raise TypeError("Elementwise_grad only applies to real-output functions.")
    return vjp(vspace(ans).ones())


@unary_to_nary
def deriv(fun, x):
    return _make_jvp(fun, x)(vspace(x).ones())[1]


@unary_to_nary
def jacobian(fun, x):
    """
    Returns a function which computes the Jacobian of `fun` with respect to
    positional argument number `argnum`, which must be a scalar or array. Unlike
    `grad` it is not restricted to scalar-output functions, but also it cannot
    take derivatives with respect to some argument types (like lists or dicts).
    If the input to `fun` has shape (in1, in2, ...) and the output has shape
    (out1, out2, ...) then the Jacobian has shape (out1, out2, ..., in1, in2, ...).
    """
    vjp, ans = _make_vjp(fun, x)
    ans_vspace = vspace(ans)
    jacobian_shape = ans_vspace.shape + vspace(x).shape
    grads = map(vjp, ans_vspace.standard_basis())
    return np.reshape(np.stack(grads), jacobian_shape)


@unary_to_nary
def holomorphic_grad(fun, x):
    if not vspace(x).iscomplex:
        warnings.warn("Input to holomorphic_grad is not complex")
    return grad(lambda x: np.real(fun(x)))(x)


def grad_named(fun, argname):
    """Takes gradients with respect to a named argument.
    Doesn't work on *args or **kwargs."""
    arg_index = _getargspec(fun).args.index(argname)
    return grad(fun, arg_index)


@unary_to_nary
def hessian(fun, x):
    "Returns a function that computes the exact Hessian."
    return jacobian(jacobian(fun))(x)


@unary_to_nary
def make_hvp(fun, x):
    """Builds a function for evaluating the Hessian-vector product at a point,
    which may be useful when evaluating many Hessian-vector products at the same
    point while caching the results of the forward pass."""
    return _make_vjp(grad(fun), x)


def hessian_tensor_product(fun, argnum=0):
    """Builds a function that returns the exact Hessian-tensor product.
    The returned function has arguments (*args, tensor, **kwargs), and for
    vectors takes roughly 4x as long to evaluate as the original function."""
    fun_grad = grad(fun, argnum)

    def vector_dot_grad(*args, **kwargs):
        args, vector = args[:-1], args[-1]
        return np.tensordot(fun_grad(*args, **kwargs), vector, np.ndim(vector))

    return grad(vector_dot_grad, argnum)


hessian_vector_product = hessian_tensor_product


def tensor_jacobian_product(fun, argnum=0):
    """Builds a function that returns the exact tensor-Jacobian product, that
    is the Jacobian matrix left-multiplied by tensor. The returned function
    has arguments (*args, tensor, **kwargs)."""

    def vector_dot_fun(*args, **kwargs):
        args, vector = args[:-1], args[-1]
        return np.tensordot(vector, fun(*args, **kwargs), axes=np.ndim(vector))

    return jacobian(vector_dot_fun, argnum)


vector_jacobian_product = tensor_jacobian_product


@unary_to_nary
def make_jvp_reversemode(fun, x):
    """Builds a function for evaluating the Jacobian-vector product at a
    point. Roughly 1.5x more FLOPs than forward-mode, plus memory requirements
    that scale with the number of primitives applied in the evaluation of f, as
    well as other overheads. See j-towns.github.io/2017/06/12/A-new-trick.html."""
    vjp, y = _make_vjp(fun, x)
    vjp_vjp, _ = _make_vjp(vjp, vspace(y).zeros())
    return vjp_vjp  # vjp_vjp is just jvp by linearity


# TODO(mattjj): update this function using make_jvp and const_graph
def make_ggnvp(f, g=lambda x: 1.0 / 2 * np.sum(x**2, axis=-1), f_argnum=0):
    """Builds a function for evaluating generalized-Gauss-Newton-vector products
    at a point. Slightly more expensive than mixed-mode."""

    @unary_to_nary
    def _make_ggnvp(f, x):
        f_vjp, f_x = _make_vjp(f, x)
        g_hvp, grad_g_x = _make_vjp(grad(g), f_x)
        f_jvp, _ = _make_vjp(f_vjp, vspace(grad_g_x).zeros())

        def ggnvp(v):
            return f_vjp(g_hvp(f_jvp(v)))

        return ggnvp

    return _make_ggnvp(f, f_argnum)


@unary_to_nary
def value_and_grad(fun, x):
    """Returns a function that returns both value and gradient. Suitable for use
    in scipy.optimize"""
    vjp, ans = _make_vjp(fun, x)
    if not vspace(ans).size == 1:
        raise TypeError(
            "value_and_grad only applies to real scalar-output "
            "functions. Try jacobian, elementwise_grad or "
            "holomorphic_grad."
        )
    return ans, vjp(vspace(ans).ones())


@unary_to_nary
def grad_and_aux(fun, x):
    """Builds a function that returns the gradient of the first output and the
    (unmodified) second output of a function that returns two outputs."""
    vjp, (ans, aux) = _make_vjp(lambda x: atuple(fun(x)), x)
    return vjp((vspace(ans).ones(), vspace(aux).zeros())), aux


def multigrad_dict(fun):
    "Takes gradients wrt all arguments simultaneously,"
    "returns a dict mapping 'argname' to 'gradval'"

    import funcsigs

    sig = funcsigs.signature(fun)

    def select(preds, lst):
        idx = lambda item: next((i for i, pred in enumerate(preds) if pred(item)), len(preds))
        results = [[] for _ in preds] + [[]]
        for item in lst:
            results[idx(item)].append(item)
        return results

    is_var_pos = lambda name: sig.parameters[name].kind == sig.parameters[name].VAR_POSITIONAL
    is_var_kwd = lambda name: sig.parameters[name].kind == sig.parameters[name].VAR_KEYWORD
    var_pos, var_kwd, argnames = select([is_var_pos, is_var_kwd], sig.parameters)

    todict = lambda dct: {key: dct[key] for key in dct}

    def apply_defaults(arguments):
        defaults = {
            name: param.default for name, param in sig.parameters.items() if param.default is not param.empty
        }
        return OrderedDict(
            (name, arguments[name] if name in arguments else defaults[name]) for name in sig.parameters
        )

    def gradfun(*args, **kwargs):
        bindings = sig.bind(*args, **kwargs)

        args = lambda dct: tuple(dct[var_pos[0]]) if var_pos else ()
        kwargs = lambda dct: todict(dct[var_kwd[0]]) if var_kwd else {}
        others = lambda dct: tuple(dct[argname] for argname in argnames if argname not in var_kwd + var_pos)

        newfun = lambda dct: fun(*(others(dct) + args(dct)), **kwargs(dct))

        argdict = apply_defaults(bindings.arguments)
        grad_dict = grad(newfun)(dict(argdict))
        return OrderedDict((argname, grad_dict[argname]) for argname in argdict)

    return gradfun


def checkpoint(fun):
    """Returns a checkpointed version of `fun`, where intermediate values
    computed during the forward pass of `fun` are discarded and then recomputed
    for the backward pass. Useful to save memory, effectively trading off time
    and memory. See e.g. arxiv.org/abs/1604.06174.
    """

    def wrapped_grad(argnum, ans, args, kwargs):
        return make_vjp(fun, argnum)(*args, **kwargs)[0]

    wrapped = primitive(fun)
    defvjp_argnum(wrapped, wrapped_grad)
    return wrapped


================================================
FILE: autograd/extend.py
================================================
# Exposes API for extending autograd
from .core import (
    JVPNode,
    SparseObject,
    VJPNode,
    VSpace,
    def_linear,
    defjvp,
    defjvp_argnum,
    defjvp_argnums,
    defvjp,
    defvjp_argnum,
    defvjp_argnums,
    vspace,
)
from .tracer import Box, notrace_primitive, primitive, register_notrace


================================================
FILE: autograd/misc/__init__.py
================================================
from .flatten import flatten
from .tracers import const_graph


================================================
FILE: autograd/misc/fixed_points.py
================================================
from autograd import make_vjp
from autograd.builtins import tuple
from autograd.extend import defvjp, primitive, vspace


@primitive
def fixed_point(f, a, x0, distance, tol):
    _f = f(a)
    x, x_prev = _f(x0), x0
    while distance(x, x_prev) > tol:
        x, x_prev = _f(x), x
    return x


def fixed_point_vjp(ans, f, a, x0, distance, tol):
    def rev_iter(params):
        a, x_star, x_star_bar = params
        vjp_x, _ = make_vjp(f(a))(x_star)
        vs = vspace(x_star)
        return lambda g: vs.add(vjp_x(g), x_star_bar)

    vjp_a, _ = make_vjp(lambda x, y: f(x)(y))(a, ans)
    return lambda g: vjp_a(fixed_point(rev_iter, tuple((a, ans, g)), vspace(x0).zeros(), distance, tol))


defvjp(fixed_point, None, fixed_point_vjp, None)


================================================
FILE: autograd/misc/flatten.py
================================================
"""
Handy functions for flattening nested containers containing numpy
arrays. The main purpose is to make examples and optimizers simpler.
"""

import autograd.numpy as np
from autograd import make_vjp
from autograd.builtins import type


def flatten(value):
    """Flattens any nesting of tuples, lists, or dicts, with numpy arrays or
    scalars inside. Returns 1D numpy array and an unflatten function.
    Doesn't preserve mixed numeric types (e.g. floats and ints). Assumes dict
    keys are sortable."""
    unflatten, flat_value = make_vjp(_flatten)(value)
    return flat_value, unflatten


def _flatten(value):
    t = type(value)
    if t in (list, tuple):
        return _concatenate(map(_flatten, value))
    elif t is dict:
        return _concatenate(_flatten(value[k]) for k in sorted(value))
    else:
        return np.ravel(value)


def _concatenate(lst):
    lst = list(lst)
    return np.concatenate(lst) if lst else np.array([])


def flatten_func(func, example):
    _ex, unflatten = flatten(example)
    _func = lambda _x, *args: flatten(func(unflatten(_x), *args))[0]
    return _func, unflatten, _ex


================================================
FILE: autograd/misc/optimizers.py
================================================
"""Some standard gradient-based stochastic optimizers.

These are just standard routines that don't make any use of autograd,
though you could take gradients of these functions too if you want
to do meta-optimization.

These routines can optimize functions whose inputs are structured
objects, such as dicts of numpy arrays."""

import autograd.numpy as np
from autograd.misc import flatten
from autograd.wrap_util import wraps


def unflatten_optimizer(optimize):
    """Takes an optimizer that operates on flat 1D numpy arrays and returns a
    wrapped version that handles trees of nested containers (lists/tuples/dicts)
    with arrays/scalars at the leaves."""

    @wraps(optimize)
    def _optimize(grad, x0, callback=None, *args, **kwargs):
        _x0, unflatten = flatten(x0)
        _grad = lambda x, i: flatten(grad(unflatten(x), i))[0]
        if callback:
            _callback = lambda x, i, g: callback(unflatten(x), i, unflatten(g))
        else:
            _callback = None
        return unflatten(optimize(_grad, _x0, _callback, *args, **kwargs))

    return _optimize


@unflatten_optimizer
def sgd(grad, x, callback=None, num_iters=200, step_size=0.1, mass=0.9):
    """Stochastic gradient descent with momentum.
    grad() must have signature grad(x, i), where i is the iteration number."""
    velocity = np.zeros(len(x))
    for i in range(num_iters):
        g = grad(x, i)
        if callback:
            callback(x, i, g)
        velocity = mass * velocity - (1.0 - mass) * g
        x = x + step_size * velocity
    return x


@unflatten_optimizer
def rmsprop(grad, x, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8):
    """Root mean squared prop: See Adagrad paper for details."""
    avg_sq_grad = np.ones(len(x))
    for i in range(num_iters):
        g = grad(x, i)
        if callback:
            callback(x, i, g)
        avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma)
        x = x - step_size * g / (np.sqrt(avg_sq_grad) + eps)
    return x


@unflatten_optimizer
def adam(grad, x, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8):
    """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.
    It's basically RMSprop with momentum and some correction terms."""
    m = np.zeros(len(x))
    v = np.zeros(len(x))
    for i in range(num_iters):
        g = grad(x, i)
        if callback:
            callback(x, i, g)
        m = (1 - b1) * g + b1 * m  # First  moment estimate.
        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
        mhat = m / (1 - b1 ** (i + 1))  # Bias correction.
        vhat = v / (1 - b2 ** (i + 1))
        x = x - step_size * mhat / (np.sqrt(vhat) + eps)
    return x


================================================
FILE: autograd/misc/tracers.py
================================================
from functools import partial
from itertools import repeat

from autograd.tracer import Node, trace
from autograd.util import subvals, toposort
from autograd.wrap_util import wraps


class ConstGraphNode(Node):
    __slots__ = ["parents", "partial_fun"]

    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        args = subvals(args, zip(parent_argnums, repeat(None)))

        def partial_fun(partial_args):
            return fun(*subvals(args, zip(parent_argnums, partial_args)), **kwargs)

        self.parents = parents
        self.partial_fun = partial_fun

    def initialize_root(self):
        self.parents = []


def const_graph_unary(fun):
    graph = []
    _fun = [fun]  # Allow fun to be freed, since it may have bound args

    def maybe_cached_fun(x):
        if graph:
            _graph = graph[0]
            vals = {_graph[0]: x}
            for node in _graph[1:]:
                vals[node] = node.partial_fun([vals[p] for p in node.parents])
            return vals[node]
        else:
            start_node = ConstGraphNode.new_root()
            end_value, end_node = trace(start_node, _fun.pop(), x)
            if end_node is None:
                raise Exception("Output is independent of input")
            graph.append(list(toposort(end_node))[::-1])
            return end_value

    return maybe_cached_fun


def const_graph(fun, *args, **kwargs):
    partial_fun = partial(fun, *args, **kwargs)
    unary_fun = lambda args: partial_fun(*args)
    maybe_cached_unary_fun = const_graph_unary(unary_fun)

    @wraps(fun)
    def _fun(*args):
        return maybe_cached_unary_fun(args)

    return _fun


class FullGraphNode(Node):
    __slots__ = ["value", "recipe"]

    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        self.value = value
        self.recipe = (fun, args, kwargs, zip(parent_argnums, parents))

    def initialize_root(self):
        self.value = None
        self.recipe = (lambda x: x, (), {}, [])


def full_graph(fun, *args, **kwargs):
    unary_fun = lambda args: fun(*args, **kwargs)
    start_node = FullGraphNode.new_root()
    end_value, end_node = trace(start_node, unary_fun, args)
    return end_node


================================================
FILE: autograd/numpy/__init__.py
================================================
from . import fft, linalg, numpy_boxes, numpy_jvps, numpy_vjps, numpy_vspaces, random
from .numpy_wrapper import *
from .numpy_wrapper import numpy_version as __version__


================================================
FILE: autograd/numpy/fft.py
================================================
import numpy.fft as ffto

from autograd.extend import defvjp, primitive, vspace

from . import numpy_wrapper as anp
from .numpy_vjps import match_complex
from .numpy_wrapper import wrap_namespace

wrap_namespace(ffto.__dict__, globals())


# TODO: make fft gradient work for a repeated axis,
# e.g. by replacing fftn with repeated calls to 1d fft along each axis
def fft_grad(get_args, fft_fun, ans, x, *args, **kwargs):
    axes, s, norm = get_args(x, *args, **kwargs)
    check_no_repeated_axes(axes)
    vs = vspace(x)
    return lambda g: match_complex(x, truncate_pad(fft_fun(g, *args, **kwargs), vs.shape))


defvjp(fft, lambda *args, **kwargs: fft_grad(get_fft_args, fft, *args, **kwargs))
defvjp(ifft, lambda *args, **kwargs: fft_grad(get_fft_args, ifft, *args, **kwargs))

defvjp(fft2, lambda *args, **kwargs: fft_grad(get_fft_args, fft2, *args, **kwargs))
defvjp(ifft2, lambda *args, **kwargs: fft_grad(get_fft_args, ifft2, *args, **kwargs))

defvjp(fftn, lambda *args, **kwargs: fft_grad(get_fft_args, fftn, *args, **kwargs))
defvjp(ifftn, lambda *args, **kwargs: fft_grad(get_fft_args, ifftn, *args, **kwargs))


def rfft_grad(get_args, irfft_fun, ans, x, *args, **kwargs):
    axes, s, norm = get_args(x, *args, **kwargs)
    vs = vspace(x)
    gvs = vspace(ans)
    check_no_repeated_axes(axes)
    if s is None:
        s = [vs.shape[i] for i in axes]
    check_even_shape(s)

    # s is the full fft shape
    # gs is the compressed shape
    gs = list(s)
    gs[-1] = gs[-1] // 2 + 1
    fac = make_rfft_factors(axes, gvs.shape, gs, s, norm)

    def vjp(g):
        g = anp.conj(g / fac)
        r = match_complex(x, truncate_pad((irfft_fun(g, *args, **kwargs)), vs.shape))
        return r

    return vjp


def irfft_grad(get_args, rfft_fun, ans, x, *args, **kwargs):
    axes, gs, norm = get_args(x, *args, **kwargs)
    vs = vspace(x)
    gvs = vspace(ans)
    check_no_repeated_axes(axes)
    if gs is None:
        gs = [gvs.shape[i] for i in axes]
    check_even_shape(gs)

    # gs is the full fft shape
    # s is the compressed shape
    s = list(gs)
    s[-1] = s[-1] // 2 + 1

    def vjp(g):
        r = match_complex(x, truncate_pad((rfft_fun(g, *args, **kwargs)), vs.shape))
        fac = make_rfft_factors(axes, vs.shape, s, gs, norm)
        r = anp.conj(r) * fac
        return r

    return vjp


defvjp(rfft, lambda *args, **kwargs: rfft_grad(get_fft_args, irfft, *args, **kwargs))

defvjp(irfft, lambda *args, **kwargs: irfft_grad(get_fft_args, rfft, *args, **kwargs))

defvjp(rfft2, lambda *args, **kwargs: rfft_grad(get_fft2_args, irfft2, *args, **kwargs))

defvjp(irfft2, lambda *args, **kwargs: irfft_grad(get_fft2_args, rfft2, *args, **kwargs))

defvjp(rfftn, lambda *args, **kwargs: rfft_grad(get_fftn_args, irfftn, *args, **kwargs))

defvjp(irfftn, lambda *args, **kwargs: irfft_grad(get_fftn_args, rfftn, *args, **kwargs))

defvjp(
    fftshift, lambda ans, x, axes=None: lambda g: match_complex(x, anp.conj(ifftshift(anp.conj(g), axes)))
)
defvjp(
    ifftshift, lambda ans, x, axes=None: lambda g: match_complex(x, anp.conj(fftshift(anp.conj(g), axes)))
)


@primitive
def truncate_pad(x, shape):
    # truncate/pad x to have the appropriate shape
    slices = [slice(n) for n in shape]
    pads = tuple(
        zip(anp.zeros(len(shape), dtype=int), anp.maximum(0, anp.array(shape) - anp.array(x.shape)))
    )
    return anp.pad(x, pads, "constant")[tuple(slices)]


defvjp(truncate_pad, lambda ans, x, shape: lambda g: match_complex(x, truncate_pad(g, vspace(x).shape)))


## TODO: could be made less stringent, to fail only when repeated axis has different values of s
def check_no_repeated_axes(axes):
    axes_set = set(axes)
    if len(axes) != len(axes_set):
        raise NotImplementedError("FFT gradient for repeated axes not implemented.")


def check_even_shape(shape):
    if shape[-1] % 2 != 0:
        raise NotImplementedError("Real FFT gradient for odd lengthed last axes is not implemented.")


def get_fft_args(a, d=None, axis=-1, norm=None, *args, **kwargs):
    axes = [axis]
    if d is not None:
        d = [d]
    return axes, d, norm


def get_fft2_args(a, s=None, axes=(-2, -1), norm=None, *args, **kwargs):
    return axes, s, norm


def get_fftn_args(a, s=None, axes=None, norm=None, *args, **kwargs):
    if axes is None:
        axes = list(range(a.ndim))
    return axes, s, norm


def make_rfft_factors(axes, resshape, facshape, normshape, norm):
    """make the compression factors and compute the normalization
    for irfft and rfft.
    """
    N = 1.0
    for n in normshape:
        N = N * n

    # inplace modification is fine because we produce a constant
    # which doesn't go into autograd.
    # For same reason could have used numpy rather than anp.
    # but we already imported anp, so use it instead.
    fac = anp.zeros(resshape)
    fac[...] = 2
    index = [slice(None)] * len(resshape)
    if facshape[-1] <= resshape[axes[-1]]:
        index[axes[-1]] = (0, facshape[-1] - 1)
    else:
        index[axes[-1]] = (0,)
    fac[tuple(index)] = 1
    if norm is None:
        fac /= N
    return fac


================================================
FILE: autograd/numpy/linalg.py
================================================
from functools import partial

import numpy.linalg as npla

from autograd.extend import defjvp, defvjp

from . import numpy_wrapper as anp
from .numpy_wrapper import wrap_namespace

wrap_namespace(npla.__dict__, globals())

# Some formulas are from
# "An extended collection of matrix derivative results
#  for forward and reverse mode algorithmic differentiation"
# by Mike Giles
# https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf


# transpose by swapping last two dimensions
def T(x):
    return anp.swapaxes(x, -1, -2)


_dot = partial(anp.einsum, "...ij,...jk->...ik")

# batched diag
_diag = lambda a: anp.eye(a.shape[-1]) * a


# batched diagonal, similar to matrix_diag in tensorflow
def _matrix_diag(a):
    reps = anp.array(a.shape)
    reps[:-1] = 1
    reps[-1] = a.shape[-1]
    newshape = list(a.shape) + [a.shape[-1]]
    return _diag(anp.tile(a, reps).reshape(newshape))


# add two dimensions to the end of x
def add2d(x):
    return anp.reshape(x, anp.shape(x) + (1, 1))


defvjp(det, lambda ans, x: lambda g: add2d(g) * add2d(ans) * T(inv(x)))
defvjp(slogdet, lambda ans, x: lambda g: add2d(g[1]) * T(inv(x)))


def grad_inv(ans, x):
    return lambda g: -_dot(_dot(T(ans), g), T(ans))


defvjp(inv, grad_inv)


def grad_pinv(ans, x):
    # https://mathoverflow.net/questions/25778/analytical-formula-for-numerical-derivative-of-the-matrix-pseudo-inverse
    return lambda g: T(
        -_dot(_dot(ans, T(g)), ans)
        + _dot(_dot(_dot(ans, T(ans)), g), anp.eye(x.shape[-2]) - _dot(x, ans))
        + _dot(_dot(_dot(anp.eye(ans.shape[-2]) - _dot(ans, x), g), T(ans)), ans)
    )


defvjp(pinv, grad_pinv)


def grad_solve(argnum, ans, a, b):
    updim = lambda x: x if x.ndim == a.ndim else x[..., None]
    if argnum == 0:
        return lambda g: -_dot(updim(solve(T(a), g)), T(updim(ans)))
    else:
        return lambda g: solve(T(a), g)


defvjp(solve, partial(grad_solve, 0), partial(grad_solve, 1))


def norm_vjp(ans, x, ord=None, axis=None):
    def check_implemented():
        matrix_norm = (x.ndim == 2 and axis is None) or isinstance(axis, tuple)

        if matrix_norm:
            if not (ord is None or ord == "fro" or ord == "nuc"):
                raise NotImplementedError(f"Gradient of matrix norm not implemented for ord={ord}")
        elif not (ord is None or ord > 1):
            raise NotImplementedError(f"Gradient of norm not implemented for ord={ord}")

    if axis is None:
        expand = lambda a: a
    elif isinstance(axis, tuple):
        row_axis, col_axis = axis
        if row_axis > col_axis:
            row_axis = row_axis - 1
        expand = lambda a: anp.expand_dims(anp.expand_dims(a, row_axis), col_axis)
    else:
        expand = lambda a: anp.expand_dims(a, axis=axis)

    if ord == "nuc":
        if axis is None:
            roll = lambda a: a
            unroll = lambda a: a
        else:
            row_axis, col_axis = axis
            if row_axis > col_axis:
                row_axis = row_axis - 1
            # Roll matrix axes to the back
            roll = lambda a: anp.rollaxis(anp.rollaxis(a, col_axis, a.ndim), row_axis, a.ndim - 1)
            # Roll matrix axes to their original position
            unroll = lambda a: anp.rollaxis(anp.rollaxis(a, a.ndim - 2, row_axis), a.ndim - 1, col_axis)

    check_implemented()

    def vjp(g):
        if ord in (None, 2, "fro"):
            return expand(g / ans) * anp.conj(x)
        elif ord == "nuc":
            x_rolled = roll(x)
            u, s, vt = svd(x_rolled, full_matrices=False)
            uvt_rolled = _dot(u, vt)
            # Roll the matrix axes back to their correct positions
            uvt = unroll(uvt_rolled)
            g = expand(g)
            return g * anp.conj(uvt)
        else:
            # see https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm
            return expand(g / ans ** (ord - 1)) * anp.conj(x) * anp.abs(x) ** (ord - 2)

    return vjp


defvjp(norm, norm_vjp)


def norm_jvp(g, ans, x, ord=None, axis=None):
    def check_implemented():
        matrix_norm = (x.ndim == 2 and axis is None) or isinstance(axis, tuple)

        if matrix_norm:
            if not (ord is None or ord == "fro" or ord == "nuc"):
                raise NotImplementedError(f"Gradient of matrix norm not implemented for ord={ord}")
        elif not (ord is None or ord > 1):
            raise NotImplementedError(f"Gradient of norm not implemented for ord={ord}")

    if axis is None:
        contract = lambda a: anp.sum(a)
    else:
        contract = partial(anp.sum, axis=axis)

    if ord == "nuc":
        if axis is None:
            roll = lambda a: a
            unroll = lambda a: a
        else:
            row_axis, col_axis = axis
            if row_axis > col_axis:
                row_axis = row_axis - 1
            # Roll matrix axes to the back
            roll = lambda a: anp.rollaxis(anp.rollaxis(a, col_axis, a.ndim), row_axis, a.ndim - 1)
            # Roll matrix axes to their original position
            unroll = lambda a: anp.rollaxis(anp.rollaxis(a, a.ndim - 2, row_axis), a.ndim - 1, col_axis)

    check_implemented()
    if ord in (None, 2, "fro"):
        return contract(g * anp.conj(x)) / ans
    elif ord == "nuc":
        x_rolled = roll(x)
        u, s, vt = svd(x_rolled, full_matrices=False)
        uvt_rolled = _dot(u, vt)
        # Roll the matrix axes back to their correct positions
        uvt = unroll(uvt_rolled)
        return contract(g * anp.conj(uvt))
    else:
        # see https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm
        return contract(g * anp.conj(x) * anp.abs(x) ** (ord - 2)) / ans ** (ord - 1)


defjvp(norm, norm_jvp)


def grad_eigh(ans, x, UPLO="L"):
    """Gradient for eigenvalues and vectors of a symmetric matrix."""
    N = x.shape[-1]
    w, v = ans  # Eigenvalues, eigenvectors.
    vc = anp.conj(v)

    def vjp(g):
        wg, vg = g  # Gradient w.r.t. eigenvalues, eigenvectors.
        w_repeated = anp.repeat(w[..., anp.newaxis], N, axis=-1)

        # Eigenvalue part
        vjp_temp = _dot(vc * wg[..., anp.newaxis, :], T(v))

        # Add eigenvector part only if non-zero backward signal is present.
        # This can avoid NaN results for degenerate cases if the function depends
        # on the eigenvalues only.
        if anp.any(vg):
            off_diag = anp.ones((N, N)) - anp.eye(N)
            F = off_diag / (T(w_repeated) - w_repeated + anp.eye(N))
            vjp_temp += _dot(_dot(vc, F * _dot(T(v), vg)), T(v))

        # eigh always uses only the lower or the upper part of the matrix
        # we also have to make sure broadcasting works
        reps = anp.array(x.shape)
        reps[-2:] = 1

        if UPLO == "L":
            tri = anp.tile(anp.tril(anp.ones(N), -1), reps)
        elif UPLO == "U":
            tri = anp.tile(anp.triu(anp.ones(N), 1), reps)

        return anp.real(vjp_temp) * anp.eye(vjp_temp.shape[-1]) + (vjp_temp + anp.conj(T(vjp_temp))) * tri

    return vjp


defvjp(eigh, grad_eigh)


# https://arxiv.org/pdf/1701.00392.pdf Eq(4.77)
# Note the formula from Sec3.1 in https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf is incomplete
def grad_eig(ans, x):
    """Gradient of a general square (complex valued) matrix"""
    e, u = ans  # eigenvalues as 1d array, eigenvectors in columns
    n = e.shape[-1]

    def vjp(g):
        ge, gu = g
        ge = _matrix_diag(ge)
        f = 1 / (e[..., anp.newaxis, :] - e[..., :, anp.newaxis] + 1.0e-20)
        f -= _diag(f)
        ut = anp.swapaxes(u, -1, -2)
        r1 = f * _dot(ut, gu)
        r2 = -f * (_dot(_dot(ut, anp.conj(u)), anp.real(_dot(ut, gu)) * anp.eye(n)))
        r = _dot(_dot(inv(ut), ge + r1 + r2), ut)
        if not anp.iscomplexobj(x):
            r = anp.real(r)
            # the derivative is still complex for real input (imaginary delta is allowed), real output
            # but the derivative should be real in real input case when imaginary delta is forbidden
        return r

    return vjp


defvjp(eig, grad_eig)


def grad_cholesky(L, A):
    # Based on Iain Murray's note http://arxiv.org/abs/1602.07527
    # scipy's dtrtrs wrapper, solve_triangular, doesn't broadcast along leading
    # dimensions, so we just call a generic LU solve instead of directly using
    # backsubstitution (also, we factor twice...)
    solve_trans = lambda a, b: solve(T(a), b)
    phi = lambda X: anp.tril(X) / (1.0 + anp.eye(X.shape[-1]))

    def conjugate_solve(L, X):
        # X -> L^{-T} X L^{-1}
        return solve_trans(L, T(solve_trans(L, T(X))))

    def vjp(g):
        S = conjugate_solve(L, phi(anp.einsum("...ki,...kj->...ij", L, g)))
        return (S + T(S)) / 2.0

    return vjp


defvjp(cholesky, grad_cholesky)


# https://j-towns.github.io/papers/svd-derivative.pdf
# https://arxiv.org/abs/1909.02659
def grad_svd(usv_, a, full_matrices=True, compute_uv=True):
    def vjp(g):
        usv = usv_

        if not compute_uv:
            s = usv

            # Need U and V so do the whole svd anyway...
            usv = svd(a, full_matrices=False)
            u = usv[0]
            v = anp.conj(T(usv[2]))

            return _dot(anp.conj(u) * g[..., anp.newaxis, :], T(v))

        elif full_matrices:
            raise NotImplementedError("Gradient of svd not implemented for full_matrices=True")

        else:
            u = usv[0]
            s = usv[1]
            v = anp.conj(T(usv[2]))

            m, n = a.shape[-2:]

            k = anp.min((m, n))
            # broadcastable identity array with shape (1, 1, ..., 1, k, k)
            i = anp.reshape(anp.eye(k), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (k, k))))

            f = 1 / (s[..., anp.newaxis, :] ** 2 - s[..., :, anp.newaxis] ** 2 + i)

            gu = g[0]
            gs = g[1]
            gv = anp.conj(T(g[2]))

            utgu = _dot(T(u), gu)
            vtgv = _dot(T(v), gv)
            t1 = (f * (utgu - anp.conj(T(utgu)))) * s[..., anp.newaxis, :]
            t1 = t1 + i * gs[..., :, anp.newaxis]
            t1 = t1 + s[..., :, anp.newaxis] * (f * (vtgv - anp.conj(T(vtgv))))

            if anp.iscomplexobj(u):
                t1 = t1 + 1j * anp.imag(_diag(utgu)) / s[..., anp.newaxis, :]

            t1 = _dot(_dot(anp.conj(u), t1), T(v))

            if m < n:
                i_minus_vvt = anp.reshape(
                    anp.eye(n), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (n, n)))
                ) - _dot(v, anp.conj(T(v)))
                t1 = t1 + anp.conj(_dot(_dot(u / s[..., anp.newaxis, :], T(gv)), i_minus_vvt))

                return t1

            elif m == n:
                return t1

            elif m > n:
                i_minus_uut = anp.reshape(
                    anp.eye(m), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (m, m)))
                ) - _dot(u, anp.conj(T(u)))
                t1 = t1 + T(_dot(_dot(v / s[..., anp.newaxis, :], T(gu)), i_minus_uut))

                return t1

    return vjp


defvjp(svd, grad_svd)


================================================
FILE: autograd/numpy/numpy_boxes.py
================================================
import numpy as np

from autograd.builtins import SequenceBox
from autograd.extend import Box, primitive

from . import numpy_wrapper as anp

Box.__array_priority__ = 90.0


class ArrayBox(Box):
    __slots__ = []
    __array_priority__ = 100.0

    @primitive
    def __getitem__(A, idx):
        return A[idx]

    # Constants w.r.t float data just pass though
    shape = property(lambda self: self._value.shape)
    ndim = property(lambda self: self._value.ndim)
    size = property(lambda self: self._value.size)
    dtype = property(lambda self: self._value.dtype)
    T = property(lambda self: anp.transpose(self))

    def __array_namespace__(self, *, api_version: str | None = None):
        return anp

    def __len__(self):
        return len(self._value)

    def astype(self, *args, **kwargs):
        return anp._astype(self, *args, **kwargs)

    def __neg__(self):
        return anp.negative(self)

    def __add__(self, other):
        return anp.add(self, other)

    def __sub__(self, other):
        return anp.subtract(self, other)

    def __mul__(self, other):
        return anp.multiply(self, other)

    def __pow__(self, other):
        return anp.power(self, other)

    def __div__(self, other):
        return anp.divide(self, other)

    def __mod__(self, other):
        return anp.mod(self, other)

    def __truediv__(self, other):
        return anp.true_divide(self, other)

    def __matmul__(self, other):
        return anp.matmul(self, other)

    def __radd__(self, other):
        return anp.add(other, self)

    def __rsub__(self, other):
        return anp.subtract(other, self)

    def __rmul__(self, other):
        return anp.multiply(other, self)

    def __rpow__(self, other):
        return anp.power(other, self)

    def __rdiv__(self, other):
        return anp.divide(other, self)

    def __rmod__(self, other):
        return anp.mod(other, self)

    def __rtruediv__(self, other):
        return anp.true_divide(other, self)

    def __rmatmul__(self, other):
        return anp.matmul(other, self)

    def __eq__(self, other):
        return anp.equal(self, other)

    def __ne__(self, other):
        return anp.not_equal(self, other)

    def __gt__(self, other):
        return anp.greater(self, other)

    def __ge__(self, other):
        return anp.greater_equal(self, other)

    def __lt__(self, other):
        return anp.less(self, other)

    def __le__(self, other):
        return anp.less_equal(self, other)

    def __abs__(self):
        return anp.abs(self)

    def __hash__(self):
        return id(self)


ArrayBox.register(np.ndarray)
for type_ in [
    float,
    np.longdouble,
    np.float64,
    np.float32,
    np.float16,
    complex,
    np.clongdouble,
    np.complex64,
    np.complex128,
]:
    ArrayBox.register(type_)

# These numpy.ndarray methods are just refs to an equivalent numpy function
nondiff_methods = [
    "all",
    "any",
    "argmax",
    "argmin",
    "argpartition",
    "argsort",
    "nonzero",
    "searchsorted",
    "round",
]
diff_methods = [
    "clip",
    "compress",
    "cumprod",
    "cumsum",
    "diagonal",
    "max",
    "mean",
    "min",
    "prod",
    "ptp",
    "ravel",
    "repeat",
    "reshape",
    "squeeze",
    "std",
    "sum",
    "swapaxes",
    "take",
    "trace",
    "transpose",
    "var",
]
for method_name in nondiff_methods + diff_methods:
    setattr(ArrayBox, method_name, anp.__dict__[method_name])

# Flatten has no function, only a method.
setattr(ArrayBox, "flatten", anp.__dict__["ravel"])

if np.lib.NumpyVersion(np.__version__) >= "2.0.0":
    SequenceBox.register(np.linalg._linalg.EigResult)
    SequenceBox.register(np.linalg._linalg.EighResult)
    SequenceBox.register(np.linalg._linalg.QRResult)
    SequenceBox.register(np.linalg._linalg.SlogdetResult)
    SequenceBox.register(np.linalg._linalg.SVDResult)
elif np.__version__ >= "1.25":
    SequenceBox.register(np.linalg.linalg.EigResult)
    SequenceBox.register(np.linalg.linalg.EighResult)
    SequenceBox.register(np.linalg.linalg.QRResult)
    SequenceBox.register(np.linalg.linalg.SlogdetResult)
    SequenceBox.register(np.linalg.linalg.SVDResult)


================================================
FILE: autograd/numpy/numpy_jvps.py
================================================
import numpy as onp

from autograd.extend import JVPNode, def_linear, defjvp, defjvp_argnum, register_notrace, vspace

from ..util import func
from . import numpy_wrapper as anp
from .numpy_boxes import ArrayBox
from .numpy_vjps import (
    balanced_eq,
    dot_adjoint_0,
    dot_adjoint_1,
    match_complex,
    nograd_functions,
    replace_zero,
    tensordot_adjoint_0,
    tensordot_adjoint_1,
    untake,
)

for fun in nograd_functions:
    register_notrace(JVPNode, fun)

defjvp(func(ArrayBox.__getitem__), "same")
defjvp(untake, "same")

defjvp_argnum(anp.array_from_args, lambda argnum, g, ans, args, kwargs: untake(g, argnum - 2, vspace(ans)))
defjvp(
    anp._array_from_scalar_or_array,
    None,
    None,
    lambda g, ans, args, kwargs, _: anp._array_from_scalar_or_array(args, kwargs, g),
)

# ----- Functions that are constant w.r.t. continuous inputs -----
defjvp(anp.nan_to_num, lambda g, ans, x: anp.where(anp.isfinite(x), g, 0.0))

# ----- Binary ufuncs (linear) -----
def_linear(anp.multiply)

# ----- Binary ufuncs -----
defjvp(anp.add, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: broadcast(g, ans))
defjvp(anp.subtract, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: broadcast(-g, ans))
defjvp(anp.divide, "same", lambda g, ans, x, y: -g * x / y**2)
defjvp(
    anp.maximum,
    lambda g, ans, x, y: g * balanced_eq(x, ans, y),
    lambda g, ans, x, y: g * balanced_eq(y, ans, x),
)
defjvp(
    anp.minimum,
    lambda g, ans, x, y: g * balanced_eq(x, ans, y),
    lambda g, ans, x, y: g * balanced_eq(y, ans, x),
)
defjvp(
    anp.fmax,
    lambda g, ans, x, y: g * balanced_eq(x, ans, y),
    lambda g, ans, x, y: g * balanced_eq(y, ans, x),
)
defjvp(
    anp.fmin,
    lambda g, ans, x, y: g * balanced_eq(x, ans, y),
    lambda g, ans, x, y: g * balanced_eq(y, ans, x),
)
defjvp(anp.logaddexp, lambda g, ans, x, y: g * anp.exp(x - ans), lambda g, ans, x, y: g * anp.exp(y - ans))
defjvp(anp.logaddexp2, lambda g, ans, x, y: g * 2 ** (x - ans), lambda g, ans, x, y: g * 2 ** (y - ans))
defjvp(anp.true_divide, "same", lambda g, ans, x, y: -g * x / y**2)
defjvp(anp.mod, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: -g * anp.floor(x / y))
defjvp(anp.remainder, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: -g * anp.floor(x / y))
defjvp(
    anp.power,
    lambda g, ans, x, y: g * y * x ** anp.where(y, y - 1, 1.0),
    lambda g, ans, x, y: g * anp.log(replace_zero(x, 1.0)) * ans,
)
defjvp(anp.arctan2, lambda g, ans, x, y: g * y / (x**2 + y**2), lambda g, ans, x, y: g * -x / (x**2 + y**2))

# ----- Simple grads (linear) -----
defjvp(anp.negative, "same")
defjvp(anp.rad2deg, "same")
defjvp(anp.degrees, "same")
defjvp(anp.deg2rad, "same")
defjvp(anp.radians, "same")
defjvp(anp.reshape, "same")
defjvp(anp.roll, "same")
defjvp(anp.array_split, "same")
defjvp(anp.split, "same")
defjvp(anp.vsplit, "same")
defjvp(anp.hsplit, "same")
defjvp(anp.dsplit, "same")
defjvp(anp.ravel, "same")
defjvp(anp.expand_dims, "same")
defjvp(anp.squeeze, "same")
defjvp(anp.diag, "same")
defjvp(anp.diagonal, "same")
defjvp(anp.make_diagonal, "same")
defjvp(anp.flipud, "same")
defjvp(anp.fliplr, "same")
defjvp(anp.rot90, "same")
defjvp(anp.trace, "same")
defjvp(anp.full, "same", argnums=(1,))
defjvp(anp.triu, "same")
defjvp(anp.tril, "same")
defjvp(anp.swapaxes, "same")
defjvp(anp.rollaxis, "same")
defjvp(anp.moveaxis, "same")
defjvp(anp.broadcast_to, "same")
def_linear(anp.cross)

# ----- Simple grads -----
np_abs_jvp = lambda g, ans, x: anp.real(g * replace_zero(anp.conj(x), 0.0)) / replace_zero(ans, 1.0)
defjvp(anp.abs, np_abs_jvp)
defjvp(anp.absolute, np_abs_jvp)
defjvp(anp.fabs, lambda g, ans, x: anp.sign(x) * g)  # fabs doesn't take complex numbers.
defjvp(anp.reciprocal, lambda g, ans, x: -g / x**2)
defjvp(anp.exp, lambda g, ans, x: ans * g)
defjvp(anp.exp2, lambda g, ans, x: ans * anp.log(2) * g)
defjvp(anp.expm1, lambda g, ans, x: (ans + 1) * g)
defjvp(anp.log, lambda g, ans, x: g / x)
defjvp(anp.log2, lambda g, ans, x: g / x / anp.log(2))
defjvp(anp.log10, lambda g, ans, x: g / x / anp.log(10))
defjvp(anp.log1p, lambda g, ans, x: g / (x + 1))
defjvp(anp.sin, lambda g, ans, x: g * anp.cos(x))
defjvp(anp.cos, lambda g, ans, x: -g * anp.sin(x))
defjvp(anp.tan, lambda g, ans, x: g / anp.cos(x) ** 2)
defjvp(anp.arcsin, lambda g, ans, x: g / anp.sqrt(1 - x**2))
defjvp(anp.arccos, lambda g, ans, x: -g / anp.sqrt(1 - x**2))
defjvp(anp.arctan, lambda g, ans, x: g / (1 + x**2))
defjvp(anp.sinh, lambda g, ans, x: g * anp.cosh(x))
defjvp(anp.cosh, lambda g, ans, x: g * anp.sinh(x))
defjvp(anp.tanh, lambda g, ans, x: g / anp.cosh(x) ** 2)
defjvp(anp.arcsinh, lambda g, ans, x: g / anp.sqrt(x**2 + 1))
defjvp(anp.arccosh, lambda g, ans, x: g / anp.sqrt(x**2 - 1))
defjvp(anp.arctanh, lambda g, ans, x: g / (1 - x**2))
defjvp(anp.square, lambda g, ans, x: g * 2 * x)
defjvp(anp.sqrt, lambda g, ans, x: g * 0.5 * x**-0.5)
defjvp(
    anp.sinc,
    lambda g, ans, x: g * (anp.cos(anp.pi * x) * anp.pi * x - anp.sin(anp.pi * x)) / (anp.pi * x**2),
)
defjvp(anp.clip, lambda g, ans, x, a_min, a_max: g * anp.logical_and(ans != a_min, ans != a_max))
defjvp(anp.real_if_close, lambda g, ans, x: match_complex(ans, g))
defjvp(anp.real, lambda g, ans, x: anp.real(g))
defjvp(anp.imag, lambda g, ans, x: match_complex(ans, -1j * g))
np_conj_jvp = lambda g, ans, x: anp.conj(g)
defjvp(anp.conj, np_conj_jvp)
defjvp(anp.conjugate, np_conj_jvp)
defjvp(anp.angle, lambda g, ans, x: match_complex(ans, g * anp.conj(x * 1j) / anp.abs(x) ** 2))
defjvp(
    anp.where,
    None,
    lambda g, ans, c, x=None, y=None: anp.where(c, g, anp.zeros(anp.shape(g))),
    lambda g, ans, c, x=None, y=None: anp.where(c, anp.zeros(g.shape), g),
)

# ----- Trickier grads -----
defjvp(anp.kron, "same", "same")
defjvp(anp.diff, "same")
defjvp(anp.gradient, "same")
defjvp(anp.repeat, "same")
defjvp(anp.tile, "same")
defjvp(anp.transpose, "same")
defjvp(anp.sum, "same")
defjvp(anp.mean, "same")
defjvp(
    anp.prod, lambda g, ans, x, axis=None, keepdims=False: ans * anp.sum(g / x, axis=axis, keepdims=keepdims)
)
defjvp(
    anp.linspace,
    lambda g, ans, start, stop, *args, **kwargs: anp.linspace(g, 0, *args, **kwargs),
    lambda g, ans, start, stop, *args, **kwargs: anp.linspace(0, g, *args, **kwargs),
)


def forward_grad_np_var(g, ans, x, axis=None, ddof=0, keepdims=False):
    if axis is None:
        num_reps = anp.size(g)
    elif isinstance(axis, int):
        num_reps = anp.shape(g)[axis]
    elif isinstance(axis, tuple):
        num_reps = anp.prod(anp.array(np.shape(g))[list(axis)])

    x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True))
    return 2.0 * anp.sum(anp.real(g * x_minus_mean), axis=axis, keepdims=keepdims) / (num_reps - ddof)


defjvp(anp.var, forward_grad_np_var)


def forward_grad_np_std(g, ans, x, axis=None, ddof=0, keepdims=False):
    if axis is None:
        num_reps = anp.size(g)
    elif isinstance(axis, int):
        num_reps = anp.shape(g)[axis]
    elif isinstance(axis, tuple):
        num_reps = anp.prod(anp.array(anp.shape(g))[list(axis)])

    if num_reps <= 1:
        return anp.zeros_like(ans)
    x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True))
    return anp.sum(anp.real(g * x_minus_mean), axis=axis, keepdims=keepdims) / ((num_reps - ddof) * ans)


defjvp(anp.std, forward_grad_np_std)


def fwd_grad_chooser(g, ans, x, axis=None, keepdims=False):
    if anp.isscalar(x):
        return g
    if not keepdims:
        if isinstance(axis, int):
            ans = anp.expand_dims(ans, axis)
        elif isinstance(axis, tuple):
            for ax in sorted(axis):
                ans = anp.expand_dims(ans, ax)
    chosen_locations = x == ans
    return anp.sum((g * chosen_locations), axis=axis, keepdims=keepdims) / anp.sum(
        chosen_locations, axis=axis, keepdims=keepdims
    )


defjvp(anp.max, fwd_grad_chooser)
defjvp(anp.min, fwd_grad_chooser)
defjvp(anp.amax, fwd_grad_chooser)
defjvp(anp.amin, fwd_grad_chooser)

defjvp(anp.cumsum, "same")

def_linear(anp.inner)
def_linear(anp.matmul)
def_linear(anp.dot)
def_linear(anp.tensordot)
def_linear(anp.outer)

def_linear(dot_adjoint_0)
def_linear(dot_adjoint_1)

def_linear(tensordot_adjoint_0)
def_linear(tensordot_adjoint_1)


def fwd_grad_concatenate_args(argnum, g, ans, axis_args, kwargs):
    result = []
    for i in range(1, len(axis_args)):
        if i == argnum:
            result.append(g)
        else:
            result.append(anp.zeros_like(axis_args[i]))
    return anp.concatenate_args(axis_args[0], *result)


defjvp_argnum(anp.concatenate_args, fwd_grad_concatenate_args)


def fwd_grad_sort(g, ans, x, axis=-1, kind="quicksort", order=None):
    sort_perm = anp.argsort(x, axis, kind, order)
    return g[sort_perm]


defjvp(anp.sort, fwd_grad_sort)
if onp.lib.NumpyVersion(onp.__version__) < "2.0.0":
    defjvp(anp.msort, lambda g, ans, x: fwd_grad_sort(g, ans, x, axis=0))


def fwd_grad_partition(g, ans, x, kth, axis=-1, kind="introselect", order=None):
    partition_perm = anp.argpartition(x, kth, axis, kind, order)
    return g[partition_perm]


defjvp(anp.partition, fwd_grad_partition)


def atleast_jvpmaker(fun):
    def jvp(g, ans, *arys):
        if len(arys) > 1:
            raise NotImplementedError("Can't handle multiple arguments yet.")
        return fun(g)

    return jvp


defjvp(anp.atleast_1d, atleast_jvpmaker(anp.atleast_1d))
defjvp(anp.atleast_2d, atleast_jvpmaker(anp.atleast_2d))
defjvp(anp.atleast_3d, atleast_jvpmaker(anp.atleast_3d))

def_linear(anp.einsum)


# TODO(mattjj): can we call np.broadcast_to or a related function instead?
def broadcast(x, target):
    target_shape, target_ndim, target_dtype, target_iscomplex = anp.metadata(target)
    while anp.ndim(x) < target_ndim:
        x = anp.expand_dims(x, 0)
    for axis, size in enumerate(anp.shape(x)):
        if size == 1:
            x = anp.repeat(x, target_shape[axis], axis=axis)
    if target_iscomplex and not anp.iscomplexobj(x):
        x = x + 0j  # TODO(mattjj): this might promote the dtype
    return x


defjvp(anp.pad, lambda g, ans, array, width, mode, **kwargs: anp.pad(g, width, mode))


================================================
FILE: autograd/numpy/numpy_vjps.py
================================================
from functools import partial

import numpy as onp

from autograd.extend import SparseObject, VJPNode, defvjp, defvjp_argnum, primitive, register_notrace, vspace

from ..util import func
from . import numpy_wrapper as anp
from .numpy_boxes import ArrayBox

# ----- Non-differentiable functions -----

nograd_functions = [
    anp.floor,
    anp.ceil,
    anp.round,
    anp.rint,
    anp.around,
    anp.fix,
    anp.trunc,
    anp.all,
    anp.any,
    anp.argmax,
    anp.argmin,
    anp.argpartition,
    anp.argsort,
    anp.argwhere,
    anp.nonzero,
    anp.flatnonzero,
    anp.count_nonzero,
    anp.searchsorted,
    anp.sign,
    anp.ndim,
    anp.shape,
    anp.floor_divide,
    anp.logical_and,
    anp.logical_or,
    anp.logical_not,
    anp.logical_xor,
    anp.isfinite,
    anp.isinf,
    anp.isnan,
    anp.isneginf,
    anp.isposinf,
    anp.allclose,
    anp.isclose,
    anp.array_equal,
    anp.array_equiv,
    anp.greater,
    anp.greater_equal,
    anp.less,
    anp.less_equal,
    anp.equal,
    anp.not_equal,
    anp.iscomplexobj,
    anp.iscomplex,
    anp.size,
    anp.isscalar,
    anp.isreal,
    anp.zeros_like,
    anp.ones_like,
    anp.empty_like,
    anp.full_like,
    anp.result_type,
]

for fun in nograd_functions:
    register_notrace(VJPNode, fun)

# ----- Functions that are constant w.r.t. continuous inputs -----

defvjp(anp.nan_to_num, lambda ans, x: lambda g: anp.where(anp.isfinite(x), g, 0.0))

# ----- Binary ufuncs -----

defvjp(
    anp.add, lambda ans, x, y: unbroadcast_f(x, lambda g: g), lambda ans, x, y: unbroadcast_f(y, lambda g: g)
)
defvjp(
    anp.multiply,
    lambda ans, x, y: unbroadcast_f(x, lambda g: y * g),
    lambda ans, x, y: unbroadcast_f(y, lambda g: x * g),
)
defvjp(
    anp.subtract,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g),
    lambda ans, x, y: unbroadcast_f(y, lambda g: -g),
)
defvjp(
    anp.divide,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g / y),
    lambda ans, x, y: unbroadcast_f(y, lambda g: -g * x / y**2),
)
defvjp(
    anp.maximum,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)),
)
defvjp(
    anp.minimum,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)),
)
defvjp(
    anp.fmax,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)),
)
defvjp(
    anp.fmin,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)),
)
defvjp(
    anp.logaddexp,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * anp.exp(x - ans)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * anp.exp(y - ans)),
)
defvjp(
    anp.logaddexp2,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * 2 ** (x - ans)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * 2 ** (y - ans)),
)
defvjp(
    anp.true_divide,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g / y),
    lambda ans, x, y: unbroadcast_f(y, lambda g: -g * x / y**2),
)
defvjp(
    anp.mod,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g),
    lambda ans, x, y: unbroadcast_f(y, lambda g: -g * anp.floor(x / y)),
)
defvjp(
    anp.remainder,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g),
    lambda ans, x, y: unbroadcast_f(y, lambda g: -g * anp.floor(x / y)),
)
defvjp(
    anp.power,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * y * x ** anp.where(y, y - 1, 1.0)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * anp.log(replace_zero(x, 1.0)) * ans),
)
defvjp(
    anp.arctan2,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * y / (x**2 + y**2)),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * -x / (x**2 + y**2)),
)
defvjp(
    anp.hypot,
    lambda ans, x, y: unbroadcast_f(x, lambda g: g * x / ans),
    lambda ans, x, y: unbroadcast_f(y, lambda g: g * y / ans),
)

# ----- Simple grads -----

defvjp(anp.negative, lambda ans, x: lambda g: -g)
np_abs_vjp = lambda ans, x: lambda g: g * replace_zero(anp.conj(x), 0.0) / replace_zero(ans, 1.0)
defvjp(anp.abs, np_abs_vjp)
defvjp(anp.absolute, np_abs_vjp)
defvjp(anp.fabs, lambda ans, x: lambda g: anp.sign(x) * g)  # fabs doesn't take complex numbers.
defvjp(anp.reciprocal, lambda ans, x: lambda g: -g / x**2)
defvjp(anp.exp, lambda ans, x: lambda g: ans * g)
defvjp(anp.exp2, lambda ans, x: lambda g: ans * anp.log(2) * g)
defvjp(anp.expm1, lambda ans, x: lambda g: (ans + 1) * g)
defvjp(anp.log, lambda ans, x: lambda g: g / x)
defvjp(anp.log2, lambda ans, x: lambda g: g / x / anp.log(2))
defvjp(anp.log10, lambda ans, x: lambda g: g / x / anp.log(10))
defvjp(anp.log1p, lambda ans, x: lambda g: g / (x + 1))
defvjp(anp.sin, lambda ans, x: lambda g: g * anp.cos(x))
defvjp(anp.cos, lambda ans, x: lambda g: -g * anp.sin(x))
defvjp(anp.tan, lambda ans, x: lambda g: g / anp.cos(x) ** 2)
defvjp(anp.arcsin, lambda ans, x: lambda g: g / anp.sqrt(1 - x**2))
defvjp(anp.arccos, lambda ans, x: lambda g: -g / anp.sqrt(1 - x**2))
defvjp(anp.arctan, lambda ans, x: lambda g: g / (1 + x**2))
defvjp(anp.sinh, lambda ans, x: lambda g: g * anp.cosh(x))
defvjp(anp.cosh, lambda ans, x: lambda g: g * anp.sinh(x))
defvjp(anp.tanh, lambda ans, x: lambda g: g / anp.cosh(x) ** 2)
defvjp(anp.arcsinh, lambda ans, x: lambda g: g / anp.sqrt(x**2 + 1))
defvjp(anp.arccosh, lambda ans, x: lambda g: g / anp.sqrt(x**2 - 1))
defvjp(anp.arctanh, lambda ans, x: lambda g: g / (1 - x**2))
defvjp(anp.rad2deg, lambda ans, x: lambda g: g / anp.pi * 180.0)
defvjp(anp.degrees, lambda ans, x: lambda g: g / anp.pi * 180.0)
defvjp(anp.deg2rad, lambda ans, x: lambda g: g * anp.pi / 180.0)
defvjp(anp.radians, lambda ans, x: lambda g: g * anp.pi / 180.0)
defvjp(anp.square, lambda ans, x: lambda g: g * 2 * x)
defvjp(anp.sqrt, lambda ans, x: lambda g: g * 0.5 * x**-0.5)
defvjp(
    anp.sinc,
    lambda ans, x: lambda g: g * (anp.cos(anp.pi * x) * anp.pi * x - anp.sin(anp.pi * x)) / (anp.pi * x**2),
)
defvjp(anp.reshape, lambda ans, x, shape, order=None: lambda g: anp.reshape(g, anp.shape(x), order=order))
defvjp(anp.roll, lambda ans, x, shift, axis=None: lambda g: anp.roll(g, -shift, axis=axis))
defvjp(anp.array_split, lambda ans, ary, idxs, axis=0: lambda g: anp.concatenate(g, axis=axis))
defvjp(anp.split, lambda ans, ary, idxs, axis=0: lambda g: anp.concatenate(g, axis=axis))
defvjp(anp.vsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=0))
defvjp(anp.hsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=1))
defvjp(anp.dsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=2))
defvjp(anp.ravel, lambda ans, x, order=None: lambda g: anp.reshape(g, anp.shape(x), order=order))
defvjp(anp.expand_dims, lambda ans, x, axis: lambda g: anp.reshape(g, anp.shape(x)))
defvjp(anp.squeeze, lambda ans, x, axis=None: lambda g: anp.reshape(g, anp.shape(x)))
defvjp(anp.diag, lambda ans, x, k=0: lambda g: anp.diag(g, k))
defvjp(anp.flipud, lambda ans, x,: lambda g: anp.flipud(g))
defvjp(anp.fliplr, lambda ans, x,: lambda g: anp.fliplr(g))
defvjp(anp.rot90, lambda ans, x, k=1: lambda g: anp.rot90(g, -k))
defvjp(
    anp.trace,
    lambda ans, x, offset=0: (
        lambda g: anp.einsum("ij,...->ij...", anp.eye(x.shape[0], x.shape[1], k=offset), g)
    ),
)
defvjp(anp.full, lambda ans, shape, fill_value, dtype=None: lambda g: anp.sum(g), argnums=(1,))
defvjp(anp.triu, lambda ans, x, k=0: lambda g: anp.triu(g, k=k))
defvjp(anp.tril, lambda ans, x, k=0: lambda g: anp.tril(g, k=k))
defvjp(anp.clip, lambda ans, x, a_min, a_max: lambda g: g * anp.logical_and(ans != a_min, ans != a_max))
defvjp(anp.swapaxes, lambda ans, x, axis1, axis2: lambda g: anp.swapaxes(g, axis2, axis1))
defvjp(anp.moveaxis, lambda ans, a, source, destination: lambda g: anp.moveaxis(g, destination, source))
defvjp(anp.real_if_close, lambda ans, x: lambda g: match_complex(x, g))
defvjp(anp.real, lambda ans, x: lambda g: match_complex(x, g))
defvjp(anp.imag, lambda ans, x: lambda g: match_complex(x, -1j * g))
np_conj_vjp = lambda ans, x: lambda g: anp.conj(g)
defvjp(anp.conj, np_conj_vjp)
defvjp(anp.conjugate, np_conj_vjp)
defvjp(anp.angle, lambda ans, x: lambda g: match_complex(x, g * anp.conj(x * 1j) / anp.abs(x) ** 2))
defvjp(
    anp.where,
    None,
    lambda ans, c, x=None, y=None: lambda g: anp.where(c, g, anp.zeros(g.shape)),
    lambda ans, c, x=None, y=None: lambda g: anp.where(c, anp.zeros(g.shape), g),
)
defvjp(
    anp.cross,
    lambda ans, a, b, axisa=-1, axisb=-1, axisc=-1, axis=None: (
        lambda g: anp.cross(b, g, axisb, axisc, axisa, axis)
    ),
    lambda ans, a, b, axisa=-1, axisb=-1, axisc=-1, axis=None: (
        lambda g: anp.cross(g, a, axisc, axisa, axisb, axis)
    ),
)
defvjp(
    anp.linspace,
    lambda ans, start, stop, num: lambda g: anp.dot(anp.linspace(1.0, 0.0, num), g),
    lambda ans, start, stop, num: lambda g: anp.dot(anp.linspace(0.0, 1.0, num), g),
)

defvjp(
    anp._astype,
    lambda ans, A, dtype, order="K", casting="unsafe", subok=True, copy=True: (
        lambda g: anp._astype(g, A.dtype)
    ),
)


# ----- Trickier grads -----
def grad_rollaxis(ans, a, axis, start=0):
    if axis < 0:
        raise NotImplementedError(
            "Gradient of rollaxis not implemented for axis < 0. Please use moveaxis instead."
        )
    elif start < 0:
        raise NotImplementedError(
            "Gradient of rollaxis not implemented for start < 0. Please use moveaxis instead."
        )
    return lambda g: anp.rollaxis(g, start - 1, axis) if start > axis else anp.rollaxis(g, start, axis + 1)


defvjp(anp.rollaxis, grad_rollaxis)


def grad_diff(ans, a, n=1, axis=-1):
    nd = anp.ndim(a)
    ans_shape = anp.shape(ans)
    sl1 = [slice(None)] * nd
    sl1[axis] = slice(None, 1)

    sl2 = [slice(None)] * nd
    sl2[axis] = slice(-1, None)

    def undiff(g):
        if g.shape[axis] > 0:
            return anp.concatenate((-g[tuple(sl1)], -anp.diff(g, axis=axis), g[tuple(sl2)]), axis=axis)
        shape = list(ans_shape)
        shape[axis] = 1
        return anp.zeros(shape)

    def helper(g, n):
        if n == 0:
            return g
        return helper(undiff(g), n - 1)

    return lambda g: helper(g, n)


defvjp(anp.diff, grad_diff)


def grad_gradient(ans, x, *vargs, **kwargs):
    axis = kwargs.pop("axis", None)
    if vargs or kwargs:
        raise NotImplementedError("The only optional argument currently supported for np.gradient is axis.")
    if axis is None:
        axis = range(x.ndim)
    elif type(axis) is int:
        axis = [axis]
    else:
        axis = list(axis)

    x_dtype = x.dtype
    x_shape = x.shape
    nd = x.ndim

    def vjp(g):
        if anp.ndim(g) == nd:
            # add axis if gradient was along one axis only
            g = g[anp.newaxis]

        # accumulate gradient
        out = anp.zeros(x_shape, dtype=x_dtype)

        for i, a in enumerate(axis):
            # swap gradient axis to the front
            g_swap = anp.swapaxes(g[i], 0, a)[:, anp.newaxis]

            out_axis = anp.concatenate(
                (
                    -g_swap[0] - 0.5 * g_swap[1],
                    g_swap[0] - 0.5 * g_swap[2],
                    (-1.0) * anp.gradient(g_swap, axis=0)[2:-2, 0],
                    0.5 * g_swap[-3] - g_swap[-1],
                    0.5 * g_swap[-2] + g_swap[-1],
                ),
                axis=0,
            )

            out = out + anp.swapaxes(out_axis, 0, a)

        return out

    return vjp


defvjp(anp.gradient, grad_gradient)


def grad_repeat(ans, x, repeats, axis=None):
    shape = anp.shape(x)

    def vjp(g):
        if axis is None:  # If axis is none, np.repeat() repeats the flattened array.
            expanded = anp.reshape(g, (anp.prod(shape),) + (repeats,))
            return anp.reshape(anp.sum(expanded, axis=1, keepdims=False), shape)
        else:
            if shape[axis] == 1:  # For this common case, the logic is simple.
                return anp.sum(g, axis=axis, keepdims=True)
            else:
                expanded = anp.reshape(g, shape[0 : axis + 1] + (repeats,) + shape[axis + 1 :])
                return anp.sum(expanded, axis=axis + 1, keepdims=False)

    return vjp


defvjp(anp.repeat, grad_repeat)


def grad_tile(ans, x, reps):
    reps = [reps] if anp.isscalar(reps) else reps
    x_shape = anp.shape(x)

    def vjp(g):
        for axis, rep in enumerate(reps):
            g = sum(anp.split(g, rep, axis))
        return anp.reshape(g, x_shape)

    return vjp


defvjp(anp.tile, grad_tile)


def grad_kron(argnum, ans, orig_A, orig_B):
    # kron has different promotion rules than dot. the reshapes are necessary if
    # and only if (1) orig_B is 1D or (2) orig_A and/or orig_B are 0D
    orig_A_shape = anp.shape(orig_A)
    orig_B_shape = anp.shape(orig_B)

    def vjp(G):
        A, B = anp.atleast_2d(orig_A), anp.atleast_2d(orig_B)
        shape = list(A.shape + B.shape)
        n = anp.ndim(A)
        shape[n - 1], shape[n] = shape[n], shape[n - 1]
        reshaped_G = anp.swapaxes(anp.reshape(G, shape), n - 1, n)
        if argnum == 0:
            return match_complex(
                orig_A, anp.reshape(anp.tensordot(reshaped_G, B, axes=anp.ndim(B)), orig_A_shape)
            )
        else:
            return match_complex(
                orig_B, anp.reshape(anp.tensordot(A, reshaped_G, axes=anp.ndim(A)), orig_B_shape)
            )

    return vjp


defvjp(anp.kron, partial(grad_kron, 0), partial(grad_kron, 1))


def grad_transpose(ans, x, axes=None):
    if axes is not None:
        axes = anp.argsort(axes)
    return lambda g: anp.transpose(g, axes)


defvjp(anp.transpose, grad_transpose)


def repeat_to_match_shape(g, shape, dtype, axis, keepdims):
    """Returns the array g repeated along axis to fit vector space vs.
    Also returns the number of repetitions of the array."""
    if shape == ():
        return g, 1
    axis = list(axis) if isinstance(axis, tuple) else axis
    new_shape = onp.array(shape)
    new_shape[axis] = 1
    num_reps = onp.prod(onp.array(shape)[axis])
    # Can't use broadcast_to because of numpy bug: https://github.com/numpy/numpy/issues/9165
    # return anp.broadcast_to(anp.reshape(g, new_shape), shape), num_reps
    return anp.reshape(g, new_shape) + onp.zeros(shape, dtype=dtype), num_reps


def grad_broadcast_to(ans, x, new_shape):
    old_shape = anp.shape(x)
    assert anp.shape(ans) == new_shape
    assert len(old_shape) == len(new_shape), "Can't handle extra leading dims"
    broadcast_axes = tuple(
        onp.where(onp.logical_and(onp.array(old_shape) == 1, onp.array(new_shape) > 1))[0]
    )
    return lambda g: anp.sum(g, axis=broadcast_axes, keepdims=True)


defvjp(anp.broadcast_to, grad_broadcast_to)


def grad_np_sum(ans, x, axis=None, keepdims=False, dtype=None):
    shape, dtype = anp.shape(x), anp.result_type(x)
    return lambda g: repeat_to_match_shape(g, shape, dtype, axis, keepdims)[0]


defvjp(anp.sum, grad_np_sum)


def grad_np_mean(ans, x, axis=None, keepdims=False):
    shape, dtype = anp.shape(x), anp.result_type(x)

    def vjp(g):
        g_repeated, num_reps = repeat_to_match_shape(g, shape, dtype, axis, keepdims)
        return g_repeated / num_reps

    return vjp


defvjp(anp.mean, grad_np_mean)


def grad_np_prod(ans, x, axis=None, keepdims=False):  # TODO: Support tuples of axes.
    shape, dtype = anp.shape(x), anp.result_type(x)

    def vjp(g):
        g_repeated, _ = repeat_to_match_shape(g * ans, shape, dtype, axis, keepdims)
        return g_repeated / x

    return vjp


defvjp(anp.prod, grad_np_prod)


def grad_np_var(ans, x, axis=None, ddof=0, keepdims=False):
    shape, _, dtype, iscomplex = anp.metadata(x)

    def vjp(g):
        if iscomplex:
            g = g + 0j
        g_repeated, num_reps = repeat_to_match_shape(g, shape, dtype, axis, keepdims)
        x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True))
        return 2.0 * g_repeated * x_minus_mean / (num_reps - ddof)

    return vjp


defvjp(anp.var, grad_np_var)


def grad_np_std(ans, x, axis=None, ddof=0, keepdims=False):
    shape, _, dtype, iscomplex = anp.metadata(x)

    def vjp(g):
        if iscomplex:
            g = g + 0j
        g_repeated, num_reps = repeat_to_match_shape(
            g, shape, dtype, axis, keepdims
        )  # Avoid division by zero.
        if num_reps <= 1:
            return g_repeated * 0.0
        else:
            g_repeated, num_reps = repeat_to_match_shape(g / ans, shape, dtype, axis, keepdims)
            x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True))
            return g_repeated * x_minus_mean / (num_reps - ddof)

    return vjp


defvjp(anp.std, grad_np_std)


def grad_chooser(ans, x, axis=None, keepdims=None):
    shape, dtype = anp.shape(x), anp.result_type(x)

    def vjp(g):
        """Builds gradient of functions that choose a single item, such as min or max."""
        g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims)
        argmax_locations = x == repeat_to_match_shape(ans, shape, dtype, axis, keepdims)[0]
        return g_repeated * argmax_locations / onp.sum(argmax_locations, axis=axis, keepdims=True)

    return vjp


defvjp(anp.max, grad_chooser)
defvjp(anp.min, grad_chooser)
defvjp(anp.amax, grad_chooser)
defvjp(anp.amin, grad_chooser)


def reverse_axis(x, axis):
    x = x.swapaxes(axis, 0)
    x = x[::-1, ...]
    return x.swapaxes(0, axis)


def grad_np_cumsum(ans, x, axis=None):
    def vjp(g):
        if axis:
            return reverse_axis(anp.cumsum(reverse_axis(g, axis), axis), axis)
        else:
            return anp.reshape(anp.cumsum(g[::-1], axis)[::-1], x.shape)

    return vjp


defvjp(anp.cumsum, grad_np_cumsum)


def grad_inner(argnum, ans, A, B):
    A_ndim, B_ndim = anp.ndim(A), anp.ndim(B)
    if A_ndim == 0 or B_ndim == 0:
        axes = ([], [])
    else:
        axes = ([A_ndim - 1], [B_ndim - 1])
    if argnum == 0:
        return lambda G: tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim)
    elif argnum == 1:
        return lambda G: tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim)


defvjp(anp.inner, partial(grad_inner, 0), partial(grad_inner, 1))


def matmul_adjoint_0(B, G, A_meta, B_ndim):
    if anp.ndim(G) == 0:  # A_ndim == B_ndim == 1
        return unbroadcast(G * B, A_meta)
    _, A_ndim, _, _ = A_meta
    if A_ndim == 1:
        G = anp.expand_dims(G, anp.ndim(G) - 1)
    if B_ndim == 1:  # The result we need is an outer product
        B = anp.expand_dims(B, 0)
        G = anp.expand_dims(G, anp.ndim(G))
    else:  # We need to swap the last two axes of B
        B = anp.swapaxes(B, B_ndim - 2, B_ndim - 1)
    result = anp.matmul(G, B)
    return unbroadcast(result, A_meta)


def matmul_adjoint_1(A, G, A_ndim, B_meta):
    if anp.ndim(G) == 0:  # A_ndim == B_ndim == 1
        return unbroadcast(G * A, B_meta)
    _, B_ndim, _, _ = B_meta
    B_is_vec = B_ndim == 1
    if B_is_vec:
        G = anp.expand_dims(G, anp.ndim(G))
    if A_ndim == 1:  # The result we need is an outer product
        A = anp.expand_dims(A, 1)
        G = anp.expand_dims(G, anp.ndim(G) - 1)
    else:  # We need to swap the last two axes of A
        A = anp.swapaxes(A, A_ndim - 2, A_ndim - 1)
    result = anp.matmul(A, G)
    if B_is_vec:
        result = anp.squeeze(result, anp.ndim(G) - 1)
    return unbroadcast(result, B_meta)


def matmul_vjp_0(ans, A, B):
    A_meta = anp.metadata(A)
    B_ndim = anp.ndim(B)
    return lambda g: matmul_adjoint_0(B, g, A_meta, B_ndim)


def matmul_vjp_1(ans, A, B):
    A_ndim = anp.ndim(A)
    B_meta = anp.metadata(B)
    return lambda g: matmul_adjoint_1(A, g, A_ndim, B_meta)


defvjp(anp.matmul, matmul_vjp_0, matmul_vjp_1)


@primitive
def dot_adjoint_0(B, G, A_meta, B_meta):
    _, A_ndim, A_dtype, _ = A_meta
    _, B_ndim, _, _ = B_meta
    if B_ndim == 0 or B_ndim == 1 or A_ndim == 0:
        contract_num = max(0, B_ndim - (A_ndim != 0))
        out = onp.tensordot(G, B, contract_num)
    else:
        out = onp.tensordot(G, onp.swapaxes(B, -1, -2), B_ndim - 1)
    return onp.asarray(out, dtype=A_dtype)


@primitive
def dot_adjoint_1(A, G, A_meta, B_meta):
    _, A_ndim, _, _ = A_meta
    _, B_ndim, B_dtype, _ = B_meta
    needs_transpose = B_ndim > 1 and A_ndim != 0
    swap = (lambda x: onp.swapaxes(x, -1, -2)) if needs_transpose else (lambda x: x)
    if A_ndim == 0 or A_ndim == 1 or B_ndim == 0:
        contract_num = max(0, A_ndim - (B_ndim != 0))
        out = swap(onp.tensordot(G, A, contract_num))
    else:
        out = swap(onp.tensordot(G, A, [range(-A_ndim - B_ndim + 2, -B_ndim + 1), range(A_ndim - 1)]))
    return onp.asarray(out, dtype=B_dtype)


def dot_vjp_0(ans, A, B):
    A_meta, B_meta = anp.metadata(A), anp.metadata(B)
    return lambda g: match_complex(A, dot_adjoint_0(B, g, A_meta, B_meta))


def dot_vjp_1(ans, A, B):
    A_meta, B_meta = anp.metadata(A), anp.metadata(B)
    return lambda g: match_complex(B, dot_adjoint_1(A, g, A_meta, B_meta))


defvjp(anp.dot, dot_vjp_0, dot_vjp_1)

defvjp(
    dot_adjoint_0,
    lambda ans, B, g, An, Bn: lambda A: match_complex(B, dot_adjoint_1(A, g, An, Bn)),
    lambda ans, B, g, An, Bn: lambda A: match_complex(g, anp.dot(A, B)),
)

defvjp(
    dot_adjoint_1,
    lambda ans, A, g, An, Bn: lambda B: match_complex(A, dot_adjoint_0(B, g, An, Bn)),
    lambda ans, A, g, An, Bn: lambda B: match_complex(g, anp.dot(A, B)),
)


@primitive
def tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim):
    # The adjoint of the operator
    # A |--> np.tensordot(A, B, axes)
    if B_ndim == 0:
        return G * B

    G_axes = onp.arange(onp.ndim(G))
    if type(axes) is int:
        axes = max(axes, 0)
        B_axes = onp.arange(B_ndim)
        return onp.tensordot(G, B, [G_axes[A_ndim - axes :], B_axes[axes:]])
    else:
        axes0 = [axes[0]] if type(axes[0]) is int else axes[0]
        axes1 = [axes[1]] if type(axes[1]) is int else axes[1]
        axes = [axes0, axes1]
        A_axes = onp.arange(A_ndim)
        B_axes = onp.arange(B_ndim)
        summed_axes = [
            onp.asarray(axes[0], dtype="int64") % A_ndim,
            onp.asarray(axes[1], dtype="int64") % B_ndim,
        ]
        other_axes = [onp.delete(A_axes, summed_axes[0]), onp.delete(B_axes, summed_axes[1])]
        out = onp.tensordot(G, B, [G_axes[len(other_axes[0]) :], other_axes[1]])
        perm = onp.argsort(onp.concatenate((other_axes[0], summed_axes[0][onp.argsort(summed_axes[1])])))
        return onp.transpose(out, perm)


@primitive
def tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim):
    # The adjoint of the operator
    # B |--> np.tensordot(A, B, axes)
    if A_ndim == 0:
        return G * A

    G_axes = onp.arange(onp.ndim(G))
    if type(axes) is int:
        axes = max(axes, 0)
        A_axes = onp.arange(A_ndim)
        return onp.tensordot(A, G, [A_axes[: A_ndim - axes], G_axes[: A_ndim - axes]])
    else:
        axes0 = [axes[0]] if type(axes[0]) is int else axes[0]
        axes1 = [axes[1]] if type(axes[1]) is int else axes[1]
        axes = [axes0, axes1]
        A_axes = onp.arange(A_ndim)
        B_axes = onp.arange(B_ndim)
        summed_axes = [
            onp.asarray(axes[0], dtype="int64") % A_ndim,
            onp.asarray(axes[1], dtype="int64") % B_ndim,
        ]
        other_axes = [onp.delete(A_axes, summed_axes[0]), onp.delete(B_axes, summed_axes[1])]
        out = onp.tensordot(A, G, [other_axes[0], G_axes[: len(other_axes[0])]])
        perm = onp.argsort(onp.concatenate((summed_axes[1][onp.argsort(summed_axes[0])], other_axes[1])))
        return onp.transpose(out, perm)


def tensordot_vjp_0(ans, A, B, axes=2):
    A_ndim, B_ndim = anp.ndim(A), anp.ndim(B)
    return lambda G: match_complex(A, tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim))


def tensordot_vjp_1(ans, A, B, axes=2):
    A_ndim, B_ndim = anp.ndim(A), anp.ndim(B)
    return lambda G: match_complex(B, tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim))


defvjp(anp.tensordot, tensordot_vjp_0, tensordot_vjp_1)
defvjp(
    tensordot_adjoint_0,
    lambda ans, B, G, axes, An, Bn: lambda A: match_complex(B, tensordot_adjoint_1(A, G, axes, An, Bn)),
    lambda ans, B, G, axes, An, Bn: lambda A: match_complex(G, anp.tensordot(A, B, axes)),
)
defvjp(
    tensordot_adjoint_1,
    lambda ans, A, G, axes, An, Bn: lambda B: match_complex(A, tensordot_adjoint_0(B, G, axes, An, Bn)),
    lambda ans, A, G, axes, An, Bn: lambda B: match_complex(G, anp.tensordot(A, B, axes)),
)
defvjp(
    anp.outer,
    lambda ans, a, b: lambda g: match_complex(a, anp.dot(g, b.T)),
    lambda ans, a, b: lambda g: match_complex(b, anp.dot(a.T, g)),
)


def grad_concatenate_args(argnum, ans, axis_args, kwargs):
    axis, args = axis_args[0], axis_args[1:]
    sizes = [anp.shape(a)[axis] for a in args[:argnum]]
    start = sum(sizes[:-1])
    idxs = [slice(None)] * ans.ndim
    idxs[axis] = slice(start, start + sizes[-1])
    return lambda g: g[tuple(idxs)]


defvjp_argnum(anp.concatenate_args, grad_concatenate_args)


def wrapped_reshape(x, *args, **kwargs):
    # The reshape method can be called like A.reshape((5,4)) or A.reshape(5,4).
    # The reshape function doesn't support both ways, so we have to wrap it.
    if isinstance(args[0], int):
        return anp.reshape(x, args, **kwargs)
    else:
        return anp.reshape(x, *args, **kwargs)


setattr(ArrayBox, "reshape", wrapped_reshape)


def grad_sort(ans, x, axis=-1, kind="quicksort", order=None):
    # TODO: Cast input with np.asanyarray()
    if len(x.shape) > 1:
        raise NotImplementedError("Gradient of sort not implemented for multi-dimensional arrays.")
    sort_perm = anp.argsort(x, axis, kind, order)
    return lambda g: unpermuter(g, sort_perm)


defvjp(anp.sort, grad_sort)
if onp.lib.NumpyVersion(onp.__version__) < "2.0.0":
    defvjp(anp.msort, grad_sort)  # Until multi-D is allowed, these are the same.


def grad_partition(ans, x, kth, axis=-1, kind="introselect", order=None):
    # TODO: Cast input with np.asanyarray()
    if len(x.shape) > 1:
        raise NotImplementedError("Gradient of partition not implemented for multi-dimensional arrays.")
    partition_perm = anp.argpartition(x, kth, axis, kind, order)
    return lambda g: unpermuter(g, partition_perm)


defvjp(anp.partition, grad_partition)


def unpermuter(g, permutation):
    unsort = anp.zeros(len(permutation), dtype=int)
    unsort[permutation] = list(range(len(permutation)))
    return g[unsort]


def grad_reshape_list(ans, *arys):
    if len(arys) > 1:
        raise NotImplementedError("Can't handle multiple arguments yet.")
    return lambda g: anp.reshape(g, anp.shape(arys[0]))


defvjp(anp.atleast_1d, grad_reshape_list)
defvjp(anp.atleast_2d, grad_reshape_list)
defvjp(anp.atleast_3d, grad_reshape_list)


def grad_einsum(argnum, ans, operands_, kwargs):
    result_meta = anp.metadata(operands_[argnum])

    def vjp(g):
        operands = operands_
        if isinstance(operands[0], str):  # using "ijk" convention.
            in_subs, out_subs, _ = anp.parse_einsum_input(*operands)
            string, operands = operands[0], operands[1:]

            in_subs_list = in_subs.split(",")
            op_num = argnum - 1
            subs_wrt = in_subs_list[op_num]
            rest_of_ops = operands[:op_num] + operands[op_num + 1 :]
            rest_of_subs = in_subs_list[:op_num] + in_subs_list[op_num + 1 :]

            # subscripts that only appear in subs_wrt (and not in other subscript lists
            # or in the output) are implicitly being summed out, as if contracted
            # against a tensor of ones. we make that tensor of ones explicit to handle
            # the necessary vjp broadcasting inside einsum.
            other_named_subs = set("".join([out_subs] + rest_of_subs))
            naked_summed = [(i, sub) for i, sub in enumerate(subs_wrt) if sub not in other_named_subs]
            if naked_summed:
                naked_summed_dims, ones_subs = zip(*naked_summed)
                ones_subs = "".join(ones_subs)
                ones = onp.ones(onp.array(operands[op_num].shape)[list(naked_summed_dims)])
                new_input_subs = ",".join([out_subs, ones_subs] + rest_of_subs)
                new_operands = (g, ones) + rest_of_ops
            else:
                new_input_subs = ",".join([out_subs] + rest_of_subs)
                new_operands = (g,) + rest_of_ops

            new_subscripts = new_input_subs + "->" + subs_wrt
            return unbroadcast(anp.einsum(new_subscripts, *new_operands), result_meta)
        else:  # using (op0, sublist0, op1, sublist1, ..., sublistout) convention
            if len(operands) % 2 == 0:
                raise NotImplementedError("Need sublistout argument")
            operands = list(operands)
            rest_of_ops = (
                [operands[-1]] + operands[:argnum] + operands[(argnum + 2) : -1] + [operands[argnum + 1]]
            )
            return unbroadcast_einsum(anp.einsum(g, *rest_of_ops), result_meta, operands[argnum + 1])

    return vjp


defvjp_argnum(anp.einsum, grad_einsum)

defvjp(
    anp.diagonal,
    lambda ans, A, offset=0, axis1=0, axis2=1: lambda g: anp.make_diagonal(g, offset, axis1, axis2),
)
defvjp(
    anp.make_diagonal,
    lambda ans, D, offset=0, axis1=0, axis2=1: lambda g: anp.diagonal(g, offset, axis1, axis2),
)


def match_complex(target, x):
    target_iscomplex = anp.iscomplexobj(target)
    x_iscomplex = anp.iscomplexobj(x)
    if x_iscomplex and not target_iscomplex:
        return anp.real(x)
    elif not x_iscomplex and target_iscomplex:
        return x + 0j
    else:
        return x


def unbroadcast(x, target_meta, broadcast_idx=0):
    target_shape, target_ndim, dtype, target_iscomplex = target_meta
    while anp.ndim(x) > target_ndim:
        x = anp.sum(x, axis=broadcast_idx)
    for axis, size in enumerate(target_shape):
        if size == 1:
            x = anp.sum(x, axis=axis, keepdims=True)
    if anp.iscomplexobj(x) and not target_iscomplex:
        x = anp.real(x)
    return x


def unbroadcast_f(target, f):
    target_meta = anp.metadata(target)
    return lambda g: unbroadcast(f(g), target_meta)


def unbroadcast_einsum(x, target_meta, subscript):
    if Ellipsis not in subscript:
        return x
    elif subscript[0] == Ellipsis:
        return unbroadcast(x, target_meta, 0)
    elif subscript[-1] == Ellipsis:
        return unbroadcast(x, target_meta, -1)
    else:
        return unbroadcast(x, target_meta, subscript.index(Ellipsis))


def balanced_eq(x, z, y):
    return (x == z) / (1.0 + (x == y))


def replace_zero(x, val):
    return anp.where(x, x, val)


# ----- extra functions used internally  -----


def array_from_args_gradmaker(argnum, ans, args, kwargs):
    return lambda g: g[argnum - 2]


defvjp_argnum(anp.array_from_args, array_from_args_gradmaker)


def array_from_scalar_or_array_gradmaker(ans, array_args, array_kwargs, scarray):
    ndmin = array_kwargs.get("ndmin", 0)
    scarray_ndim = anp.ndim(scarray)
    if ndmin > scarray_ndim:
        return lambda g: anp.squeeze(g, axis=tuple(range(ndmin - scarray_ndim)))
    else:
        return lambda g: g


defvjp(anp._array_from_scalar_or_array, array_from_scalar_or_array_gradmaker, argnums=(2, 3))


@primitive
def untake(x, idx, vs):
    if isinstance(idx, list) and (len(idx) == 0 or not isinstance(idx[0], slice)):
        idx = onp.array(idx, dtype="int64")

    def mut_add(A):
        onp.add.at(A, idx, x)
        return A

    return SparseObject(vs, mut_add)


defvjp(func(ArrayBox.__getitem__), lambda ans, A, idx: lambda g: untake(g, idx, vspace(A)))
defvjp(untake, lambda ans, x, idx, _: lambda g: g[idx])


def _unpad(array, width):
    if anp.isscalar(width):
        width = [[width, width]]
    elif anp.shape(width) == (1,):
        width = [anp.concatenate((width, width))]
    elif anp.shape(width) == (2,):
        width = [width]
    if anp.shape(width)[0] == 1:
        width = anp.repeat(width, anp.ndim(array), 0)
    idxs = tuple(slice(l, -u or None) for l, u in width)
    return array[idxs]


def pad_vjp(ans, array, pad_width, mode, **kwargs):
    assert mode == "constant", "Only constant mode padding is supported."
    return lambda g: _unpad(g, pad_width)


defvjp(anp.pad, pad_vjp)


================================================
FILE: autograd/numpy/numpy_vspaces.py
================================================
import numpy as np

from autograd.builtins import NamedTupleVSpace
from autograd.extend import VSpace


class ArrayVSpace(VSpace):
    def __init__(self, value):
        value = np.asarray(value)
        self.shape = value.shape
        self.dtype = value.dtype

    @property
    def size(self):
        return np.prod(self.shape)

    @property
    def ndim(self):
        return len(self.shape)

    def zeros(self):
        return np.zeros(self.shape, dtype=self.dtype)

    def ones(self):
        return np.ones(self.shape, dtype=self.dtype)

    def standard_basis(self):
        for idxs in np.ndindex(*self.shape):
            vect = np.zeros(self.shape, dtype=self.dtype)
            vect[idxs] = 1
            yield vect

    def randn(self):
        return np.array(np.random.randn(*self.shape)).astype(self.dtype)

    def _inner_prod(self, x, y):
        return np.dot(np.ravel(x), np.ravel(y))


class ComplexArrayVSpace(ArrayVSpace):
    iscomplex = True

    @property
    def size(self):
        return np.prod(self.shape) * 2

    def ones(self):
        return np.ones(self.shape, dtype=self.dtype) + 1.0j * np.ones(self.shape, dtype=self.dtype)

    def standard_basis(self):
        for idxs in np.ndindex(*self.shape):
            for v in [1.0, 1.0j]:
                vect = np.zeros(self.shape, dtype=self.dtype)
                vect[idxs] = v
                yield vect

    def randn(self):
        return np.array(np.random.randn(*self.shape)).astype(self.dtype) + 1.0j * np.array(
            np.random.randn(*self.shape)
        ).astype(self.dtype)

    def _inner_prod(self, x, y):
        return np.real(np.dot(np.conj(np.ravel(x)), np.ravel(y)))

    def _covector(self, x):
        return np.conj(x)


VSpace.register(np.ndarray, lambda x: ComplexArrayVSpace(x) if np.iscomplexobj(x) else ArrayVSpace(x))

for type_ in [float, np.longdouble, np.float64, np.float32, np.float16]:
    ArrayVSpace.register(type_)

for type_ in [complex, np.clongdouble, np.complex64, np.complex128]:
    ComplexArrayVSpace.register(type_)


if np.lib.NumpyVersion(np.__version__) >= "2.0.0":

    class EigResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg._linalg.EigResult

    class EighResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg._linalg.EighResult

    class QRResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg._linalg.QRResult

    class SlogdetResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg._linalg.SlogdetResult

    class SVDResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg._linalg.SVDResult

    EigResultVSpace.register(np.linalg._linalg.EigResult)
    EighResultVSpace.register(np.linalg._linalg.EighResult)
    QRResultVSpace.register(np.linalg._linalg.QRResult)
    SlogdetResultVSpace.register(np.linalg._linalg.SlogdetResult)
    SVDResultVSpace.register(np.linalg._linalg.SVDResult)
elif np.__version__ >= "1.25":

    class EigResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg.linalg.EigResult

    class EighResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg.linalg.EighResult

    class QRResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg.linalg.QRResult

    class SlogdetResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg.linalg.SlogdetResult

    class SVDResultVSpace(NamedTupleVSpace):
        seq_type = np.linalg.linalg.SVDResult

    EigResultVSpace.register(np.linalg.linalg.EigResult)
    EighResultVSpace.register(np.linalg.linalg.EighResult)
    QRResultVSpace.register(np.linalg.linalg.QRResult)
    SlogdetResultVSpace.register(np.linalg.linalg.SlogdetResult)
    SVDResultVSpace.register(np.linalg.linalg.SVDResult)


================================================
FILE: autograd/numpy/numpy_wrapper.py
================================================
import warnings

import numpy as _np

import autograd.builtins as builtins
from autograd.extend import notrace_primitive, primitive

if _np.lib.NumpyVersion(_np.__version__) >= "2.0.0":
    from numpy._core.einsumfunc import _parse_einsum_input
else:
    from numpy.core.einsumfunc import _parse_einsum_input

numpy_version = _np.__version__

notrace_functions = [_np.ndim, _np.shape, _np.iscomplexobj, _np.result_type]


def wrap_intdtype(cls):
    class IntdtypeSubclass(cls):
        __new__ = notrace_primitive(cls.__new__)

    return IntdtypeSubclass


def wrap_namespace(old, new):
    unchanged_types = {float, int, type(None), type}
    int_types = {_np.int8, _np.int16, _np.int32, _np.int64, _np.integer}
    for name, obj in old.items():
        if obj in notrace_functions:
            new[name] = notrace_primitive(obj)
        elif callable(obj) and type(obj) is not type:
            new[name] = primitive(obj)
        elif type(obj) is type and obj in int_types:
            new[name] = wrap_intdtype(obj)
        elif type(obj) in unchanged_types:
            new[name] = obj


wrap_namespace(_np.__dict__, globals())

# ----- Special treatment of list-input functions -----


@primitive
def concatenate_args(axis, *args):
    return _np.concatenate(args, axis).view(ndarray)


concatenate = lambda arr_list, axis=0: concatenate_args(axis, *arr_list)
vstack = row_stack = lambda tup: concatenate([atleast_2d(_m) for _m in tup], axis=0)


def hstack(tup):
    arrs = [atleast_1d(_m) for _m in tup]
    if arrs[0].ndim == 1:
        return concatenate(arrs, 0)
    return concatenate(arrs, 1)


def column_stack(tup):
    arrays = []
    for v in tup:
        arr = array(v)
        if arr.ndim < 2:
            arr = array(arr, ndmin=2).T
        arrays.append(arr)
    return concatenate(arrays, 1)


def array(A, *args, **kwargs):
    t = builtins.type(A)
    if t in (list, tuple):
        return array_from_args(args, kwargs, *map(array, A))
    else:
        return _array_from_scalar_or_array(args, kwargs, A)


def wrap_if_boxes_inside(raw_array, slow_op_name=None):
    if raw_array.dtype is _np.dtype("O"):
        if slow_op_name:
            warnings.warn(f"{slow_op_name} is slow for array inputs. np.concatenate() is faster.")
        return array_from_args((), {}, *raw_array.ravel()).reshape(raw_array.shape)
    else:
        return raw_array


@primitive
def _array_from_scalar_or_array(array_args, array_kwargs, scalar):
    return _np.array(scalar, *array_args, **array_kwargs)


@primitive
def array_from_args(array_args, array_kwargs, *args):
    return _np.array(args, *array_args, **array_kwargs)


def select(condlist, choicelist, default=0):
    raw_array = _np.select(list(condlist), list(choicelist), default=default)
    return array(list(raw_array.ravel())).reshape(raw_array.shape)


def stack(arrays, axis=0):
    # this code is basically copied from numpy/core/shape_base.py's stack
    # we need it here because we want to re-implement stack in terms of the
    # primitives defined in this file

    arrays = [array(arr) for arr in arrays]
    if not arrays:
        raise ValueError("need at least one array to stack")

    shapes = {arr.shape for arr in arrays}
    if len(shapes) != 1:
        raise ValueError("all input arrays must have the same shape")

    result_ndim = arrays[0].ndim + 1
    if not -result_ndim <= axis < result_ndim:
        raise IndexError("axis {0} out of bounds [-{1}, {1})".format(axis, result_ndim))
    if axis < 0:
        axis += result_ndim

    sl = (slice(None),) * axis + (None,)
    return concatenate([arr[sl] for arr in arrays], axis=axis)


def append(arr, values, axis=None):
    # this code is basically copied from numpy/lib/function_base.py's append
    arr = array(arr)
    if axis is None:
        if ndim(arr) != 1:
            arr = ravel(arr)
        values = ravel(array(values))
        axis = ndim(arr) - 1
    return concatenate((arr, values), axis=axis)


# ----- Enable functions called using [] ----


class r_class:
    def __getitem__(self, args):
        raw_array = _np.r_[args]
        return wrap_if_boxes_inside(raw_array, slow_op_name="r_")


r_ = r_class()


class c_class:
    def __getitem__(self, args):
        raw_array = _np.c_[args]
        return wrap_if_boxes_inside(raw_array, slow_op_name="c_")


c_ = c_class()


# ----- misc -----
@primitive
def make_diagonal(D, offset=0, axis1=0, axis2=1):
    # Numpy doesn't offer a complement to np.diagonal: a function to create new
    # diagonal arrays with extra dimensions. We need such a function for the
    # gradient of np.diagonal and it's also quite handy to have. So here it is.
    if not (offset == 0 and axis1 == -1 and axis2 == -2):
        raise NotImplementedError("Currently make_diagonal only supports offset=0, axis1=-1, axis2=-2")

    # We use a trick: calling np.diagonal returns a view on the original array,
    # so we can modify it in-place. (only valid for numpy version >= 1.10.)
    new_array = _np.zeros(D.shape + (D.shape[-1],))
    new_array_diag = _np.diagonal(new_array, offset=0, axis1=-1, axis2=-2)
    new_array_diag.flags.writeable = True
    new_array_diag[:] = D
    return new_array


@notrace_primitive
def metadata(A):
    return _np.shape(A), _np.ndim(A), _np.result_type(A), _np.iscomplexobj(A)


@notrace_primitive
def parse_einsum_input(*args):
    return _parse_einsum_input(args)


if _np.lib.NumpyVersion(_np.__version__) >= "2.0.0":
    # Wrapped above
    _astype = astype
else:

    @primitive
    def _astype(A, dtype, order="K", casting="unsafe", subok=True, copy=True):
        return A.astype(dtype, order, casting, subok, copy)


================================================
FILE: autograd/numpy/random.py
================================================
import numpy.random as npr

from .numpy_wrapper import wrap_namespace

wrap_namespace(npr.__dict__, globals())


================================================
FILE: autograd/scipy/__init__.py
================================================
from . import integrate, signal, special, stats


================================================
FILE: autograd/scipy/integrate.py
================================================
import scipy.integrate

import autograd.numpy as np
from autograd import make_vjp
from autograd.builtins import tuple
from autograd.extend import defvjp_argnums, primitive
from autograd.misc import flatten

odeint = primitive(scipy.integrate.odeint)


def grad_odeint(yt, func, y0, t, func_args, **kwargs):
    # Extended from "Scalable Inference of Ordinary Differential
    # Equation Models of Biochemical Processes", Sec. 2.4.2
    # Fabian Froehlich, Carolin Loos, Jan Hasenauer, 2017
    # https://arxiv.org/abs/1711.08079

    T, D = np.shape(yt)
    flat_args, unflatten = flatten(func_args)

    def flat_func(y, t, flat_args):
        return func(y, t, *unflatten(flat_args))

    def unpack(x):
        #      y,      vjp_y,      vjp_t,    vjp_args
        return x[0:D], x[D : 2 * D], x[2 * D], x[2 * D + 1 :]

    def augmented_dynamics(augmented_state, t, flat_args):
        # Orginal system augmented with vjp_y, vjp_t and vjp_args.
        y, vjp_y, _, _ = unpack(augmented_state)
        vjp_all, dy_dt = make_vjp(flat_func, argnum=(0, 1, 2))(y, t, flat_args)
        vjp_y, vjp_t, vjp_args = vjp_all(-vjp_y)
        return np.hstack((dy_dt, vjp_y, vjp_t, vjp_args))

    def vjp_all(g):
        vjp_y = g[-1, :]
        vjp_t0 = 0
        time_vjp_list = []
        vjp_args = np.zeros(np.size(flat_args))

        for i in range(T - 1, 0, -1):
            # Compute effect of moving measurement time.
            vjp_cur_t = np.dot(func(yt[i, :], t[i], *func_args), g[i, :])
            time_vjp_list.append(vjp_cur_t)
            vjp_t0 = vjp_t0 - vjp_cur_t

            # Run augmented system backwards to the previous observation.
            aug_y0 = np.hstack((yt[i, :], vjp_y, vjp_t0, vjp_args))
            aug_ans = odeint(
                augmented_dynamics, aug_y0, np.array([t[i], t[i - 1]]), tuple((flat_args,)), **kwargs
            )
            _, vjp_y, vjp_t0, vjp_args = unpack(aug_ans[1])

            # Add gradient from current output.
            vjp_y = vjp_y + g[i - 1, :]

        time_vjp_list.append(vjp_t0)
        vjp_times = np.hstack(time_vjp_list)[::-1]

        return None, vjp_y, vjp_times, unflatten(vjp_args)

    return vjp_all


def argnums_unpack(all_vjp_builder):
    # A generic autograd helper function.  Takes a function that
    # builds vjps for all arguments, and wraps it to return only required vjps.
    def build_selected_vjps(argnums, ans, combined_args, kwargs):
        vjp_func = all_vjp_builder(ans, *combined_args, **kwargs)

        def chosen_vjps(g):  # Returns whichever vjps were asked for.
            all_vjps = vjp_func(g)
            return [all_vjps[argnum] for argnum in argnums]

        return chosen_vjps

    return build_selected_vjps


defvjp_argnums(odeint, argnums_unpack(grad_odeint))


================================================
FILE: autograd/scipy/linalg.py
================================================
from functools import partial

import scipy.linalg

import autograd.numpy as anp
from autograd.extend import defjvp, defjvp_argnums, defvjp, defvjp_argnums
from autograd.numpy.numpy_wrapper import wrap_namespace

wrap_namespace(scipy.linalg.__dict__, globals())  # populates module namespace


def _vjp_sqrtm(ans, A, disp=True, blocksize=64):
    assert disp, "sqrtm vjp not implemented for disp=False"
    ans_transp = anp.transpose(ans)

    def vjp(g):
        return anp.real(solve_sylvester(ans_transp, ans_transp, g))

    return vjp


defvjp(sqrtm, _vjp_sqrtm)


def _flip(a, trans):
    if anp.iscomplexobj(a):
        return "H" if trans in ("N", 0) else "N"
    else:
        return "T" if trans in ("N", 0) else "N"


def grad_solve_triangular(ans, a, b, trans=0, lower=False, **kwargs):
    tri = anp.tril if (lower ^ (_flip(a, trans) == "N")) else anp.triu
    transpose = lambda x: x if _flip(a, trans) != "N" else x.T
    al2d = lambda x: x if x.ndim > 1 else x[..., None]

    def vjp(g):
        v = al2d(solve_triangular(a, g, trans=_flip(a, trans), lower=lower))
        return -transpose(tri(anp.dot(v, al2d(ans).T)))

    return vjp


defvjp(
    solve_triangular,
    grad_solve_triangular,
    lambda ans, a, b, trans=0, lower=False, **kwargs: (
        lambda g: solve_triangular(a, g, trans=_flip(a, trans), lower=lower)
    ),
)


def grad_solve_banded(argnum, ans, l_and_u, a, b):
    updim = lambda x: x if x.ndim == a.ndim else x[..., None]

    def transpose_banded(l_and_u, a):
        # Compute the transpose of a banded matrix.
        # The transpose is itself a banded matrix.

        num_rows = a.shape[0]

        shifts = anp.arange(-l_and_u[1], l_and_u[0] + 1)

        T_a = anp.roll(a[:1, :], shifts[0])
        for rr in range(1, num_rows):
            T_a = anp.vstack([T_a, anp.flipud(anp.roll(a[rr : rr + 1, :], shifts[rr]))])
        T_a = anp.flipud(T_a)

        T_l_and_u = anp.flip(l_and_u)

        return T_l_and_u, T_a

    def banded_dot(l_and_u, uu, vv):
        # Compute tensor product of vectors uu and vv.
        # Tensor product elements are resticted to the bands specified by l_and_u.

        # TODO: replace the brute-force ravel() by smarter dimension handeling of uu and vv

        # main diagonal
        banded_uv = anp.ravel(uu) * anp.ravel(vv)

        # stack below the sub-diagonals
        for rr in range(1, l_and_u[0] + 1):
            banded_uv_rr = anp.hstack([anp.ravel(uu)[rr:] * anp.ravel(vv)[:-rr], anp.zeros(rr)])
            banded_uv = anp.vstack([banded_uv, banded_uv_rr])

        # stack above the sup-diagonals
        for rr in range(1, l_and_u[1] + 1):
            banded_uv_rr = anp.hstack([anp.zeros(rr), anp.ravel(uu)[:-rr] * anp.ravel(vv)[rr:]])
            banded_uv = anp.vstack([banded_uv_rr, banded_uv])

        return banded_uv

    T_l_and_u, T_a = transpose_banded(l_and_u, a)

    if argnum == 1:
        return lambda g: (
            -banded_dot(l_and_u, updim(solve_banded(T_l_and_u, T_a, g)), anp.transpose(updim(ans)))
        )
    elif argnum == 2:
        return lambda g: solve_banded(T_l_and_u, T_a, g)


defvjp(solve_banded, partial(grad_solve_banded, 1), partial(grad_solve_banded, 2), argnums=[1, 2])


def _jvp_sqrtm(dA, ans, A, disp=True, blocksize=64):
    assert disp, "sqrtm jvp not implemented for disp=False"
    return solve_sylvester(ans, ans, dA)


defjvp(sqrtm, _jvp_sqrtm)


def _jvp_sylvester(argnums, dms, ans, args, _):
    a, b, q = args
    if 0 in argnums:
        da = dms[0]
        db = dms[1] if 1 in argnums else 0
    else:
        da = 0
        db = dms[0] if 1 in argnums else 0
    dq = dms[-1] if 2 in argnums else 0
    rhs = dq - anp.dot(da, ans) - anp.dot(ans, db)
    return solve_sylvester(a, b, rhs)


defjvp_argnums(solve_sylvester, _jvp_sylvester)


def _vjp_sylvester(argnums, ans, args, _):
    a, b, q = args

    def vjp(g):
        vjps = []
        q_vjp = solve_sylvester(anp.transpose(a), anp.transpose(b), g)
        if 0 in argnums:
            vjps.append(-anp.dot(q_vjp, anp.transpose(ans)))
        if 1 in argnums:
            vjps.append(-anp.dot(anp.transpose(ans), q_vjp))
        if 2 in argnums:
            vjps.append(q_vjp)
        return tuple(vjps)

    return vjp


defvjp_argnums(solve_sylvester, _vjp_sylvester)


================================================
FILE: autograd/scipy/signal.py
================================================
from functools import partial

import numpy as npo  # original numpy
from numpy.lib.stride_tricks import as_strided

import autograd.numpy as np
from autograd.extend import defvjp, primitive


@primitive
def convolve(A, B, axes=None, dot_axes=[(), ()], mode="full"):
    assert mode in ["valid", "full"], f"Mode {mode} not yet implemented"
    if axes is None:
        axes = [list(range(A.ndim)), list(range(A.ndim))]
    wrong_order = any([B.shape[ax_B] < A.shape[ax_A] for ax_A, ax_B in zip(*axes)])
    if wrong_order:
        if mode == "valid" and not all([B.shape[ax_B] <= A.shape[ax_A] for ax_A, ax_B in zip(*axes)]):
            raise Exception("One array must be larger than the other along all convolved dimensions")
        elif mode != "full" or B.size <= A.size:  # Tie breaker
            i1 = B.ndim - len(dot_axes[1]) - len(axes[1])  # B ignore
            i2 = i1 + A.ndim - len(dot_axes[0]) - len(axes[0])  # A ignore
            i3 = i2 + len(axes[0])
            ignore_B = list(range(i1))
            ignore_A = list(range(i1, i2))
            conv = list(range(i2, i3))
            return convolve(B, A, axes=axes[::-1], dot_axes=dot_axes[::-1], mode=mode).transpose(
                ignore_A + ignore_B + conv
            )

    if mode == "full":
        B = pad_to_full(B, A, axes[::-1])
    B_view_shape = list(B.shape)
    B_view_strides = list(B.strides)
    flipped_idxs = [slice(None)] * A.ndim
    for ax_A, ax_B in zip(*axes):
        B_view_shape.append(abs(B.shape[ax_B] - A.shape[ax_A]) + 1)
        B_view_strides.append(B.strides[ax_B])
        B_view_shape[ax_B] = A.shape[ax_A]
        flipped_idxs[ax_A] = slice(None, None, -1)
    B_view = as_strided(B, B_view_shape, B_view_strides)
    A_view = A[tuple(flipped_idxs)]
    all_axes = [list(axes[i]) + list(dot_axes[i]) for i in [0, 1]]
    return einsum_tensordot(A_view, B_view, all_axes)


def einsum_tensordot(A, B, axes, reverse=False):
    # Does tensor dot product using einsum, which shouldn't require a copy.
    A_axnums = list(range(A.ndim))
    B_axnums = list(range(A.ndim, A.ndim + B.ndim))
    sum_axnum = A.ndim + B.ndim
    for i_sum, (i_A, i_B) in enumerate(zip(*axes)):
        A_axnums[i_A] = sum_axnum + i_sum
        B_axnums[i_B] = sum_axnum + i_sum
    return npo.einsum(A, A_axnums, B, B_axnums)


def pad_to_full(A, B, axes):
    A_pad = [(0, 0)] * A.ndim
    for ax_A, ax_B in zip(*axes):
        A_pad[ax_A] = (B.shape[ax_B] - 1,) * 2
    return npo.pad(A, A_pad, mode="constant")


def parse_axes(A_shape, B_shape, conv_axes, dot_axes, mode):
    A_ndim, B_ndim = len(A_shape), len(B_shape)
    if conv_axes is None:
        conv_axes = (
            tuple(range(A_ndim)),
            tuple(range(A_ndim)),
        )
    axes = {
        "A": {
            "conv": tuple(conv_axes[0]),
            "dot": tuple(dot_axes[0]),
            "ignore": tuple(i for i in range(A_ndim) if i not in conv_axes[0] and i not in dot_axes[0]),
        },
        "B": {
            "conv": tuple(conv_axes[1]),
            "dot": tuple(dot_axes[1]),
            "ignore": tuple(i for i in range(B_ndim) if i not in conv_axes[1] and i not in dot_axes[1]),
        },
    }
    assert len(axes["A"]["dot"]) == len(axes["B"]["dot"])
    assert len(axes["A"]["conv"]) == len(axes["B"]["conv"])
    i1 = len(axes["A"]["ignore"])
    i2 = i1 + len(axes["B"]["ignore"])
    i3 = i2 + len(axes["A"]["conv"])
    axes["out"] = {
        "ignore_A": tuple(range(i1)),
        "ignore_B": tuple(range(i1, i2)),
        "conv": tuple(range(i2, i3)),
    }
    conv_shape = (
        compute_conv_size(A_shape[i], B_shape[j], mode) for i, j in zip(axes["A"]["conv"], axes["B"]["conv"])
    )
    shapes = {
        "A": {s: tuple(A_shape[i] for i in ax) for s, ax in axes["A"].items()},
        "B": {s: tuple(B_shape[i] for i in ax) for s, ax in axes["B"].items()},
    }
    shapes["out"] = {
        "ignore_A": shapes["A"]["ignore"],
        "ignore_B": shapes["B"]["ignore"],
        "conv": conv_shape,
    }
    return axes, shapes


def compute_conv_size(A_size, B_size, mode):
    if mode == "full":
        return A_size + B_size - 1
    elif mode == "same":
        return A_size
    elif mode == "valid":
        return abs(A_size - B_size) + 1
    else:
        raise Exception(f"Mode {mode} not recognized")


def flipped_idxs(ndim, axes):
    new_idxs = [slice(None)] * ndim
    for ax in axes:
        new_idxs[ax] = slice(None, None, -1)
    return tuple(new_idxs)


def grad_convolve(argnum, ans, A, B, axes=None, dot_axes=[(), ()], mode="full"):
    assert mode in ["valid", "full"], f"Grad for mode {mode} not yet implemented"
    axes, shapes = parse_axes(A.shape, B.shape, axes, dot_axes, mode)
    if argnum == 0:
        X, Y = A, B
        _X_, _Y_ = "A", "B"
        ignore_Y = "ignore_B"
    elif argnum == 1:
        X, Y = B, A
        _X_, _Y_ = "B", "A"
        ignore_Y = "ignore_A"
    else:
        raise NotImplementedError(f"Can't take grad of convolve w.r.t. arg {argnum}")

    if mode == "full":
        new_mode = "valid"
    else:
        if any([x_size > y_size for x_size, y_size in zip(shapes[_X_]["conv"], shapes[_Y_]["conv"])]):
            new_mode = "full"
        else:
            new_mode = "valid"

    def vjp(g):
        result = convolve(
            g,
            Y[flipped_idxs(Y.ndim, axes[_Y_]["conv"])],
            axes=[axes["out"]["conv"], axes[_Y_]["conv"]],
            dot_axes=[axes["out"][ignore_Y], axes[_Y_]["ignore"]],
            mode=new_mode,
        )
        new_order = npo.argsort(axes[_X_]["ignore"] + axes[_X_]["dot"] + axes[_X_]["conv"])
        return np.transpose(result, new_order)

    return vjp


defvjp(convolve, partial(grad_convolve, 0), partial(grad_convolve, 1))


================================================
FILE: autograd/scipy/special.py
================================================
import scipy.special

import autograd.numpy as np
from autograd.extend import defjvp, defvjp, primitive
from autograd.numpy.numpy_vjps import repeat_to_match_shape, unbroadcast_f

### Beta function ###
beta = primitive(scipy.special.beta)
betainc = primitive(scipy.special.betainc)
betaln = primitive(scipy.special.betaln)

defvjp(
    beta,
    lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))),
    lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b))),
)
defvjp(
    betainc,
    lambda ans, a, b, x: unbroadcast_f(
        x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)
    ),
    argnums=[2],
)
defvjp(
    betaln,
    lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))),
    lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b))),
)

### Gamma functions ###
polygamma = primitive(scipy.special.polygamma)
psi = primitive(scipy.special.psi)  # psi(x) is just polygamma(0, x)
digamma = primitive(scipy.special.digamma)  # digamma is another name for psi.
gamma = primitive(scipy.special.gamma)
gammaln = primitive(scipy.special.gammaln)
gammainc = primitive(scipy.special.gammainc)
gammaincc = primitive(scipy.special.gammaincc)
gammasgn = primitive(scipy.special.gammasgn)
rgamma = primitive(scipy.special.rgamma)
multigammaln = primitive(scipy.special.multigammaln)

defvjp(gammasgn, None)
defvjp(polygamma, None, lambda ans, n, x: lambda g: g * polygamma(n + 1, x))
defvjp(psi, lambda ans, x: lambda g: g * polygamma(1, x))
defvjp(digamma, lambda ans, x: lambda g: g * polygamma(1, x))
defvjp(gamma, lambda ans, x: lambda g: g * ans * psi(x))
defvjp(gammaln, lambda ans, x: lambda g: g * psi(x))
defvjp(rgamma, lambda ans, x: lambda g: g * psi(x) / -gamma(x))
defvjp(
    multigammaln,
    lambda ans, a, d: lambda g: g * np.sum(digamma(np.expand_dims(a, -1) - np.arange(d) / 2.0), -1),
    None,
)


def make_gammainc_vjp_arg1(sign):
    def gammainc_vjp_arg1(ans, a, x):
        coeffs = sign * np.exp(-x) * np.power(x, a - 1) / gamma(a)
        return unbroadcast_f(x, lambda g: g * coeffs)

    return gammainc_vjp_arg1


defvjp(gammainc, make_gammainc_vjp_arg1(1), argnums=[1])
defvjp(gammaincc, make_gammainc_vjp_arg1(-1), argnums=[1])

### Bessel functions ###

j0 = primitive(scipy.special.j0)
y0 = primitive(scipy.special.y0)
j1 = primitive(scipy.special.j1)
y1 = primitive(scipy.special.y1)
jn = primitive(scipy.special.jn)
yn = primitive(scipy.special.yn)

defvjp(j0, lambda ans, x: lambda g: -g * j1(x))
defvjp(y0, lambda ans, x: lambda g: -g * y1(x))
defvjp(j1, lambda ans, x: lambda g: g * (j0(x) - jn(2, x)) / 2.0)
defvjp(y1, lambda ans, x: lambda g: g * (y0(x) - yn(2, x)) / 2.0)
defvjp(jn, None, lambda ans, n, x: lambda g: g * (jn(n - 1, x) - jn(n + 1, x)) / 2.0)
defvjp(yn, None, lambda ans, n, x: lambda g: g * (yn(n - 1, x) - yn(n + 1, x)) / 2.0)


### Faster versions of common Bessel functions ###
i0 = primitive(scipy.special.i0)
i1 = primitive(scipy.special.i1)
iv = primitive(scipy.special.iv)
ive = primitive(scipy.special.ive)

defvjp(i0, lambda ans, x: lambda g: g * i1(x))
defvjp(i1, lambda ans, x: lambda g: g * (i0(x) + iv(2, x)) / 2.0)
defvjp(iv, None, lambda ans, n, x: lambda g: g * (iv(n - 1, x) + iv(n + 1, x)) / 2.0)
defvjp(ive, None, lambda ans, n, x: lambda g: g * (ans * (n / x - np.sign(x)) + ive(n + 1, x)))

### Error Function ###
inv_root_pi = 0.56418958354775627928
erf = primitive(scipy.special.erf)
erfc = primitive(scipy.special.erfc)

defvjp(erf, lambda ans, x: lambda g: 2.0 * g * inv_root_pi * np.exp(-(x**2)))
defvjp(erfc, lambda ans, x: lambda g: -2.0 * g * inv_root_pi * np.exp(-(x**2)))


### Inverse error function ###
root_pi = 1.7724538509055159
erfinv = primitive(scipy.special.erfinv)
erfcinv = primitive(scipy.special.erfcinv)

defvjp(erfinv, lambda ans, x: lambda g: g * root_pi / 2 * np.exp(erfinv(x) ** 2))
defvjp(erfcinv, lambda ans, x: lambda g: -g * root_pi / 2 * np.exp(erfcinv(x) ** 2))

### Logit and Expit ###
logit = primitive(scipy.special.logit)
expit = primitive(scipy.special.expit)

defvjp(logit, lambda ans, x: lambda g: g / (x * (1 - x)))
defvjp(expit, lambda ans, x: lambda g: g * ans * (1 - ans))

### logsumexp ###
logsumexp = primitive(scipy.special.logsumexp)


def make_grad_logsumexp(ans, x, axis=None, b=1.0, keepdims=False):
    shape, dtype = np.shape(x), np.result_type(x)

    def vjp(g):
        g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims)
        ans_repeated, _ = repeat_to_match_shape(ans, shape, dtype, axis, keepdims)
        return g_repeated * b * np.exp(x - ans_repeated)

    return vjp


defvjp(logsumexp, make_grad_logsumexp)


def fwd_grad_logsumexp(g, ans, x, axis=None, b=1.0, keepdims=False):
    if not keepdims:
        if isinstance(axis, int):
            ans = np.expand_dims(ans, axis)
        elif isinstance(axis, tuple):
            for ax in sorted(axis):
                ans = np.expand_dims(ans, ax)
    return np.sum(g * b * np.exp(x - ans), axis=axis, keepdims=keepdims)


defjvp(logsumexp, fwd_grad_logsumexp)


================================================
FILE: autograd/scipy/stats/__init__.py
================================================
from . import beta, chi2, gamma, norm, poisson, t

# Try block needed in case the user has an
# old version of scipy without multivariate normal.
try:
    from . import multivariate_normal
except AttributeError:
    pass

try:
    from . import dirichlet
except AttributeError:
    pass


================================================
FILE: autograd/scipy/stats/beta.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f
from autograd.scipy.special import beta, psi

cdf = primitive(scipy.stats.beta.cdf)
logpdf = primitive(scipy.stats.beta.logpdf)
pdf = primitive(scipy.stats.beta.pdf)


def grad_beta_logpdf_arg0(x, a, b):
    return (1 + a * (x - 1) + x * (b - 2)) / (x * (x - 1))


def grad_beta_logpdf_arg1(x, a, b):
    return np.log(x) - psi(a) + psi(a + b)


def grad_beta_logpdf_arg2(x, a, b):
    return np.log1p(-x) - psi(b) + psi(a + b)


defvjp(
    cdf,
    lambda ans, x, a, b: unbroadcast_f(
        x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)
    ),
    argnums=[0],
)
defvjp(
    logpdf,
    lambda ans, x, a, b: unbroadcast_f(x, lambda g: g * grad_beta_logpdf_arg0(x, a, b)),
    lambda ans, x, a, b: unbroadcast_f(a, lambda g: g * grad_beta_logpdf_arg1(x, a, b)),
    lambda ans, x, a, b: unbroadcast_f(b, lambda g: g * grad_beta_logpdf_arg2(x, a, b)),
)
defvjp(
    pdf,
    lambda ans, x, a, b: unbroadcast_f(x, lambda g: g * ans * grad_beta_logpdf_arg0(x, a, b)),
    lambda ans, x, a, b: unbroadcast_f(a, lambda g: g * ans * grad_beta_logpdf_arg1(x, a, b)),
    lambda ans, x, a, b: unbroadcast_f(b, lambda g: g * ans * grad_beta_logpdf_arg2(x, a, b)),
)


================================================
FILE: autograd/scipy/stats/chi2.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f
from autograd.scipy.special import gamma

cdf = primitive(scipy.stats.chi2.cdf)
logpdf = primitive(scipy.stats.chi2.logpdf)
pdf = primitive(scipy.stats.chi2.pdf)


def grad_chi2_logpdf(x, df):
    return np.where(df % 1 == 0, (df - x - 2) / (2 * x), 0)


defvjp(
    cdf,
    lambda ans, x, df: unbroadcast_f(
        x, lambda g: g * np.power(2.0, -df / 2) * np.exp(-x / 2) * np.power(x, df / 2 - 1) / gamma(df / 2)
    ),
    argnums=[0],
)
defvjp(logpdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * grad_chi2_logpdf(x, df)), argnums=[0])
defvjp(pdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * ans * grad_chi2_logpdf(x, df)), argnums=[0])


================================================
FILE: autograd/scipy/stats/dirichlet.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.scipy.special import digamma

rvs = primitive(scipy.stats.dirichlet.rvs)
pdf = primitive(scipy.stats.dirichlet.pdf)
logpdf = primitive(scipy.stats.dirichlet.logpdf)

defvjp(
    logpdf,
    lambda ans, x, alpha: lambda g: g * (alpha - 1) / x,
    lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)),
)

# Same as log pdf, but multiplied by the pdf (ans).
defvjp(
    pdf,
    lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x,
    lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)),
)


================================================
FILE: autograd/scipy/stats/gamma.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f
from autograd.scipy.special import gamma, psi

cdf = primitive(scipy.stats.gamma.cdf)
logpdf = primitive(scipy.stats.gamma.logpdf)
pdf = primitive(scipy.stats.gamma.pdf)


def grad_gamma_logpdf_arg0(x, a):
    return (a - x - 1) / x


def grad_gamma_logpdf_arg1(x, a):
    return np.log(x) - psi(a)


defvjp(
    cdf,
    lambda ans, x, a: unbroadcast_f(x, lambda g: g * np.exp(-x) * np.power(x, a - 1) / gamma(a)),
    argnums=[0],
)
defvjp(
    logpdf,
    lambda ans, x, a: unbroadcast_f(x, lambda g: g * grad_gamma_logpdf_arg0(x, a)),
    lambda ans, x, a: unbroadcast_f(a, lambda g: g * grad_gamma_logpdf_arg1(x, a)),
)
defvjp(
    pdf,
    lambda ans, x, a: unbroadcast_f(x, lambda g: g * ans * grad_gamma_logpdf_arg0(x, a)),
    lambda ans, x, a: unbroadcast_f(a, lambda g: g * ans * grad_gamma_logpdf_arg1(x, a)),
)


================================================
FILE: autograd/scipy/stats/multivariate_normal.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f

pdf = primitive(scipy.stats.multivariate_normal.pdf)
logpdf = primitive(scipy.stats.multivariate_normal.logpdf)
entropy = primitive(scipy.stats.multivariate_normal.entropy)

# With thanks to Eric Bresch.
# Some formulas are from
# "An extended collection of matrix derivative results
#  for forward and reverse mode algorithmic differentiation"
# by Mike Giles
# https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf


def generalized_outer_product(x):
    if np.ndim(x) == 1:
        return np.outer(x, x)
    return np.matmul(x, np.swapaxes(x, -1, -2))


def covgrad(x, mean, cov, allow_singular=False):
    if allow_singular:
        raise NotImplementedError(
            "The multivariate normal pdf is not differentiable w.r.t. a singular covariance matix"
        )
    J = np.linalg.inv(cov)
    solved = np.matmul(J, np.expand_dims(x - mean, -1))
    return 1.0 / 2 * (generalized_outer_product(solved) - J)


def solve(allow_singular):
    if allow_singular:
        return lambda A, x: np.dot(np.linalg.pinv(A), x)
    else:
        return np.linalg.solve


defvjp(
    logpdf,
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        x, lambda g: -np.expand_dims(np.atleast_1d(g), 1) * solve(allow_singular)(cov, (x - mean).T).T
    ),
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        mean, lambda g: np.expand_dims(np.atleast_1d(g), 1) * solve(allow_singular)(cov, (x - mean).T).T
    ),
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        cov, lambda g: np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)
    ),
)

# Same as log pdf, but multiplied by the pdf (ans).
defvjp(
    pdf,
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        x, lambda g: -np.expand_dims(np.atleast_1d(ans * g), 1) * solve(allow_singular)(cov, (x - mean).T).T
    ),
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        mean,
        lambda g: np.expand_dims(np.atleast_1d(ans * g), 1) * solve(allow_singular)(cov, (x - mean).T).T,
    ),
    lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
        cov, lambda g: np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)
    ),
)

defvjp(entropy, None, lambda ans, mean, cov: unbroadcast_f(cov, lambda g: 0.5 * g * np.linalg.inv(cov).T))


================================================
FILE: autograd/scipy/stats/norm.py
================================================
"""Gradients of the normal distribution."""

import scipy.stats

import autograd.numpy as anp
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f

pdf = primitive(scipy.stats.norm.pdf)
cdf = primitive(scipy.stats.norm.cdf)
sf = primitive(scipy.stats.norm.sf)
logpdf = primitive(scipy.stats.norm.logpdf)
logcdf = primitive(scipy.stats.norm.logcdf)
logsf = primitive(scipy.stats.norm.logsf)

defvjp(
    pdf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * ans * (x - loc) / scale**2),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * (x - loc) / scale**2),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * ans * (((x - loc) / scale) ** 2 - 1.0) / scale
    ),
)

defvjp(
    cdf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, loc, scale)),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, loc, scale)),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: -g * pdf(x, loc, scale) * (x - loc) / scale
    ),
)

defvjp(
    logpdf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * (x - loc) / scale**2),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * (x - loc) / scale**2),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * (-1.0 / scale + (x - loc) ** 2 / scale**3)
    ),
)

defvjp(
    logcdf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        x, lambda g: g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))
    ),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        loc, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))
    ),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)) * (x - loc) / scale
    ),
)

defvjp(
    logsf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        x, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale))
    ),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        loc, lambda g: g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale))
    ),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale)) * (x - loc) / scale
    ),
)

defvjp(
    sf,
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * pdf(x, loc, scale)),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * pdf(x, loc, scale)),
    lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * pdf(x, loc, scale) * (x - loc) / scale
    ),
)


================================================
FILE: autograd/scipy/stats/poisson.py
================================================
import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f

cdf = primitive(scipy.stats.poisson.cdf)
logpmf = primitive(scipy.stats.poisson.logpmf)
pmf = primitive(scipy.stats.poisson.pmf)


def grad_poisson_logpmf(k, mu):
    return np.where(k % 1 == 0, k / mu - 1, 0)


defvjp(cdf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * -pmf(np.floor(k), mu)), argnums=[1])
defvjp(logpmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * grad_poisson_logpmf(k, mu)), argnums=[1])
defvjp(
    pmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * ans * grad_poisson_logpmf(k, mu)), argnums=[1]
)


================================================
FILE: autograd/scipy/stats/t.py
================================================
"""Gradients of the univariate t distribution."""

import scipy.stats

import autograd.numpy as np
from autograd.extend import defvjp, primitive
from autograd.numpy.numpy_vjps import unbroadcast_f
from autograd.scipy.special import psi

pdf = primitive(scipy.stats.t.pdf)
cdf = primitive(scipy.stats.t.cdf)
logpdf = primitive(scipy.stats.t.logpdf)
logcdf = primitive(scipy.stats.t.logcdf)


def grad_tlogpdf_diff(diff, df):
    return -diff * (1.0 + df) / (diff**2 + df)


def grad_tlogpdf_x(x, df, loc, scale):
    return grad_tlogpdf_diff((x - loc) / scale, df) / scale


def grad_tlogpdf_loc(x, df, loc, scale):
    return -grad_tlogpdf_diff((x - loc) / scale, df) / scale


def grad_tlogpdf_scale(x, df, loc, scale):
    diff = x - loc
    return -(df * (scale**2 - diff**2)) / (scale * (df * scale**2 + diff**2))


def grad_tlogpdf_df(x, df, loc, scale):
    y = (x - loc) / scale
    return 0.5 * (
        (y**2 * (df + 1)) / (df * (y**2 + df))
        - np.log(y**2 / df + 1)
        - 1.0 / df
        - psi(df / 2.0)
        + psi((df + 1) / 2.0)
    )


defvjp(
    pdf,
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        x, lambda g: g * ans * grad_tlogpdf_x(x, df, loc, scale)
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        df, lambda g: g * ans * grad_tlogpdf_df(x, df, loc, scale)
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        loc, lambda g: g * ans * grad_tlogpdf_loc(x, df, loc, scale)
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * ans * grad_tlogpdf_scale(x, df, loc, scale)
    ),
)

defvjp(
    cdf,
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, df, loc, scale)),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, df, loc, scale)),
    argnums=(0, 2),
)

defvjp(
    logpdf,
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * grad_tlogpdf_x(x, df, loc, scale)),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        df, lambda g: g * grad_tlogpdf_df(x, df, loc, scale)
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        loc, lambda g: g * grad_tlogpdf_loc(x, df, loc, scale)
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        scale, lambda g: g * grad_tlogpdf_scale(x, df, loc, scale)
    ),
)

defvjp(
    logcdf,
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        x, lambda g: g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))
    ),
    lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(
        loc, lambda g: -g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))
    ),
    argnums=(0, 2),
)


================================================
FILE: autograd/test_util.py
================================================
from itertools import product

from .core import make_jvp, make_vjp, vspace
from .wrap_util import get_name, unary_to_nary

TOL = 1e-6
RTOL = 1e-6


def scalar_close(a, b):
    return abs(a - b) < TOL or abs(a - b) / abs(a + b) < RTOL


EPS = 1e-6


def make_numerical_jvp(f, x):
    y = f(x)
    x_vs, y_vs = vspace(x), vspace(y)

    def jvp(v):
        # (f(x + v*eps/2) - f(x - v*eps/2)) / eps
        f_x_plus = f(x_vs.add(x, x_vs.scalar_mul(v, EPS / 2)))
        f_x_minus = f(x_vs.add(x, x_vs.scalar_mul(v, -EPS / 2)))
        neg_f_x_minus = y_vs.scalar_mul(f_x_minus, -1.0)
        return y_vs.scalar_mul(y_vs.add(f_x_plus, neg_f_x_minus), 1.0 / EPS)

    return jvp


def check_vjp(f, x):
    vjp, y = make_vjp(f, x)
    jvp = make_numerical_jvp(f, x)
    x_vs, y_vs = vspace(x), vspace(y)
    x_v, y_v = x_vs.randn(), y_vs.randn()

    vjp_y = x_vs.covector(vjp(y_vs.covector(y_v)))
    assert vspace(vjp_y) == x_vs
    vjv_exact = x_vs.inner_prod(x_v, vjp_y)
    vjv_numeric = y_vs.inner_prod(y_v, jvp(x_v))
    assert scalar_close(vjv_numeric, vjv_exact), (
        "Derivative (VJP) check of {} failed with arg {}:\nanalytic: {}\nnumeric:  {}".format(
            get_name(f), x, vjv_exact, vjv_numeric
        )
    )


def check_jvp(f, x):
    jvp = make_jvp(f, x)
    jvp_numeric = make_numerical_jvp(f, x)
    x_v = vspace(x).randn()
    check_equivalent(jvp(x_v)[1], jvp_numeric(x_v))


def check_equivalent(x, y):
    x_vs, y_vs = vspace(x), vspace(y)
    assert x_vs == y_vs, f"VSpace mismatch:\nx: {x_vs}\ny: {y_vs}"
    v = x_vs.randn()
    assert scalar_close(x_vs.inner_prod(x, v), x_vs.inner_prod(y, v)), f"Value mismatch:\nx: {x}\ny: {y}"


@unary_to_nary
def check_grads(f, x, modes=["fwd", "rev"], order=2):
    assert all(m in ["fwd", "rev"] for m in modes)
    if "fwd" in modes:
        check_jvp(f, x)
        if order > 1:
            grad_f = lambda x, v: make_jvp(f, x)(v)[1]
            grad_f.__name__ = f"jvp_{get_name(f)}"
            v = vspace(x).randn()
            check_grads(grad_f, (0, 1), modes, order=order - 1)(x, v)
    if "rev" in modes:
        check_vjp(f, x)
        if order > 1:
            grad_f = lambda x, v: make_vjp(f, x)[0](v)
            grad_f.__name__ = f"vjp_{get_name(f)}"
            v = vspace(f(x)).randn()
            check_grads(grad_f, (0, 1), modes, order=order - 1)(x, v)


def combo_check(fun, *args, **kwargs):
    # Tests all combinations of args and kwargs given.
    _check_grads = lambda f: check_grads(f, *args, **kwargs)

    def _combo_check(*args, **kwargs):
        kwarg_key_vals = [[(k, x) for x in xs] for k, xs in kwargs.items()]
        for _args in product(*args):
            for _kwargs in product(*kwarg_key_vals):
                _check_grads(fun)(*_args, **dict(_kwargs))

    return _combo_check


================================================
FILE: autograd/tracer.py
================================================
import warnings
from collections import defaultdict
from contextlib import contextmanager

from .util import subvals, toposort
from .wrap_util import wraps


def trace(start_node, fun, x):
    with trace_stack.new_trace() as t:
        start_box = new_box(x, t, start_node)
        end_box = fun(start_box)
        if isbox(end_box) and end_box._trace == start_box._trace:
            return end_box._value, end_box._node
        else:
            warnings.warn("Output seems independent of input.")
            return end_box, None


class Node:
    __slots__ = []

    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        assert False

    def initialize_root(self, *args, **kwargs):
        assert False

    @classmethod
    def new_root(cls, *args, **kwargs):
        root = cls.__new__(cls)
        root.initialize_root(*args, **kwargs)
        return root


def primitive(f_raw):
    """
    Wraps a function so that its gradient can be specified and its invocation
    can be recorded. For examples, see the docs."""

    @wraps(f_raw)
    def f_wrapped(*args, **kwargs):
        boxed_args, trace, node_constructor = find_top_boxed_args(args)
        if boxed_args:
            argvals = subvals(args, [(argnum, box._value) for argnum, box in boxed_args])
            if f_wrapped in notrace_primitives[node_constructor]:
                return f_wrapped(*argvals, **kwargs)
            parents = tuple(box._node for _, box in boxed_args)
            argnums = tuple(argnum for argnum, _ in boxed_args)
            ans = f_wrapped(*argvals, **kwargs)
            node = node_constructor(ans, f_wrapped, argvals, kwargs, argnums, parents)
            return new_box(ans, trace, node)
        else:
            return f_raw(*args, **kwargs)

    f_wrapped.fun = f_raw
    f_wrapped._is_autograd_primitive = True
    return f_wrapped


notrace_primitives = defaultdict(set)


def register_notrace(trace_type, primitive_fun):
    notrace_primitives[trace_type].add(primitive_fun)


def notrace_primitive(f_raw):
    @wraps(f_raw)
    def f_wrapped(*args, **kwargs):
        argvals = map(getval, args)
        return f_raw(*argvals, **kwargs)

    f_wrapped._is_primitive = True
    return f_wrapped


def find_top_boxed_args(args):
    top_trace = -1
    top_boxes = []
    top_node_type = None
    for argnum, arg in enumerate(args):
        if isbox(arg):
            trace = arg._trace
            if trace > top_trace:
                top_boxes = [(argnum, arg)]
                top_trace = trace
                top_node_type = type(arg._node)
            elif trace == top_trace:
                top_boxes.append((argnum, arg))
    return top_boxes, top_trace, top_node_type


class TraceStack:
    def __init__(self):
        self.top = -1

    @contextmanager
    def new_trace(self):
        self.top += 1
        yield self.top
        self.top -= 1


trace_stack = TraceStack()


class Box:
    type_mappings = {}
    types = set()

    __slots__ = ["_value", "_trace", "_node"]

    def __init__(self, value, trace, node):
        self._value = value
        self._node = node
        self._trace = trace

    def __bool__(self):
        return bool(self._value)

    __nonzero__ = __bool__

    def __str__(self):
        return f"Autograd {type(self).__name__} with value {str(self._value)}"

    @classmethod
    def register(cls, value_type):
        Box.types.add(cls)
        Box.type_mappings[value_type] = cls
        Box.type_mappings[cls] = cls


box_type_mappings = Box.type_mappings


def new_box(value, trace, node):
    try:
        return box_type_mappings[type(value)](value, trace, node)
    except KeyError:
        raise TypeError(f"Can't differentiate w.r.t. type {type(value)}")


box_types = Box.types
isbox = lambda x: type(x) in box_types  # almost 3X faster than isinstance(x, Box)
getval = lambda x: getval(x._value) if isbox(x) else x


================================================
FILE: autograd/util.py
================================================
import operator


def subvals(x, ivs):
    x_ = list(x)
    for i, v in ivs:
        x_[i] = v
    return tuple(x_)


def subval(x, i, v):
    x_ = list(x)
    x_[i] = v
    return tuple(x_)


def func(f):
    return f


def toposort(end_node, parents=operator.attrgetter("parents")):
    child_counts = {}
    stack = [end_node]
    while stack:
        node = stack.pop()
        if node in child_counts:
            child_counts[node] += 1
        else:
            child_counts[node] = 1
            stack.extend(parents(node))

    childless_nodes = [end_node]
    while childless_nodes:
        node = childless_nodes.pop()
        yield node
        for parent in parents(node):
            if child_counts[parent] == 1:
                childless_nodes.append(parent)
            else:
                child_counts[parent] -= 1


# -------------------- deprecation warnings -----------------------

import warnings

deprecation_msg = """
The quick_grad_check function is deprecated. See the update guide:
https://github.com/HIPS/autograd/blob/master/docs/updateguide.md"""


def quick_grad_check(
    fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=1e-4, rtol=1e-4, atol=1e-6, rs=None
):
    warnings.warn(deprecation_msg)
    from autograd.test_util import check_grads

    fun_ = lambda arg0: fun(arg0, *extra_args, **kwargs)
    check_grads(fun_, modes=["rev"], order=1)(arg0)


================================================
FILE: autograd/wrap_util.py
================================================
from .util import subvals


def unary_to_nary(unary_operator):
    @wraps(unary_operator)
    def nary_operator(fun, argnum=0, *nary_op_args, **nary_op_kwargs):
        assert type(argnum) in (int, tuple, list), argnum

        @wrap_nary_f(fun, unary_operator, argnum)
        def nary_f(*args, **kwargs):
            @wraps(fun)
            def unary_f(x):
                if isinstance(argnum, int):
                    subargs = subvals(args, [(argnum, x)])
                else:
                    subargs = subvals(args, zip(argnum, x))
                return fun(*subargs, **kwargs)

            if isinstance(argnum, int):
                x = args[argnum]
            else:
                x = tuple(args[i] for i in argnum)
            return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs)

        return nary_f

    return nary_operator


def wraps(fun, namestr="{fun}", docstr="{doc}", **kwargs):
    def _wraps(f):
        try:
            f.__name__ = namestr.format(fun=get_name(fun), **kwargs)
            f.__doc__ = docstr.format(fun=get_name(fun), doc=get_doc(fun), **kwargs)
        except BaseException:
            pass
        return f

    return _wraps


def wrap_nary_f(fun, op, argnum):
    namestr = "{op}_of_{fun}_wrt_argnum_{argnum}"
    docstr = """\
    {op} of function {fun} with respect to argument number {argnum}. Takes the
    same arguments as {fun} but returns the {op}.
    """
    return wraps(fun, namestr, docstr, op=get_name(op), argnum=argnum)


get_name = lambda f: getattr(f, "__name__", "[unknown name]")
get_doc = lambda f: getattr(f, "__doc__", "")


================================================
FILE: benchmarks/__init__.py
================================================


================================================
FILE: benchmarks/asv.conf.json.sample
================================================
{
    "version": 1,
    "project": "autograd",
    "project_url": "http://github.com/hips/autograd",
    "branches": ["master"],
    "dvcs": "git",
    "environment_type": "virtualenv",
    "install_timeout": 600,
    "repo"          : "..",
    "benchmark_dir" : ".",
    "env_dir"       : "../.asv/env",
    "results_dir"   : "../.asv/results",
    "html_dir"      : "../.asv/html",
}


================================================
FILE: benchmarks/bench_core.py
================================================
import numpy as onp

import autograd.numpy as np
from autograd import grad

try:
    from autograd.core import VJPNode, backward_pass, vspace
    from autograd.tracer import new_box, trace

    MASTER_BRANCH = False
except ImportError:
    from autograd.core import backward_pass, forward_pass, new_progenitor, vspace

    MASTER_BRANCH = True


## SHORT FUNCTION
def f_short(x):
    return x**2


def time_short_fun():
    f_short(2.0)


def time_short_forward_pass():
    if MASTER_BRANCH:
        forward_pass(f_short, (2.0,), {})
    else:
        start_node = VJPNode.new_root()
        trace(start_node, f_short, x)


def time_short_backward_pass():
    if MASTER_BRANCH:
        backward_pass(1.0, short_end_node, short_start_node)
    else:
        backward_pass(1.0, short_end_node)


def time_short_grad():
    grad(f_short)(2.0)


## LONG FUNCTION
def f_long(x):
    for i in range(50):
        x = np.sin(x)
    return x


def time_long_fun():
    f_long(2.0)


def time_long_forward_pass():
    if MASTER_BRANCH:
        forward_pass(f_long, (2.0,), {})
    else:
        start_node = VJPNode.new_root()
        trace(start_node, f_long, x)


def time_long_backward_pass():
    if MASTER_BRANCH:
        backward_pass(1.0, long_end_node, long_start_node)
    else:
        backward_pass(1.0, long_end_node)


def time_long_grad():
    grad(f_long)(2.0)


## 'PEARLMUTTER TEST' FUNCTION
def fan_out_fan_in(x):
    for i in range(10**4):
        x = (x + x) / 2.0
    return np.sum(x)


def time_fan_out_fan_in_fun():
    fan_out_fan_in(2.0)


def time_fan_out_fan_in_forward_pass():
    if MASTER_BRANCH:
        forward_pass(fan_out_fan_in, (2.0,), {})
    else:
        start_node = VJPNode.new_root()
        trace(start_node, fan_out_fan_in, x)


def time_fan_out_fan_in_backward_pass():
    if MASTER_BRANCH:
        backward_pass(1.0, fan_end_node, fan_start_node)
    else:
        backward_pass(1.0, fan_end_node)


def time_fan_out_fan_in_grad():
    grad(fan_out_fan_in)(2.0)


## UNIT BENCHMARKS
def time_vspace_float():
    vspace(1.0)


A = np.array([[1.0, 2.0, 3.0]])


def time_vspace_array():
    vspace(A)


def time_new_box_float():
    new_box(1.0, 0, start_node)


def time_new_box_array():
    new_box(A, 0, start_node)


def time_exp_call():
    onp.exp(2.0)


def time_exp_primitive_call_unboxed():
    np.exp(2.0)


def time_exp_primitive_call_boxed():
    if MASTER_BRANCH:
        np.exp(progenitor)
    else:
        np.exp(start_box)


def time_no_autograd_control():
    # Test whether the benchmarking machine is running slowly independent of autograd
    A = np.random.randn(200, 200)
    np.dot(A, A)


if MASTER_BRANCH:
    short_start_node, short_end_node = forward_pass(f_short, (2.0,), {})
    long_start_node, long_end_node = forward_pass(f_long, (2.0,), {})
    fan_start_node, fan_end_node = forward_pass(fan_out_fan_in, (2.0,), {})
    progenitor = new_progenitor(2.0)
else:
    x = 2.0
    start_node = VJPNode.new_root()
    start_box = new_box(x, 0, start_node)
    _, short_end_node = trace(VJPNode.new_root(), f_short, x)
    _, long_end_node = trace(VJPNode.new_root(), f_long, x)
    _, fan_end_node = trace(VJPNode.new_root(), fan_out_fan_in, x)


================================================
FILE: benchmarks/bench_mem.py
================================================
import autograd.numpy as np
from autograd import grad


def peakmem_needless_nodes():
    N, M = 1000, 100

    def fun(x):
        for i in range(M):
            x = x + 1
        return np.sum(x)

    grad(fun)(np.zeros((N, N)))


================================================
FILE: benchmarks/bench_numpy_vjps.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import make_vjp

dot_0 = lambda a, b, g: make_vjp(np.dot, argnum=0)(a, b)[0](g)
dot_1 = lambda a, b, g: make_vjp(np.dot, argnum=1)(a, b)[0](g)

dot_0_0 = lambda a, b, g: make_vjp(dot_0, argnum=0)(a, b, g)[0](a)
dot_0_1 = lambda a, b, g: make_vjp(dot_0, argnum=1)(a, b, g)[0](a)
dot_0_2 = lambda a, b, g: make_vjp(dot_0, argnum=2)(a, b, g)[0](a)

dot_1_0 = lambda a, b, g: make_vjp(dot_1, argnum=0)(a, b, g)[0](b)
dot_1_1 = lambda a, b, g: make_vjp(dot_1, argnum=1)(a, b, g)[0](b)
dot_1_2 = lambda a, b, g: make_vjp(dot_1, argnum=2)(a, b, g)[0](b)

a = npr.randn(2, 3, 4, 5)
b = npr.randn(2, 3, 5, 4)
g = npr.randn(2, 3, 4, 2, 3, 4)


def time_dot_0():
    dot_0(a, b, g)


def time_dot_1():
    dot_1(a, b, g)


def time_dot_0_0():
    dot_0_0(a, b, g)


def time_dot_0_1():
    dot_0_1(a, b, g)


def time_dot_0_2():
    dot_0_2(a, b, g)


def time_dot_1_0():
    dot_1_0(a, b, g)


def time_dot_1_1():
    dot_1_1(a, b, g)


def time_dot_1_2():
    dot_1_2(a, b, g)


tensordot_0 = lambda A, B, G: make_vjp(np.tensordot, argnum=0)(A, B, 2)[0](G)
tensordot_1 = lambda A, B, G: make_vjp(np.tensordot, argnum=1)(A, B, 2)[0](G)

tensordot_0_0 = lambda A, B, G: make_vjp(tensordot_0, argnum=0)(A, B, G)[0](A)
tensordot_0_1 = lambda A, B, G: make_vjp(tensordot_0, argnum=1)(A, B, G)[0](A)
tensordot_0_2 = lambda A, B, G: make_vjp(tensordot_0, argnum=2)(A, B, G)[0](A)

tensordot_1_0 = lambda A, B, G: make_vjp(tensordot_1, argnum=0)(A, B, G)[0](B)
tensordot_1_1 = lambda A, B, G: make_vjp(tensordot_1, argnum=1)(A, B, G)[0](B)
tensordot_1_2 = lambda A, B, G: make_vjp(tensordot_1, argnum=2)(A, B, G)[0](B)

A = npr.randn(2, 3, 5, 4)
B = npr.randn(5, 4, 2, 3)
G = npr.randn(2, 3, 2, 3)


def time_tensordot_0():
    tensordot_0(A, B, G)


def time_tensordot_1():
    tensordot_1(A, B, G)


def time_tensordot_0_0():
    tensordot_0_0(A, B, G)


def time_tensordot_0_1():
    tensordot_0_1(A, B, G)


def time_tensordot_0_2():
    tensordot_0_2(A, B, G)


def time_tensordot_1_0():
    tensordot_1_0(A, B, G)


def time_tensordot_1_1():
    tensordot_1_1(A, B, G)


def time_tensordot_1_2():
    tensordot_1_2(A, B, G)


================================================
FILE: benchmarks/bench_rnn.py
================================================
# Write the benchmarking functions here.
# See "Writing benchmarks" in the asv docs for more information.
# http://asv.readthedocs.io/en/latest/writing_benchmarks.html
import autograd.numpy as np
from autograd import grad


class RNNSuite:
    """
    Checking speed on a vanilla RNN.
    """

    # NOTE: this is run each time we run a benchmark.
    # Might want to switch to setup_cache, which has to return an object which is loaded and unpacked in setup().
    def setup(self):
        self.batch_size = 16
        self.dtype = "float32"
        self.D = 2**10
        self.x = 0.01 * np.random.randn(self.batch_size, self.D).astype(self.dtype)
        self.W1 = 0.01 * np.random.randn(self.D, self.D).astype(self.dtype)
        self.b1 = 0.01 * np.random.randn(self.D).astype(self.dtype)
        self.Wout = 0.01 * np.random.randn(self.D, 1).astype(self.dtype)
        self.bout = 0.01 * np.random.randn(1).astype(self.dtype)
        self.l = (np.random.rand(self.batch_size, 1) > 0.5).astype(self.dtype)
        self.n = 50

        def autograd_rnn(params, x, label, n):
            W, b, Wout, bout = params
            h1 = x
            for i in range(n):
                h1 = np.tanh(np.dot(h1, W) + b)
            logit = np.dot(h1, Wout) + bout
            loss = -np.sum(label * logit - (logit + np.log(1 + np.exp(-logit))))
            return loss

        self.fn = autograd_rnn
        self.grad_fn = grad(self.fn)

    def rnn_grad(self):
        self.grad_fn((self.W1, self.b1, self.Wout, self.bout), self.x, self.l, self.n)

    def time_rnn_grad(self):
        self.rnn_grad()

    def peakmem_rnn_grad(self):
        self.rnn_grad()

    def time_manual_rnn_grad(self):
        self.manual_rnn_grad()

    def peakmem_manual_rnn_grad(self):
        self.manual_rnn_grad()

    def manual_rnn_grad(self):
        def repeat_to_match_shape(g, A, axis=None):
            gout = np.empty_like(A)
            if np.ndim(gout) == 0:
                gout = g
            else:
                gout = np.ones_like(A) * g
            return gout

        def sum_to_match_shape(sum_this, to_match_this):
            sum_this = np.sum(sum_this, axis=tuple(range(0, np.ndim(sum_this) - np.ndim(to_match_this))))
            for axis, size in enumerate(np.shape(to_match_this)):
                if size == 1:
                    sum_this = np.sum(sum_this, axis=axis, keepdims=True)
            return sum_this

        def grad_dot_A(g, A, B):
            ga = np.dot(g, B.T)
            ga = np.reshape(ga, np.shape(A))
            return ga

        def grad_dot_B(g, A, B):
            gb = np.dot(A.T, g)
            gb = np.reshape(gb, np.shape(B))
            return gb

        def _rnn_grad(x, W, b, Wout, bout, label, n):
            h1__1_stack, h1__1 = [], None
            h1__0_stack, h1__0 = [], None
            out_stack, out = [], None
            h1_stack = []
            h1 = x
            _for1 = list(range(n))

            for i in _for1:
                h1__1_stack.append(h1__1)
                h1__1 = np.dot(h1, W)
                h1__0_stack.append(h1__0)
                h1__0 = h1__1 + b
                h1_stack.append(h1)
                h1 = np.tanh(h1__0)
            out__0 = np.dot(h1, Wout)
            out = out__0 + bout
            loss__2 = label * out
            loss__7 = -out
            loss__6 = np.exp(loss__7)
            loss__5 = 1 + loss__6
            loss__4 = np.log(loss__5)
            loss__3 = out + loss__4
            loss__1 = loss__2 - loss__3

            # Begin Backward Pass
            g_loss = 1
            g_h1__0 = 0
            g_h1__1 = 0
            g_b = 0
            g_W = 0

            # Reverse of: loss = -loss__0
            g_loss__0 = -g_loss

            # Reverse of: loss__0 = np.sum(loss__1)
            g_loss__1 = repeat_to_match_shape(g_loss__0, loss__1)

            # Reverse of: loss__1 = loss__2 - loss__3
            g_loss__2 = sum_to_match_shape(g_loss__1, loss__2)
            g_loss__3 = sum_to_match_shape(-g_loss__1, loss__3)

            # Reverse of: loss__3 = out + loss__4
            g_out = sum_to_match_shape(g_loss__3, out)
            g_loss__4 = sum_to_match_shape(g_loss__3, loss__4)

            # Reverse of: loss__4 = np.log(loss__5)
            g_loss__5 = g_loss__4 / loss__5

            # Reverse of: loss__5 = 1 + loss__6
            g_loss__6 = sum_to_match_shape(g_loss__5, loss__6)

            # Reverse of: loss__6 = np.exp(loss__7)
            g_loss__7 = g_loss__6 * np.exp(loss__7)

            # Reverse of: loss__7 = -out
            g_out += -g_loss__7
            g_out += sum_to_match_shape(g_loss__2 * label, out)

            # Reverse of: out = out__0 + bout
            g_out__0 = sum_to_match_shape(g_out, out__0)
            g_bout = sum_to_match_shape(g_out, bout)

            # Reverse of: out__0 = np.dot(h1, Wout)
            g_h1 = grad_dot_A(g_out__0, h1, Wout)
            g_Wout = grad_dot_B(g_out__0, h1, Wout)
            _for1 = reversed(_for1)
            for i in _for1:
                h1 = h1_stack.pop()
                tmp_g0 = g_h1 / np.cosh(h1__0) ** 2.0
                g_h1 = 0
                g_h1__0 += tmp_g0
                h1__0 = h1__0_stack.pop()
                tmp_g1 = sum_to_match_shape(g_h1__0, h1__1)
                tmp_g2 = sum_to_match_shape(g_h1__0, b)
                g_h1__0 = 0
                g_h1__1 += tmp_g1
                g_b += tmp_g2
                h1__1 = h1__1_stack.pop()
                tmp_g3 = grad_dot_A(g_h1__1, h1, W)
                tmp_g4 = grad_dot_B(g_h1__1, h1, W)
                g_h1__1 = 0
                g_h1 += tmp_g3
                g_W += tmp_g4
            return g_W, g_b, g_Wout, g_bout

        _rnn_grad(self.x, self.W1, self.b1, self.Wout, self.bout, self.l, self.n)
        pass


================================================
FILE: benchmarks/bench_util.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad

try:
    from autograd.misc.flatten import flatten
except ImportError:
    from autograd.util import flatten


def time_flatten():
    val = {
        "k": npr.random((4, 4)),
        "k2": npr.random((3, 3)),
        "k3": 3.0,
        "k4": [1.0, 4.0, 7.0, 9.0],
        "k5": np.array([4.0, 5.0, 6.0]),
        "k6": np.array([[7.0, 8.0], [9.0, 10.0]]),
    }

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)


# def time_vspace_flatten():
#     val = {'k':  npr.random((4, 4)),
#            'k2': npr.random((3, 3)),
#            'k3': 3.0,
#            'k4': [1.0, 4.0, 7.0, 9.0],
#            'k5': np.array([4., 5., 6.]),
#            'k6': np.array([[7., 8.], [9., 10.]])}

#     vspace_flatten(val)


def time_grad_flatten():
    val = {
        "k": npr.random((4, 4)),
        "k2": npr.random((3, 3)),
        "k3": 3.0,
        "k4": [1.0, 4.0, 7.0, 9.0],
        "k5": np.array([4.0, 5.0, 6.0]),
        "k6": np.array([[7.0, 8.0], [9.0, 10.0]]),
    }

    vect, unflatten = flatten(val)

    def fun(vec):
        v = unflatten(vec)
        return np.sum(v["k5"]) + np.sum(v["k6"])

    grad(fun)(vect)


================================================
FILE: conda_recipe/conda.yaml
================================================
package:
  name: autograd
  # there are ways to derive version from other sources; for now, it's hard-coded
  version: 1.1.1

source:
  {% if not environ.get('BINSTAR_PLATFORM', None) %}
  git_url: ../
  {% else %}
  # we're building on binstar, we already have the repo; treat as local path
  path: ../
  {% endif %}

requirements:
  build:
    - python
    - hatch
    - hatchling
    - future
    - numpy >=1.9

  run:
    - python
    - future
    - numpy >=1.9

build:
  script: pip install . --no-deps

test:
  # Python imports
  imports:
    - autograd
    - autograd.numpy

about:
  home: https://github.com/HIPS/autograd
  license: MIT
  summary: 'Efficiently computes derivatives of numpy code.'


================================================
FILE: docs/tutorial.md
================================================
# Autograd tutorial

## Motivation

Imagine you want to test out a new machine learning model for your data. This
usually means coming up with some loss function to capture how well your model
fits the data and optimizing that loss with respect to the model parameters. If
there are many model parameters (neural nets can have millions) then you need
gradients. You then have two options: derive and code them up yourself, or
implement your model using the syntactic and semantic constraints of a system
like [Theano](http://deeplearning.net/software/theano/) or
[TensorFlow](https://github.com/tensorflow/tensorflow).

We want to provide a third way: just write down the loss function using a
standard numerical library like Numpy, and Autograd will give you its gradient.

## How to use Autograd

Autograd's `grad` function takes in a function, and gives you a function that computes its derivative.
Your function must have a scalar-valued output (i.e. a float).
This covers the common case when you want to use gradients to optimize something.

Autograd works on ordinary Python and Numpy code containing all the usual control structures, including `while` loops, `if` statements, and closures.  Here's a simple example of using an open-ended loop to compute the sine function:

```python
import autograd.numpy as np   # Thinly-wrapped version of Numpy
from autograd import grad

def taylor_sine(x):  # Taylor approximation to sine function
    ans = currterm = x
    i = 0
    while np.abs(currterm) > 0.001:
        currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2))
        ans = ans + currterm
        i += 1
    return ans

grad_sine = grad(taylor_sine)
print "Gradient of sin(pi) is", grad_sine(np.pi)
```

## Complete example: logistic regression

A common use case for automatic differentiation is to train a probabilistic model.
Here we present a very simple (but complete) example of specifying and training
a logistic regression model for binary classification:

```python
import autograd.numpy as np
from autograd import grad

def sigmoid(x):
    return 0.5 * (np.tanh(x / 2.) + 1)

def logistic_predictions(weights, inputs):
    # Outputs probability of a label being true according to logistic model.
    return sigmoid(np.dot(inputs, weights))

def training_loss(weights):
    # Training loss is the negative log-likelihood of the training labels.
    preds = logistic_predictions(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.sum(np.log(label_probabilities))

# Build a toy dataset.
inputs = np.array([[0.52, 1.12,  0.77],
                   [0.88, -1.08, 0.15],
                   [0.52, 0.06, -1.30],
                   [0.74, -2.49, 1.39]])
targets = np.array([True, True, False, True])

# Define a function that returns gradients of training loss using Autograd.
training_gradient_fun = grad(training_loss)

# Optimize weights using gradient descent.
weights = np.array([0.0, 0.0, 0.0])
print("Initial loss:", training_loss(weights))
for i in range(100):
    weights -= training_gradient_fun(weights) * 0.01

print("Trained loss:", training_loss(weights))
```

Python syntax is pretty good for specifying probabilistic models.  The biggest
win is that it becomes a lot easier to modify a model and rapidly iterate.

For more complex examples, see our [examples directory](../examples/), which includes:
* [a simple neural net](../examples/neural_net.py)
* [a convolutional neural net](../examples/convnet.py)
* [a recurrent neural net](../examples/rnn.py)
* [a long short-term memory (LSTM)](../examples/lstm.py)
* [backpropagating through a fluid simulation](../examples/fluidsim/fluidsim.py)


## What's going on under the hood?

To compute the gradient, Autograd first has to record every transformation that was applied to the input as it was turned into the output of your function.
To do this, Autograd wraps functions (using the function `primitive`) so that when they're called, they add themselves to a list of operations performed.
Autograd's core has a table mapping these wrapped primitives to their corresponding gradient functions (or, more precisely, their vector-Jacobian product functions).
To flag the variables we're taking the gradient with respect to, we wrap them using the `Box` class.
You should never have to think about the `Box` class, but you might notice it when printing out debugging info.

After the function is evaluated, Autograd has a graph specifying all operations that were performed on the inputs with respect to which we want to differentiate.
This is the computational graph of the function evaluation.
To compute the derivative, we simply apply the rules of differentiation to each node in the graph.

### Reverse mode differentiation

Given a function made up of several nested function calls, there are several ways to compute its derivative.

For example, given L(x) = F(G(H(x))), the chain rule says that its gradient is dL/dx = dF/dG * dG/dH * dH/dx.  If we evaluate this product from right-to-left: (dF/dG * (dG/dH * dH/dx)), the same order as the computations themselves were performed, this is called forward-mode differentiation.
If we evaluate this product from left-to-right: ((dF/dG * dG/dH) * dH/dx), the reverse order as the computations themselves were performed, this is called reverse-mode differentiation.

Compared to finite differences or forward-mode, reverse-mode differentiation is by far the more practical method for differentiating functions that take in a large vector and output a single number.
In the machine learning community, reverse-mode differentiation is known as 'backpropagation', since the gradients propagate backwards through the function.
It's particularly nice since you don't need to instantiate the intermediate Jacobian matrices explicitly, and instead only rely on applying a sequence of matrix-free vector-Jacobian product functions (VJPs).
Because Autograd supports higher derivatives as well, Hessian-vector products (a form of second-derivative) are also available and efficient to compute.

### How can you support ifs, while loops and recursion?

Some autodiff packages (such as [TensorFlow](https://github.com/tensorflow/tensorflow)) work by having you specify a graph of the computation that your function performs, including all the control flow (such as if and for loops), and then turn that graph into another one that computes gradients.
This has some benefits (such as allowing compile-time optimizations), but it requires you to express control flow in a limited mini-language that those packages know how to handle.  (For example, the `tf.while` and `tf.cond` operations in TensorFlow.)

In contrast, Autograd doesn't have to know about any ifs, branches, loops or recursion that were used to decide which operations were called.  To compute the gradient of a particular input, one only needs to know which continuous transforms were applied to that particular input, not which other transforms might have been applied.
Since Autograd keeps track of the relevant operations on each function call separately, it's not a problem that all the Python control flow operations are invisible to Autograd.  In fact, it greatly simplifies the implementation.


## What can Autograd differentiate?

The main constraint is that any function that operates on a `Box` is marked as `primitive`, and has its gradient implemented.
This is taken care of for most functions in the Numpy library, and it's easy to write your own gradients.

The input can be a scalar, complex number, vector, tuple, a tuple of vectors, a tuple of tuples, etc.

When using the `grad` function, the output must be a scalar, but the functions `elementwise_grad` and `jacobian` allow gradients of vectors.


## Supported and unsupported parts of numpy/scipy

Numpy has [a lot of features](http://docs.scipy.org/doc/numpy/reference/). We've done our best to support most of them. So far, we've implemented gradients for:
* most of the [mathematical operations](../autograd/numpy/numpy_vjps.py)
* most of the [array and matrix manipulation routines](../autograd/numpy/numpy_vjps.py)
* some [linear algebra](../autograd/numpy/linalg.py) functions
* most of the [fast fourier transform](../autograd/numpy/fft.py) routines
* full support for complex numbers
* [N-dimensional convolutions](../autograd/scipy/signal.py)
* Some scipy routines, including [`scipy.stats.norm`](../autograd/scipy/stats/norm.py)

Some things remain to be implemented. For example, we support indexing (`x = A[i, j, :]`) but not assignment (`A[i,j] = x`) in arrays that are being differentiated with respect to.
Assignment is hard to support because it requires keeping copies of the overwritten data, and so even when you write code that looks like it's performing assignment, the system would have to be making copies behind the scenes, often defeating the purpose of in-place operations.

Similarly, we don't support the syntax `A.dot(B)`; use the equivalent `np.dot(A, B)` instead.
The reason we don't support the first way is that subclassing `ndarray` raises a host of issues.
As another consequence of not subclassing `ndarray`, some subclass checks can break, like `isinstance(x, np.ndarray)` can return `False`.
However, those `isinstance` checks will work if you instead use Autograd's provided one, writing `from autograd.builtins import isinstance`.

In-place modification of arrays not being differentiated with respect to (for example, `A[i] = x` or `A += B`) won't raise an error, but be careful.
It's easy to accidentally change something without Autograd knowing about it.
This can be a problem because Autograd keeps references to variables used in the forward pass if they will be needed on the reverse pass.
Making copies would be too slow.

Lists and dicts can be used freely - like control flow, Autograd usually doesn't even need to know about them.
The exception is passing in a list to a primitive function, such as `autograd.numpy.sum`.
This requires special care, since the list contents need to be examined for boxes.
We do support passing lists to `autograd.numpy.array` and `autograd.numpy.concatenate`, but in other cases, you may need to explicitly construct an array using `autograd.numpy.array` before passing a list or tuple argument into a primitive.
An alternative is to use the `list`, `dict`, and `tuple` classes in `autograd.builtins`, which should work just like the Python builtins while also ensuring boxes don't get hidden inside those containers.
Remember, these issues typically only come up when you're passing a `list` or `tuple` to a primitive function; when passing around lists or tuples in your own (non-primitive) functions, you can put boxed values inside lists, tuples, or dicts without having to worry about it.

#### TL;DR: Do use
* [Most](../autograd/numpy/numpy_vjps.py) of numpy's functions
* [Most](../autograd/numpy/numpy_boxes.py) numpy.ndarray methods
* [Some](../autograd/scipy/) scipy functions
* Indexing and slicing of arrays `x = A[3, :, 2:4]`
* Explicit array creation from lists `A = np.array([x, y])`

#### Don't use
* Assignment to arrays `A[0,0] = x`
* Implicit casting of lists to arrays `A = np.sum([x, y])`, use `A = np.sum(np.array([x, y]))` instead.
* `A.dot(B)` notation (use `np.dot(A, B)` instead)
* In-place operations (such as `a += b`, use `a = a + b` instead)
* Some isinstance checks, like `isinstance(x, np.ndarray)` or `isinstance(x, tuple)`, without first doing `from autograd.builtins import isinstance, tuple`.

Luckily, it's easy to check gradients numerically if you're worried that something's wrong.

## Extend Autograd by defining your own primitives

What if Autograd doesn't support a function you need to take the gradient of?
This can happen if your code depends on external library calls or C code.
It can sometimes even be a good idea to provide the gradient of a pure Python function for speed or numerical stability.

For example, let's add the gradient of a numerically stable version of `log(sum(exp(x)))`.
This function is included in `scipy.special` and already supported, but let's make our own version.

Next, we define our function using standard Python, using `@primitive` as a decorator:

```python
import autograd.numpy as np
from autograd.extend import primitive, defvjp

@primitive
def logsumexp(x):
    """Numerically stable log(sum(exp(x)))"""
    max_x = np.max(x)
    return max_x + np.log(np.sum(np.exp(x - max_x)))
```

`@primitive` tells Autograd not to look inside the function, but instead to treat it as a black box whose gradient can be specified later.
Functions with this decorator can contain anything that Python knows how to execute, including calls to other languages.

Next, we write a function that specifies the gradient of the primitive `logsumexp`:

```python
def logsumexp_vjp(ans, x):
    x_shape = x.shape
    return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans))
```

`logsumexp_vjp` returns a vector-Jacobian product (VJP) operator, which is a function that right-multiplies its argument `g` by the Jacobian matrix of `logsumexp` (without explicitly forming the matrix's coefficients).
`g` will be the gradient of the final objective with respect to `ans` (the output of `logsumexp`).
The calculation can depend on both the input (`x`) and the output (`ans`) of the original function.
If you want to be able to take higher-order derivatives, then the code inside the VJP function must be itself differentiable by Autograd, which usually just means you write it in terms of other primitives which themselves have VJPs (like Numpy functions).

The final step is to tell Autograd about `logsumexp`'s vector-Jacobian product function:
```python
defvjp(logsumexp, logsumexp_vjp)
```

Now we can use `logsumexp` anywhere, including inside of a larger function that we want to differentiate:

```python
from autograd import grad

def example_func(y):
    z = y**2
    lse = logsumexp(z)
    return np.sum(lse)

grad_of_example = grad(example_func)
print "Gradient: ", grad_of_example(np.array([1.5, 6.7, 1e-10])
```

This example can be found as a Python script [here](../examples/define_gradient.py).

## Complex numbers

Autograd supports complex arrays and scalars using a convention described as follows.
Consider a complex-to-complex function, `f`,
expressed in terms of real-to-real components, `u` and `v`:

```python
def f(z):
    x, y = real(z), imag(z)
    return u(x, y) + v(x, y) * 1j
```

We define `grad` of `f` as

```python
def grad_f(z):
    x, y = real(z), imag(z)
    return grad(u, 0)(x, y) - i * grad(u, 1)(x, y)
```

(The second argument of `grad` specifies which argument we're differentiating with respect to.)
So we throw out v, the imaginary part of f, entirely.

Our convention covers three important cases:
  * If `f` is holomorphic, we get the usual complex derivative
    (since `grad(u, 0) == grad(v, 1)` and `grad(u, 1) == - grad(v, 0)`).
  * If `f` is a real-valued loss function of a complex parameter, `x`,
    we get a result that we can use in a gradient-based optimizer,
    by taking steps in the direction of the complex conjugate of `grad(f)(x)`.
  * If `f` is a real-to-real function that happens to use complex primitives internally,
    some of which must necessarily be non-holomorphic
    (maybe you use FFTs to implement convolutions for example)
    then we get the same result that a purely real implementation would have given.

Our convention doesn't handle the case where `f` is a non-holomorphic function
and you're interested in all of du/dx, du/dy, dv/dx and dv/dy.
But then the answer would have to contain four real values
and there would be no way to express it as a single complex number.

We define primitive vector-Jacobian products of complex functions like this

```python
def f_vjp(g, z):
    z_x, z_y = real(z), imag(z)
    g_x, g_y = real(g), imag(g)
    return (       g_x * grad(u, 0)(x, y)
             - i * g_x * grad(u, 1)(x, y)
             -     g_y * grad(v, 0)(x, y)
             + i * g_y * grad(v, 1)(x, y))
```

For holomorphic primitives, this is just the regular complex derivative multiplied by `g`,
so most simple math primitives don't need to be changed from their real implementations.
For non-holomorphic primitives, it preserves all four real partial derivatives as if we
were treating complex numbers as real 2-tuples
(though it throws a couple of negative signs in there).
Chapter 4 of [Dougal's PhD thesis](https://dougalmaclaurin.com/phd-thesis.pdf)
goes into a bit more detail about how we define the primitive vector-Jacobian products.

## Autograd Lecture
For more information on automatic differentiation, autograd's implementation, and advanced automatic differentiation techniques, see a [talk by Matt at the Deep Learning Summer School, Montreal 2017](https://videolectures.net/videos/deeplearning2017_johnson_automatic_differentiation/).

## Support

Autograd was written by
[Dougal Maclaurin](https://dougalmaclaurin.com),
[David Duvenaud](http://mlg.eng.cam.ac.uk/duvenaud/), and
[Matthew Johnson](http://www.mit.edu/~mattjj/)
and we're actively developing it. Please
feel free to submit any bugs or feature requests. We'd also love to hear about
your experiences with Autograd in general. Drop us an email!


================================================
FILE: docs/updateguide.md
================================================
# Autograd v1.2 update guide

Autograd v1.2 changed the interface for defining custom vector-Jacobian
products (VJPs). Luckily the change only affects users writing custom VJPs, and
should only require minor updates to the custom VJP code.

This guide is meant to explain why we made these changes (and others) in
Autograd v1.2, and to summarize everything you need to know to update your
custom VJP code.

- [Reasoning for the changes](#reasoning-for-the-changes)
- [New defvjp interface](#new-defvjp-interface)
- [Gradient checking](#gradient-checking)

## Reasoning for the changes

Here are some of the most important reasons for this update:
1. To allow us to make Autograd faster and more memory efficient, we staged the
   VJP functions to allow more garbage collection and eliminated almost all of
   the vspace metadata checks.
1. Forward-mode now comes built-in with `make_jvp`.
1. There's now a clear extension API in `autograd.extend`, so you can write
   custom VJPs or wrap your own numerical libraries.
1. Autograd is now backend-independent, making it easy to wrap other numerical
   libraries.
1. Autograd's tracing functionality is now parameterized and easily reusable,
   and we added some new tracers for
   [computation graph visualization](https://github.com/hips/autograd/blob/master/examples/dot_graph.py)
   and
   [pure-Python constant folding](https://github.com/hips/autograd/blob/master/autograd/misc/tracers.py).
1. More exhaustive, fast reverse- and forward-mode checking with `autograd.test_util.check_grads`.
1. Expensive VJPs can share work across arguments using `defvjp_argnums`.
1. These changes enabled some internal cleanups, and more features to come!

## New defvjp interface
First, here's an example of the old way to write custom primitives and VJPs:
```python
import autograd.numpy as np
from autograd import primitive

@primitive
def func(x, y, z):
    assert z != 0
    return x * y**2

func.defvjp(lambda g, ans, vs, gvs, x, y, z: g * y**2)
func.defvjp(lambda g, ans, vs, gvs, x, y, z: 2 * g * x * y, argnum=1)
func.defvjp_is_zero(argnums=[2])
```

Here's the new way to write custom VJPs for that same primitive:
```python
import autograd.numpy as np
from autograd.extend import primitive, defvjp  # defvjp is now a function

# primitives look the same as before
@primitive
def func(x, y, z):
    assert z != 0
    return x * y**2

# but we call defvjp differently
defvjp(func,
       lambda ans, x, y, z: lambda g: g * y**2,
       lambda ans, x, y, z: lambda g: 2 * g * x * y,
       None)
```

Here's a list of the `defvjp` changes illustrated in that example:
1. `defvjp` is a function, rather than a method on the `primitive` class. (Actually, `primitive` is now just a function, and no longer a class.) As a result, `func.defvjp(...)` became `defvjp(func, ...)`.
1. VJPs are staged, so that instead of writing `lambda g, ans, vs, gvs, *args: ...` we write `lambda ans, *args: lambda g: ...`. This change enables a lot of automatic garbage collection. In the above example, if we were differentiating only with respect to `x` argument of `func`, because the VJP for `func` with respect to argument index 0 doesn't need the values of `x` or `z` from the forward pass, those values aren't stored and can instead be immediately garbage-collected.
1. There are no more `vs` and `gvs` arguments. These usually weren't used, and computing vspace metadata for every intermediate value proved to contribute significant overhead for some programs. Autograd now avoids computing vspace metadata unless necessary.
1. `defvjp` lets you define VJPs with respect to multiple arguments at once, and the argnum(s) involved are often implicit.

Here's another example, this time showing how to define VJPs with respect to
specific argnums, leaving the others undefined.
```python
# OLD way to leave some VJPs undefined
func.defvjp(lambda g, ans, vs, gvs, x, y, z, w: ..., argnum=2)
func.defvjp(lambda g, ans, vs, gvs, x, y, z, w: ..., argnum=3)

# NEW way to leave some VJPs undefined
defvjp(func,
       lambda ans, x, y, z, w: lambda g: ...,
       lambda ans, x, y, z, w: lambda g: ...,
       argnums=[2, 3])
```

## Gradient checking
Here's how to do gradient checking, whether on a composite function or on your
primitive with a custom VJP:

```python
from autograd.test_util import check_grads

# check reverse-mode to second order
check_grads(my_func, modes=['rev'], order=2)(*args_for_my_func)
```


================================================
FILE: examples/README.md
================================================
# Autograd examples

## Usage instructions

Some of the examples require additional dependencies beyond Autograd and its
core dependencies. These are set up under the `examples` dependency group. To
install them, navigate to the root directory of where you cloned Autograd and
run
```sh
pip install --group examples
```
from the command line. Note that dependency groups are a recent feature so you
may need to upgrade `pip` with
```sh
pip install --upgrade pip
```

Having installed the additional dependencies, you may navigate to the `examples`
subdirectory and run any of the Python scripts. For example:
```sh
python3 tanh.py
```
Some of the examples print to the terminal and others open pop-up windows for
plots.


================================================
FILE: examples/__init__.py
================================================


================================================
FILE: examples/bayesian_neural_net.py
================================================
import matplotlib.pyplot as plt
from black_box_svi import black_box_variational_inference

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.misc.optimizers import adam


def make_nn_funs(layer_sizes, L2_reg, noise_variance, nonlinearity=np.tanh):
    """These functions implement a standard multi-layer perceptron,
    vectorized over both training examples and weight samples."""
    shapes = list(zip(layer_sizes[:-1], layer_sizes[1:]))
    num_weights = sum((m + 1) * n for m, n in shapes)

    def unpack_layers(weights):
        num_weight_sets = len(weights)
        for m, n in shapes:
            yield (
                weights[:, : m * n].reshape((num_weight_sets, m, n)),
                weights[:, m * n : m * n + n].reshape((num_weight_sets, 1, n)),
            )
            weights = weights[:, (m + 1) * n :]

    def predictions(weights, inputs):
        """weights is shape (num_weight_samples x num_weights)
        inputs  is shape (num_datapoints x D)"""
        inputs = np.expand_dims(inputs, 0)
        for W, b in unpack_layers(weights):
            outputs = np.einsum("mnd,mdo->mno", inputs, W) + b
            inputs = nonlinearity(outputs)
        return outputs

    def logprob(weights, inputs, targets):
        log_prior = -L2_reg * np.sum(weights**2, axis=1)
        preds = predictions(weights, inputs)
        log_lik = -np.sum((preds - targets) ** 2, axis=1)[:, 0] / noise_variance
        return log_prior + log_lik

    return num_weights, predictions, logprob


def build_toy_dataset(n_data=40, noise_std=0.1):
    D = 1
    rs = npr.RandomState(0)
    inputs = np.concatenate([np.linspace(0, 2, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)])
    targets = np.cos(inputs) + rs.randn(n_data) * noise_std
    inputs = (inputs - 4.0) / 4.0
    inputs = inputs.reshape((len(inputs), D))
    targets = targets.reshape((len(targets), D))
    return inputs, targets


if __name__ == "__main__":
    # Specify inference problem by its unnormalized log-posterior.
    rbf = lambda x: np.exp(-(x**2))
    relu = lambda x: np.maximum(x, 0.0)
    num_weights, predictions, logprob = make_nn_funs(
        layer_sizes=[1, 20, 20, 1], L2_reg=0.1, noise_variance=0.01, nonlinearity=rbf
    )

    inputs, targets = build_toy_dataset()
    log_posterior = lambda weights, t: logprob(weights, inputs, targets)

    # Build variational objective.
    objective, gradient, unpack_params = black_box_variational_inference(
        log_posterior, num_weights, num_samples=20
    )

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    def callback(params, t, g):
        print(f"Iteration {t} lower bound {-objective(params, t)}")

        # Sample functions from posterior.
        rs = npr.RandomState(0)
        mean, log_std = unpack_params(params)
        # rs = npr.RandomState(0)
        sample_weights = rs.randn(10, num_weights) * np.exp(log_std) + mean
        plot_inputs = np.linspace(-8, 8, num=400)
        outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1))

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), "bx")
        ax.plot(plot_inputs, outputs[:, :, 0].T)
        ax.set_ylim([-2, 3])
        plt.draw()
        plt.pause(1.0 / 60.0)

    # Initialize variational parameters
    rs = npr.RandomState(0)
    init_mean = rs.randn(num_weights)
    init_log_std = -5 * np.ones(num_weights)
    init_var_params = np.concatenate([init_mean, init_log_std])

    print("Optimizing variational parameters...")
    variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=1000, callback=callback)


================================================
FILE: examples/bayesian_optimization.py
================================================
"""This Bayesian optimization demo using gradient-based optimization
to find the next query point."""

import matplotlib.pyplot as plt
from gaussian_process import make_gp_funs, rbf_covariance
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import value_and_grad
from autograd.scipy.stats import norm


def probability_of_improvement(mean, std, max_so_far):
    return norm.cdf(max_so_far, mean, std)


def expected_new_max(mean, std, max_so_far):
    return (
        max_so_far
        - (mean - max_so_far) * norm.cdf(mean, max_so_far, std)
        + std * norm.pdf(mean, max_so_far, std)
    )


def init_covariance_params(num_params):
    return np.zeros(num_params)


def defaultmax(x, default=-np.inf):
    if x.size == 0:
        return default
    return np.max(x)


def bayesian_optimize(func, domain_min, domain_max, num_iters=20, callback=None):
    D = len(domain_min)

    num_params, predict, log_marginal_likelihood = make_gp_funs(rbf_covariance, num_cov_params=D + 1)

    model_params = init_covariance_params(num_params)

    def optimize_gp_params(init_params, X, y):
        log_hyperprior = lambda params: np.sum(norm.logpdf(params, 0.0, 100.0))
        objective = lambda params: -log_marginal_likelihood(params, X, y) - log_hyperprior(params)
        return minimize(value_and_grad(objective), init_params, jac=True, method="CG").x

    def choose_next_point(domain_min, domain_max, acquisition_function, num_tries=15, rs=npr.RandomState(0)):
        """Uses gradient-based optimization to find next query point."""
        init_points = rs.rand(num_tries, D) * (domain_max - domain_min) + domain_min

        grad_obj = value_and_grad(lambda x: -acquisition_function(x))

        def optimize_point(init_point):
            print(".", end="")
            result = minimize(
                grad_obj,
                x0=init_point,
                jac=True,
                method="L-BFGS-B",
                options={"maxiter": 10},
                bounds=list(zip(domain_min, domain_max)),
            )
            return result.x, acquisition_function(result.x)

        optimzed_points, optimized_values = list(zip(*list(map(optimize_point, init_points))))
        print()
        best_ix = np.argmax(optimized_values)
        return np.atleast_2d(optimzed_points[best_ix])

    # Start by evaluating once in the middle of the domain.
    X = np.zeros((0, D))
    y = np.zeros(0)
    X = np.concatenate((X, np.reshape((domain_max - domain_min) / 2.0, (D, 1))))
    y = np.concatenate((y, np.reshape(np.array(func(X)), (1,))))

    for i in range(num_iters):
        if i > 1:
            print("Optimizing model parameters...")
            model_params = optimize_gp_params(model_params, X, y)

        print("Choosing where to look next", end="")

        def predict_func(xstar):
            mean, cov = predict(model_params, X, y, xstar)
            return mean, np.sqrt(np.diag(cov))

        def acquisition_function(xstar):
            xstar = np.atleast_2d(xstar)  # To work around a bug in scipy.minimize
            mean, std = predict_func(xstar)
            return expected_new_max(mean, std, defaultmax(y))

        next_point = choose_next_point(domain_min, domain_max, acquisition_function)

        print("Evaluating expensive function...")
        new_value = func(next_point)

        X = np.concatenate((X, next_point))
        y = np.concatenate((y, np.reshape(np.array(new_value), (1,))))

        if callback:
            callback(X, y, predict_func, acquisition_function, next_point, new_value)

    best_ix = np.argmax(y)
    return X[best_ix, :], y[best_ix]


if __name__ == "__main__":

    def example_function(x):
        return np.sum(x * np.sin(10.0 * x) + x) - 1

    domain_min = np.array([0.0])
    domain_max = np.array([1.1])

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(X, y, predict_func, acquisition_function, next_point, new_value):
        plt.cla()

        # Show posterior marginals.
        plot_xs = np.reshape(np.linspace(domain_min, domain_max, 300), (300, 1))
        pred_mean, pred_std = predict_func(plot_xs)
        ax.plot(plot_xs, pred_mean, "b")
        ax.fill(
            np.concatenate([plot_xs, plot_xs[::-1]]),
            np.concatenate([pred_mean - 1.96 * pred_std, (pred_mean + 1.96 * pred_std)[::-1]]),
            alpha=0.15,
            fc="Blue",
            ec="None",
        )

        ax.plot(X, y, "kx")
        ax.plot(next_point, new_value, "ro")

        alphas = acquisition_function(plot_xs)
        ax.plot(plot_xs, alphas, "r")
        ax.set_ylim([-1.5, 1.5])
        ax.set_xticks([])
        ax.set_yticks([])
        plt.draw()
        plt.pause(1)

    best_x, best_y = bayesian_optimize(example_function, domain_min, domain_max, callback=callback)


================================================
FILE: examples/black_box_svi.py
================================================
import matplotlib.pyplot as plt

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.multivariate_normal as mvn
import autograd.scipy.stats.norm as norm
from autograd import grad
from autograd.misc.optimizers import adam


def black_box_variational_inference(logprob, D, num_samples):
    """Implements http://arxiv.org/abs/1401.0118, and uses the
    local reparameterization trick from http://arxiv.org/abs/1506.02557"""

    def unpack_params(params):
        # Variational dist is a diagonal Gaussian.
        mean, log_std = params[:D], params[D:]
        return mean, log_std

    def gaussian_entropy(log_std):
        return 0.5 * D * (1.0 + np.log(2 * np.pi)) + np.sum(log_std)

    rs = npr.RandomState(0)

    def variational_objective(params, t):
        """Provides a stochastic estimate of the variational lower bound."""
        mean, log_std = unpack_params(params)
        samples = rs.randn(num_samples, D) * np.exp(log_std) + mean
        lower_bound = gaussian_entropy(log_std) + np.mean(logprob(samples, t))
        return -lower_bound

    gradient = grad(variational_objective)

    return variational_objective, gradient, unpack_params


if __name__ == "__main__":
    # Specify an inference problem by its unnormalized log-density.
    D = 2

    def log_density(x, t):
        mu, log_sigma = x[:, 0], x[:, 1]
        sigma_density = norm.logpdf(log_sigma, 0, 1.35)
        mu_density = norm.logpdf(mu, 0, np.exp(log_sigma))
        return sigma_density + mu_density

    # Build variational objective.
    objective, gradient, unpack_params = black_box_variational_inference(log_density, D, num_samples=2000)

    # Set up plotting code
    def plot_isocontours(ax, func, xlimits=[-2, 2], ylimits=[-4, 2], numticks=101):
        x = np.linspace(*xlimits, num=numticks)
        y = np.linspace(*ylimits, num=numticks)
        X, Y = np.meshgrid(x, y)
        zs = func(np.concatenate([np.atleast_2d(X.ravel()), np.atleast_2d(Y.ravel())]).T)
        Z = zs.reshape(X.shape)
        plt.contour(X, Y, Z)
        ax.set_yticks([])
        ax.set_xticks([])

    # Set up figure.
    fig = plt.figure(figsize=(8, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    def callback(params, t, g):
        print(f"Iteration {t} lower bound {-objective(params, t)}")

        plt.cla()
        target_distribution = lambda x: np.exp(log_density(x, t))
        plot_isocontours(ax, target_distribution)

        mean, log_std = unpack_params(params)
        variational_contour = lambda x: mvn.pdf(x, mean, np.diag(np.exp(2 * log_std)))
        plot_isocontours(ax, variational_contour)
        plt.draw()
        plt.pause(1.0 / 30.0)

    print("Optimizing variational parameters...")
    init_mean = -1 * np.ones(D)
    init_log_std = -5 * np.ones(D)
    init_var_params = np.concatenate([init_mean, init_log_std])
    variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=2000, callback=callback)


================================================
FILE: examples/convnet.py
================================================
"""Convolutional neural net on MNIST, modeled on 'LeNet-5',
http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf"""

import data_mnist

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.signal
from autograd import grad

convolve = autograd.scipy.signal.convolve


class WeightsParser:
    """A helper class to index into a parameter vector."""

    def __init__(self):
        self.idxs_and_shapes = {}
        self.N = 0

    def add_weights(self, name, shape):
        start = self.N
        self.N += np.prod(shape)
        self.idxs_and_shapes[name] = (slice(start, self.N), shape)

    def get(self, vect, name):
        idxs, shape = self.idxs_and_shapes[name]
        return np.reshape(vect[idxs], shape)


def make_batches(N_total, N_batch):
    start = 0
    batches = []
    while start < N_total:
        batches.append(slice(start, start + N_batch))
        start += N_batch
    return batches


def logsumexp(X, axis, keepdims=False):
    max_X = np.max(X)
    return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=keepdims))


def make_nn_funs(input_shape, layer_specs, L2_reg):
    parser = WeightsParser()
    cur_shape = input_shape
    for layer in layer_specs:
        N_weights, cur_shape = layer.build_weights_dict(cur_shape)
        parser.add_weights(layer, (N_weights,))

    def predictions(W_vect, inputs):
        """Outputs normalized log-probabilities.
        shape of inputs : [data, color, y, x]"""
        cur_units = inputs
        for layer in layer_specs:
            cur_weights = parser.get(W_vect, layer)
            cur_units = layer.forward_pass(cur_units, cur_weights)
        return cur_units

    def loss(W_vect, X, T):
        log_prior = -L2_reg * np.dot(W_vect, W_vect)
        log_lik = np.sum(predictions(W_vect, X) * T)
        return -log_prior - log_lik

    def frac_err(W_vect, X, T):
        return np.mean(np.argmax(T, axis=1) != np.argmax(pred_fun(W_vect, X), axis=1))

    return parser.N, predictions, loss, frac_err


class conv_layer:
    def __init__(self, kernel_shape, num_filters):
        self.kernel_shape = kernel_shape
        self.num_filters = num_filters

    def forward_pass(self, inputs, param_vector):
        # Input dimensions:  [data, color_in, y, x]
        # Params dimensions: [color_in, color_out, y, x]
        # Output dimensions: [data, color_out, y, x]
        params = self.parser.get(param_vector, "params")
        biases = self.parser.get(param_vector, "biases")
        conv = convolve(inputs, params, axes=([2, 3], [2, 3]), dot_axes=([1], [0]), mode="valid")
        return conv + biases

    def build_weights_dict(self, input_shape):
        # Input shape : [color, y, x] (don't need to know number of data yet)
        self.parser = WeightsParser()
        self.parser.add_weights("params", (input_shape[0], self.num_filters) + self.kernel_shape)
        self.parser.add_weights("biases", (1, self.num_filters, 1, 1))
        output_shape = (self.num_filters,) + self.conv_output_shape(input_shape[1:], self.kernel_shape)
        return self.parser.N, output_shape

    def conv_output_shape(self, A, B):
        return (A[0] - B[0] + 1, A[1] - B[1] + 1)


class maxpool_layer:
    def __init__(self, pool_shape):
        self.pool_shape = pool_shape

    def build_weights_dict(self, input_shape):
        # input_shape dimensions: [color, y, x]
        output_shape = list(input_shape)
        for i in [0, 1]:
            assert input_shape[i + 1] % self.pool_shape[i] == 0, "maxpool shape should tile input exactly"
            output_shape[i + 1] = input_shape[i + 1] / self.pool_shape[i]
        return 0, output_shape

    def forward_pass(self, inputs, param_vector):
        new_shape = inputs.shape[:2]
        for i in [0, 1]:
            pool_width = self.pool_shape[i]
            img_width = inputs.shape[i + 2]
            new_shape += (img_width // pool_width, pool_width)
        result = inputs.reshape(new_shape)
        return np.max(np.max(result, axis=3), axis=4)


class full_layer:
    def __init__(self, size):
        self.size = size

    def build_weights_dict(self, input_shape):
        # Input shape is anything (all flattened)
        input_size = np.prod(input_shape, dtype=int)
        self.parser = WeightsParser()
        self.parser.add_weights("params", (input_size, self.size))
        self.parser.add_weights("biases", (self.size,))
        return self.parser.N, (self.size,)

    def forward_pass(self, inputs, param_vector):
        params = self.parser.get(param_vector, "params")
        biases = self.parser.get(param_vector, "biases")
        if inputs.ndim > 2:
            inputs = inputs.reshape((inputs.shape[0], np.prod(inputs.shape[1:])))
        return self.nonlinearity(np.dot(inputs[:, :], params) + biases)


class tanh_layer(full_layer):
    def nonlinearity(self, x):
        return np.tanh(x)


class softmax_layer(full_layer):
    def nonlinearity(self, x):
        return x - logsumexp(x, axis=1, keepdims=True)


if __name__ == "__main__":
    # Network parameters
    L2_reg = 1.0
    input_shape = (1, 28, 28)
    layer_specs = [
        conv_layer((5, 5), 6),
        maxpool_layer((2, 2)),
        conv_layer((5, 5), 16),
        maxpool_layer((2, 2)),
        tanh_layer(120),
        tanh_layer(84),
        softmax_layer(10),
    ]

    # Training parameters
    param_scale = 0.1
    learning_rate = 1e-3
    momentum = 0.9
    batch_size = 256
    num_epochs = 50

    # Load and process MNIST data
    print("Loading training data...")
    add_color_channel = lambda x: x.reshape((x.shape[0], 1, x.shape[1], x.shape[2]))
    one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int)
    train_images, train_labels, test_images, test_labels = data_mnist.mnist()
    train_images = add_color_channel(train_images) / 255.0
    test_images = add_color_channel(test_images) / 255.0
    train_labels = one_hot(train_labels, 10)
    test_labels = one_hot(test_labels, 10)
    N_data = train_images.shape[0]

    # Make neural net functions
    N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(input_shape, layer_specs, L2_reg)
    loss_grad = grad(loss_fun)

    # Initialize weights
    rs = npr.RandomState()
    W = rs.randn(N_weights) * param_scale

    # Check the gradients numerically, just to be safe
    # quick_grad_check(loss_fun, W, (train_images[:50], train_labels[:50]))

    print("    Epoch      |    Train err  |   Test error  ")

    def print_perf(epoch, W):
        test_perf = frac_err(W, test_images, test_labels)
        train_perf = frac_err(W, train_images, train_labels)
        print(f"{epoch:15}|{train_perf:15}|{test_perf:15}")

    # Train with sgd
    batch_idxs = make_batches(N_data, batch_size)
    cur_dir = np.zeros(N_weights)

    for epoch in range(num_epochs):
        print_perf(epoch, W)
        for idxs in batch_idxs:
            grad_W = loss_grad(W, train_images[idxs], train_labels[idxs])
            cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W
            W -= learning_rate * cur_dir


================================================
FILE: examples/data.py
================================================
import data_mnist
import matplotlib.image
import matplotlib.pyplot as plt

import autograd.numpy as np
import autograd.numpy.random as npr


def load_mnist():
    partial_flatten = lambda x: np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
    one_hot = lambda x, k: np.array(x[:, None] == np.arange(k)[None, :], dtype=int)
    train_images, train_labels, test_images, test_labels = data_mnist.mnist()
    train_images = partial_flatten(train_images) / 255.0
    test_images = partial_flatten(test_images) / 255.0
    train_labels = one_hot(train_labels, 10)
    test_labels = one_hot(test_labels, 10)
    N_data = train_images.shape[0]

    return N_data, train_images, train_labels, test_images, test_labels


def plot_images(
    images,
    ax,
    ims_per_row=5,
    padding=5,
    digit_dimensions=(28, 28),
    cmap=matplotlib.cm.binary,
    vmin=None,
    vmax=None,
):
    """Images should be a (N_images x pixels) matrix."""
    N_images = images.shape[0]
    N_rows = (N_images - 1) // ims_per_row + 1
    pad_value = np.min(images.ravel())
    concat_images = np.full(
        (
            (digit_dimensions[0] + padding) * N_rows + padding,
            (digit_dimensions[1] + padding) * ims_per_row + padding,
        ),
        pad_value,
    )
    for i in range(N_images):
        cur_image = np.reshape(images[i, :], digit_dimensions)
        row_ix = i // ims_per_row
        col_ix = i % ims_per_row
        row_start = padding + (padding + digit_dimensions[0]) * row_ix
        col_start = padding + (padding + digit_dimensions[1]) * col_ix
        concat_images[
            row_start : row_start + digit_dimensions[0], col_start : col_start + digit_dimensions[1]
        ] = cur_image
    cax = ax.matshow(concat_images, cmap=cmap, vmin=vmin, vmax=vmax)
    plt.xticks(np.array([]))
    plt.yticks(np.array([]))
    return cax


def save_images(images, filename, **kwargs):
    fig = plt.figure(1)
    fig.clf()
    ax = fig.add_subplot(111)
    plot_images(images, ax, **kwargs)
    fig.patch.set_visible(False)
    ax.patch.set_visible(False)
    plt.savefig(filename)


def make_pinwheel(radial_std, tangential_std, num_classes, num_per_class, rate, rs=npr.RandomState(0)):
    """Based on code by Ryan P. Adams."""
    rads = np.linspace(0, 2 * np.pi, num_classes, endpoint=False)

    features = rs.randn(num_classes * num_per_class, 2) * np.array([radial_std, tangential_std])
    features[:, 0] += 1
    labels = np.repeat(np.arange(num_classes), num_per_class)

    angles = rads[labels] + rate * np.exp(features[:, 0])
    rotations = np.stack([np.cos(angles), -np.sin(angles), np.sin(angles), np.cos(angles)])
    rotations = np.reshape(rotations.T, (-1, 2, 2))

    return np.einsum("ti,tij->tj", features, rotations)


================================================
FILE: examples/data_mnist.py
================================================
import array
import gzip
import os
import struct
from urllib.request import urlretrieve

import numpy as np


def download(url, filename):
    if not os.path.exists("data"):
        os.makedirs("data")
    out_file = os.path.join("data", filename)
    if not os.path.isfile(out_file):
        urlretrieve(url, out_file)


def mnist():
    base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"

    def parse_labels(filename):
        with gzip.open(filename, "rb") as fh:
            magic, num_data = struct.unpack(">II", fh.read(8))
            return np.array(array.array("B", fh.read()), dtype=np.uint8)

    def parse_images(filename):
        with gzip.open(filename, "rb") as fh:
            magic, num_data, rows, cols = struct.unpack(">IIII", fh.read(16))
            return np.array(array.array("B", fh.read()), dtype=np.uint8).reshape(num_data, rows, cols)

    for filename in [
        "train-images-idx3-ubyte.gz",
        "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz",
        "t10k-labels-idx1-ubyte.gz",
    ]:
        download(base_url + filename, filename)

    train_images = parse_images("data/train-images-idx3-ubyte.gz")
    train_labels = parse_labels("data/train-labels-idx1-ubyte.gz")
    test_images = parse_images("data/t10k-images-idx3-ubyte.gz")
    test_labels = parse_labels("data/t10k-labels-idx1-ubyte.gz")

    return train_images, train_labels, test_images, test_labels


================================================
FILE: examples/deep_gaussian_process.py
================================================
import matplotlib.pyplot as plt
from gaussian_process import make_gp_funs, rbf_covariance
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import value_and_grad


def build_step_function_dataset(D=1, n_data=40, noise_std=0.1):
    rs = npr.RandomState(0)
    inputs = np.linspace(-2, 2, num=n_data)
    targets = np.sign(inputs) + rs.randn(n_data) * noise_std
    inputs = inputs.reshape((len(inputs), D))
    return inputs, targets


def build_deep_gp(input_dimension, hidden_dimension, covariance_function):
    # GP going from input to hidden
    num_params_layer1, predict_layer1, log_marginal_likelihood_layer1 = make_gp_funs(
        covariance_function, num_cov_params=input_dimension + 1
    )

    # GP going from hidden to output
    num_params_layer2, predict_layer2, log_marginal_likelihood_layer2 = make_gp_funs(
        covariance_function, num_cov_params=hidden_dimension + 1
    )

    num_hidden_params = hidden_dimension * n_data
    total_num_params = num_params_layer1 + num_params_layer2 + num_hidden_params

    def unpack_all_params(all_params):
        layer1_params = all_params[:num_params_layer1]
        layer2_params = all_params[num_params_layer1 : num_params_layer1 + num_params_layer2]
        hiddens = all_params[num_params_layer1 + num_params_layer2 :]
        return layer1_params, layer2_params, hiddens

    def combined_predict_fun(all_params, X, y, xs):
        layer1_params, layer2_params, hiddens = unpack_all_params(all_params)
        h_star_mean, h_star_cov = predict_layer1(layer1_params, X, hiddens, xs)
        y_star_mean, y_star_cov = predict_layer2(
            layer2_params, np.atleast_2d(hiddens).T, y, np.atleast_2d(h_star_mean).T
        )
        return y_star_mean, y_star_cov

    def log_marginal_likelihood(all_params):
        layer1_params, layer2_params, h = unpack_all_params(all_params)
        return log_marginal_likelihood_layer1(layer1_params, X, h) + log_marginal_likelihood_layer2(
            layer2_params, np.atleast_2d(h).T, y
        )

    predict_layer_funcs = [predict_layer1, predict_layer2]

    return (
        total_num_params,
        log_marginal_likelihood,
        combined_predict_fun,
        unpack_all_params,
        predict_layer_funcs,
    )


if __name__ == "__main__":
    n_data = 20
    input_dimension = 1
    hidden_dimension = 1
    X, y = build_step_function_dataset(D=input_dimension, n_data=n_data)

    (
        total_num_params,
        log_marginal_likelihood,
        combined_predict_fun,
        unpack_all_params,
        predict_layer_funcs,
    ) = build_deep_gp(input_dimension, hidden_dimension, rbf_covariance)

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax_end_to_end = fig.add_subplot(311, frameon=False)
    ax_x_to_h = fig.add_subplot(312, frameon=False)
    ax_h_to_y = fig.add_subplot(313, frameon=False)
    plt.show(block=False)

    def plot_gp(ax, X, y, pred_mean, pred_cov, plot_xs):
        ax.cla()
        marg_std = np.sqrt(np.diag(pred_cov))
        ax.plot(plot_xs, pred_mean, "b")
        ax.fill(
            np.concatenate([plot_xs, plot_xs[::-1]]),
            np.concatenate([pred_mean - 1.96 * marg_std, (pred_mean + 1.96 * marg_std)[::-1]]),
            alpha=0.15,
            fc="Blue",
            ec="None",
        )

        # Show samples from posterior.
        rs = npr.RandomState(0)
        sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10)
        ax.plot(plot_xs, sampled_funcs.T)
        ax.plot(X, y, "kx")
        ax.set_ylim([-1.5, 1.5])
        ax.set_xticks([])
        ax.set_yticks([])

    def callback(params):
        print(f"Log marginal likelihood {log_marginal_likelihood(params)}")

        # Show posterior marginals.
        plot_xs = np.reshape(np.linspace(-5, 5, 300), (300, 1))
        pred_mean, pred_cov = combined_predict_fun(params, X, y, plot_xs)
        plot_gp(ax_end_to_end, X, y, pred_mean, pred_cov, plot_xs)
        ax_end_to_end.set_title("X to y")

        layer1_params, layer2_params, hiddens = unpack_all_params(params)
        h_star_mean, h_star_cov = predict_layer_funcs[0](layer1_params, X, hiddens, plot_xs)
        y_star_mean, y_star_cov = predict_layer_funcs[0](layer2_params, np.atleast_2d(hiddens).T, y, plot_xs)

        plot_gp(ax_x_to_h, X, hiddens, h_star_mean, h_star_cov, plot_xs)
        ax_x_to_h.set_title("X to hiddens")

        plot_gp(ax_h_to_y, np.atleast_2d(hiddens).T, y, y_star_mean, y_star_cov, plot_xs)
        ax_h_to_y.set_title("hiddens to y")

        plt.draw()
        plt.pause(1.0 / 60.0)

    # Initialize covariance parameters and hiddens.
    rs = npr.RandomState(0)
    init_params = 0.1 * rs.randn(total_num_params)

    print("Optimizing covariance parameters...")
    objective = lambda params: -log_marginal_likelihood(params)
    cov_params = minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback)
    plt.pause(10.0)


================================================
FILE: examples/define_gradient.py
================================================
"""This example shows how to define the gradient of your own functions.
This can be useful for speed, numerical stability, or in cases where
your code depends on external library calls."""

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.extend import defvjp, primitive
from autograd.test_util import check_grads


# @primitive tells Autograd not to look inside this function, but instead
# to treat it as a black box, whose gradient might be specified later.
# Functions with this decorator can contain anything that Python knows
# how to execute, and you can do things like in-place operations on arrays.
@primitive
def logsumexp(x):
    """Numerically stable log(sum(exp(x))), also defined in scipy.special"""
    max_x = np.max(x)
    return max_x + np.log(np.sum(np.exp(x - max_x)))


# Next, we write a function that specifies the gradient with a closure.
# The reason for the closure is so that the gradient can depend
# on both the input to the original function (x), and the output of the
# original function (ans).


def logsumexp_vjp(ans, x):
    # If you want to be able to take higher-order derivatives, then all the
    # code inside this function must be itself differentiable by Autograd.
    # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x).
    # Because Autograd uses reverse-mode differentiation, g contains
    # the gradient of the objective w.r.t. ans, the output of logsumexp.
    # This returned VJP function doesn't close over `x`, so Python can
    # garbage-collect `x` if there are no references to it elsewhere.
    x_shape = x.shape
    return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans))


# Now we tell Autograd that logsumexmp has a gradient-making function.
defvjp(logsumexp, logsumexp_vjp)

if __name__ == "__main__":
    # Now we can use logsumexp() inside a larger function that we want
    # to differentiate.
    def example_func(y):
        z = y**2
        lse = logsumexp(z)
        return np.sum(lse)

    grad_of_example = grad(example_func)
    print("Gradient: \n", grad_of_example(npr.randn(10)))

    # Check the gradients numerically, just to be safe.
    check_grads(example_func, modes=["rev"])(npr.randn(10))


================================================
FILE: examples/dot_graph.py
================================================
"""Generates a graphviz DOT file of an evaluation trace.
Usage (need the dot binary, from the graphviz package, www.graphviz.org):

python2 dot_graph.py | dot -Tpdf -o graph.pdf
"""

import autograd.numpy as np
from autograd.tracer import Node, trace


class GraphNode(Node):
    # Records the full graph (could having this in tracer.py)
    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        self.fun_name = fun.__name__
        self.args = args
        self.parents = dict(zip(parent_argnums, parents))
        self.isroot = False

    def initialize_root(self, x):
        self.isroot = True

    def __repr__(self):
        return f"node_{id(self)}"


def trace_graph(f, x):
    start_node = GraphNode.new_root(x)
    _, node = trace(start_node, f, x)
    return node


dot_edge = "{} -> {} [color=gray30];\n".format
dot_function_node = '{} [label="{}", shape=box, color=lightblue, style=filled];\n'.format
dot_variable_node = '{} [label="{}", color=orange, style=filled];\n'.format
dot_graph = "digraph G {{{}}}".format


def graph_to_dotfile(graph):
    visited = set()

    def node_to_fragment(node):
        visited.add(node)
        if node.isroot:
            return dot_variable_node(node, "input")
        fragment = dot_function_node(node, node.fun_name)
        for argnum, arg in enumerate(node.args):
            if argnum in node.parents:
                parent = node.parents[argnum]
                fragment += dot_edge(parent, node)
                if parent not in visited:
                    fragment += node_to_fragment(parent)
            else:
                argnode = f"{node}_arg_{argnum}"
                fragment += dot_edge(argnode, node)
                fragment += dot_variable_node(argnode, arg)

        return fragment

    dot_body = node_to_fragment(graph)
    dot_body += dot_variable_node("output", "output")
    dot_body += dot_edge(graph, "output")
    return dot_graph(dot_body)


if __name__ == "__main__":

    def fun(x):
        y = np.sin(x)
        return (y + np.exp(x) - 0.5) * y

    print(graph_to_dotfile(trace_graph(fun, 1.0)))


================================================
FILE: examples/fixed_points.py
================================================
import autograd.numpy as np
from autograd import grad
from autograd.misc.fixed_points import fixed_point


def newton_sqrt_iter(a):
    return lambda x: 0.5 * (x + a / x)


def grad_descent_sqrt_iter(a):
    return lambda x: x - 0.05 * (x**2 - a)


def sqrt(a, guess=10.0):
    # return fixed_point(newton_sqrt_iter, a, guess, distance, 1e-4)
    return fixed_point(grad_descent_sqrt_iter, a, guess, distance, 1e-4)


def distance(x, y):
    return np.abs(x - y)


print(np.sqrt(2.0))
print(sqrt(2.0))
print()
print(grad(np.sqrt)(2.0))
print(grad(sqrt)(2.0))
print()
print(grad(grad(np.sqrt))(2.0))
print(grad(grad(sqrt))(2.0))
print()


================================================
FILE: examples/fluidsim/fluidsim.py
================================================
import os

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread
from scipy.optimize import minimize

import autograd.numpy as np
from autograd import value_and_grad

# Fluid simulation code based on
# "Real-Time Fluid Dynamics for Games" by Jos Stam
# https://www.josstam.com/_files/ugd/cf1fd6_9989229efbd34a26ba5ccd913721a2ac.pdf


def project(vx, vy):
    """Project the velocity field to be approximately mass-conserving,
    using a few iterations of Gauss-Seidel."""
    p = np.zeros(vx.shape)
    h = 1.0 / vx.shape[0]
    div = (
        -0.5
        * h
        * (
            np.roll(vx, -1, axis=0)
            - np.roll(vx, 1, axis=0)
            + np.roll(vy, -1, axis=1)
            - np.roll(vy, 1, axis=1)
        )
    )

    for k in range(10):
        p = (
            div
            + np.roll(p, 1, axis=0)
            + np.roll(p, -1, axis=0)
            + np.roll(p, 1, axis=1)
            + np.roll(p, -1, axis=1)
        ) / 4.0

    vx -= 0.5 * (np.roll(p, -1, axis=0) - np.roll(p, 1, axis=0)) / h
    vy -= 0.5 * (np.roll(p, -1, axis=1) - np.roll(p, 1, axis=1)) / h
    return vx, vy


def advect(f, vx, vy):
    """Move field f according to x and y velocities (u and v)
    using an implicit Euler integrator."""
    rows, cols = f.shape
    cell_ys, cell_xs = np.meshgrid(np.arange(rows), np.arange(cols))
    center_xs = (cell_xs - vx).ravel()
    center_ys = (cell_ys - vy).ravel()

    # Compute indices of source cells.
    left_ix = np.floor(center_xs).astype(int)
    top_ix = np.floor(center_ys).astype(int)
    rw = center_xs - left_ix  # Relative weight of right-hand cells.
    bw = center_ys - top_ix  # Relative weight of bottom cells.
    left_ix = np.mod(left_ix, rows)  # Wrap around edges of simulation.
    right_ix = np.mod(left_ix + 1, rows)
    top_ix = np.mod(top_ix, cols)
    bot_ix = np.mod(top_ix + 1, cols)

    # A linearly-weighted sum of the 4 surrounding cells.
    flat_f = (1 - rw) * ((1 - bw) * f[left_ix, top_ix] + bw * f[left_ix, bot_ix]) + rw * (
        (1 - bw) * f[right_ix, top_ix] + bw * f[right_ix, bot_ix]
    )
    return np.reshape(flat_f, (rows, cols))


def simulate(vx, vy, smoke, num_time_steps, ax=None, render=False):
    print("Running simulation...")
    for t in range(num_time_steps):
        if ax:
            plot_matrix(ax, smoke, t, render)
        vx_updated = advect(vx, vx, vy)
        vy_updated = advect(vy, vx, vy)
        vx, vy = project(vx_updated, vy_updated)
        smoke = advect(smoke, vx, vy)
    if ax:
        plot_matrix(ax, smoke, num_time_steps, render)
    return smoke


def plot_matrix(ax, mat, t, render=False):
    plt.cla()
    ax.matshow(mat)
    ax.set_xticks([])
    ax.set_yticks([])
    plt.draw()
    if render:
        matplotlib.image.imsave(f"step{t:03d}.png", mat)
    plt.pause(0.001)


if __name__ == "__main__":
    simulation_timesteps = 100
    basepath = os.path.dirname(__file__)

    print("Loading initial and target states...")
    init_smoke = imread(os.path.join(basepath, "init_smoke.png"))[:, :, 0]
    # target = imread('peace.png')[::2,::2,3]
    target = imread(os.path.join(basepath, "skull.png"))[::2, ::2]
    rows, cols = target.shape

    init_dx_and_dy = np.zeros((2, rows, cols)).ravel()

    def distance_from_target_image(smoke):
        return np.mean((target - smoke) ** 2)

    def convert_param_vector_to_matrices(params):
        vx = np.reshape(params[: (rows * cols)], (rows, cols))
        vy = np.reshape(params[(rows * cols) :], (rows, cols))
        return vx, vy

    def objective(params):
        init_vx, init_vy = convert_param_vector_to_matrices(params)
        final_smoke = simulate(init_vx, init_vy, init_smoke, simulation_timesteps)
        return distance_from_target_image(final_smoke)

    # Specify gradient of objective function using autograd.
    objective_with_grad = value_and_grad(objective)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, frameon=False)

    def callback(params):
        init_vx, init_vy = convert_param_vector_to_matrices(params)
        simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax)

    print("Optimizing initial conditions...")
    result = minimize(
        objective_with_grad,
        init_dx_and_dy,
        jac=True,
        method="CG",
        options={"maxiter": 25, "disp": True},
        callback=callback,
    )

    print("Rendering optimized flow...")
    init_vx, init_vy = convert_param_vector_to_matrices(result.x)
    simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax, render=True)

    print("Converting frames to an animated GIF...")
    os.system("convert -delay 5 -loop 0 step*.png -delay 250 step100.png surprise.gif")  # Using imagemagick.
    os.system("rm step*.png")


================================================
FILE: examples/fluidsim/wing.py
================================================
import os

import matplotlib.pyplot as plt
from scipy.optimize import minimize

import autograd.numpy as np
from autograd import value_and_grad

rows, cols = 40, 60

# Fluid simulation code based on
# "Real-Time Fluid Dynamics for Games" by Jos Stam
# http://www.intpowertechcorp.com/GDC03.pdf


def occlude(f, occlusion):
    return f * (1 - occlusion)


def project(vx, vy, occlusion):
    """Project the velocity field to be approximately mass-conserving,
    using a few iterations of Gauss-Seidel."""
    p = np.zeros(vx.shape)
    div = -0.5 * (
        np.roll(vx, -1, axis=1) - np.roll(vx, 1, axis=1) + np.roll(vy, -1, axis=0) - np.roll(vy, 1, axis=0)
    )
    div = make_continuous(div, occlusion)

    for k in range(50):
        p = (
            div
            + np.roll(p, 1, axis=1)
            + np.roll(p, -1, axis=1)
            + np.roll(p, 1, axis=0)
            + np.roll(p, -1, axis=0)
        ) / 4.0
        p = make_continuous(p, occlusion)

    vx = vx - 0.5 * (np.roll(p, -1, axis=1) - np.roll(p, 1, axis=1))
    vy = vy - 0.5 * (np.roll(p, -1, axis=0) - np.roll(p, 1, axis=0))

    vx = occlude(vx, occlusion)
    vy = occlude(vy, occlusion)
    return vx, vy


def advect(f, vx, vy):
    """Move field f according to x and y velocities (u and v)
    using an implicit Euler integrator."""
    rows, cols = f.shape
    cell_xs, cell_ys = np.meshgrid(np.arange(cols), np.arange(rows))
    center_xs = (cell_xs - vx).ravel()
    center_ys = (cell_ys - vy).ravel()

    # Compute indices of source cells.
    left_ix = np.floor(center_ys).astype(int)
    top_ix = np.floor(center_xs).astype(int)
    rw = center_ys - left_ix  # Relative weight of right-hand cells.
    bw = center_xs - top_ix  # Relative weight of bottom cells.
    left_ix = np.mod(left_ix, rows)  # Wrap around edges of simulation.
    right_ix = np.mod(left_ix + 1, rows)
    top_ix = np.mod(top_ix, cols)
    bot_ix = np.mod(top_ix + 1, cols)

    # A linearly-weighted sum of the 4 surrounding cells.
    flat_f = (1 - rw) * ((1 - bw) * f[left_ix, top_ix] + bw * f[left_ix, bot_ix]) + rw * (
        (1 - bw) * f[right_ix, top_ix] + bw * f[right_ix, bot_ix]
    )
    return np.reshape(flat_f, (rows, cols))


def make_continuous(f, occlusion):
    non_occluded = 1 - occlusion
    num = (
        np.roll(f, 1, axis=0) * np.roll(non_occluded, 1, axis=0)
        + np.roll(f, -1, axis=0) * np.roll(non_occluded, -1, axis=0)
        + np.roll(f, 1, axis=1) * np.roll(non_occluded, 1, axis=1)
        + np.roll(f, -1, axis=1) * np.roll(non_occluded, -1, axis=1)
    )
    den = (
        np.roll(non_occluded, 1, axis=0)
        + np.roll(non_occluded, -1, axis=0)
        + np.roll(non_occluded, 1, axis=1)
        + np.roll(non_occluded, -1, axis=1)
    )
    return f * non_occluded + (1 - non_occluded) * num / (den + 0.001)


def sigmoid(x):
    return 0.5 * (np.tanh(x) + 1.0)  # Output ranges from 0 to 1.


def simulate(vx, vy, num_time_steps, occlusion, ax=None, render=False):
    occlusion = sigmoid(occlusion)

    # Disallow occlusion outside a certain area.
    mask = np.zeros((rows, cols))
    mask[10:30, 10:30] = 1.0
    occlusion = occlusion * mask

    # Initialize smoke bands.
    red_smoke = np.zeros((rows, cols))
    red_smoke[rows // 4 : rows // 2] = 1
    blue_smoke = np.zeros((rows, cols))
    blue_smoke[rows // 2 : 3 * rows // 4] = 1

    print("Running simulation...")
    vx, vy = project(vx, vy, occlusion)
    for t in range(num_time_steps):
        plot_matrix(ax, red_smoke, occlusion, blue_smoke, t, render)
        vx_updated = advect(vx, vx, vy)
        vy_updated = advect(vy, vx, vy)
        vx, vy = project(vx_updated, vy_updated, occlusion)
        red_smoke = advect(red_smoke, vx, vy)
        red_smoke = occlude(red_smoke, occlusion)
        blue_smoke = advect(blue_smoke, vx, vy)
        blue_smoke = occlude(blue_smoke, occlusion)
    plot_matrix(ax, red_smoke, occlusion, blue_smoke, num_time_steps, render)
    return vx, vy


def plot_matrix(ax, r, g, b, t, render=False):
    if ax:
        plt.cla()
        ax.imshow(np.concatenate((r[..., np.newaxis], g[..., np.newaxis], b[..., np.newaxis]), axis=2))
        ax.set_xticks([])
        ax.set_yticks([])
        plt.draw()
        if render:
            plt.savefig(f"step{t:03d}.png", bbox_inches="tight")
        plt.pause(0.001)


if __name__ == "__main__":
    simulation_timesteps = 20

    print("Loading initial and target states...")
    init_vx = np.ones((rows, cols))
    init_vy = np.zeros((rows, cols))

    # Initialize the occlusion to be a block.
    init_occlusion = -np.ones((rows, cols))
    init_occlusion[15:25, 15:25] = 0.0
    init_occlusion = init_occlusion.ravel()

    def drag(vx):
        return np.mean(init_vx - vx)

    def lift(vy):
        return np.mean(vy - init_vy)

    def objective(params):
        cur_occlusion = np.reshape(params, (rows, cols))
        final_vx, final_vy = simulate(init_vx, init_vy, simulation_timesteps, cur_occlusion)
        return -lift(final_vy) / drag(final_vx)

    # Specify gradient of objective function using autograd.
    objective_with_grad = value_and_grad(objective)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, frameon=False)

    def callback(weights):
        cur_occlusion = np.reshape(weights, (rows, cols))
        simulate(init_vx, init_vy, simulation_timesteps, cur_occlusion, ax)

    print("Rendering initial flow...")
    callback(init_occlusion)

    print("Optimizing initial conditions...")
    result = minimize(
        objective_with_grad,
        init_occlusion,
        jac=True,
        method="CG",
        options={"maxiter": 50, "disp": True},
        callback=callback,
    )

    print("Rendering optimized flow...")
    final_occlusion = np.reshape(result.x, (rows, cols))
    simulate(init_vx, init_vy, simulation_timesteps, final_occlusion, ax, render=True)

    print("Converting frames to an animated GIF...")  # Using imagemagick.
    os.system(f"convert -delay 5 -loop 0 step*.png -delay 250 step{simulation_timesteps:03d}.png wing.gif")
    os.system("rm step*.png")


================================================
FILE: examples/gaussian_process.py
================================================
import matplotlib.pyplot as plt
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.multivariate_normal as mvn
from autograd import value_and_grad
from autograd.numpy.linalg import solve


def make_gp_funs(cov_func, num_cov_params):
    """Functions that perform Gaussian process regression.
    cov_func has signature (cov_params, x, x')"""

    def unpack_kernel_params(params):
        mean = params[0]
        cov_params = params[2:]
        noise_scale = np.exp(params[1]) + 0.0001
        return mean, cov_params, noise_scale

    def predict(params, x, y, xstar):
        """Returns the predictive mean and covariance at locations xstar,
        of the latent function value f (without observation noise)."""
        mean, cov_params, noise_scale = unpack_kernel_params(params)
        cov_f_f = cov_func(cov_params, xstar, xstar)
        cov_y_f = cov_func(cov_params, x, xstar)
        cov_y_y = cov_func(cov_params, x, x) + noise_scale * np.eye(len(y))
        pred_mean = mean + np.dot(solve(cov_y_y, cov_y_f).T, y - mean)
        pred_cov = cov_f_f - np.dot(solve(cov_y_y, cov_y_f).T, cov_y_f)
        return pred_mean, pred_cov

    def log_marginal_likelihood(params, x, y):
        mean, cov_params, noise_scale = unpack_kernel_params(params)
        cov_y_y = cov_func(cov_params, x, x) + noise_scale * np.eye(len(y))
        prior_mean = mean * np.ones(len(y))
        return mvn.logpdf(y, prior_mean, cov_y_y)

    return num_cov_params + 2, predict, log_marginal_likelihood


# Define an example covariance function.
def rbf_covariance(kernel_params, x, xp):
    output_scale = np.exp(kernel_params[0])
    lengthscales = np.exp(kernel_params[1:])
    diffs = np.expand_dims(x / lengthscales, 1) - np.expand_dims(xp / lengthscales, 0)
    return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=2))


def build_toy_dataset(D=1, n_data=20, noise_std=0.1):
    rs = npr.RandomState(0)
    inputs = np.concatenate([np.linspace(0, 3, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)])
    targets = (np.cos(inputs) + rs.randn(n_data) * noise_std) / 2.0
    inputs = (inputs - 4.0) / 2.0
    inputs = inputs.reshape((len(inputs), D))
    return inputs, targets


if __name__ == "__main__":
    D = 1

    # Build model and objective function.
    num_params, predict, log_marginal_likelihood = make_gp_funs(rbf_covariance, num_cov_params=D + 1)

    X, y = build_toy_dataset(D=D)
    objective = lambda params: -log_marginal_likelihood(params, X, y)

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(params):
        print(f"Log likelihood {-objective(params)}")
        plt.cla()

        # Show posterior marginals.
        plot_xs = np.reshape(np.linspace(-7, 7, 300), (300, 1))
        pred_mean, pred_cov = predict(params, X, y, plot_xs)
        marg_std = np.sqrt(np.diag(pred_cov))
        ax.plot(plot_xs, pred_mean, "b")
        ax.fill(
            np.concatenate([plot_xs, plot_xs[::-1]]),
            np.concatenate([pred_mean - 1.96 * marg_std, (pred_mean + 1.96 * marg_std)[::-1]]),
            alpha=0.15,
            fc="Blue",
            ec="None",
        )

        # Show samples from posterior.
        rs = npr.RandomState(0)
        sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10)
        ax.plot(plot_xs, sampled_funcs.T)

        ax.plot(X, y, "kx")
        ax.set_ylim([-1.5, 1.5])
        ax.set_xticks([])
        ax.set_yticks([])
        plt.draw()
        plt.pause(1.0 / 60.0)

    # Initialize covariance parameters
    rs = npr.RandomState(0)
    init_params = 0.1 * rs.randn(num_params)

    print("Optimizing covariance parameters...")
    cov_params = minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback)
    plt.pause(10.0)


================================================
FILE: examples/generative_adversarial_net.py
================================================
# Implements a Generative Adversarial Network, from
# arxiv.org/abs/1406.2661
# but, it always collapses to generating a single image.
# Let me know if you can get it to work! - David Duvenaud

from data import load_mnist, save_images

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc import flatten

### Define geneerator, discriminator, and objective ###


def relu(x):
    return np.maximum(0, x)


def sigmoid(x):
    return 0.5 * (np.tanh(x) + 1.0)


def logsigmoid(x):
    return x - np.logaddexp(0, x)


def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples,
    one for each layer in the net."""
    return [
        (
            scale * rs.randn(m, n),  # weight matrix
            scale * rs.randn(n),
        )  # bias vector
        for m, n in zip(layer_sizes[:-1], layer_sizes[1:])
    ]


def batch_normalize(activations):
    mbmean = np.mean(activations, axis=0, keepdims=True)
    return (activations - mbmean) / (np.std(activations, axis=0, keepdims=True) + 1)


def neural_net_predict(params, inputs):
    """Params is a list of (weights, bias) tuples.
    inputs is an (N x D) matrix."""
    inpW, inpb = params[0]
    inputs = relu(np.dot(inputs, inpW) + inpb)
    for W, b in params[1:-1]:
        outputs = batch_normalize(np.dot(inputs, W) + b)
        inputs = relu(outputs)
    outW, outb = params[-1]
    outputs = np.dot(inputs, outW) + outb
    return outputs


def generate_from_noise(gen_params, num_samples, noise_dim, rs):
    noise = rs.rand(num_samples, noise_dim)
    samples = neural_net_predict(gen_params, noise)
    return sigmoid(samples)


def gan_objective(gen_params, dsc_params, real_data, num_samples, noise_dim, rs):
    fake_data = generate_from_noise(gen_params, num_samples, noise_dim, rs)
    logprobs_fake = logsigmoid(neural_net_predict(dsc_params, fake_data))
    logprobs_real = logsigmoid(neural_net_predict(dsc_params, real_data))
    return np.mean(logprobs_real) - np.mean(logprobs_fake)


### Define minimax version of adam optimizer ###


def adam_minimax(
    grad_both,
    init_params_max,
    init_params_min,
    callback=None,
    num_iters=100,
    step_size_max=0.001,
    step_size_min=0.001,
    b1=0.9,
    b2=0.999,
    eps=10**-8,
):
    """Adam modified to do minimiax optimization, for instance to help with
    training generative adversarial networks."""

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))
    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback:
            callback(
                unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min)
            )

        m_max = (1 - b1) * g_max + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1 ** (i + 1))  # Bias correction.
        vhat_max = v_max / (1 - b2 ** (i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1 ** (i + 1))  # Bias correction.
        vhat_min = v_min / (1 - b2 ** (i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)
    return unflatten_max(x_max), unflatten_min(x_min)


### Setup and run on MNIST ###

if __name__ == "__main__":
    # Model hyper-parameters
    noise_dim = 10
    gen_layer_sizes = [noise_dim, 200, 784]
    dsc_layer_sizes = [784, 200, 1]

    # Training parameters
    param_scale = 0.001
    batch_size = 100
    num_epochs = 50
    step_size_max = 0.01
    step_size_min = 0.01

    print("Loading training data...")
    N, train_images, _, test_images, _ = load_mnist()

    init_gen_params = init_random_params(param_scale, gen_layer_sizes)
    init_dsc_params = init_random_params(param_scale, dsc_layer_sizes)

    num_batches = int(np.ceil(len(train_images) / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx + 1) * batch_size)

    # Define training objective
    seed = npr.RandomState(0)

    def objective(gen_params, dsc_params, iter):
        idx = batch_indices(iter)
        return gan_objective(gen_params, dsc_params, train_images[idx], batch_size, noise_dim, seed)

    # Get gradients of objective using autograd.
    both_objective_grad = grad(objective, argnum=(0, 1))

    print("     Epoch     |    Objective  |       Fake probability | Real Probability  ")

    def print_perf(gen_params, dsc_params, iter, gen_gradient, dsc_gradient):
        if iter % 10 == 0:
            ability = np.mean(objective(gen_params, dsc_params, iter))
            fake_data = generate_from_noise(gen_params, 20, noise_dim, seed)
            real_data = train_images[batch_indices(iter)]
            probs_fake = np.mean(sigmoid(neural_net_predict(dsc_params, fake_data)))
            probs_real = np.mean(sigmoid(neural_net_predict(dsc_params, real_data)))
            print(f"{iter // num_batches:15}|{ability:20}|{probs_fake:20}|{probs_real:20}")
            save_images(fake_data, "gan_samples.png", vmin=0, vmax=1)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam_minimax(
        both_objective_grad,
        init_gen_params,
        init_dsc_params,
        step_size_max=step_size_max,
        step_size_min=step_size_min,
        num_iters=num_epochs * num_batches,
        callback=print_perf,
    )


================================================
FILE: examples/gmm.py
================================================
"""Implements a Gaussian mixture model, in which parameters are fit using
gradient descent.  This example runs on 2-dimensional data, but the model
works on arbitrarily-high dimension."""

import matplotlib.pyplot as plt
from data import make_pinwheel
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.multivariate_normal as mvn
from autograd import grad, hessian_vector_product
from autograd.misc.flatten import flatten_func
from autograd.scipy.special import logsumexp


def init_gmm_params(num_components, D, scale, rs=npr.RandomState(0)):
    return {
        "log proportions": rs.randn(num_components) * scale,
        "means": rs.randn(num_components, D) * scale,
        "lower triangles": np.zeros((num_components, D, D)) + np.eye(D),
    }


def log_normalize(x):
    return x - logsumexp(x)


def unpack_gmm_params(params):
    normalized_log_proportions = log_normalize(params["log proportions"])
    return normalized_log_proportions, params["means"], params["lower triangles"]


def gmm_log_likelihood(params, data):
    cluster_lls = []
    for log_proportion, mean, cov_sqrt in zip(*unpack_gmm_params(params)):
        cov = np.dot(cov_sqrt.T, cov_sqrt)
        cluster_lls.append(log_proportion + mvn.logpdf(data, mean, cov))
    return np.sum(logsumexp(np.vstack(cluster_lls), axis=0))


def plot_ellipse(ax, mean, cov_sqrt, alpha, num_points=100):
    angles = np.linspace(0, 2 * np.pi, num_points)
    circle_pts = np.vstack([np.cos(angles), np.sin(angles)]).T * 2.0
    cur_pts = mean + np.dot(circle_pts, cov_sqrt)
    ax.plot(cur_pts[:, 0], cur_pts[:, 1], "-", alpha=alpha)


def plot_gaussian_mixture(params, ax):
    for log_proportion, mean, cov_sqrt in zip(*unpack_gmm_params(params)):
        alpha = np.minimum(1.0, np.exp(log_proportion) * 10)
        plot_ellipse(ax, mean, cov_sqrt, alpha)


if __name__ == "__main__":
    init_params = init_gmm_params(num_components=10, D=2, scale=0.1)

    data = make_pinwheel(radial_std=0.3, tangential_std=0.05, num_classes=3, num_per_class=100, rate=0.4)

    def objective(params):
        return -gmm_log_likelihood(params, data)

    flattened_obj, unflatten, flattened_init_params = flatten_func(objective, init_params)

    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(flattened_params):
        params = unflatten(flattened_params)
        print(f"Log likelihood {-objective(params)}")
        ax.cla()
        ax.plot(data[:, 0], data[:, 1], "k.")
        ax.set_xticks([])
        ax.set_yticks([])
        plot_gaussian_mixture(params, ax)
        plt.draw()
        plt.pause(1.0 / 60.0)

    minimize(
        flattened_obj,
        flattened_init_params,
        jac=grad(flattened_obj),
        hessp=hessian_vector_product(flattened_obj),
        method="Newton-CG",
        callback=callback,
    )


================================================
FILE: examples/gplvm.py
================================================
# Implements a Gaussian process latent-variable model.
# The (high-dimensional) data, Y is explained by some low-dimensional latent
# data X, warped by a function drawn from a GP prior (f).  So Y = f(X), but
# we don't know X or f.
#
# In this example, we optimize X and the hyperparameters of the GP, but
# we integrate over all possible functions f.
#
# Normally the observed data would be high-dimensional.
#
# David Duvenaud (duvenaud@gmail.com)


import matplotlib.pyplot as plt
from data import make_pinwheel
from gaussian_process import make_gp_funs, rbf_covariance
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import value_and_grad
from autograd.scipy.stats import norm

if __name__ == "__main__":
    data_dimension = 2  # Normally the data dimension would be much higher.
    latent_dimension = 2

    # Build model and objective function.
    params_per_gp, predict, log_marginal_likelihood = make_gp_funs(
        rbf_covariance, num_cov_params=latent_dimension + 1
    )
    total_gp_params = data_dimension * params_per_gp

    data = make_pinwheel(radial_std=0.3, tangential_std=0.05, num_classes=3, num_per_class=30, rate=0.4)
    datalen = data.shape[0]

    num_latent_params = datalen * latent_dimension

    def unpack_params(params):
        gp_params = np.reshape(params[:total_gp_params], (data_dimension, params_per_gp))
        latents = np.reshape(params[total_gp_params:], (datalen, latent_dimension))
        return gp_params, latents

    def objective(params):
        gp_params, latents = unpack_params(params)
        gp_likelihood = sum(
            [log_marginal_likelihood(gp_params[i], latents, data[:, i]) for i in range(data_dimension)]
        )
        latent_prior_likelihood = np.sum(norm.logpdf(latents))
        return -gp_likelihood - latent_prior_likelihood

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    latent_ax = fig.add_subplot(121, frameon=False)
    data_ax = fig.add_subplot(122, frameon=False)
    plt.show(block=False)

    def callback(params):
        print(f"Log likelihood {-objective(params)}")
        gp_params, latents = unpack_params(params)

        data_ax.cla()
        data_ax.plot(data[:, 0], data[:, 1], "bx")
        data_ax.set_xticks([])
        data_ax.set_yticks([])
        data_ax.set_title("Observed Data")

        latent_ax.cla()
        latent_ax.plot(latents[:, 0], latents[:, 1], "kx")
        latent_ax.set_xticks([])
        latent_ax.set_yticks([])
        latent_ax.set_xlim([-2, 2])
        latent_ax.set_ylim([-2, 2])
        latent_ax.set_title("Latent coordinates")

        plt.draw()
        plt.pause(1.0 / 60.0)

    # Initialize covariance parameters
    rs = npr.RandomState(1)
    init_params = rs.randn(total_gp_params + num_latent_params) * 0.1

    print("Optimizing covariance parameters and latent variable locations...")
    minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback)


================================================
FILE: examples/hmm_em.py
================================================
import string
from functools import partial
from os.path import dirname, join

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import value_and_grad as vgrad
from autograd.scipy.special import logsumexp


def EM(init_params, data, callback=None):
    def EM_update(params):
        natural_params = list(map(np.log, params))
        loglike, E_stats = vgrad(log_partition_function)(natural_params, data)  # E step
        if callback:
            callback(loglike, params)
        return list(map(normalize, E_stats))  # M step

    def fixed_point(f, x0):
        x1 = f(x0)
        while different(x0, x1):
            x0, x1 = x1, f(x1)
        return x1

    def different(params1, params2):
        allclose = partial(np.allclose, atol=1e-3, rtol=1e-3)
        return not all(map(allclose, params1, params2))

    return fixed_point(EM_update, init_params)


def normalize(a):
    def replace_zeros(a):
        return np.where(a > 0.0, a, 1.0)

    return a / replace_zeros(a.sum(-1, keepdims=True))


def log_partition_function(natural_params, data):
    if isinstance(data, list):
        return sum(map(partial(log_partition_function, natural_params), data))

    log_pi, log_A, log_B = natural_params

    log_alpha = log_pi
    for y in data:
        log_alpha = logsumexp(log_alpha[:, None] + log_A, axis=0) + log_B[:, y]

    return logsumexp(log_alpha)


def initialize_hmm_parameters(num_states, num_outputs):
    init_pi = normalize(npr.rand(num_states))
    init_A = normalize(npr.rand(num_states, num_states))
    init_B = normalize(npr.rand(num_states, num_outputs))
    return init_pi, init_A, init_B


def build_dataset(filename, max_lines=-1):
    """Loads a text file, and turns each line into an encoded sequence."""
    encodings = dict(list(map(reversed, enumerate(string.printable))))
    digitize = lambda char: encodings[char] if char in encodings else len(encodings)
    encode_line = lambda line: np.array(list(map(digitize, line)))
    nonblank_line = lambda line: len(line) > 2

    with open(filename) as f:
        lines = f.readlines()

    encoded_lines = list(map(encode_line, list(filter(nonblank_line, lines))[:max_lines]))
    num_outputs = len(encodings) + 1

    return encoded_lines, num_outputs


if __name__ == "__main__":
    np.random.seed(0)
    np.seterr(divide="ignore")

    # callback to print log likelihoods during training
    print_loglike = lambda loglike, params: print(loglike)

    # load training data
    lstm_filename = join(dirname(__file__), "lstm.py")
    train_inputs, num_outputs = build_dataset(lstm_filename, max_lines=60)

    # train with EM
    num_states = 20
    init_params = initialize_hmm_parameters(num_states, num_outputs)
    pi, A, B = EM(init_params, train_inputs, print_loglike)


================================================
FILE: examples/ica.py
================================================
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from scipy.optimize import minimize

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.t as t
from autograd import value_and_grad


def make_ica_funs(observed_dimension, latent_dimension):
    """These functions implement independent component analysis.

    The model is:
    latents are drawn i.i.d. for each data point from a product of student-ts.
    weights are the same across all datapoints.
    each data = latents * weghts + noise."""

    def sample(weights, n_samples, noise_std, rs):
        latents = rs.randn(latent_dimension, n_samples)
        latents = np.array(sorted(latents.T, key=lambda a_entry: a_entry[0])).T
        noise = rs.randn(n_samples, observed_dimension) * noise_std
        observed = predict(weights, latents) + noise
        return latents, observed

    def predict(weights, latents):
        return np.dot(weights, latents).T

    def logprob(weights, latents, noise_std, observed):
        preds = predict(weights, latents)
        log_lik = np.sum(t.logpdf(preds, 2.4, observed, noise_std))
        return log_lik

    num_weights = observed_dimension * latent_dimension

    def unpack_weights(weights):
        return np.reshape(weights, (observed_dimension, latent_dimension))

    return num_weights, sample, logprob, unpack_weights


def color_scatter(ax, xs, ys):
    colors = cm.rainbow(np.linspace(0, 1, len(ys)))
    for x, y, c in zip(xs, ys, colors):
        ax.scatter(x, y, color=c)


if __name__ == "__main__":
    observed_dimension = 100
    latent_dimension = 2
    true_noise_var = 1.0
    n_samples = 200

    num_weights, sample, logprob, unpack_weights = make_ica_funs(observed_dimension, latent_dimension)

    num_latent_params = latent_dimension * n_samples
    total_num_params = num_weights + num_latent_params + 1

    def unpack_params(params):
        weights = unpack_weights(params[:num_weights])
        latents = np.reshape(
            params[num_weights : num_weights + num_latent_params], (latent_dimension, n_samples)
        )
        noise_std = np.exp(params[-1])
        return weights, latents, noise_std

    rs = npr.RandomState(0)
    true_weights = np.zeros((observed_dimension, latent_dimension))
    for i in range(latent_dimension):
        true_weights[:, i] = np.sin(np.linspace(0, 4 + i * 3.2, observed_dimension))

    true_latents, data = sample(true_weights, n_samples, true_noise_var, rs)

    # Set up figure.
    fig2 = plt.figure(figsize=(6, 6), facecolor="white")
    ax_data = fig2.add_subplot(111, frameon=False)
    ax_data.matshow(data)

    fig1 = plt.figure(figsize=(12, 16), facecolor="white")
    ax_true_latents = fig1.add_subplot(411, frameon=False)
    ax_est_latents = fig1.add_subplot(412, frameon=False)
    ax_true_weights = fig1.add_subplot(413, frameon=False)
    ax_est_weights = fig1.add_subplot(414, frameon=False)

    plt.show(block=False)
    ax_true_weights.scatter(true_weights[:, 0], true_weights[:, 1])
    ax_true_weights.set_title("True weights")
    color_scatter(ax_true_latents, true_latents[0, :], true_latents[1, :])
    ax_true_latents.set_title("True latents")
    ax_true_latents.set_xticks([])
    ax_true_weights.set_xticks([])
    ax_true_latents.set_yticks([])
    ax_true_weights.set_yticks([])

    def objective(params):
        weight_matrix, latents, noise_std = unpack_params(params)
        return -logprob(weight_matrix, latents, noise_std, data) / n_samples

    def callback(params):
        weights, latents, noise_std = unpack_params(params)
        print(f"Log likelihood {-objective(params)}, noise_std {noise_std}")
        ax_est_weights.cla()
        ax_est_weights.scatter(weights[:, 0], weights[:, 1])
        ax_est_weights.set_title("Estimated weights")
        ax_est_latents.cla()
        color_scatter(ax_est_latents, latents[0, :], latents[1, :])
        ax_est_latents.set_title("Estimated latents")
        ax_est_weights.set_yticks([])
        ax_est_latents.set_yticks([])
        ax_est_weights.set_xticks([])
        ax_est_latents.set_xticks([])
        plt.draw()
        plt.pause(1.0 / 60.0)

    # Initialize and optimize model.
    rs = npr.RandomState(0)
    init_params = rs.randn(total_num_params)
    minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback)
    plt.pause(20)


================================================
FILE: examples/logistic_regression.py
================================================
import autograd.numpy as np
from autograd import grad
from autograd.test_util import check_grads


def sigmoid(x):
    return 0.5 * (np.tanh(x) + 1)


def logistic_predictions(weights, inputs):
    # Outputs probability of a label being true according to logistic model.
    return sigmoid(np.dot(inputs, weights))


def training_loss(weights):
    # Training loss is the negative log-likelihood of the training labels.
    preds = logistic_predictions(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.sum(np.log(label_probabilities))


# Build a toy dataset.
inputs = np.array([[0.52, 1.12, 0.77], [0.88, -1.08, 0.15], [0.52, 0.06, -1.30], [0.74, -2.49, 1.39]])
targets = np.array([True, True, False, True])

# Build a function that returns gradients of training loss using autograd.
training_gradient_fun = grad(training_loss)

# Check the gradients numerically, just to be safe.
weights = np.array([0.0, 0.0, 0.0])
check_grads(training_loss, modes=["rev"])(weights)

# Optimize weights using gradient descent.
print("Initial loss:", training_loss(weights))
for i in range(100):
    weights -= training_gradient_fun(weights) * 0.01

print("Trained loss:", training_loss(weights))


================================================
FILE: examples/lstm.py
================================================
"""Implements the long-short term memory character model.
This version vectorizes over multiple examples, but each string
has a fixed length."""

from os.path import dirname, join

from rnn import build_dataset, concat_and_multiply, one_hot_to_string, sigmoid, string_to_one_hot

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc.optimizers import adam
from autograd.scipy.special import logsumexp


def init_lstm_params(input_size, state_size, output_size, param_scale=0.01, rs=npr.RandomState(0)):
    def rp(*shape):
        return rs.randn(*shape) * param_scale

    return {
        "init cells": rp(1, state_size),
        "init hiddens": rp(1, state_size),
        "change": rp(input_size + state_size + 1, state_size),
        "forget": rp(input_size + state_size + 1, state_size),
        "ingate": rp(input_size + state_size + 1, state_size),
        "outgate": rp(input_size + state_size + 1, state_size),
        "predict": rp(state_size + 1, output_size),
    }


def lstm_predict(params, inputs):
    def update_lstm(input, hiddens, cells):
        change = np.tanh(concat_and_multiply(params["change"], input, hiddens))
        forget = sigmoid(concat_and_multiply(params["forget"], input, hiddens))
        ingate = sigmoid(concat_and_multiply(params["ingate"], input, hiddens))
        outgate = sigmoid(concat_and_multiply(params["outgate"], input, hiddens))
        cells = cells * forget + ingate * change
        hiddens = outgate * np.tanh(cells)
        return hiddens, cells

    def hiddens_to_output_probs(hiddens):
        output = concat_and_multiply(params["predict"], hiddens)
        return output - logsumexp(output, axis=1, keepdims=True)  # Normalize log-probs.

    num_sequences = inputs.shape[1]
    hiddens = np.repeat(params["init hiddens"], num_sequences, axis=0)
    cells = np.repeat(params["init cells"], num_sequences, axis=0)

    output = [hiddens_to_output_probs(hiddens)]
    for input in inputs:  # Iterate over time steps.
        hiddens, cells = update_lstm(input, hiddens, cells)
        output.append(hiddens_to_output_probs(hiddens))
    return output


def lstm_log_likelihood(params, inputs, targets):
    logprobs = lstm_predict(params, inputs)
    loglik = 0.0
    num_time_steps, num_examples, _ = inputs.shape
    for t in range(num_time_steps):
        loglik += np.sum(logprobs[t] * targets[t])
    return loglik / (num_time_steps * num_examples)


if __name__ == "__main__":
    num_chars = 128

    # Learn to predict our own source code.
    text_filename = join(dirname(__file__), "lstm.py")
    train_inputs = build_dataset(text_filename, sequence_length=30, alphabet_size=num_chars, max_lines=60)

    init_params = init_lstm_params(input_size=128, output_size=128, state_size=40, param_scale=0.01)

    def print_training_prediction(weights):
        print("Training text                         Predicted text")
        logprobs = np.asarray(lstm_predict(weights, train_inputs))
        for t in range(logprobs.shape[1]):
            training_text = one_hot_to_string(train_inputs[:, t, :])
            predicted_text = one_hot_to_string(logprobs[:, t, :])
            print(training_text.replace("\n", " ") + "|" + predicted_text.replace("\n", " "))

    def training_loss(params, iter):
        return -lstm_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)

    print("Training LSTM...")
    trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback)

    print()
    print("Generating text from LSTM...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = lstm_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)


================================================
FILE: examples/mixture_variational_inference.py
================================================
# Implements black-box variational inference, where the variational
# distribution is a mixture of Gaussians.
#
# This trick was written up by Alex Graves in this note:
# http://arxiv.org/abs/1607.05690

import matplotlib.pyplot as plt

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.norm as norm
from autograd import grad
from autograd.misc.optimizers import adam
from autograd.scipy.special import logsumexp


def diag_gaussian_log_density(x, mu, log_std):
    return np.sum(norm.logpdf(x, mu, np.exp(log_std)), axis=-1)


def unpack_gaussian_params(params):
    # Variational dist is a diagonal Gaussian.
    D = np.shape(params)[0] // 2
    mean, log_std = params[:D], params[D:]
    return mean, log_std


def variational_log_density_gaussian(params, x):
    mean, log_std = unpack_gaussian_params(params)
    return diag_gaussian_log_density(x, mean, log_std)


def sample_diag_gaussian(params, num_samples, rs):
    mean, log_std = unpack_gaussian_params(params)
    D = np.shape(mean)[0]
    return rs.randn(num_samples, D) * np.exp(log_std) + mean


def variational_lower_bound(params, t, logprob, sampler, log_density, num_samples, rs):
    """Provides a stochastic estimate of the variational lower bound,
    for any variational family and model density."""
    samples = sampler(params, num_samples, rs)
    log_qs = log_density(params, samples)
    log_ps = logprob(samples, t)
    log_ps = np.reshape(log_ps, (num_samples, -1))
    log_qs = np.reshape(log_qs, (num_samples, -1))
    return np.mean(log_ps - log_qs)


def init_gaussian_var_params(D, mean_mean=-1, log_std_mean=-5, scale=0.1, rs=npr.RandomState(0)):
    init_mean = mean_mean * np.ones(D) + rs.randn(D) * scale
    init_log_std = log_std_mean * np.ones(D) + rs.randn(D) * scale
    return np.concatenate([init_mean, init_log_std])


def log_normalize(x):
    return x - logsumexp(x)


def build_mog_bbsvi(logprob, num_samples, k=10, rs=npr.RandomState(0)):
    init_component_var_params = init_gaussian_var_params
    component_log_density = variational_log_density_gaussian
    component_sample = sample_diag_gaussian

    def unpack_mixture_params(mixture_params):
        log_weights = log_normalize(mixture_params[:k])
        var_params = np.reshape(mixture_params[k:], (k, -1))
        return log_weights, var_params

    def init_var_params(D, rs=npr.RandomState(0), **kwargs):
        log_weights = np.ones(k)
        component_weights = [init_component_var_params(D, rs=rs, **kwargs) for i in range(k)]
        return np.concatenate([log_weights] + component_weights)

    def sample(var_mixture_params, num_samples, rs):
        """Sample locations aren't a continuous function of parameters
        due to multinomial sampling."""
        log_weights, var_params = unpack_mixture_params(var_mixture_params)
        samples = np.concatenate(
            [component_sample(params_k, num_samples, rs)[:, np.newaxis, :] for params_k in var_params],
            axis=1,
        )
        ixs = np.random.choice(k, size=num_samples, p=np.exp(log_weights))
        return np.array([samples[i, ix, :] for i, ix in enumerate(ixs)])

    def mixture_log_density(var_mixture_params, x):
        """Returns a weighted average over component densities."""
        log_weights, var_params = unpack_mixture_params(var_mixture_params)
        component_log_densities = np.vstack(
            [component_log_density(params_k, x) for params_k in var_params]
        ).T
        return logsumexp(component_log_densities + log_weights, axis=1, keepdims=False)

    def mixture_elbo(var_mixture_params, t):
        # We need to only sample the continuous component parameters,
        # and integrate over the discrete component choice

        def mixture_lower_bound(params):
            """Provides a stochastic estimate of the variational lower bound."""
            samples = component_sample(params, num_samples, rs)
            log_qs = mixture_log_density(var_mixture_params, samples)
            log_ps = logprob(samples, t)
            log_ps = np.reshape(log_ps, (num_samples, -1))
            log_qs = np.reshape(log_qs, (num_samples, -1))
            return np.mean(log_ps - log_qs)

        log_weights, var_params = unpack_mixture_params(var_mixture_params)
        component_elbos = np.stack([mixture_lower_bound(params_k) for params_k in var_params])
        return np.sum(component_elbos * np.exp(log_weights))

    return init_var_params, mixture_elbo, mixture_log_density, sample


if __name__ == "__main__":
    # Specify an inference problem by its unnormalized log-density.
    D = 2

    def log_density(x, t):
        mu, log_sigma = x[:, 0], x[:, 1]
        sigma_density = norm.logpdf(log_sigma, 0, 1.35)
        mu_density = norm.logpdf(mu, -0.5, np.exp(log_sigma))
        sigma_density2 = norm.logpdf(log_sigma, 0.1, 1.35)
        mu_density2 = norm.logpdf(mu, 0.5, np.exp(log_sigma))
        return np.logaddexp(sigma_density + mu_density, sigma_density2 + mu_density2)

    init_var_params, elbo, variational_log_density, variational_sampler = build_mog_bbsvi(
        log_density, num_samples=40, k=10
    )

    def objective(params, t):
        return -elbo(params, t)

    # Set up plotting code
    def plot_isocontours(ax, func, xlimits=[-2, 2], ylimits=[-4, 2], numticks=101, cmap=None):
        x = np.linspace(*xlimits, num=numticks)
        y = np.linspace(*ylimits, num=numticks)
        X, Y = np.meshgrid(x, y)
        zs = func(np.concatenate([np.atleast_2d(X.ravel()), np.atleast_2d(Y.ravel())]).T)
        Z = zs.reshape(X.shape)
        plt.contour(X, Y, Z, cmap=cmap)
        ax.set_yticks([])
        ax.set_xticks([])

    fig = plt.figure(figsize=(8, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    num_plotting_samples = 51

    def callback(params, t, g):
        print(f"Iteration {t} lower bound {-objective(params, t)}")

        plt.cla()
        target_distribution = lambda x: np.exp(log_density(x, t))
        var_distribution = lambda x: np.exp(variational_log_density(params, x))
        plot_isocontours(ax, target_distribution)
        plot_isocontours(ax, var_distribution, cmap=plt.cm.bone)
        ax.set_autoscale_on(False)

        rs = npr.RandomState(0)
        samples = variational_sampler(params, num_plotting_samples, rs)
        plt.plot(samples[:, 0], samples[:, 1], "x")

        plt.draw()
        plt.pause(1.0 / 30.0)

    print("Optimizing variational parameters...")
    variational_params = adam(
        grad(objective), init_var_params(D), step_size=0.1, num_iters=2000, callback=callback
    )


================================================
FILE: examples/natural_gradient_black_box_svi.py
================================================
import matplotlib.pyplot as plt

# same BBSVI function!
from black_box_svi import black_box_variational_inference

import autograd.numpy as np
import autograd.scipy.stats.norm as norm
from autograd.misc.optimizers import adam, sgd

if __name__ == "__main__":
    # Specify an inference problem by its unnormalized log-density.
    # it's difficult to see the benefit in low dimensions
    # model parameters are a mean and a log_sigma
    np.random.seed(42)
    obs_dim = 20
    Y = np.random.randn(obs_dim, obs_dim).dot(np.random.randn(obs_dim))

    def log_density(x, t):
        mu, log_sigma = x[:, :obs_dim], x[:, obs_dim:]
        sigma_density = np.sum(norm.logpdf(log_sigma, 0, 1.35), axis=1)
        mu_density = np.sum(norm.logpdf(Y, mu, np.exp(log_sigma)), axis=1)
        return sigma_density + mu_density

    # Build variational objective.
    D = obs_dim * 2  # dimension of our posterior
    objective, gradient, unpack_params = black_box_variational_inference(log_density, D, num_samples=2000)

    # Define the natural gradient
    #   The natural gradient of the ELBO is the gradient of the elbo,
    #   preconditioned by the inverse Fisher Information Matrix.  The Fisher,
    #   in the case of a diagonal gaussian, is a diagonal matrix that is a
    #   simple function of the variance.  Intuitively, statistical distance
    #   created by perturbing the mean of an independent Gaussian is
    #   determined by how wide the distribution is along that dimension ---
    #   the wider the distribution, the less sensitive statistical distances is
    #   to perturbations of the mean; the narrower the distribution, the more
    #   the statistical distance changes when you perturb the mean (imagine
    #   an extremely narrow Gaussian --- basically a spike.  The KL between
    #   this Gaussian and a Gaussian $\epsilon$ away in location can be big ---
    #   moving the Gaussian could significantly reduce overlap in support
    #   which corresponds to a greater statistical distance).
    #
    #   When we want to move in directions of steepest ascent, we multiply by
    #   the inverse fisher --- that way we make quicker progress when the
    #   variance is wide, and we scale down our step size when the variance
    #   is small (which leads to more robust/less chaotic ascent).
    def fisher_diag(lam):
        mu, log_sigma = unpack_params(lam)
        return np.concatenate([np.exp(-2.0 * log_sigma), np.ones(len(log_sigma)) * 2])

    # simple! basically free!
    natural_gradient = lambda lam, i: (1.0 / fisher_diag(lam)) * gradient(lam, i)

    # function for keeping track of callback ELBO values (for plotting below)
    def optimize_and_lls(optfun):
        num_iters = 200
        elbos = []

        def callback(params, t, g):
            elbo_val = -objective(params, t)
            elbos.append(elbo_val)
            if t % 50 == 0:
                print(f"Iteration {t} lower bound {elbo_val}")

        init_mean = -1 * np.ones(D)
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [0.1, 0.25, 0.5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size, num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(
            natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=0.001
        )
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12, 8))
    colors = ["b", "k", "g"]
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(
            np.arange(len(stand_lls)),
            stand_lls,
            "--",
            label="standard (adam, step-size = %2.2f)" % ss,
            alpha=0.5,
            c=col,
        )
        plt.plot(np.arange(len(nat_lls)), nat_lls, "-", label="natural (sgd, step-size = %2.2f)" % ss, c=col)

    llrange = natural_lls.max() - natural_lls.min()
    plt.ylim((natural_lls.max() - llrange * 0.1, natural_lls.max() + 10))
    plt.xlabel("optimization iteration")
    plt.ylabel("ELBO")
    plt.legend(loc="lower right")
    plt.title("%d dimensional posterior" % D)
    plt.show()


================================================
FILE: examples/negative_binomial_maxlike.py
================================================
import scipy.optimize

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.scipy.special import gammaln

# The code in this example implements a method for finding a stationary point of
# the negative binomial likelihood via Newton's method, described here:
# https://en.wikipedia.org/wiki/Negative_binomial_distribution#Maximum_likelihood_estimation


def newton(f, x0):
    # wrap scipy.optimize.newton with our automatic derivatives
    return scipy.optimize.newton(f, x0, fprime=grad(f), fprime2=grad(grad(f)))


def negbin_loglike(r, p, x):
    # the negative binomial log likelihood we want to maximize
    return gammaln(r + x) - gammaln(r) - gammaln(x + 1) + x * np.log(p) + r * np.log(1 - p)


def negbin_sample(r, p, size):
    # a negative binomial is a gamma-compound-Poisson
    return npr.poisson(npr.gamma(r, p / (1 - p), size=size))


def fit_maxlike(x, r_guess):
    # follows Wikipedia's section on negative binomial max likelihood
    assert np.var(x) > np.mean(x), "Likelihood-maximizing parameters don't exist!"
    loglike = lambda r, p: np.sum(negbin_loglike(r, p, x))
    p = lambda r: np.sum(x) / np.sum(r + x)
    rprime = lambda r: grad(loglike)(r, p(r))
    r = newton(rprime, r_guess)
    return r, p(r)


if __name__ == "__main__":
    # generate data
    npr.seed(0)
    data = negbin_sample(r=5, p=0.5, size=1000)

    # fit likelihood-extremizing parameters
    r, p = fit_maxlike(data, r_guess=1)

    # report fit
    print("Fit parameters:")
    print(f"r={r}, p={p}")

    print("Check that we are at a local stationary point:")
    loglike = lambda r, p: np.sum(negbin_loglike(r, p, data))
    grad_both = grad(loglike, argnum=(0, 1))
    print(grad_both(r, p))

    import matplotlib.pyplot as plt

    xm = data.max()
    plt.figure()
    plt.hist(data, bins=np.arange(xm + 1) - 0.5, density=True, label="normed data counts")
    plt.xlim(0, xm)
    plt.plot(np.arange(xm), np.exp(negbin_loglike(r, p, np.arange(xm))), label="maxlike fit")
    plt.xlabel("k")
    plt.ylabel("p(k)")
    plt.legend(loc="best")
    plt.show()


================================================
FILE: examples/neural_net.py
================================================
"""A multi-layer perceptron for classification of MNIST handwritten digits."""

from data import load_mnist

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc.flatten import flatten
from autograd.misc.optimizers import adam
from autograd.scipy.special import logsumexp


def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples,
    one for each layer in the net."""
    return [
        (
            scale * rs.randn(m, n),  # weight matrix
            scale * rs.randn(n),
        )  # bias vector
        for m, n in zip(layer_sizes[:-1], layer_sizes[1:])
    ]


def neural_net_predict(params, inputs):
    """Implements a deep neural network for classification.
    params is a list of (weights, bias) tuples.
    inputs is an (N x D) matrix.
    returns normalized class log-probabilities."""
    for W, b in params:
        outputs = np.dot(inputs, W) + b
        inputs = np.tanh(outputs)
    return outputs - logsumexp(outputs, axis=1, keepdims=True)


def l2_norm(params):
    """Computes l2 norm of params by flattening them into a vector."""
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)


def log_posterior(params, inputs, targets, L2_reg):
    log_prior = -L2_reg * l2_norm(params)
    log_lik = np.sum(neural_net_predict(params, inputs) * targets)
    return log_prior + log_lik


def accuracy(params, inputs, targets):
    target_class = np.argmax(targets, axis=1)
    predicted_class = np.argmax(neural_net_predict(params, inputs), axis=1)
    return np.mean(predicted_class == target_class)


if __name__ == "__main__":
    # Model parameters
    layer_sizes = [784, 200, 100, 10]
    L2_reg = 1.0

    # Training parameters
    param_scale = 0.1
    batch_size = 256
    num_epochs = 5
    step_size = 0.001

    print("Loading training data...")
    N, train_images, train_labels, test_images, test_labels = load_mnist()

    init_params = init_random_params(param_scale, layer_sizes)

    num_batches = int(np.ceil(len(train_images) / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx + 1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(params, train_images[idx], train_labels[idx], L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print("     Epoch     |    Train accuracy  |       Test accuracy  ")

    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, train_images, train_labels)
            test_acc = accuracy(params, test_images, test_labels)
            print(f"{iter // num_batches:15}|{train_acc:20}|{test_acc:20}")

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(
        objective_grad,
        init_params,
        step_size=step_size,
        num_iters=num_epochs * num_batches,
        callback=print_perf,
    )


================================================
FILE: examples/neural_net_regression.py
================================================
import matplotlib.pyplot as plt

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.norm as norm
from autograd import grad
from autograd.misc import flatten
from autograd.misc.optimizers import adam


def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples, one for each layer."""
    return [
        (
            rs.randn(insize, outsize) * scale,  # weight matrix
            rs.randn(outsize) * scale,
        )  # bias vector
        for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])
    ]


def nn_predict(params, inputs, nonlinearity=np.tanh):
    for W, b in params:
        outputs = np.dot(inputs, W) + b
        inputs = nonlinearity(outputs)
    return outputs


def log_gaussian(params, scale):
    flat_params, _ = flatten(params)
    return np.sum(norm.logpdf(flat_params, 0, scale))


def logprob(weights, inputs, targets, noise_scale=0.1):
    predictions = nn_predict(weights, inputs)
    return np.sum(norm.logpdf(predictions, targets, noise_scale))


def build_toy_dataset(n_data=80, noise_std=0.1):
    rs = npr.RandomState(0)
    inputs = np.concatenate([np.linspace(0, 3, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)])
    targets = np.cos(inputs) + rs.randn(n_data) * noise_std
    inputs = (inputs - 4.0) / 2.0
    inputs = inputs[:, np.newaxis]
    targets = targets[:, np.newaxis] / 2.0
    return inputs, targets


if __name__ == "__main__":
    init_scale = 0.1
    weight_prior_variance = 10.0
    init_params = init_random_params(init_scale, layer_sizes=[1, 4, 4, 1])

    inputs, targets = build_toy_dataset()

    def objective(weights, t):
        return -logprob(weights, inputs, targets) - log_gaussian(weights, weight_prior_variance)

    print(grad(objective)(init_params, 0))

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(params, t, g):
        print(f"Iteration {t} log likelihood {-objective(params, t)}")

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), "bx", ms=12)
        plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300, 1))
        outputs = nn_predict(params, plot_inputs)
        ax.plot(plot_inputs, outputs, "r", lw=3)
        ax.set_ylim([-1, 1])
        plt.draw()
        plt.pause(1.0 / 60.0)

    print("Optimizing network parameters...")
    optimized_params = adam(grad(objective), init_params, step_size=0.01, num_iters=1000, callback=callback)


================================================
FILE: examples/ode_net.py
================================================
# A demo of gradients through scipy.integrate.odeint,
# estimating the dynamics of a system given a trajectory.


import matplotlib.pyplot as plt
import numpy as npo

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.builtins import tuple
from autograd.misc.optimizers import adam
from autograd.scipy.integrate import odeint

N = 30  # Dataset size
D = 2  # Data dimension
max_T = 1.5


# Two-dimensional damped oscillator
def func(y, t0, A):
    return np.dot(y**3, A)


def nn_predict(inputs, t, params):
    for W, b in params:
        outputs = np.dot(inputs, W) + b
        inputs = np.maximum(0, outputs)
    return outputs


def init_nn_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples, one for each layer."""
    return [
        (
            rs.randn(insize, outsize) * scale,  # weight matrix
            rs.randn(outsize) * scale,
        )  # bias vector
        for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])
    ]


# Define neural ODE model.
def ode_pred(params, y0, t):
    return odeint(nn_predict, y0, t, tuple((params,)), rtol=0.01)


def L1_loss(pred, targets):
    return np.mean(np.abs(pred - targets))


if __name__ == "__main__":
    # Generate data from true dynamics.
    true_y0 = np.array([2.0, 0.0]).T
    t = np.linspace(0.0, max_T, N)
    true_A = np.array([[-0.1, 2.0], [-2.0, -0.1]])
    true_y = odeint(func, true_y0, t, args=(true_A,))

    def train_loss(params, iter):
        pred = ode_pred(params, true_y0, t)
        return L1_loss(pred, true_y)

    # Set up figure
    fig = plt.figure(figsize=(12, 4), facecolor="white")
    ax_traj = fig.add_subplot(131, frameon=False)
    ax_phase = fig.add_subplot(132, frameon=False)
    ax_vecfield = fig.add_subplot(133, frameon=False)
    plt.show(block=False)

    # Plots data and learned dynamics.
    def callback(params, iter, g):
        pred = ode_pred(params, true_y0, t)

        print(f"Iteration {iter:d} train loss {L1_loss(pred, true_y):.6f}")

        ax_traj.cla()
        ax_traj.set_title("Trajectories")
        ax_traj.set_xlabel("t")
        ax_traj.set_ylabel("x,y")
        ax_traj.plot(t, true_y[:, 0], "-", t, true_y[:, 1], "g-")
        ax_traj.plot(t, pred[:, 0], "--", t, pred[:, 1], "b--")
        ax_traj.set_xlim(t.min(), t.max())
        ax_traj.set_ylim(-2, 2)
        ax_traj.xaxis.set_ticklabels([])
        ax_traj.yaxis.set_ticklabels([])
        ax_traj.legend()

        ax_phase.cla()
        ax_phase.set_title("Phase Portrait")
        ax_phase.set_xlabel("x")
        ax_phase.set_ylabel("y")
        ax_phase.plot(true_y[:, 0], true_y[:, 1], "g-")
        ax_phase.plot(pred[:, 0], pred[:, 1], "b--")
        ax_phase.set_xlim(-2, 2)
        ax_phase.set_ylim(-2, 2)
        ax_phase.xaxis.set_ticklabels([])
        ax_phase.yaxis.set_ticklabels([])

        ax_vecfield.cla()
        ax_vecfield.set_title("Learned Vector Field")
        ax_vecfield.set_xlabel("x")
        ax_vecfield.set_ylabel("y")
        ax_vecfield.xaxis.set_ticklabels([])
        ax_vecfield.yaxis.set_ticklabels([])

        # vector field plot
        y, x = npo.mgrid[-2:2:21j, -2:2:21j]
        dydt = nn_predict(np.stack([x, y], -1).reshape(21 * 21, 2), 0, params).reshape(-1, 2)
        mag = np.sqrt(dydt[:, 0] ** 2 + dydt[:, 1] ** 2).reshape(-1, 1)
        dydt = dydt / mag
        dydt = dydt.reshape(21, 21, 2)

        ax_vecfield.streamplot(x, y, dydt[:, :, 0], dydt[:, :, 1], color="black")
        ax_vecfield.set_xlim(-2, 2)
        ax_vecfield.set_ylim(-2, 2)

        fig.tight_layout()
        plt.draw()
        plt.pause(0.001)

    # Train neural net dynamics to match data.
    init_params = init_nn_params(0.1, layer_sizes=[D, 150, D])
    optimized_params = adam(grad(train_loss), init_params, num_iters=1000, callback=callback)


================================================
FILE: examples/print_trace.py
================================================
"""Demonstrates how to use the tracer module, independent of autodiff, by
creating a trace that prints out functions and their arguments as they're being
evaluated"""

import autograd.numpy as np  # autograd has already wrapped numpy for us
from autograd.tracer import Node, trace


class PrintNode(Node):
    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
        self.varname_generator = parents[0].varname_generator
        self.varname = next(self.varname_generator)
        args_or_vars = list(args)
        for argnum, parent in zip(parent_argnums, parents):
            args_or_vars[argnum] = parent.varname
        print("{} = {}({}) = {}".format(self.varname, fun.__name__, ",".join(map(str, args_or_vars)), value))

    def initialize_root(self, x):
        self.varname_generator = make_varname_generator()
        self.varname = next(self.varname_generator)
        print(f"{self.varname} = {x}")


def make_varname_generator():
    for i in range(65, 91):
        yield chr(i)
    raise Exception("Ran out of alphabet!")


def print_trace(f, x):
    start_node = PrintNode.new_root(x)
    trace(start_node, f, x)
    print()


def avg(x, y):
    return (x + y) / 2


def fun(x):
    y = np.sin(x + x)
    return avg(y, y)


print_trace(fun, 1.23)

# Traces can be nested, so we can also trace through grad(fun)
from autograd import grad

print_trace(grad(fun), 1.0)


================================================
FILE: examples/rkhs.py
================================================
"""
Inferring a function from a reproducing kernel Hilbert space (RKHS) by taking
gradients of eval with respect to the function-valued argument
"""

from itertools import chain

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.extend import Box, VSpace, defvjp, primitive
from autograd.util import func


class RKHSFun:
    def __init__(self, kernel, alphas={}):
        self.alphas = alphas
        self.kernel = kernel
        self.vs = RKHSFunVSpace(self)

    @primitive
    def __call__(self, x):
        return sum([a * self.kernel(x, x_repr) for x_repr, a in self.alphas.items()], 0.0)

    def __add__(self, f):
        return self.vs.add(self, f)

    def __mul__(self, a):
        return self.vs.scalar_mul(self, a)


# TODO: add vjp of __call__ wrt x (and show it in action)
defvjp(func(RKHSFun.__call__), lambda ans, f, x: lambda g: RKHSFun(f.kernel, {x: 1}) * g)


class RKHSFunBox(Box, RKHSFun):
    @property
    def kernel(self):
        return self._value.kernel


RKHSFunBox.register(RKHSFun)


class RKHSFunVSpace(VSpace):
    def __init__(self, value):
        self.kernel = value.kernel

    def zeros(self):
        return RKHSFun(self.kernel)

    def randn(self):
        # These arbitrary vectors are not analogous to randn in any meaningful way
        N = npr.randint(1, 3)
        return RKHSFun(self.kernel, dict(zip(npr.randn(N), npr.randn(N))))

    def _add(self, f, g):
        assert f.kernel is g.kernel
        return RKHSFun(f.kernel, add_dicts(f.alphas, g.alphas))

    def _scalar_mul(self, f, a):
        return RKHSFun(f.kernel, {x: a * a_cur for x, a_cur in f.alphas.items()})

    def _inner_prod(self, f, g):
        assert f.kernel is g.kernel
        return sum(
            [a1 * a2 * f.kernel(x1, x2) for x1, a1 in f.alphas.items() for x2, a2 in g.alphas.items()], 0.0
        )


RKHSFunVSpace.register(RKHSFun)


def add_dicts(d1, d2):
    d = {}
    for k, v in chain(d1.items(), d2.items()):
        d[k] = d[k] + v if k in d else v
    return d


if __name__ == "__main__":

    def sq_exp_kernel(x1, x2):
        return np.exp(-((x1 - x2) ** 2))

    xs = range(5)
    ys = [1, 2, 3, 2, 1]

    def logprob(f, xs, ys):
        return -sum((f(x) - y) ** 2 for x, y in zip(xs, ys))

    f = RKHSFun(sq_exp_kernel)
    for i in range(100):
        f = f + grad(logprob)(f, xs, ys) * 0.01

    for x, y in zip(xs, ys):
        print(f"{x}\t{y}\t{f(x)}")


================================================
FILE: examples/rnn.py
================================================
"""Implements the long-short term memory character model.
This version vectorizes over multiple examples, but each string
has a fixed length."""

from os.path import dirname, join

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc.optimizers import adam
from autograd.scipy.special import logsumexp

### Helper functions #################


def sigmoid(x):
    return 0.5 * (np.tanh(x) + 1.0)  # Output ranges from 0 to 1.


def concat_and_multiply(weights, *args):
    cat_state = np.hstack(args + (np.ones((args[0].shape[0], 1)),))
    return np.dot(cat_state, weights)


### Define recurrent neural net #######


def create_rnn_params(input_size, state_size, output_size, param_scale=0.01, rs=npr.RandomState(0)):
    return {
        "init hiddens": rs.randn(1, state_size) * param_scale,
        "change": rs.randn(input_size + state_size + 1, state_size) * param_scale,
        "predict": rs.randn(state_size + 1, output_size) * param_scale,
    }


def rnn_predict(params, inputs):
    def update_rnn(input, hiddens):
        return np.tanh(concat_and_multiply(params["change"], input, hiddens))

    def hiddens_to_output_probs(hiddens):
        output = concat_and_multiply(params["predict"], hiddens)
        return output - logsumexp(output, axis=1, keepdims=True)  # Normalize log-probs.

    num_sequences = inputs.shape[1]
    hiddens = np.repeat(params["init hiddens"], num_sequences, axis=0)
    output = [hiddens_to_output_probs(hiddens)]

    for input in inputs:  # Iterate over time steps.
        hiddens = update_rnn(input, hiddens)
        output.append(hiddens_to_output_probs(hiddens))
    return output


def rnn_log_likelihood(params, inputs, targets):
    logprobs = rnn_predict(params, inputs)
    loglik = 0.0
    num_time_steps, num_examples, _ = inputs.shape
    for t in range(num_time_steps):
        loglik += np.sum(logprobs[t] * targets[t])
    return loglik / (num_time_steps * num_examples)


### Dataset setup ##################


def string_to_one_hot(string, maxchar):
    """Converts an ASCII string to a one-of-k encoding."""
    ascii = np.array([ord(c) for c in string]).T
    return np.array(ascii[:, None] == np.arange(maxchar)[None, :], dtype=int)


def one_hot_to_string(one_hot_matrix):
    return "".join([chr(np.argmax(c)) for c in one_hot_matrix])


def build_dataset(filename, sequence_length, alphabet_size, max_lines=-1):
    """Loads a text file, and turns each line into an encoded sequence."""
    with open(filename) as f:
        content = f.readlines()
    content = content[:max_lines]
    content = [line for line in content if len(line) > 2]  # Remove blank lines
    seqs = np.zeros((sequence_length, len(content), alphabet_size))
    for ix, line in enumerate(content):
        padded_line = (line + " " * sequence_length)[:sequence_length]
        seqs[:, ix, :] = string_to_one_hot(padded_line, alphabet_size)
    return seqs


if __name__ == "__main__":
    num_chars = 128

    # Learn to predict our own source code.
    text_filename = join(dirname(__file__), "rnn.py")
    train_inputs = build_dataset(text_filename, sequence_length=30, alphabet_size=num_chars, max_lines=60)

    init_params = create_rnn_params(input_size=128, output_size=128, state_size=40, param_scale=0.01)

    def print_training_prediction(weights):
        print("Training text                         Predicted text")
        logprobs = np.asarray(rnn_predict(weights, train_inputs))
        for t in range(logprobs.shape[1]):
            training_text = one_hot_to_string(train_inputs[:, t, :])
            predicted_text = one_hot_to_string(logprobs[:, t, :])
            print(training_text.replace("\n", " ") + "|" + predicted_text.replace("\n", " "))

    def training_loss(params, iter):
        return -rnn_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)

    print("Training RNN...")
    trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback)

    print()
    print("Generating text from RNN...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = rnn_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)


================================================
FILE: examples/rosenbrock.py
================================================
from scipy.optimize import minimize

import autograd.numpy as np
from autograd import value_and_grad


def rosenbrock(x):
    return 100 * (x[1] - x[0] ** 2) ** 2 + (1 - x[0]) ** 2


# Build a function that also returns gradients using autograd.
rosenbrock_with_grad = value_and_grad(rosenbrock)

# Optimize using conjugate gradients.
result = minimize(rosenbrock_with_grad, x0=np.array([0.0, 0.0]), jac=True, method="CG")
print(f"Found minimum at {result.x}")


================================================
FILE: examples/sinusoid.py
================================================
import matplotlib.pyplot as plt

import autograd.numpy as np
from autograd import grad


def fun(x):
    return np.sin(x)


d_fun = grad(fun)  # First derivative
dd_fun = grad(d_fun)  # Second derivative

x = np.linspace(-10, 10, 100)
plt.plot(x, list(map(fun, x)), x, list(map(d_fun, x)), x, list(map(dd_fun, x)))

plt.xlim([-10, 10])
plt.ylim([-1.2, 1.2])
plt.axis("off")
plt.savefig("sinusoid.png")
plt.clf()


# Taylor approximation to sin function
def fun(x):
    currterm = x
    ans = currterm
    for i in range(1000):
        print(i, end=" ")
        currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2))
        ans = ans + currterm
        if np.abs(currterm) < 0.2:
            break  # (Very generous tolerance!)

    return ans


d_fun = grad(fun)
dd_fun = grad(d_fun)

x = np.linspace(-10, 10, 100)
plt.plot(x, list(map(fun, x)), x, list(map(d_fun, x)), x, list(map(dd_fun, x)))

plt.xlim([-10, 10])
plt.ylim([-1.2, 1.2])
plt.axis("off")
plt.savefig("sinusoid_taylor.png")
plt.clf()


================================================
FILE: examples/tanh.py
================================================
import matplotlib.pyplot as plt

import autograd.numpy as np
from autograd import elementwise_grad as egrad

"""
Mathematically we can only take gradients of scalar-valued functions, but
autograd's elementwise_grad function also handles numpy's familiar vectorization
of scalar functions, which is used in this example.

To be precise, elementwise_grad(fun)(x) always returns the value of a
vector-Jacobian product, where the Jacobian of fun is evaluated at x and the
vector is an all-ones vector with the same size as the output of fun. When
vectorizing a scalar-valued function over many arguments, the Jacobian of the
overall vector-to-vector mapping is diagonal, and so this vector-Jacobian
product simply returns the diagonal elements of the Jacobian, which is the
(elementwise) gradient of the function at each input value over which the
function is vectorized.
"""


def tanh(x):
    return (1.0 - np.exp(-2 * x)) / (1.0 + np.exp(-(2 * x)))


### Plotting
plt.figure(figsize=(12, 8))
x = np.linspace(-7, 7, 700)
plt.plot(x, tanh(x), label="tanh(x)")
plt.plot(x, egrad(tanh)(x), label="1st derivative")
plt.plot(x, egrad(egrad(tanh))(x), label="2nd derivative")
plt.plot(x, egrad(egrad(egrad(tanh)))(x), label="3rd derivative")
plt.plot(x, egrad(egrad(egrad(egrad(tanh))))(x), label="4th derivative")
plt.xlabel("x")
plt.ylabel("y")
plt.ylim(-5, 5)
plt.yticks(np.arange(-5, 6, 1))
plt.legend()
plt.grid(True)
plt.title("tanh(x) and its derivatives")
plt.savefig("tanh.png")
plt.show()


================================================
FILE: examples/variational_autoencoder.py
================================================
# Implements auto-encoding variational Bayes.

from data import load_mnist, save_images

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.norm as norm
from autograd import grad
from autograd.misc.optimizers import adam
from autograd.scipy.special import expit as sigmoid


def diag_gaussian_log_density(x, mu, log_std):
    return np.sum(norm.logpdf(x, mu, np.exp(log_std)), axis=-1)


def unpack_gaussian_params(params):
    # Params of a diagonal Gaussian.
    D = np.shape(params)[-1] // 2
    mean, log_std = params[:, :D], params[:, D:]
    return mean, log_std


def sample_diag_gaussian(mean, log_std, rs):
    return rs.randn(*mean.shape) * np.exp(log_std) + mean


def bernoulli_log_density(targets, unnormalized_logprobs):
    # unnormalized_logprobs are in R
    # Targets must be -1 or 1
    label_probabilities = -np.logaddexp(0, -unnormalized_logprobs * targets)
    return np.sum(label_probabilities, axis=-1)  # Sum across pixels.


def relu(x):
    return np.maximum(0, x)


def init_net_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a (weights, biases) tuples for all layers."""
    return [
        (
            scale * rs.randn(m, n),  # weight matrix
            scale * rs.randn(n),
        )  # bias vector
        for m, n in zip(layer_sizes[:-1], layer_sizes[1:])
    ]


def batch_normalize(activations):
    mbmean = np.mean(activations, axis=0, keepdims=True)
    return (activations - mbmean) / (np.std(activations, axis=0, keepdims=True) + 1)


def neural_net_predict(params, inputs):
    """Params is a list of (weights, bias) tuples.
    inputs is an (N x D) matrix.
    Applies batch normalization to every layer but the last."""
    for W, b in params[:-1]:
        outputs = batch_normalize(np.dot(inputs, W) + b)  # linear transformation
        inputs = relu(outputs)  # nonlinear transformation
    outW, outb = params[-1]
    outputs = np.dot(inputs, outW) + outb
    return outputs


def nn_predict_gaussian(params, inputs):
    # Returns means and diagonal variances
    return unpack_gaussian_params(neural_net_predict(params, inputs))


def generate_from_prior(gen_params, num_samples, noise_dim, rs):
    latents = rs.randn(num_samples, noise_dim)
    return sigmoid(neural_net_predict(gen_params, latents))


def p_images_given_latents(gen_params, images, latents):
    preds = neural_net_predict(gen_params, latents)
    return bernoulli_log_density(images, preds)


def vae_lower_bound(gen_params, rec_params, data, rs):
    # We use a simple Monte Carlo estimate of the KL
    # divergence from the prior.
    q_means, q_log_stds = nn_predict_gaussian(rec_params, data)
    latents = sample_diag_gaussian(q_means, q_log_stds, rs)
    q_latents = diag_gaussian_log_density(latents, q_means, q_log_stds)
    p_latents = diag_gaussian_log_density(latents, 0, 0)
    likelihood = p_images_given_latents(gen_params, data, latents)
    return np.mean(p_latents + likelihood - q_latents)


if __name__ == "__main__":
    # Model hyper-parameters
    latent_dim = 10
    data_dim = 784  # How many pixels in each image (28x28).
    gen_layer_sizes = [latent_dim, 300, 200, data_dim]
    rec_layer_sizes = [data_dim, 200, 300, latent_dim * 2]

    # Training parameters
    param_scale = 0.01
    batch_size = 200
    num_epochs = 15
    step_size = 0.001

    print("Loading training data...")
    N, train_images, _, test_images, _ = load_mnist()

    def binarise(images):
        on = images > 0.5
        images = images * 0 - 1
        images[on] = 1.0
        return images

    print("Binarising training data...")
    train_images = binarise(train_images)
    test_images = binarise(test_images)

    init_gen_params = init_net_params(param_scale, gen_layer_sizes)
    init_rec_params = init_net_params(param_scale, rec_layer_sizes)
    combined_init_params = (init_gen_params, init_rec_params)

    num_batches = int(np.ceil(len(train_images) / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx + 1) * batch_size)

    # Define training objective
    seed = npr.RandomState(0)

    def objective(combined_params, iter):
        data_idx = batch_indices(iter)
        gen_params, rec_params = combined_params
        return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim

    # Get gradients of objective using autograd.
    objective_grad = grad(objective)

    print("     Epoch     |    Objective       |    Test ELBO  ")

    def print_perf(combined_params, iter, grad):
        if iter % 10 == 0:
            gen_params, rec_params = combined_params
            bound = np.mean(objective(combined_params, iter))
            message = f"{iter // num_batches:15}|{bound:20}|"
            if iter % 100 == 0:
                test_bound = -vae_lower_bound(gen_params, rec_params, test_images, seed) / data_dim
                message += f"{test_bound:20}"
            print(message)

            fake_data = generate_from_prior(gen_params, 20, latent_dim, seed)
            save_images(fake_data, "vae_samples.png", vmin=0, vmax=1)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(
        objective_grad,
        combined_init_params,
        step_size=step_size,
        num_iters=num_epochs * num_batches,
        callback=print_perf,
    )


================================================
FILE: license.txt
================================================
The MIT License (MIT)

Copyright (c) 2025 by the President and Fellows of Harvard University

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: noxfile.py
================================================
import platform

import nox

NIGHTLY_INDEX_URL = "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
UV_NIGHTLY_ENV_VARS = {
    "UV_INDEX_URL": NIGHTLY_INDEX_URL,
    "UV_PRERELEASE": "allow",
    "UV_INDEX_STRATEGY": "first-index",
}

nox.needs_version = ">=2024.4.15"
nox.options.default_venv_backend = "uv|virtualenv"
nox.options.reuse_existing_virtualenvs = False
nox.options.error_on_external_run = True
# nox.options.sessions = ["lint", "validate-package", "tests"]
nox.options.sessions = ["tests"]


@nox.session(name="validate-package")
def check(session):
    """Build source distribution, wheel, and check their metadata"""
    session.install("build", "twine", silent=False)
    session.run("python", "-m", "build")
    session.run("twine", "check", "--strict", "dist/*")


@nox.session(name="tests", tags=["tests"])
def run_tests(session):
    """Run unit tests and generate a coverage report"""
    pyproject = nox.project.load_toml("pyproject.toml")
    session.install(*nox.project.dependency_groups(pyproject, "test"))
    # SciPy doesn't have wheels on PyPy
    if platform.python_implementation() == "PyPy":
        session.install("-e.", silent=False)
    else:
        session.install("-e", ".[scipy]", silent=False)
    session.run("pytest", "--cov=autograd", "--cov-report=xml", "--cov-append", *session.posargs)


@nox.session(name="lint", reuse_venv=True)
def ruff(session):
    """Lightning-fast linting for Python"""
    session.install("pre-commit", silent=False)
    session.run("pre-commit", "run", "--all-files", "--show-diff-on-failure")


@nox.session(name="nightly-tests", tags=["tests"])
def run_nightly_tests(session):
    """Run tests against nightly versions of dependencies"""
    session.install("-e.", silent=False)
    pyproject = nox.project.load_toml("pyproject.toml")
    session.install(*nox.project.dependency_groups(pyproject, "test"))
    # SciPy doesn't have wheels on PyPy
    if platform.python_implementation() == "PyPy":
        session.install(
            "numpy", "--upgrade", "--only-binary", ":all:", silent=False, env=UV_NIGHTLY_ENV_VARS
        )
    else:
        session.install(
            "numpy", "scipy", "--upgrade", "--only-binary", ":all:", silent=False, env=UV_NIGHTLY_ENV_VARS
        )
    session.run("pytest", "--cov=autograd", "--cov-report=xml", "--cov-append", *session.posargs)


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "autograd"
version = "1.8.0"
requires-python = ">=3.10"
description = "Efficiently computes derivatives of NumPy code."
readme = "README.md"
license = {file = "license.txt"}
authors = [
  {name = "Dougal Maclaurin", email = "maclaurin@physics.harvard.edu"},
  {name = "David Duvenaud", email = "duvenaud@cs.toronto.edu"},
  {name = "Matthew Johnson", email = "mattjj@csail.mit.edu"},
  {name = "Jamie Townsend", email = "j.h.n.townsend@uva.nl"},
]
maintainers = [
  {name = "Jamie Townsend", email = "j.h.n.townsend@uva.nl"},
  {name = "Fabian Joswig", email = "fabian.joswig@uni-muenster.de"},
  {name = "Agriya Khetarpal", email = "agriyakhetarpal@outlook.com"},
]
classifiers = [
  "Development Status :: 4 - Beta",
  "Intended Audience :: Information Technology",
  "Intended Audience :: Science/Research",
  "License :: OSI Approved :: MIT License",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
  "Programming Language :: Python :: 3.14",
  "Topic :: Scientific/Engineering",
]
keywords = [
  "Automatic differentiation",
  "backpropagation",
  "gradients",
  "machine learning",
  "optimization",
  "neural networks",
  "Python",
  "NumPy",
  "SciPy",
]
dependencies = [
  "numpy<3",
]
# dynamic = ["version"]

[project.urls]
Source = "https://github.com/HIPS/autograd"

[project.optional-dependencies]
scipy = [
  "scipy",
]

[dependency-groups]
test = [
  "pytest",
  "pytest-cov",
  "pytest-xdist",
]
examples = ["matplotlib"]

[tool.coverage.run]
source = ["autograd"]

[tool.coverage.report]
show_missing = true

[tool.pytest.ini_options]
required_plugins = ["pytest-cov", "pytest-xdist"]
# TODO: generate HTML report, upload to CodeCov
addopts = "--color=yes -sra -n auto --cov=autograd --cov-report=xml --cov-report=term"

[tool.ruff]
extend-exclude = []
# TODO: not ignore them
lint.extend-ignore = [
  "E731",
  "F401",
  "F403",
  "F841",
  "F821",
  "E721",
  "E722",
  "E741",
  "E402",
  "F811"
]
lint.extend-select = ["I", "W"]
line-length = 109


================================================
FILE: tests/_test_complexity.py
================================================
import time
import warnings

import autograd.numpy as np
from autograd import deriv, grad
from autograd.builtins import list as make_list


def timefunction(f):
    t = time.time()
    f()
    return time.time() - t


def assert_linear_time(f):
    t = timefunction(lambda: f(1))
    t10 = timefunction(lambda: f(10))
    assert t10 > 5 * t, f"Too fast: f(1) takes {t}, f(10) takes {t10}"
    assert t10 < 20 * t, f"Too slow: f(1) takes {t}, f(10) takes {t10}"
    if not (8 * t < t10 < 12 * t):
        warnings.warn("Borderline linearity. May fail on different hardware")


def test_array_creation():
    def fun(x, N):
        arr = [x for i in range(N)]
        return np.sum(np.array(arr))

    assert_linear_time(lambda N: grad(fun)(1.0, 200 * N))


def test_array_indexing():
    def fun(x):
        return sum([x[i] for i in range(len(x))])

    assert_linear_time(lambda N: grad(fun)(np.zeros(200 * N)))


def test_list_indexing():
    def fun(x):
        return sum([x[i] for i in range(len(x))])

    assert_linear_time(lambda N: grad(fun)([0.0 for i in range(50 * N)]))


def test_list_creation():
    def fun(x, N):
        return make_list(*[x for _ in range(N)])

    assert_linear_time(lambda N: deriv(fun)(0.0, 20 * N))


# This fails. Need to figure out why
def test_array_creation_fwd():
    def fun(x, N):
        arr = [x for i in range(N)]
        return np.sum(np.array(arr))

    assert_linear_time(lambda N: deriv(fun)(1.0, 400 * N))


================================================
FILE: tests/check_examples_run.sh
================================================
#!/bin/bash

PYTHONPATH=".:$PYTHONPATH"
trap 'kill -INT -$pid && exit 1' INT

working=()
failing=()

examples=$(find examples -name '*.py' -not -name '__init__.py')

echo 'Running all the examples...'
for f in $examples; do
    timeout 15s python2 $f > /dev/null 2>&1 & pid=$!
    wait $pid
    status=$?
    if [ $status -eq 0 -o $status -eq 124 ]; then
        echo $f "seems to work"
        working+=($f)
    elif [ $status -eq 137 ]; then
        echo $f "might be working, but had to be killed"
        working+=($f)
    else
        echo $f "seems broken, try running manually"
        failing+=($f)
    fi
done

if [ ! ${#working[@]} -eq 0 ]; then
    echo -e '\033[01;36m'
    echo "These seemed to WORK:"
    echo -en '\033[00m'
    printf '%s\n' "${working[@]}"
    echo
fi
if [ ! ${#failing[@]} -eq 0 ]; then
    echo -e '\033[01;31m'
    echo "These seemed to FAIL:"
    echo -en '\033[00m'
    printf '%s\n' "${failing[@]}"
    echo
fi


================================================
FILE: tests/conftest.py
================================================
import numpy as np
import pytest


@pytest.fixture(autouse=True)
def random_seed():
    np.random.seed(42)


================================================
FILE: tests/numpy_utils.py
================================================
import autograd.numpy.random as npr
from autograd.test_util import combo_check


def stat_check(fun, test_complex=True, **kwargs):
    # Tests functions that compute statistics, like sum, mean, etc
    x = 3.5
    A = npr.randn()
    B = npr.randn(3)
    C = npr.randn(2, 3)
    D = npr.randn(1, 3)
    check = combo_check(fun, (0,), **kwargs)
    check([x, A])
    check([B, C, D], axis=[None, 0], keepdims=[True, False])
    check([C, D], axis=[None, 0, 1], keepdims=[True, False])
    if test_complex:
        c = npr.randn() + 0.1j * npr.randn()
        E = npr.randn(2, 3) + 0.1j * npr.randn(2, 3)
        check([x, c, A])
        check([B, C, D, E], axis=[None, 0], keepdims=[True, False])


def unary_ufunc_check(fun, lims=[-2, 2], test_complex=True, **kwargs):
    scalar = transform(lims, 0.4)
    vector = transform(lims, npr.rand(2))
    mat = transform(lims, npr.rand(3, 2))
    mat2 = transform(lims, npr.rand(1, 2))
    check = combo_check(fun, (0,), **kwargs)
    check([scalar, vector, mat, mat2])
    if test_complex:
        comp = transform(lims, 0.4) + 0.1j * transform(lims, 0.3)
        matc = transform(lims, npr.rand(3, 2)) + 0.1j * npr.rand(3, 2)
        check([comp, matc])


def binary_ufunc_check(fun, lims_A=[-2, 2], lims_B=[-2, 2], test_complex=True, **kwargs):
    T_A = lambda x: transform(lims_A, x)
    T_B = lambda x: transform(lims_B, x)
    scalar = 0.6
    vector = npr.rand(2)
    mat = npr.rand(3, 2)
    mat2 = npr.rand(1, 2)
    check = combo_check(fun, (0, 1), **kwargs)
    check([T_A(scalar), T_A(vector), T_A(mat), T_A(mat2)], [T_B(scalar), T_B(vector), T_B(mat), T_B(mat2)])
    if test_complex:
        comp = 0.6 + 0.3j
        matc = npr.rand(3, 2) + 0.1j * npr.rand(3, 2)
        check(
            [T_A(scalar), T_A(comp), T_A(vector), T_A(matc), T_A(mat2)],
            [T_B(scalar), T_B(comp), T_B(vector), T_B(matc), T_B(mat2)],
        )


def binary_ufunc_check_no_same_args(fun, lims_A=[-2, 2], lims_B=[-2, 2], test_complex=True, **kwargs):
    T_A = lambda x: transform(lims_A, x)
    T_B = lambda x: transform(lims_B, x)
    scalar1 = 0.6
    scalar2 = 0.7
    vector1 = npr.rand(2)
    vector2 = npr.rand(2)
    mat11 = npr.rand(3, 2)
    mat12 = npr.rand(3, 2)
    mat21 = npr.rand(1, 2)
    mat22 = npr.rand(1, 2)
    check = combo_check(fun, (0, 1), **kwargs)
    check(
        [T_A(scalar1), T_A(vector1), T_A(mat11), T_A(mat21)],
        [T_B(scalar2), T_B(vector2), T_B(mat12), T_B(mat22)],
    )
    if test_complex:
        comp1 = 0.6 + 0.3j
        comp2 = 0.1 + 0.2j
        matc1 = npr.rand(3, 2) + 0.1j * npr.rand(3, 2)
        matc2 = npr.rand(3, 2) + 0.1j * npr.rand(3, 2)
        check(
            [T_A(scalar1), T_A(comp1), T_A(vector1), T_A(matc1), T_A(mat21)],
            [T_B(scalar2), T_B(comp2), T_B(vector2), T_B(matc2), T_B(mat22)],
        )


def transform(lims, x):
    return x * (lims[1] - lims[0]) + lims[0]


================================================
FILE: tests/profiling.py
================================================
from contextlib import contextmanager
from time import time

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad


@contextmanager
def tictoc(text=""):
    print("--- Start clock ---")
    t1 = time()
    yield
    dt = time() - t1
    print(f"--- Stop clock {text}: {dt} seconds elapsed ---")


def fan_out_fan_in():
    """The 'Pearlmutter test'"""

    def fun(x):
        for i in range(10**4):
            x = (x + x) / 2.0
        return np.sum(x)

    with tictoc():
        grad(fun)(1.0)


def convolution():
    # MNIST-scale convolution operation
    import autograd.scipy.signal

    convolve = autograd.scipy.signal.convolve
    dat = npr.randn(256, 3, 28, 28)
    kernel = npr.randn(3, 5, 5)
    with tictoc():
        convolve(dat, kernel, axes=([2, 3], [1, 2]), dot_axes=([1], [0]))


def dot_equivalent():
    # MNIST-scale convolution operation

    dat = npr.randn(256, 3, 24, 5, 24, 5)
    kernel = npr.randn(3, 5, 5)
    with tictoc():
        np.tensordot(dat, kernel, axes=[(1, 3, 5), (0, 1, 2)])


# fan_out_fan_in()
# convolution()
dot_equivalent()


================================================
FILE: tests/test_binary_ops.py
================================================
import itertools as it
import warnings

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad, value_and_grad
from autograd.test_util import check_grads

rs = npr.RandomState(0)


def arg_pairs():
    scalar = 2.0
    vector = rs.randn(4)
    mat = rs.randn(3, 4)
    mat2 = rs.randn(1, 4)
    allargs = [scalar, vector, mat, mat2]
    yield from it.product(allargs, allargs)


def test_mul():
    fun = lambda x, y: x * y
    for arg1, arg2 in arg_pairs():
        check_grads(fun)(arg1, arg2)


def test_add():
    fun = lambda x, y: x + y
    for arg1, arg2 in arg_pairs():
        check_grads(fun)(arg1, arg2)


def test_sub():
    fun = lambda x, y: x - y
    for arg1, arg2 in arg_pairs():
        check_grads(fun)(arg1, arg2)


def test_div():
    fun = lambda x, y: x / y
    make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5)
    for arg1, arg2 in arg_pairs():
        arg1 = make_gap_from_zero(arg1)
        arg2 = make_gap_from_zero(arg2)
        check_grads(fun)(arg1, arg2)


def test_mod():
    fun = lambda x, y: x % y
    make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5)
    for arg1, arg2 in arg_pairs():
        if arg1 is not arg2:  # Gradient undefined at x == y
            arg1 = make_gap_from_zero(arg1)
            arg2 = make_gap_from_zero(arg2)
            check_grads(fun)(arg1, arg2)


def test_pow():
    fun = lambda x, y: x**y
    make_positive = lambda x: np.abs(x) + 1.1  # Numeric derivatives fail near zero
    for arg1, arg2 in arg_pairs():
        arg1 = make_positive(arg1)
        check_grads(fun)(arg1, arg2)


def test_arctan2():
    for arg1, arg2 in arg_pairs():
        check_grads(np.arctan2)(arg1, arg2)


def test_hypot():
    for arg1, arg2 in arg_pairs():
        check_grads(np.hypot, modes=["rev"])(arg1, arg2)


def test_comparison_grads():
    compare_funs = [
        lambda x, y: np.sum(x < x) + 0.0,
        lambda x, y: np.sum(x <= y) + 0.0,
        lambda x, y: np.sum(x > y) + 0.0,
        lambda x, y: np.sum(x >= y) + 0.0,
        lambda x, y: np.sum(x == y) + 0.0,
        lambda x, y: np.sum(x != y) + 0.0,
    ]

    with warnings.catch_warnings(record=True) as w:
        for arg1, arg2 in arg_pairs():
            zeros = (arg1 + arg2) * 0  # get correct shape
            for fun in compare_funs:
                assert np.all(grad(fun)(arg1, arg2) == zeros)
                assert np.all(grad(fun, argnum=1)(arg1, arg2) == zeros)


def test_comparison_values():
    compare_funs = [
        lambda x, y: np.sum(x < x) + 0.0,
        lambda x, y: np.sum(x <= y) + 0.0,
        lambda x, y: np.sum(x > y) + 0.0,
        lambda x, y: np.sum(x >= y) + 0.0,
        lambda x, y: np.sum(x == y) + 0.0,
        lambda x, y: np.sum(x != y) + 0.0,
    ]

    for arg1, arg2 in arg_pairs():
        for fun in compare_funs:
            fun_val = fun(arg1, arg2)
            fun_val_from_grad, _ = value_and_grad(fun)(arg1, arg2)
            assert fun_val == fun_val_from_grad, (fun_val, fun_val_from_grad)


================================================
FILE: tests/test_builtins.py
================================================
import autograd.numpy as np
from autograd import grad
from autograd.builtins import isinstance


def test_isinstance():
    def checker(ex, type_, truthval):
        assert isinstance(ex, type_) == truthval
        return 1.0

    examples = [
        [list, [[]], [()]],
        [np.ndarray, [np.zeros(1)], [[]]],
        [(tuple, list), [[], ()], [np.zeros(1)]],
    ]

    for type_, positive_examples, negative_examples in examples:
        for ex in positive_examples:
            checker(ex, type_, True)
            grad(checker)(ex, type_, True)

        for ex in negative_examples:
            checker(ex, type_, False)
            grad(checker)(ex, type_, False)


================================================
FILE: tests/test_complex.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.test_util import check_grads

npr.seed(1)


def test_real_type():
    fun = lambda x: np.sum(np.real(x))
    df = grad(fun)
    assert np.isrealobj(df(2.0))
    assert np.iscomplexobj(df(1.0j))


def test_real_if_close_type():
    fun = lambda x: np.sum(np.real(x))
    df = grad(fun)
    assert np.isrealobj(df(1.0))
    assert np.iscomplexobj(df(1.0j))


def test_angle_real():
    fun = lambda x: np.angle(x)
    d_fun = lambda x: grad(fun)(x)
    check_grads(fun)(npr.rand())
    check_grads(d_fun)(npr.rand())


def test_angle_complex():
    fun = lambda x: np.angle(x)
    d_fun = lambda x: grad(fun)(x)
    check_grads(fun)(npr.rand() + 1j * npr.rand())
    check_grads(d_fun)(npr.rand() + 1j * npr.rand())


def test_abs_real():
    fun = lambda x: np.abs(x)
    d_fun = lambda x: grad(fun)(x)
    check_grads(fun)(1.1)
    check_grads(d_fun)(2.1)


def test_abs_complex():
    fun = lambda x: np.abs(x)
    d_fun = lambda x: grad(fun)(x)
    check_grads(fun)(1.1 + 1.2j)
    check_grads(d_fun)(1.1 + 1.3j)


================================================
FILE: tests/test_core.py
================================================
"""This file doesn't import the numpy wrapper, to check if core works
on basic operations even without numpy."""

import warnings

from autograd.core import make_vjp
from autograd.wrap_util import unary_to_nary


@unary_to_nary
def grad(fun, x):
    vjp, _ = make_vjp(fun, x)
    return vjp(1.0)


# Non-numpy gradient checking functions.
def nd(f, x, eps=1e-4):
    return (f(x + eps / 2) - f(x - eps / 2)) / eps


def check_close(a, b, atol=1e-4, rtol=1e-4):
    assert abs(a - b) < atol + rtol * abs(b), f"Diffs are: {a - b}"


def check_binary_func(fun, independent=False):
    with warnings.catch_warnings(record=independent) as w:
        x, y = 0.7, 1.8
        a = grad(fun)(x, y)
        b = nd(lambda x: fun(x, y), x)
        check_close(a, b)

        a = grad(fun, 1)(x, y)
        b = nd(lambda y: fun(x, y), y)
        check_close(a, b)


def test_add():
    check_binary_func(lambda x, y: x + y)


def test_sub():
    check_binary_func(lambda x, y: x - y)


def test_div():
    check_binary_func(lambda x, y: x / y)


def test_mul():
    check_binary_func(lambda x, y: x * y)


def test_pow():
    check_binary_func(lambda x, y: x**y)


def test_mod():
    check_binary_func(lambda x, y: x % y)


def test_eq():
    check_binary_func(lambda x, y: x == y, independent=True)


def test_neq():
    check_binary_func(lambda x, y: x != y, independent=True)


def test_leq():
    check_binary_func(lambda x, y: x <= y, independent=True)


def test_geq():
    check_binary_func(lambda x, y: x >= y, independent=True)


def test_lt():
    check_binary_func(lambda x, y: x < y, independent=True)


def test_gt():
    check_binary_func(lambda x, y: x > y, independent=True)


================================================
FILE: tests/test_dict.py
================================================
import operator as op

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import dict as ag_dict
from autograd import grad
from autograd import isinstance as ag_isinstance
from autograd.test_util import check_grads

npr.seed(0)


def test_getter():
    def fun(input_dict):
        A = np.sum(input_dict["item_1"])
        B = np.sum(input_dict["item_2"])
        C = np.sum(input_dict["item_2"])
        return A + B + C

    d_fun = grad(fun)
    input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)}

    result = d_fun(input_dict)
    assert np.allclose(result["item_1"], np.ones((5, 6)))
    assert np.allclose(result["item_2"], 2 * np.ones((4, 3)))
    assert np.allclose(result["item_X"], np.zeros((2, 4)))


def test_grads():
    def fun(input_dict):
        A = np.sum(np.sin(input_dict["item_1"]))
        B = np.sum(np.cos(input_dict["item_2"]))
        return A + B

    def d_fun(input_dict):
        g = grad(fun)(input_dict)
        A = np.sum(g["item_1"])
        B = np.sum(np.sin(g["item_1"]))
        C = np.sum(np.sin(g["item_2"]))
        return A + B + C

    input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)}

    check_grads(fun)(input_dict)
    check_grads(d_fun)(input_dict)


def test_iter():
    def fun(input_dict):
        A = 0.0
        B = 0.0
        for i, k in enumerate(sorted(input_dict)):
            A = A + np.sum(np.sin(input_dict[k])) * (i + 1.0)
            B = B + np.sum(np.cos(input_dict[k]))
        return A + B

    def d_fun(input_dict):
        g = grad(fun)(input_dict)
        A = np.sum(g["item_1"])
        B = np.sum(np.sin(g["item_1"]))
        C = np.sum(np.sin(g["item_2"]))
        return A + B + C

    input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)}

    check_grads(fun)(input_dict)
    check_grads(d_fun)(input_dict)


def test_items_values_keys():
    def fun(input_dict):
        A = 0.0
        B = 0.0
        for i, (k, v) in enumerate(sorted(input_dict.items(), key=op.itemgetter(0))):
            A = A + np.sum(np.sin(v)) * (i + 1.0)
            B = B + np.sum(np.cos(v))
        for v in input_dict.values():
            A = A + np.sum(np.sin(v))
        for k in sorted(input_dict.keys()):
            A = A + np.sum(np.cos(input_dict[k]))
        return A + B

    def d_fun(input_dict):
        g = grad(fun)(input_dict)
        A = np.sum(g["item_1"])
        B = np.sum(np.sin(g["item_1"]))
        C = np.sum(np.sin(g["item_2"]))
        return A + B + C

    input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)}

    check_grads(fun)(input_dict)
    check_grads(d_fun)(input_dict)


def test_get():
    def fun(d, x):
        return d.get("item_1", x) ** 2

    check_grads(fun, argnum=(0, 1))({"item_1": 3.0}, 2.0)
    check_grads(fun, argnum=(0, 1))({"item_2": 4.0}, 2.0)
    check_grads(fun, argnum=(0, 1))({}, 2.0)


def test_make_dict():
    def fun(x):
        return ag_dict([("a", x)], b=x)

    check_grads(fun, modes=["rev"])(1.0)

    def fun(x):
        return ag_dict({"a": x})

    check_grads(fun, modes=["rev"])(1.0)

    # check some other forms of the constructor
    ag_dict()
    ag_dict(())
    ag_dict({})


def test_isinstance():
    def fun(x):
        assert ag_isinstance(x, dict)
        assert ag_isinstance(x, ag_dict)
        return x["x"]

    fun({"x": 1.0})
    grad(fun)({"x": 1.0})


================================================
FILE: tests/test_direct.py
================================================
"""
Set of tests that are as explicit as possible, in case the test helpers like
autograd.test_util break and start letting everything pass
"""

import numpy as onp
import pytest

import autograd.numpy as np
from autograd import deriv, grad, holomorphic_grad


def test_grad():
    def fun(x):
        return (x + np.sin(x**2)) * x

    assert 3.190948746871 - 1e-6 < grad(fun)(1.3) < 3.190948746871 + 1e-6


def test_deriv():
    def fun(x):
        return (x + np.sin(x**2)) * x

    assert 3.190948746871 - 1e-6 < deriv(fun)(1.3) < 3.190948746871 + 1e-6


def test_grad_complex_output():
    def fun(x):
        return x * (1.0 + 0.2j)

    with pytest.raises(TypeError):
        grad(fun)(1.0)


def test_holomorphic_grad():
    def fun(x):
        return x * (1.0 + 0.2j)

    g = holomorphic_grad(fun)(1.0 + 0.0j)
    assert 0.9999 < onp.real(g) < 1.0001
    assert 0.1999 < onp.imag(g) < 0.2001


================================================
FILE: tests/test_fft.py
================================================
from functools import partial

import pytest

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.test_util import check_grads

npr.seed(1)

### fwd mode not yet implemented
check_grads = partial(check_grads, modes=["rev"])


def test_fft():
    def fun(x):
        return np.fft.fft(x)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_fft_ortho():
    def fun(x):
        return np.fft.fft(x, norm="ortho")

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_fft_axis():
    def fun(x):
        return np.fft.fft(x, axis=0)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def match_complex(fft_fun, mat):
    # ensure hermitian by doing a fft
    if fft_fun.__name__.startswith("ir"):
        return getattr(np.fft, fft_fun.__name__[1:])(mat)
    else:
        return mat


def check_fft_n(fft_fun, D, n):
    def fun(x):
        return fft_fun(x, D + n)

    mat = npr.randn(D, D)
    mat = match_complex(fft_fun, mat)
    check_grads(fun)(mat)


def test_fft_n_smaller():
    check_fft_n(np.fft.fft, 5, -2)


def test_fft_n_bigger():
    check_fft_n(np.fft.fft, 5, 2)


def test_ifft_n_smaller():
    check_fft_n(np.fft.ifft, 5, -2)


def test_ifft_n_bigger():
    check_fft_n(np.fft.ifft, 5, 2)


def test_rfft_n_smaller():
    check_fft_n(np.fft.rfft, 4, -2)


def test_rfft_n_bigger():
    check_fft_n(np.fft.rfft, 4, 2)


def test_irfft_n_smaller():
    check_fft_n(np.fft.irfft, 4, -2)


def test_irfft_n_bigger():
    check_fft_n(np.fft.irfft, 4, 2)


def check_fft_s(fft_fun, D):
    def fun(x):
        return fft_fun(x, s=s, axes=axes)

    mat = npr.randn(D, D, D) / 10.0
    mat = match_complex(fft_fun, mat)
    s = [D + 2, D - 2]
    axes = [0, 2]
    check_grads(fun)(mat)


def test_fft2_s():
    check_fft_s(np.fft.fft2, 5)


def test_ifft2_s():
    check_fft_s(np.fft.ifft2, 5)


def test_fftn_s():
    check_fft_s(np.fft.fftn, 5)


def test_ifftn_s():
    check_fft_s(np.fft.ifftn, 5)


def test_rfft2_s():
    check_fft_s(np.fft.rfft2, 4)


def test_irfft2_s():
    check_fft_s(np.fft.irfft2, 4)


def test_rfftn_s():
    check_fft_s(np.fft.rfftn, 4)


def test_irfftn_s():
    check_fft_s(np.fft.irfftn, 4)


## TODO: fft gradient not implemented for repeated axes
# def test_fft_repeated_axis():
#     D = 5
#     for fft_fun in (np.fft.fft2,np.fft.ifft2,np.fft.fftn, np.fft.ifftn):
#        def fun(x): return fft_fun(x, s=s, axes=axes)

#        mat = npr.randn(D,D,D) / 10.0
#        s = [D + 2, D - 2]
#        axes = [0,0]

#   check_grads(rad)(fun)


def test_ifft():
    def fun(x):
        return np.fft.ifft(x)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_fft2():
    def fun(x):
        return np.fft.fft2(x)

    D = 5
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_ifft2():
    def fun(x):
        return np.fft.ifft2(x)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_fftn():
    def fun(x):
        return np.fft.fftn(x)

    D = 5
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_ifftn():
    def fun(x):
        return np.fft.ifftn(x)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_rfft():
    def fun(x):
        return np.fft.rfft(x)

    D = 4
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_rfft_ortho():
    def fun(x):
        return np.fft.rfft(x, norm="ortho")

    D = 4
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_rfft_axes():
    def fun(x):
        return np.fft.rfft(x, axis=0)

    D = 4
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_irfft():
    def fun(x):
        return np.fft.irfft(x)

    D = 4
    mat = npr.randn(D, D) / 10.0
    # ensure hermitian by doing a fft
    mat = np.fft.rfft(mat)
    check_grads(fun)(mat)


def test_irfft_ortho():
    def fun(x):
        return np.fft.irfft(x, norm="ortho")

    D = 4
    mat = npr.randn(D, D) / 10.0
    # ensure hermitian by doing a fft
    mat = np.fft.rfft(mat)
    check_grads(fun)(mat)


def test_rfft2():
    def fun(x):
        return np.fft.rfft2(x)

    D = 4
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_irfft2():
    def fun(x):
        return np.fft.irfft2(x)

    D = 4
    mat = npr.randn(D, D) / 10.0
    # ensure hermitian by doing a fft
    mat = np.fft.rfft2(mat)
    check_grads(fun)(mat)


def test_rfftn():
    def fun(x):
        return np.fft.rfftn(x)

    D = 4
    mat = npr.randn(D, D, D) / 10.0
    check_grads(fun)(mat)


def test_rfftn_odd_not_implemented():
    def fun(x):
        return np.fft.rfftn(x)

    D = 5
    mat = npr.randn(D, D, D) / 10.0
    with pytest.raises(NotImplementedError):
        check_grads(fun)(mat)


def test_rfftn_subset():
    def fun(x):
        return np.fft.rfftn(x)[(0, 1, 0), (3, 3, 2)]

    D = 4
    mat = npr.randn(D, D, D) / 10.0
    check_grads(fun)(mat)


def test_rfftn_axes():
    def fun(x):
        return np.fft.rfftn(x, axes=(0, 2))

    D = 4
    mat = npr.randn(D, D, D) / 10.0
    check_grads(fun)(mat)


def test_irfftn():
    def fun(x):
        return np.fft.irfftn(x)

    D = 4
    mat = npr.randn(D, D, D) / 10.0
    # ensure hermitian by doing a fft
    mat = np.fft.rfftn(mat)
    check_grads(fun)(mat)


def test_irfftn_subset():
    def fun(x):
        return np.fft.irfftn(x)[(0, 1, 0), (3, 3, 2)]

    D = 4
    mat = npr.randn(D, D, D) / 10.0
    # ensure hermitian by doing a fft
    mat = np.fft.rfftn(mat)
    check_grads(fun)(mat)


def test_fftshift():
    def fun(x):
        return np.fft.fftshift(x)

    D = 5
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_fftshift_even():
    def fun(x):
        return np.fft.fftshift(x)

    D = 4
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_fftshift_axes():
    def fun(x):
        return np.fft.fftshift(x, axes=1)

    D = 5
    mat = npr.randn(D, D) / 10.0
    check_grads(fun)(mat)


def test_ifftshift():
    def fun(x):
        return np.fft.ifftshift(x)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_ifftshift_even():
    def fun(x):
        return np.fft.ifftshift(x)

    D = 4
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_ifftshift_axes():
    def fun(x):
        return np.fft.ifftshift(x, axes=1)

    D = 5
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


================================================
FILE: tests/test_graphs.py
================================================
import warnings

import pytest

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.test_util import check_grads

npr.seed(1)


def test_grad_fanout():
    fun = lambda x: np.sin(np.sin(x) + np.sin(x))
    df = grad(fun)
    check_grads(fun)(npr.randn())
    check_grads(df)(npr.rand())


def test_grad_const():
    fun = lambda x: 1.0
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("ignore")
        df = grad(fun)
        assert np.allclose(df(2.0), 0.0)


def test_grad_identity():
    fun = lambda x: x
    df = grad(fun)
    ddf = grad(df)
    assert np.allclose(df(2.0), 1.0)
    assert np.allclose(ddf(2.0), 0.0)


def test_hess_vector_prod():
    npr.seed(1)
    randv = npr.randn(10)

    def fun(x):
        return np.sin(np.dot(x, randv))

    df = grad(fun)

    def vector_product(x, v):
        return np.sin(np.dot(v, df(x)))

    ddf = grad(vector_product)
    A = npr.randn(10)
    B = npr.randn(10)
    check_grads(fun)(A)
    check_grads(vector_product)(A, B)


def test_enclosing_scope_ref():
    def fun(x):
        inner_fun = lambda y: x * y
        return x * grad(inner_fun)(2.0)

    check_grads(fun)(1.0)


def test_enclosing_scope_ref_2():
    def fun(x):
        inner_fun = lambda y: y * x
        return x * grad(inner_fun)(2.0)

    check_grads(fun)(1.0)


def test_mutating_outgrad():
    def fun(a):
        b = a + 1.0
        c = b + 1.5
        d = a + b
        e = d + c
        return e

    A = npr.randn(5)
    check_grads(fun)(A)


def test_mutating_outgrad_from_indexing():
    def fun(a):
        b = a + 1.0
        c = b[0] + 1.5
        d = a + b
        e = d + c
        return e

    A = npr.randn(5)
    check_grads(fun)(A)


def test_complex_mutating_outgrad_from_indexing():
    def fun(a):
        b = a + 1.0j
        c = b[0] + 1.5
        d = a + b
        e = d + c
        return np.sum(np.sin(np.real(e)))

    A = npr.randn(5)
    check_grads(fun)(A)
    d_fun = lambda x: grad(fun)(x)
    check_grads(d_fun)(A)


def test_complex_separate_real_and_imaginary():
    def fun(a):
        r, i = np.real(a), np.imag(a)
        a = np.abs(r) ** 1.4 + np.abs(i) ** 1.3
        return np.sum(np.sin(a))

    d_fun = lambda x: grad(fun)(x)
    A = npr.randn(5, 3) + 0.1j * npr.randn(5, 3)
    check_grads(fun)(A)
    check_grads(d_fun)(A)


def test_third_derivative():
    fun = lambda x: np.sin(np.sin(x) + np.sin(x))
    df = grad(fun)
    ddf = grad(fun)
    dddf = grad(fun)
    check_grads(fun)(npr.randn())
    check_grads(df)(npr.rand())
    check_grads(ddf)(npr.rand())
    check_grads(dddf)(npr.rand())


def test_third_derivative_other_args():
    fun = lambda x, y: np.sin(np.sin(x) + np.sin(y))
    df = grad(fun)
    ddf = grad(fun, 1)
    dddf = grad(fun)
    check_grads(fun)(npr.randn(), npr.randn())
    check_grads(df)(npr.randn(), npr.randn())
    check_grads(ddf)(npr.randn(), npr.randn())
    check_grads(dddf)(npr.randn(), npr.randn())


def test_third_derivative_other_args2():
    fun = lambda x, y: np.sin(np.sin(x) + np.sin(y))
    df = grad(fun, 1)
    ddf = grad(fun)
    dddf = grad(fun, 1)
    check_grads(fun)(npr.randn(), npr.randn())
    check_grads(df)(npr.randn(), npr.randn())
    check_grads(ddf)(npr.randn(), npr.randn())
    check_grads(dddf)(npr.randn(), npr.randn())


def test_singleton_array_output():
    fun = lambda x: np.sum(np.sin(x), keepdims=True)
    check_grads(fun)(npr.randn(3, 3))
    check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 3))


def test_singleton_array_output_axis0():
    fun = lambda x: np.sum(np.sin(x), axis=0, keepdims=False)
    check_grads(fun)(npr.randn(3, 1))
    check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 1))


def test_singleton_array_output_axis1():
    fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=False)
    check_grads(fun)(npr.randn(1, 3))
    check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(1, 3))


def test_singleton_array_output_axis0_keepdims():
    fun = lambda x: np.sum(np.sin(x), axis=0, keepdims=True)
    check_grads(fun)(npr.randn(3, 1))
    check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 1))


def test_singleton_array_output_axis1_keepdims():
    fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=True)
    check_grads(fun)(npr.randn(1, 3))
    check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(1, 3))


def test_assignment_raises_error():
    def fun(A, b):
        A[1] = b
        return A

    A = npr.randn(5)
    with pytest.raises(TypeError):
        check_grads(fun)(A, 3.0)


# def test_nonscalar_output_1():
#     with pytest.raises(TypeError):
#         grad(lambda x: x * 2)(np.zeros(2))

# def test_nonscalar_output_2():
#     with pytest.raises(TypeError):
#         grad(lambda x: x * 2)(np.zeros(2))

# TODO:
# Diamond patterns
# Taking grad again after returning const
# Empty functions
# 2nd derivatives with fanout, thinking about the outgrad adder


================================================
FILE: tests/test_jacobian.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad, jacobian
from autograd.test_util import check_grads

npr.seed(1)


def test_jacobian_against_grad():
    fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=True)
    A = npr.randn(1, 3)
    assert np.allclose(grad(fun)(A), jacobian(fun)(A))


def test_jacobian_scalar_to_vector():
    fun = lambda x: np.array([x, x**2, x**3])
    val = npr.randn()
    assert np.allclose(jacobian(fun)(val), np.array([1.0, 2 * val, 3 * val**2]))


def test_jacobian_against_stacked_grads():
    scalar_funs = [
        lambda x: np.sum(x**3),
        lambda x: np.prod(np.sin(x) + np.sin(x)),
        lambda x: grad(lambda y: np.exp(y) * np.tanh(x[0]))(x[1]),
    ]

    vector_fun = lambda x: np.array([f(x) for f in scalar_funs])

    x = npr.randn(5)
    jac = jacobian(vector_fun)(x)
    grads = [grad(f)(x) for f in scalar_funs]

    assert np.allclose(jac, np.vstack(grads))


def test_jacobian_higher_order():
    fun = lambda x: np.sin(np.outer(x, x)) + np.cos(np.dot(x, x))

    assert jacobian(fun)(npr.randn(2)).shape == (2, 2, 2)
    assert jacobian(jacobian(fun))(npr.randn(2)).shape == (2, 2, 2, 2)
    # assert jacobian(jacobian(jacobian(fun)))(npr.randn(2)).shape == (2,2,2,2,2)

    check_grads(lambda x: np.sum(np.sin(jacobian(fun)(x))))(npr.randn(2))
    check_grads(lambda x: np.sum(np.sin(jacobian(jacobian(fun))(x))))(npr.randn(2))


================================================
FILE: tests/test_linalg.py
================================================
from functools import partial

import numpy as onp
import pytest

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import tuple
from autograd.test_util import check_grads

npr.seed(1)

# Fwd mode not yet implemented
check_grads = partial(check_grads, modes=["rev"])


def check_symmetric_matrix_grads(fun, **grad_check_kwargs):
    def check(*args):
        def symmetrize(A):
            L = np.tril(A)
            return (L + T(L)) / 2.0

        new_fun = lambda *args: fun(symmetrize(args[0]), *args[1:])
        check_grads(new_fun, **grad_check_kwargs)(*args)

    return check


T = lambda A: np.swapaxes(A, -1, -2)


def rand_psd(D):
    mat = npr.randn(D, D)
    return np.dot(mat, mat.T)


def test_inv():
    def fun(x):
        return np.linalg.inv(x)

    D = 8
    mat = npr.randn(D, D)
    mat = np.dot(mat, mat) + 1.0 * np.eye(D)
    check_grads(fun)(mat)


def test_pinv():
    def fun(x):
        return np.linalg.pinv(x)

    N = 5
    D = 2
    ## Non-square matrices:
    for M in range(N // 2, N + N // 2 + 1):
        mat = npr.randn(N, M)
        check_grads(fun)(mat)
        # Stacked
        mat = npr.randn(D, N, M)
        check_grads(fun)(mat)

    ## Square, low (fixed) rank matrices
    def fun_low_rank(x):
        return np.linalg.pinv(np.linalg._dot(np.linalg.T(x), x))

    for M in range(N // 2, N + N // 2 + 1):
        mat = npr.randn(N, M)
        check_grads(fun_low_rank)(mat)
        # Stacked
        mat = npr.randn(D, N, M)
        check_grads(fun_low_rank)(mat)


def test_inv_3d():
    fun = lambda x: np.linalg.inv(x)

    D = 4
    mat = npr.randn(D, D, D) + 5 * np.eye(D)
    check_grads(fun)(mat)

    mat = npr.randn(D, D, D, D) + 5 * np.eye(D)
    check_grads(fun)(mat)


def test_solve_arg1():
    D = 8
    A = npr.randn(D, D) + 10.0 * np.eye(D)
    B = npr.randn(D, D - 1)

    def fun(a):
        return np.linalg.solve(a, B)

    check_grads(fun)(A)


def test_solve_arg1_1d():
    D = 8
    A = npr.randn(D, D) + 10.0 * np.eye(D)
    B = npr.randn(D)

    def fun(a):
        return np.linalg.solve(a, B)

    check_grads(fun)(A)


def test_solve_arg2():
    D = 6
    A = npr.randn(D, D) + 1.0 * np.eye(D)
    B = npr.randn(D, D - 1)

    def fun(b):
        return np.linalg.solve(A, b)

    check_grads(fun)(B)


def test_solve_arg1_3d():
    D = 4
    A = npr.randn(D + 1, D, D) + 5 * np.eye(D)
    B = npr.randn(D + 1, D)
    if onp.lib.NumpyVersion(onp.__version__) < "2.0.0":
        fun = lambda A: np.linalg.solve(A, B)
    else:
        fun = lambda A: np.linalg.solve(A, B[..., None])[..., 0]
    check_grads(fun)(A)


def test_solve_arg1_3d_3d():
    D = 4
    A = npr.randn(D + 1, D, D) + 5 * np.eye(D)
    B = npr.randn(D + 1, D, D + 2)
    fun = lambda A: np.linalg.solve(A, B)
    check_grads(fun)(A)


def test_det():
    def fun(x):
        return np.linalg.det(x)

    D = 6
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_det_3d():
    fun = lambda x: np.linalg.det(x)
    D = 3
    mat = npr.randn(D, D, D)
    check_grads(fun)(mat)


def test_slogdet():
    def fun(x):
        sign, logdet = np.linalg.slogdet(x)
        return logdet

    D = 6
    mat = npr.randn(D, D)
    check_grads(fun)(mat)
    check_grads(fun)(-mat)


def test_slogdet_3d():
    fun = lambda x: np.sum(np.linalg.slogdet(x)[1])
    mat = np.concatenate([(rand_psd(5) + 5 * np.eye(5))[None, ...] for _ in range(3)])
    check_grads(fun)(mat)


def test_vector_2norm():
    def fun(x):
        return np.linalg.norm(x)

    D = 6
    vec = npr.randn(D)
    check_grads(fun, modes=["fwd", "rev"])(vec)


def test_vector_2norm_complex():
    def fun(x):
        return np.linalg.norm(x)

    D = 6
    vec = npr.randn(D) + 1j * npr.randn(D)
    check_grads(fun)(vec)


def test_frobenius_norm():
    def fun(x):
        return np.linalg.norm(x)

    D = 6
    mat = npr.randn(D, D - 1)
    check_grads(fun, modes=["fwd", "rev"])(mat)


def test_frobenius_norm_complex():
    def fun(x):
        return np.linalg.norm(x)

    D = 6
    mat = npr.randn(D, D - 1) + 1j * npr.randn(D, D - 1)
    check_grads(fun)(mat)


def test_frobenius_norm_axis():
    def fun(x):
        return np.linalg.norm(x, axis=(0, 1))

    D = 6
    mat = npr.randn(D, D - 1, D - 2)
    check_grads(fun, modes=["fwd", "rev"])(mat)


def test_frobenius_norm_axis_complex():
    def fun(x):
        return np.linalg.norm(x, axis=(0, 1))

    D = 6
    mat = npr.randn(D, D - 1, D - 2) + 1j * npr.randn(D, D - 1, D - 2)
    check_grads(fun)(mat)


@pytest.mark.parametrize("ord", range(2, 5))
@pytest.mark.parametrize("size", [6])
def test_vector_norm_ord(size, ord):
    def fun(x):
        return np.linalg.norm(x, ord=ord)

    vec = npr.randn(size)
    check_grads(fun, modes=["fwd", "rev"])(vec)


@pytest.mark.parametrize("ord", range(2, 5))
@pytest.mark.parametrize("size", [6])
def test_vector_norm_ord_complex(size, ord):
    def fun(x):
        return np.linalg.norm(x, ord=ord)

    vec = npr.randn(size) + 1j * npr.randn(size)
    check_grads(fun)(vec)


@pytest.mark.parametrize("axis", range(3))
@pytest.mark.parametrize("shape", [(6, 5, 4)])
def test_norm_axis(shape, axis):
    def fun(x):
        return np.linalg.norm(x, axis=axis)

    arr = npr.randn(*shape)
    check_grads(fun, modes=["fwd", "rev"])(arr)


@pytest.mark.parametrize("axis", range(3))
@pytest.mark.parametrize("shape", [(6, 5, 4)])
def test_norm_axis_complex(shape, axis):
    def fun(x):
        return np.linalg.norm(x, axis=axis)

    arr = npr.randn(*shape) + 1j * npr.randn(*shape)
    check_grads(fun)(arr)


def test_norm_nuclear():
    def fun(x):
        return np.linalg.norm(x, ord="nuc")

    D = 6
    mat = npr.randn(D, D - 1)
    # Order 1 because the jvp of the svd is not implemented
    check_grads(fun, modes=["fwd", "rev"], order=1)(mat)


def test_norm_nuclear_complex():
    def fun(x):
        return np.linalg.norm(x, ord="nuc")

    D = 6
    mat = npr.randn(D, D - 1) + 1j * npr.randn(D, D - 1)
    check_grads(fun)(mat)


def test_norm_nuclear_axis():
    def fun(x):
        return np.linalg.norm(x, ord="nuc", axis=(0, 1))

    D = 6
    mat = npr.randn(D, D - 1, D - 2)
    # Order 1 because the jvp of the svd is not implemented
    check_grads(fun, modes=["fwd", "rev"], order=1)(mat)


def test_norm_nuclear_axis_complex():
    def fun(x):
        return np.linalg.norm(x, ord="nuc", axis=(0, 1))

    D = 6
    mat = npr.randn(D, D - 1, D - 2) + 1j * npr.randn(D, D - 1, D - 2)
    check_grads(fun)(mat)


def test_eigvalh_lower():
    def fun(x):
        w, v = np.linalg.eigh(x)
        return tuple((w, v))

    D = 6
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_eigvalh_upper():
    def fun(x):
        w, v = np.linalg.eigh(x, "U")
        return tuple((w, v))

    D = 6
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


broadcast_dot_transpose = partial(np.einsum, "...ij,...kj->...ik")


def test_eigvalh_lower_broadcasting():
    def fun(x):
        w, v = np.linalg.eigh(x)
        return tuple((w, v))

    D = 6
    mat = npr.randn(2, 3, D, D) + 10 * np.eye(D)[None, None, ...]
    hmat = broadcast_dot_transpose(mat, mat)
    check_grads(fun)(hmat)


def test_eigvalh_upper_broadcasting():
    def fun(x):
        w, v = np.linalg.eigh(x, "U")
        return tuple((w, v))

    D = 6
    mat = npr.randn(2, 3, D, D) + 10 * np.eye(D)[None, None, ...]
    hmat = broadcast_dot_transpose(mat, mat)
    check_grads(fun)(hmat)


# For complex-valued matrices, the eigenvectors could have arbitrary phases (gauge)
# which makes it impossible to compare to numerical derivatives. So we take the
# absolute value to get rid of that phase.
def test_eigvalh_lower_complex():
    def fun(x):
        w, v = np.linalg.eigh(x)
        return tuple((w, np.abs(v)))

    D = 6
    mat = npr.randn(D, D) + 1j * npr.randn(D, D)
    check_grads(fun)(mat)


def test_eigvalh_upper_complex():
    def fun(x):
        w, v = np.linalg.eigh(x, "U")
        return tuple((w, np.abs(v)))

    D = 6
    mat = npr.randn(D, D) + 1j * npr.randn(D, D)
    check_grads(fun)(mat)


# Note eigenvalues and eigenvectors for real matrix can still be complex
def test_eig_real():
    def fun(x):
        w, v = np.linalg.eig(x)
        return tuple((np.abs(w), np.abs(v)))

    D = 8
    mat = npr.randn(D, D)
    check_grads(fun)(mat)


def test_eig_complex():
    def fun(x):
        w, v = np.linalg.eig(x)
        return tuple((w, np.abs(v)))

    D = 8
    mat = npr.randn(D, D) + 1.0j * npr.randn(D, D)
    check_grads(fun)(mat)


def test_eig_batched():
    def fun(x):
        w, v = np.linalg.eig(x)
        return tuple((w, np.abs(v)))

    D = 8
    b = 5
    mat = npr.randn(b, D, D) + 1.0j * npr.randn(b, D, D)
    check_grads(fun)(mat)


def test_cholesky():
    fun = lambda A: np.linalg.cholesky(A)
    check_symmetric_matrix_grads(fun)(rand_psd(6))


def test_cholesky_broadcast():
    fun = lambda A: np.linalg.cholesky(A)
    A = np.concatenate([rand_psd(6)[None, :, :] for i in range(3)], axis=0)
    check_symmetric_matrix_grads(fun)(A)


def test_cholesky_reparameterization_trick():
    def fun(A):
        rng = np.random.RandomState(0)
        z = np.dot(np.linalg.cholesky(A), rng.randn(A.shape[0]))
        return np.linalg.norm(z)

    check_symmetric_matrix_grads(fun)(rand_psd(6))


def test_svd_wide_2d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    m = 3
    n = 5
    mat = npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_wide_2d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    m = 3
    n = 5
    mat = npr.randn(m, n) + 1j * npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_wide_3d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    k = 4
    m = 3
    n = 5
    mat = npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_wide_3d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    k = 4
    m = 3
    n = 5
    mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_square_2d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    m = 4
    n = 4
    mat = npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_square_2d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    m = 4
    n = 4
    mat = npr.randn(m, n) + 1j * npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_square_3d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    k = 3
    m = 4
    n = 4
    mat = npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_square_3d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    k = 3
    m = 4
    n = 4
    mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_tall_2d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    m = 5
    n = 3
    mat = npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_tall_2d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    m = 5
    n = 3
    mat = npr.randn(m, n) + 1j * npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_tall_3d():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((u, s, v))

    k = 4
    m = 5
    n = 3
    mat = npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_tall_3d_complex():
    def fun(x):
        u, s, v = np.linalg.svd(x, full_matrices=False)
        return tuple((np.abs(u), s, np.abs(v)))

    k = 4
    m = 5
    n = 3
    mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_only_s_2d():
    def fun(x):
        s = np.linalg.svd(x, full_matrices=False, compute_uv=False)
        return s

    m = 5
    n = 3
    mat = npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_only_s_2d_complex():
    def fun(x):
        s = np.linalg.svd(x, full_matrices=False, compute_uv=False)
        return s

    m = 5
    n = 3
    mat = npr.randn(m, n) + 1j * npr.randn(m, n)
    check_grads(fun)(mat)


def test_svd_only_s_3d():
    def fun(x):
        s = np.linalg.svd(x, full_matrices=False, compute_uv=False)
        return s

    k = 4
    m = 5
    n = 3
    mat = npr.randn(k, m, n)
    check_grads(fun)(mat)


def test_svd_only_s_3d_complex():
    def fun(x):
        s = np.linalg.svd(x, full_matrices=False, compute_uv=False)
        return s

    k = 4
    m = 5
    n = 3
    mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n)
    check_grads(fun)(mat)


================================================
FILE: tests/test_list.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd import isinstance as ag_isinstance
from autograd import list as ag_list
from autograd.test_util import check_grads

npr.seed(1)


def test_getter():
    def fun(input_list):
        A = np.sum(input_list[0])
        B = np.sum(input_list[1])
        C = np.sum(input_list[1])
        return A + B + C

    d_fun = grad(fun)
    input_list = [npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)]

    result = d_fun(input_list)
    assert np.allclose(result[0], np.ones((5, 6)))
    assert np.allclose(result[1], 2 * np.ones((4, 3)))
    assert np.allclose(result[2], np.zeros((2, 4)))


def test_grads():
    def fun(input_list):
        A = np.sum(np.sin(input_list[0]))
        B = np.sum(np.cos(input_list[1]))
        return A + B

    def d_fun(input_list):
        g = grad(fun)(input_list)
        A = np.sum(g[0])
        B = np.sum(np.sin(g[0]))
        C = np.sum(np.sin(g[1]))
        return A + B + C

    input_list = [npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)]

    check_grads(fun)(input_list)
    check_grads(d_fun)(input_list)


def test_slices():
    def f(x):
        s = slice(None, -1, None)
        y = x[s]
        return y[0]

    grad(f)([1.0, 2.0, 3.0])

    def f(x):
        y = x[1:3]
        return y[0]

    grad(f)([1.0, 2.0, 3.0])


def test_nested_list():
    A = [[1.0], 2.0, 1.5]

    def fun(x):
        return x[1:][0]

    check_grads(fun)(A)


def test_make_list():
    def fun(x):
        return ag_list((x, x))

    check_grads(fun)(1.0)


def test_isinstance():
    def fun(x):
        assert ag_isinstance(x, list)
        assert ag_isinstance(x, ag_list)
        return x[0]

    fun([1.0, 2.0, 3.0])
    grad(fun)([1.0, 2.0, 3.0])


================================================
FILE: tests/test_logic.py
================================================
import warnings
from contextlib import contextmanager

import pytest

import autograd.numpy as np
from autograd import deriv, grad
from autograd.core import primitive_vjps
from autograd.extend import primitive
from autograd.test_util import check_grads


def test_assert():
    # from https://github.com/HIPS/autograd/issues/43
    def fun(x):
        assert np.allclose(x, (x * 3.0) / 3.0)
        return np.sum(x)

    check_grads(fun)(np.array([1.0, 2.0, 3.0]))


def test_nograd():
    # we want this to raise non-differentiability error
    fun = lambda x: np.allclose(x, (x * 3.0) / 3.0)
    with pytest.raises(TypeError):
        with warnings.catch_warnings(record=True) as w:
            grad(fun)(np.array([1.0, 2.0, 3.0]))


def test_no_vjp_def():
    fun = primitive(lambda x: 2.0 * x)
    with pytest.raises(NotImplementedError):
        grad(fun)(1.0)


def test_no_jvp_def():
    fun = primitive(lambda x: 2.0 * x)
    with pytest.raises(NotImplementedError):
        deriv(fun)(1.0)


def test_falseyness():
    fun = lambda x: np.real(x**2 if np.iscomplex(x) else np.sum(x))
    check_grads(fun)(2.0)
    check_grads(fun)(2.0 + 1j)


def test_unimplemented_falseyness():
    @contextmanager
    def remove_grad_definitions(fun):
        vjpmaker = primitive_vjps.pop(fun, None)
        yield
        if vjpmaker:
            primitive_vjps[fun] = vjpmaker

    with remove_grad_definitions(np.iscomplex):
        fun = lambda x: np.real(x**2 if np.iscomplex(x) else np.sum(x))
        check_grads(fun)(5.0)
        check_grads(fun)(2.0 + 1j)


================================================
FILE: tests/test_misc.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad, make_vjp
from autograd.misc import const_graph, flatten
from autograd.test_util import scalar_close
from autograd.tracer import primitive


def test_const_graph():
    L = []

    def foo(x, y):
        L.append(None)
        return grad(lambda x: np.sin(x) + x * 2)(x * y)

    foo_wrapped = const_graph(foo)

    assert len(L) == 0
    assert scalar_close(foo(0.0, 0.0), foo_wrapped(0.0, 0.0))
    assert len(L) == 2
    assert scalar_close(foo(1.0, 0.5), foo_wrapped(1.0, 0.5))
    assert len(L) == 3
    assert scalar_close(foo(1.0, 0.5), foo_wrapped(1.0, 0.5))
    assert len(L) == 4


def test_const_graph_args():
    L = []

    @primitive
    def process(var, varname):
        L.append(varname)
        return var

    def foo(x, y, z):
        x = process(x, "x")
        y = process(y, "y")
        z = process(z, "z")
        return x + 2 * y + 3 * z

    foo_wrapped = const_graph(foo, 1.0, z=3.0)

    assert L == []
    assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0))
    assert L == ["x", "y", "z", "x", "y", "z"]
    L = []
    assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0))
    assert L == ["x", "y", "z", "y"]
    L = []
    assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0))
    assert L == ["x", "y", "z", "y"]


def test_flatten():
    r = np.random.randn
    x = (1.0, r(2, 3), [r(1, 4), {"x": 2.0, "y": r(4, 2)}])
    x_flat, unflatten = flatten(x)
    assert x_flat.shape == (20,)
    assert x_flat[0] == 1.0
    assert np.all(x_flat == flatten(unflatten(x_flat))[0])

    y = (1.0, 2.0, [3.0, {"x": 2.0, "y": 4.0}])
    y_flat, unflatten = flatten(y)
    assert y_flat.shape == (5,)
    assert y == unflatten(y_flat)


def test_flatten_empty():
    val = (npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)]))
    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
    assert np.all(vect == vect_2)


def test_flatten_dict():
    val = {"k": npr.random((4, 4)), "k2": npr.random((3, 3)), "k3": 3.0, "k4": [1.0, 4.0, 7.0, 9.0]}

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
    assert np.all(vect == vect_2)


def unflatten_tracing():
    val = [npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)])]
    vect, unflatten = flatten(val)

    def f(vect):
        return unflatten(vect)

    flatten2, _ = make_vjp(f)(vect)
    assert np.all(vect == flatten2(val))


def test_flatten_nodes_in_containers():
    # see issue #232
    def f(x, y):
        xy, _ = flatten([x, y])
        return np.sum(xy)

    grad(f)(1.0, 2.0)


def test_flatten_complex():
    val = 1 + 1j
    flat, unflatten = flatten(val)
    assert np.all(val == unflatten(flat))


================================================
FILE: tests/test_numpy.py
================================================
import warnings

from numpy_utils import combo_check

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.test_util import check_grads

npr.seed(1)


def test_numpy_version():
    import numpy

    assert np.__version__ == numpy.__version__


def test_dot():
    def fun(x, y):
        return np.dot(x, y)

    mat1 = npr.randn(10, 11)
    mat2 = npr.randn(10, 11)
    vect1 = npr.randn(10)
    vect2 = npr.randn(11)
    vect3 = npr.randn(11)

    check_grads(fun)(mat1, vect2)
    check_grads(fun)(mat1, mat2.T)
    check_grads(fun)(vect1, mat1)
    check_grads(fun)(vect2, vect3)


def test_dot_with_floats():
    def fun(x, y):
        return np.dot(x, y)

    mat1 = npr.randn(10, 11)
    vect1 = npr.randn(10)
    float1 = npr.randn()

    check_grads(fun)(mat1, float1)
    check_grads(fun)(float1, mat1)
    check_grads(fun)(vect1, float1)
    check_grads(fun)(float1, vect1)


# No longer supporting this
# def test_dot_method():
#     def fun(x, y): return x.dot(y)

#     mat1 = npr.randn(10, 11)
#     mat2 = npr.randn(10, 11)
#     vect1 = npr.randn(10)
#     vect2 = npr.randn(11)
#     vect3 = npr.randn(11)

#     check_grads(fun)(mat1, vect2)
#     check_grads(fun)(mat1, mat2.T)
#     check_grads(fun)(vect1, mat1)
#     check_grads(fun)(vect2, vect3)


def test_outer():
    def fun(x, y):
        return np.outer(x, y)

    vect2 = npr.randn(11)
    vect3 = npr.randn(11)

    check_grads(fun)(vect2, vect3)
    check_grads(fun)(vect2.T, vect3)
    check_grads(fun)(vect2.T, vect3.T)


def test_max():
    def fun(x):
        return np.max(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_max_axis():
    def fun(x):
        return np.max(x, axis=1)

    mat = npr.randn(3, 4, 5)
    check_grads(fun)(mat)


def test_max_axis_keepdims():
    def fun(x):
        return np.max(x, axis=1, keepdims=True)

    mat = npr.randn(3, 4, 5)
    check_grads(fun)(mat)


def test_min():
    def fun(x):
        return np.min(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_min_axis():
    def fun(x):
        return np.min(x, axis=1)

    mat = npr.randn(3, 4, 5)
    check_grads(fun)(mat)


def test_min_axis_keepdims():
    def fun(x):
        return np.min(x, axis=1, keepdims=True)

    mat = npr.randn(3, 4, 5)
    check_grads(fun)(mat)


def test_sum_1():
    def fun(x):
        return np.sum(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_sum_2():
    def fun(x):
        return np.sum(x, axis=0)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_sum_3():
    def fun(x):
        return np.sum(x, axis=0, keepdims=True)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_sum_with_axis_tuple():
    def fun(x):
        return np.sum(x, axis=(1, 2))

    mat = npr.randn(10, 11, 7)
    check_grads(fun)(mat)


def test_flipud():
    def fun(x):
        return np.flipud(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_fliplr():
    def fun(x):
        return np.fliplr(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_rot90():
    def fun(x):
        return np.rot90(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_cumsum_axis0():
    def fun(x):
        return np.cumsum(x, axis=0)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_cumsum_axis1():
    def fun(x):
        return np.cumsum(x, axis=1)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_cumsum_1d():
    def fun(x):
        return np.cumsum(x)

    mat = npr.randn(10)
    check_grads(fun)(mat)


def test_cumsum_no_axis():
    def fun(x):
        return np.cumsum(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_non_numpy_sum():
    def fun(x, y):
        return sum([x, y])

    mat1 = npr.randn(10, 11)
    mat2 = npr.randn(10, 11)
    check_grads(fun)(mat1, mat2)


def test_mean_1():
    def fun(x):
        return np.mean(x)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_mean_2():
    def fun(x):
        return np.mean(x, axis=0)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_mean_3():
    def fun(x):
        return np.mean(x, axis=0, keepdims=True)

    mat = npr.randn(10, 11)
    check_grads(fun)(mat)


def test_index_ints():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x[3, 0, 1]

    check_grads(fun)(A)


def test_index_slice():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x[::-1, 2:4, :]

    check_grads(fun)(A)


def test_index_lists():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x[[0, 1, 2], :, :]

    check_grads(fun)(A)


def test_index_mixed():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x[3, 2:, [1, 3]]

    check_grads(fun)(A)


def test_vector_slice():
    A = npr.randn(5)

    def fun(x):
        return x[2:4]

    check_grads(fun)(A)


def test_index_slice_fanout():
    A = npr.randn(5, 6, 4)

    def fun(x):
        y = x[::-1, 2:4, :]
        z = x[::-1, 3:5, :]
        return y + z

    check_grads(fun)(A)


def test_index_multiple_slices():
    A = npr.randn(7)

    def fun(x):
        y = x[2:6]
        z = y[1:3]
        return z

    check_grads(fun)(A)


def test_reshape_method():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x.reshape((5 * 4, 6))

    check_grads(fun)(A)


def test_reshape_call():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return np.reshape(x, (5 * 4, 6))

    check_grads(fun)(A)


def test_reshape_method_nolist():
    # The reshape can be called in two different ways:
    # like A.reshape((5,4)) or A.reshape(5,4).
    # This test checks that we support the second way.
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x.reshape(5 * 4, 6)

    check_grads(fun)(A)


def test_ravel_method():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x.ravel()

    check_grads(fun)(A)


def test_ravel_call():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return np.ravel(x)

    check_grads(fun)(A)


def test_flatten_method():
    A = npr.randn(5, 6, 4)

    def fun(x):
        return x.flatten()

    check_grads(fun)(A)


def test_simple_append_list():
    A = [1.0, 2.0, 3.0]
    b = 4.0
    check_grads(np.append, argnum=(0, 1))(A, b)


def test_simple_append_arr():
    A = np.array([1.0, 2.0, 3.0])
    b = 4.0
    check_grads(np.append, argnum=(0, 1))(A, b)


def test_simple_append_list_2D():
    A = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
    B = [[7.0, 8.0, 9.0]]
    check_grads(np.append, argnum=(0, 1))(A, B, axis=0)


def test_simple_concatenate():
    A = npr.randn(5, 6, 4)
    B = npr.randn(4, 6, 4)

    def fun(x):
        return np.concatenate((A, x))

    check_grads(fun)(B)


def test_concatenate_axis_0():
    A = npr.randn(5, 6, 4)
    B = npr.randn(5, 6, 4)

    def fun(x):
        return np.concatenate((B, x, B))

    check_grads(fun)(A)


def test_concatenate_axis_1():
    A = npr.randn(5, 6, 4)
    B = npr.randn(5, 6, 4)

    def fun(x):
        return np.concatenate((B, x, B), axis=1)

    check_grads(fun)(A)


def test_concatenate_axis_1_unnamed():
    """Tests whether you can specify the axis without saying "axis=1"."""
    A = npr.randn(5, 6, 4)
    B = npr.randn(5, 6, 4)

    def fun(x):
        return np.concatenate((B, x, B), 1)

    check_grads(fun)(A)


def test_trace():
    def fun(x):
        return np.trace(x, offset=offset)

    mat = npr.randn(10, 11)
    offset = npr.randint(-9, 11)
    check_grads(fun)(mat)


def test_trace2():
    def fun(x):
        return np.trace(x, offset=offset)

    mat = npr.randn(11, 10)
    offset = npr.randint(-9, 11)
    check_grads(fun)(mat)


def test_trace_extradims():
    def fun(x):
        return np.trace(x, offset=offset)

    mat = npr.randn(5, 6, 4, 3)
    offset = npr.randint(-5, 6)
    check_grads(fun)(mat)


# TODO: Allow axis1, axis2 args.
# def test_trace_extradims2():
#     def fun(x): return np.trace(x, offset=offset, axis1=3,axis2=2)
#     mat = npr.randn(5,6,4,3)
#     offset = npr.randint(-5,6)
#     check_grads(fun)(mat)


def test_diag():
    def fun(x):
        return np.diag(x)

    mat = npr.randn(10, 10)
    check_grads(fun)(mat)


def test_transpose():
    def fun(x):
        return x.T

    mat = npr.randn(8, 8)
    check_grads(fun)(mat)


def test_roll():
    def fun(x):
        return np.roll(x, 2, axis=1)

    mat = npr.randn(4, 5)
    check_grads(fun)(mat)


def test_roll_no_axis():
    def fun(x):
        return np.roll(x, 2, axis=1)

    mat = npr.randn(4, 5)
    check_grads(fun)(mat)


def test_triu():
    def fun(x):
        return np.triu(x, k=2)

    mat = npr.randn(5, 5)
    check_grads(fun)(mat)


def test_tril():
    def fun(x):
        return np.tril(x, k=2)

    mat = npr.randn(5, 5)
    check_grads(fun)(mat)


def test_clip():
    def fun(x):
        return np.clip(x, a_min=0.1, a_max=1.1)

    mat = npr.randn(5, 5)
    check_grads(fun)(mat)


def test_prod_1():
    def fun(x):
        return np.prod(x)

    mat = npr.randn(2, 3) ** 2 / 10.0 + 0.1  # Gradient unstable when zeros are present.
    check_grads(fun)(mat)


def test_prod_2():
    def fun(x):
        return np.prod(x, axis=0)

    mat = npr.randn(2, 3) ** 2 + 0.1
    check_grads(fun)(mat)


def test_prod_3():
    def fun(x):
        return np.prod(x, axis=0, keepdims=True)

    mat = npr.randn(2, 3) ** 2 + 0.1
    check_grads(fun)(mat)


def test_prod_4():
    def fun(x):
        return np.prod(x)

    mat = npr.randn(7) ** 2 + 0.1
    check_grads(fun)(mat)


def test_1d_array():
    def fun(x):
        return np.array([x, x * 1.0, x + 2.5])

    check_grads(fun)(3.0)


def test_2d_array():
    def fun(x):
        return np.array([[x, x * 1.0, x + 2.5], [x**2, x, x / 2.0]])

    check_grads(fun)(3.0)


def test_1d_array_fanout():
    def fun(x):
        A = np.array([x, x * 1.0, x + 2.5])
        return A + A

    check_grads(fun)(3.0)


def test_2d_array_fanout():
    def fun(x):
        A = np.array([[x, x * 1.0, x + 2.5], [x**2, x, x / 2.0]])
        return A + A

    check_grads(fun)(3.0)


def test_array_from_scalar():
    def fun(x):
        return np.array(x)

    check_grads(fun)(3.0)


def test_array_from_arrays():
    def fun(x):
        return np.array([x, x])

    A = npr.randn(3, 2)
    check_grads(fun)(A)


def test_array_from_arrays_2():
    def fun(x):
        return np.array([[2 * x, x + 1], [x, x]])

    A = npr.randn(3, 2)
    check_grads(fun)(A)


def test_len():
    def fun(x):
        assert len(x) == 3
        return x

    A = npr.randn(3, 2)
    check_grads(fun)(A)


def test_r_basic():
    with warnings.catch_warnings(record=True) as w:

        def fun(x):
            c = npr.randn(3, 2)
            b = np.r_[x]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_r_double():
    with warnings.catch_warnings(record=True) as w:

        def fun(x):
            c = npr.randn(3, 2)
            b = np.r_[x, x]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_no_relation():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            return c

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_r_no_relation():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            b = np.r_[c]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_r_node_and_const():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            b = np.r_[x, c]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_r_mixed():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            b = np.r_[x, c, x]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_r_slicing():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(10)

        def fun(x):
            b = np.r_[x, c, 1:10]
            return b

        A = npr.randn(10)
        check_grads(fun)(A)


def test_c_():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            b = np.c_[x, c, x]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_c_mixed():
    with warnings.catch_warnings(record=True) as w:
        c = npr.randn(3, 2)

        def fun(x):
            b = np.c_[x, c, x]
            return b

        A = npr.randn(3, 2)
        check_grads(fun)(A)


def test_var_ddof():
    B = npr.randn(3)
    C = npr.randn(3, 4)
    D = npr.randn(1, 3)
    combo_check(np.var, (0,))([B, C, D], axis=[None], keepdims=[True, False], ddof=[0, 1])
    combo_check(np.var, (0,))([C, D], axis=[None, 1], keepdims=[True, False], ddof=[2])


def test_std_ddof():
    B = npr.randn(3)
    C = npr.randn(3, 4)
    D = npr.randn(1, 3)
    combo_check(np.std, (0,))([B, C, D], axis=[None], keepdims=[True, False], ddof=[0, 1])
    combo_check(np.std, (0,))([C, D], axis=[None, 1], keepdims=[True, False], ddof=[2])


def test_where():
    def fun(x, y):
        b = np.where(C, x, y)
        return b

    C = npr.randn(4, 5) > 0
    A = npr.randn(4, 5)
    B = npr.randn(4, 5)
    check_grads(fun)(A, B)


def test_squeeze_func():
    A = npr.randn(5, 1, 4)

    def fun(x):
        return np.squeeze(x)

    check_grads(fun)(A)


def test_squeeze_method():
    A = npr.randn(5, 1, 4)

    def fun(x):
        return x.squeeze()

    check_grads(fun)(A)


def test_repeat():
    A = npr.randn(5, 3, 4)

    def fun(x):
        return np.repeat(x, 2, axis=1)

    check_grads(fun)(A)


def test_repeat_axis1_rep1():
    A = npr.randn(5, 3, 4)

    def fun(x):
        return np.repeat(x, 1, axis=1)

    check_grads(fun)(A)


def test_repeat_axis0():
    A = npr.randn(5, 3)

    def fun(x):
        return np.repeat(x, 2, axis=0)

    check_grads(fun)(A)


def test_repeat_1d_axis0():
    A = npr.randn(5)

    def fun(x):
        return np.repeat(x, 2, axis=0)

    check_grads(fun)(A)


def test_repeat_axis0_rep1():
    A = npr.randn(5, 1)

    def fun(x):
        return np.repeat(x, 1, axis=0)

    check_grads(fun)(A)


def test_expand_dims():
    A = npr.randn(5, 1, 4)

    def fun(x):
        return np.expand_dims(x, 2)

    check_grads(fun)(A)


def test_tensordot_kwargs_by_position():
    def fun(x):
        return np.tensordot(x * np.ones((2, 2)), x * np.ones((2, 2)), 2)

    grad(fun)(1.0)


def test_multi_index():
    A = npr.randn(3)
    fun = lambda x: np.sum(x[[0, 0]])
    check_grads(fun)(A)


def test_multi_index2():
    A = npr.randn(3)
    fun = lambda x: np.sum(x[[0, 1, 0]])
    check_grads(fun)(A)


def test_index_dot_slices():
    A = npr.randn(4)

    def fun(x):
        return np.dot(x[:2], x[2:])

    check_grads(fun)(A)


# def test_index_exp_slicing():
#    def fun(x):
#        b = np.index_exp[x, x]
#        return b
#    A = npr.randn(10, 1)
#    check_grads(fun)(A)

# def test_s_slicing():
#    def fun(x):
#        b = np.s_[x, x]
#        return b
#    A = npr.randn(10, 1)
#    check_grads(fun)(A)

# TODO:
# getitem


def test_cast_to_int():
    inds = np.ones(5)[:, None]

    def fun(W):
        # glue W and inds together
        glued_together = np.concatenate((W, inds), axis=1)

        # separate W and inds back out
        new_W = W[:, :-1]
        new_inds = np.int64(W[:, -1])

        assert new_inds.dtype == np.int64
        return new_W[new_inds].sum()

    W = np.random.randn(5, 10)
    check_grads(fun)(W)


def test_make_diagonal():
    def fun(D):
        return np.make_diagonal(D, axis1=-1, axis2=-2)

    D = np.random.randn(4)
    A = np.make_diagonal(D, axis1=-1, axis2=-2)
    assert np.allclose(np.diag(A), D)
    check_grads(fun)(D)

    D = np.random.randn(3, 4)
    A = np.make_diagonal(D, axis1=-1, axis2=-2)
    assert all([np.allclose(np.diag(A[i]), D[i]) for i in range(3)])
    check_grads(fun)(D)


def test_diagonal():
    def fun(D):
        return np.diagonal(D, axis1=-1, axis2=-2)

    D = np.random.randn(4, 4)
    A = np.make_diagonal(D, axis1=-1, axis2=-2)
    check_grads(fun)(D)

    D = np.random.randn(3, 4, 4)
    A = np.make_diagonal(D, axis1=-1, axis2=-2)
    check_grads(fun)(D)


def test_nan_to_num():
    y = np.array([0.0, np.nan, np.inf, -np.inf])
    fun = lambda x: np.sum(np.sin(np.nan_to_num(x + y)))

    x = np.random.randn(4)
    check_grads(fun)(x)


# TODO(mattjj): np.frexp returns a pair of ndarrays and the second is an int
# type, for which there is currently no vspace registered
# def test_frexp():
#    fun = lambda x: np.frexp(x)[0]
#    A = 1.2 #np.random.rand(4,3) * 0.8 + 2.1
#    check_grads(fun)(A)


def test_max_equal_values():
    def fun(x):
        return np.max(np.array([x, x]))

    check_grads(fun)(1.0)


def test_max_equal_values_2d():
    def fun(x):
        return np.max(np.array([[x, x], [x, 0.5]]), axis=1)

    check_grads(fun)(1.0)
    check_grads(fun)(-1.0)


def test_min_3_way_equality():
    def fun(x):
        return np.min(np.array([[x, x, x], [x, 0.5, 0.5], [0.5, 0.5, 0.5], [x, x, 0.5]]), axis=0)

    check_grads(fun)(1.0)
    check_grads(fun)(-1.0)


def test_maximum_equal_values():
    def fun(x):
        return np.maximum(x, x)

    check_grads(fun)(1.0)


def test_maximum_equal_values_2d():
    def fun(x):
        return np.maximum(np.array([x, x, 0.5]), np.array([[x, 0.5, x], [x, x, 0.5]]))

    check_grads(fun)(1.0)
    check_grads(fun)(-1.0)
    check_grads(fun)(2.0)


def test_linspace():
    for num in [0, 1, 5]:

        def fun(x, y):
            return np.linspace(x, y, num)

        check_grads(fun)(1.2, 3.4)
        check_grads(fun)(1.2, -3.4)
        check_grads(fun)(1.2, 1.2)


def test_astype():
    x = np.arange(3, dtype="float32")

    def f(x):
        return np.sum(np.sin(x.astype("float64")))

    assert grad(f)(x).dtype == np.dtype("float32")


def test_gradient():
    check_grads(np.gradient, 0)(npr.randn(10))
    check_grads(np.gradient, 0)(npr.randn(10, 10))
    check_grads(np.gradient, 0)(npr.randn(10, 10, 10))

    for a in [None, 0, 1, -1, (0, 1), (0, -1)]:
        check_grads(np.gradient, 0)(npr.randn(10, 10, 10), axis=a)


================================================
FILE: tests/test_performance.py
================================================
# TODO:
# Do a huge calculation with trivial primitive computations
# and lots of diamonds and get a benchmark per-node time and
# memory cost.


================================================
FILE: tests/test_scalar_ops.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.test_util import check_grads

npr.seed(1)


def test_abs():
    fun = lambda x: 3.0 * np.abs(x)
    check_grads(fun)(1.1)
    check_grads(fun)(-1.1)
    check_grads(fun, order=1)(0.0)


def test_absolute():
    fun = lambda x: 3.0 * np.absolute(x)
    check_grads(fun)(1.1)
    check_grads(fun)(-1.1)
    check_grads(fun, order=1)(0.0)


def test_sin():
    fun = lambda x: 3.0 * np.sin(x)
    check_grads(fun)(npr.randn())


def test_sign():
    fun = lambda x: 3.0 * np.sign(x)
    check_grads(fun)(1.1)
    check_grads(fun)(-1.1)


def test_exp():
    fun = lambda x: 3.0 * np.exp(x)
    check_grads(fun)(npr.randn())


def test_log():
    fun = lambda x: 3.0 * np.log(x)
    check_grads(fun)(abs(npr.randn()))


def test_log2():
    fun = lambda x: 3.0 * np.log2(x)
    check_grads(fun)(abs(npr.randn()))


def test_log10():
    fun = lambda x: 3.0 * np.log10(x)
    check_grads(fun)(abs(npr.randn()))


def test_log1p():
    fun = lambda x: 3.0 * np.log1p(x)
    check_grads(fun)(abs(npr.randn()))


def test_expm1():
    fun = lambda x: 3.0 * np.expm1(x)
    check_grads(fun)(abs(npr.randn()))


def test_exp2():
    fun = lambda x: 3.0 * np.exp2(x)
    check_grads(fun)(abs(npr.randn()))


def test_neg():
    fun = lambda x: 3.0 * -x
    check_grads(fun)(npr.randn())


def test_cos():
    fun = lambda x: 3.0 * np.cos(x)
    check_grads(fun)(npr.randn())


def test_tan():
    fun = lambda x: 3.0 * np.tan(x)
    check_grads(fun)(npr.randn())


def test_cosh():
    fun = lambda x: 3.0 * np.cosh(x)
    check_grads(fun)(npr.randn())


def test_sinh():
    fun = lambda x: 3.0 * np.sinh(x)
    check_grads(fun)(npr.randn())


def test_tanh():
    fun = lambda x: 3.0 * np.tanh(x)
    check_grads(fun)(npr.randn())


def test_arccos():
    fun = lambda x: 3.0 * np.arccos(x)
    check_grads(fun)(0.1)


def test_arcsin():
    fun = lambda x: 3.0 * np.arcsin(x)
    check_grads(fun)(0.1)


def test_arctan():
    fun = lambda x: 3.0 * np.arctan(x)
    check_grads(fun)(0.2)


def test_arccosh():
    fun = lambda x: 3.0 * np.arccosh(x)
    check_grads(fun)(npr.randn() ** 2 + 1.2)


def test_arcsinh():
    fun = lambda x: 3.0 * np.arcsinh(x)
    check_grads(fun)(npr.randn())


def test_arctanh():
    fun = lambda x: 3.0 * np.arctanh(x)
    check_grads(fun)(0.2)


def test_sqrt():
    fun = lambda x: 3.0 * np.sqrt(x)
    check_grads(fun)(10.0 * npr.rand())


def test_power_arg0():
    # the +1.'s here are to avoid regimes where numerical diffs fail
    make_fun = lambda y: lambda x: np.power(x, y)
    fun = make_fun(npr.randn() ** 2 + 1.0)
    check_grads(fun)(npr.rand() ** 2 + 1.0)

    # test y == 0. as a special case, c.f. #116
    fun = make_fun(0.0)
    assert grad(fun)(0.0) == 0.0
    assert grad(grad(fun))(0.0) == 0.0


def test_power_arg1():
    x = npr.randn() ** 2
    fun = lambda y: np.power(x, y)
    check_grads(fun)(npr.rand() ** 2)


def test_power_arg1_zero():
    fun = lambda y: np.power(0.0, y)
    check_grads(fun)(npr.rand() ** 2)


def test_mod_arg0():
    fun = lambda x, y: np.mod(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_mod_arg1():
    fun = lambda x, y: np.mod(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_divide_arg0():
    fun = lambda x, y: np.divide(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_divide_arg1():
    fun = lambda x, y: np.divide(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_multiply_arg0():
    fun = lambda x, y: np.multiply(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_multiply_arg1():
    fun = lambda x, y: np.multiply(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_true_divide_arg0():
    fun = lambda x, y: np.true_divide(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_true_divide_arg1():
    fun = lambda x, y: np.true_divide(x, y)
    check_grads(fun)(npr.rand(), npr.rand())


def test_reciprocal():
    fun = lambda x: np.reciprocal(x)
    check_grads(fun)(npr.rand())


def test_negative():
    fun = lambda x: np.negative(x)
    check_grads(fun)(npr.rand())


def test_rad2deg():
    fun = lambda x: 3.0 * np.rad2deg(x)
    check_grads(fun)(10.0 * npr.rand())


def test_deg2rad():
    fun = lambda x: 3.0 * np.deg2rad(x)
    check_grads(fun)(10.0 * npr.rand())


def test_radians():
    fun = lambda x: 3.0 * np.radians(x)
    check_grads(fun)(10.0 * npr.rand())


def test_degrees():
    fun = lambda x: 3.0 * np.degrees(x)
    check_grads(fun)(10.0 * npr.rand())


def test_sinc():
    fun = lambda x: 3.0 * np.sinc(x)
    check_grads(fun)(10.0 * npr.rand())


================================================
FILE: tests/test_scipy.py
================================================
from functools import partial

import numpy as npo

try:
    import scipy
except:
    from warnings import warn

    warn("Skipping scipy tests.")
else:
    from numpy_utils import unary_ufunc_check
    from scipy.signal import convolve as sp_convolve

    import autograd.numpy as np
    import autograd.numpy.random as npr
    import autograd.scipy.integrate as integrate
    import autograd.scipy.linalg as spla
    import autograd.scipy.signal
    import autograd.scipy.special as special
    import autograd.scipy.stats as stats
    import autograd.scipy.stats.multivariate_normal as mvn
    from autograd import grad
    from autograd.test_util import check_grads, combo_check

    npr.seed(1)
    R = npr.randn
    U = npr.uniform

    # Fwd mode not yet implemented for scipy functions
    combo_check = partial(combo_check, modes=["rev"])
    unary_ufunc_check = partial(unary_ufunc_check, modes=["rev"])
    check_grads = partial(check_grads, modes=["rev"])

    def symmetrize_matrix_arg(fun, argnum):
        def T(X):
            return np.swapaxes(X, -1, -2) if np.ndim(X) > 1 else X

        def symmetrize(X):
            return 0.5 * (X + T(X))

        def symmetrized_fun(*args, **kwargs):
            args = list(args)
            args[argnum] = symmetrize(args[argnum])
            return fun(*args, **kwargs)

        return symmetrized_fun

    ### Stats ###
    def test_chi2_pdf():
        combo_check(stats.chi2.pdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3])

    def test_chi2_cdf():
        combo_check(stats.chi2.cdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3])

    def test_chi2_logpdf():
        combo_check(stats.chi2.logpdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3])

    def test_beta_cdf():
        combo_check(stats.beta.cdf, [0])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_beta_pdf():
        combo_check(stats.beta.pdf, [0, 1, 2])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_beta_logpdf():
        combo_check(stats.beta.logpdf, [0, 1, 2])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_gamma_cdf():
        combo_check(stats.gamma.cdf, [0])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_gamma_pdf():
        combo_check(stats.gamma.pdf, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_gamma_logpdf():
        combo_check(stats.gamma.logpdf, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_norm_pdf():
        combo_check(stats.norm.pdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_cdf():
        combo_check(stats.norm.cdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_sf():
        combo_check(stats.norm.sf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_logpdf():
        combo_check(stats.norm.logpdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_logcdf():
        combo_check(stats.norm.logcdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_logsf():
        combo_check(stats.norm.logsf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1])

    def test_norm_pdf_broadcast():
        combo_check(stats.norm.pdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_norm_cdf_broadcast():
        combo_check(stats.norm.cdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_norm_sf_broadcast():
        combo_check(stats.norm.cdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_norm_logpdf_broadcast():
        combo_check(stats.norm.logpdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_norm_logcdf_broadcast():
        combo_check(stats.norm.logcdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_norm_logsf_broadcast():
        combo_check(stats.norm.logcdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1])

    def test_poisson_cdf():
        combo_check(stats.poisson.cdf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1])

    def test_poisson_logpmf():
        combo_check(stats.poisson.logpmf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1])

    def test_poisson_pmf():
        combo_check(stats.poisson.pmf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1])

    def test_poisson_cdf_broadcast():
        combo_check(stats.poisson.cdf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1])

    def test_poisson_logpmf_broadcast():
        combo_check(stats.poisson.logpmf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1])

    def test_poisson_pmf_broadcast():
        combo_check(stats.poisson.pmf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1])

    def test_t_pdf():
        combo_check(stats.t.pdf, [0, 1, 2, 3])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1])

    def test_t_cdf():
        combo_check(stats.t.cdf, [0, 2])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1])

    def test_t_logpdf():
        combo_check(stats.t.logpdf, [0, 1, 2, 3])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1])

    def test_t_logcdf():
        combo_check(stats.t.logcdf, [0, 2])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1])

    def test_t_pdf_broadcast():
        combo_check(stats.t.pdf, [0, 1, 2, 3])(
            [R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1]
        )

    def test_t_cdf_broadcast():
        combo_check(stats.t.cdf, [0, 2])([R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1])

    def test_t_logpdf_broadcast():
        combo_check(stats.t.logpdf, [0, 1, 2, 3])(
            [R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1]
        )

    def test_t_logcdf_broadcast():
        combo_check(stats.t.logcdf, [0, 2])([R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1])

    def make_psd(mat):
        return np.dot(mat.T, mat) + np.eye(mat.shape[0])

    def test_mvn_pdf():
        combo_check(symmetrize_matrix_arg(mvn.pdf, 2), [0, 1, 2])(
            [R(4)], [R(4)], [make_psd(R(4, 4))], allow_singular=[False]
        )

    def test_mvn_logpdf():
        combo_check(symmetrize_matrix_arg(mvn.logpdf, 2), [0, 1, 2])(
            [R(4)], [R(4)], [make_psd(R(4, 4))], allow_singular=[False]
        )

    def test_mvn_entropy():
        combo_check(symmetrize_matrix_arg(mvn.entropy, 1), [0, 1])([10 * R(4)], [make_psd(R(4, 4))])

    def test_mvn_sing_cov():
        cov = np.zeros((4, 4))
        cov[0, 0] = cov[1, 1] = 1

        # Only allow variations in x along the first two dimensions, because
        # variance is zero in the last two.
        def pdf(x, mean, cov):
            x = np.concatenate([x[:2], mean[2:]])
            return symmetrize_matrix_arg(partial(mvn.pdf, allow_singular=True), 2)(x, mean, cov)

        combo_check(pdf, [0, 1])(
            [np.concatenate((R(2), np.zeros(2)))], [np.concatenate((R(2), np.zeros(2)))], [cov]
        )

        def logpdf(x, mean, cov):
            x = np.concatenate([x[:2], mean[2:]])
            return symmetrize_matrix_arg(partial(mvn.logpdf, allow_singular=True), 2)(x, mean, cov)

        combo_check(logpdf, [0, 1])(
            [np.concatenate((R(2), np.zeros(2)))], [np.concatenate((R(2), np.zeros(2)))], [cov]
        )

    def test_mvn_pdf_broadcast():
        combo_check(symmetrize_matrix_arg(mvn.pdf, 2), [0, 1, 2])([R(5, 4)], [R(4)], [make_psd(R(4, 4))])

    def test_mvn_logpdf_broadcast():
        combo_check(symmetrize_matrix_arg(mvn.logpdf, 2), [0, 1, 2])([R(5, 4)], [R(4)], [make_psd(R(4, 4))])

    alpha = npr.random(4) ** 2 + 1.2
    x = stats.dirichlet.rvs(alpha, size=1)[0, :]

    # Need to normalize input so that x's sum to one even when we perturb them to compute numeric gradient.
    def normalize(x):
        return x / sum(x)

    def normalized_dirichlet_pdf(x, alpha):
        return stats.dirichlet.pdf(normalize(x), alpha)

    def normalized_dirichlet_logpdf(x, alpha):
        return stats.dirichlet.logpdf(normalize(x), alpha)

    def test_dirichlet_pdf_x():
        combo_check(normalized_dirichlet_pdf, [0])([x], [alpha])

    def test_dirichlet_pdf_alpha():
        combo_check(stats.dirichlet.pdf, [1])([x], [alpha])

    def test_dirichlet_logpdf_x():
        combo_check(normalized_dirichlet_logpdf, [0])([x], [alpha])

    def test_dirichlet_logpdf_alpha():
        combo_check(stats.dirichlet.logpdf, [1])([x], [alpha])

    ### Misc ###
    def test_logsumexp1():
        combo_check(special.logsumexp, [0], modes=["fwd", "rev"])(
            [np.array([1.1]), R(4), R(3, 4)], axis=[None, 0], keepdims=[True, False]
        )

    def test_logsumexp2():
        combo_check(special.logsumexp, [0], modes=["fwd", "rev"])(
            [R(3, 4), R(4, 5, 6), R(1, 5)], axis=[None, 0, 1], keepdims=[True, False]
        )

    def test_logsumexp3():
        combo_check(special.logsumexp, [0], modes=["fwd", "rev"])(
            [R(4)], b=[np.exp(R(4))], axis=[None, 0], keepdims=[True, False]
        )

    def test_logsumexp4():
        combo_check(special.logsumexp, [0], modes=["fwd", "rev"])(
            [
                R(3, 4),
            ],
            b=[np.exp(R(3, 4))],
            axis=[None, 0, 1],
            keepdims=[True, False],
        )

    def test_logsumexp5():
        combo_check(special.logsumexp, [0], modes=["fwd", "rev"])(
            [R(2, 3, 4)], b=[np.exp(R(2, 3, 4))], axis=[None, 0, 1], keepdims=[True, False]
        )

    def test_logsumexp6():
        x = npr.randn(1, 5)

        def f(a):
            return special.logsumexp(a, axis=1, keepdims=True)

        check_grads(f, modes=["fwd", "rev"])(x)
        check_grads(lambda a: grad(f)(a), modes=["fwd", "rev"])(x)

    ### Signal ###
    def test_convolve_generalization():
        ag_convolve = autograd.scipy.signal.convolve
        A_35 = R(3, 5)
        A_34 = R(3, 4)
        A_342 = R(3, 4, 2)
        A_2543 = R(2, 5, 4, 3)
        A_24232 = R(2, 4, 2, 3, 2)

        for mode in ["valid", "full"]:
            assert npo.allclose(
                ag_convolve(A_35, A_34, axes=([1], [0]), mode=mode)[1, 2],
                sp_convolve(A_35[1, :], A_34[:, 2], mode),
            )
            assert npo.allclose(
                ag_convolve(A_35, A_34, axes=([], []), dot_axes=([0], [0]), mode=mode),
                npo.tensordot(A_35, A_34, axes=([0], [0])),
            )
            assert npo.allclose(
                ag_convolve(A_35, A_342, axes=([1], [2]), dot_axes=([0], [0]), mode=mode)[2],
                sum([sp_convolve(A_35[i, :], A_342[i, 2, :], mode) for i in range(3)]),
            )
            assert npo.allclose(
                ag_convolve(A_2543, A_24232, axes=([1, 2], [2, 4]), dot_axes=([0, 3], [0, 3]), mode=mode)[2],
                sum(
                    [
                        sum(
                            [sp_convolve(A_2543[i, :, :, j], A_24232[i, 2, :, j, :], mode) for i in range(2)]
                        )
                        for j in range(3)
                    ]
                ),
            )

    def test_convolve():
        combo_check(autograd.scipy.signal.convolve, [0, 1])(
            [R(4), R(5), R(6)], [R(2), R(3), R(4)], mode=["full", "valid"]
        )

    def test_convolve_2d():
        combo_check(autograd.scipy.signal.convolve, [0, 1])(
            [R(4, 3), R(5, 4), R(6, 7)], [R(2, 2), R(3, 2), R(4, 2), R(4, 1)], mode=["full", "valid"]
        )

    def test_convolve_ignore():
        combo_check(autograd.scipy.signal.convolve, [0, 1])(
            [R(4, 3)],
            [R(3, 2)],
            axes=[([0], [0]), ([1], [1]), ([0], [1]), ([1], [0]), ([0, 1], [0, 1]), ([1, 0], [1, 0])],
            mode=["full", "valid"],
        )

    def test_convolve_ignore_dot():
        combo_check(autograd.scipy.signal.convolve, [0, 1])(
            [R(3, 3, 2)],
            [R(3, 2, 3)],
            axes=[([1], [1])],
            dot_axes=[([0], [2]), ([0], [0])],
            mode=["full", "valid"],
        )

    ### Special ###
    def test_beta():
        combo_check(special.beta, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_betainc():
        combo_check(special.betainc, [2])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1], [U(0.0, 1.0, 4)])

    def test_betaln():
        combo_check(special.betaln, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1])

    def test_gammainc():
        combo_check(special.gammainc, [1])([1], R(4) ** 2 + 1.3)

    def test_gammaincc():
        combo_check(special.gammaincc, [1])([1], R(4) ** 2 + 1.3)

    def test_polygamma():
        combo_check(special.polygamma, [1])([0], R(4) ** 2 + 1.3)

    def test_jn():
        combo_check(special.jn, [1])([2], R(4) ** 2 + 1.3)

    def test_yn():
        combo_check(special.yn, [1])([2], R(4) ** 2 + 1.3)

    def test_psi():
        unary_ufunc_check(special.psi, lims=[0.3, 2.0], test_complex=False)

    def test_digamma():
        unary_ufunc_check(special.digamma, lims=[0.3, 2.0], test_complex=False)

    def test_gamma():
        unary_ufunc_check(special.gamma, lims=[0.3, 2.0], test_complex=False)

    def test_gammaln():
        unary_ufunc_check(special.gammaln, lims=[0.3, 2.0], test_complex=False)

    def test_gammasgn():
        unary_ufunc_check(special.gammasgn, lims=[0.3, 2.0], test_complex=False)

    def test_rgamma():
        unary_ufunc_check(special.rgamma, lims=[0.3, 2.0], test_complex=False)

    def test_multigammaln():
        combo_check(special.multigammaln, [0])([U(4.0, 5.0), U(4.0, 5.0, (2, 3))], [1, 2, 3])

    def test_j0():
        unary_ufunc_check(special.j0, lims=[0.2, 20.0], test_complex=False)

    def test_j1():
        unary_ufunc_check(special.j1, lims=[0.2, 20.0], test_complex=False)

    def test_y0():
        unary_ufunc_check(special.y0, lims=[0.2, 20.0], test_complex=False)

    def test_y1():
        unary_ufunc_check(special.y1, lims=[0.2, 20.0], test_complex=False)

    def test_i0():
        unary_ufunc_check(special.i0, lims=[0.2, 20.0], test_complex=False)

    def test_i1():
        unary_ufunc_check(special.i1, lims=[0.2, 20.0], test_complex=False)

    def test_iv():
        combo_check(special.iv, [1])(U(1.0, 50.0, 4), R(4) ** 2 + 1.3)

    def test_ive():
        combo_check(special.ive, [1])(U(1.0, 50.0, 4), R(4) ** 2 + 1.3)

    def test_erf():
        unary_ufunc_check(special.erf, lims=[-3.0, 3.0], test_complex=True)

    def test_erfc():
        unary_ufunc_check(special.erfc, lims=[-3.0, 3.0], test_complex=True)

    def test_erfinv():
        unary_ufunc_check(special.erfinv, lims=[-0.95, 0.95], test_complex=False)

    def test_erfcinv():
        unary_ufunc_check(special.erfcinv, lims=[0.05, 1.95], test_complex=False)

    def test_logit():
        unary_ufunc_check(special.logit, lims=[0.10, 0.90], test_complex=False)

    def test_expit():
        unary_ufunc_check(special.expit, lims=[-4.05, 4.95], test_complex=False)

    ### ODE integrator ###
    def func(y, t, arg1, arg2):
        return -np.sqrt(t) - y + arg1 - np.mean((y + arg2) ** 2)

    def test_odeint():
        combo_check(integrate.odeint, [1, 2, 3])([func], [R(3)], [np.linspace(0.1, 0.2, 4)], [(R(3), R(3))])

    ## Linalg
    def test_sqrtm():
        combo_check(spla.sqrtm, modes=["fwd"], order=2)([R(3, 3)])

    def test_sqrtm():
        combo_check(symmetrize_matrix_arg(spla.sqrtm, 0), modes=["fwd", "rev"], order=2)([R(3, 3)])

    def test_solve_sylvester():
        combo_check(spla.solve_sylvester, [0, 1, 2], modes=["rev", "fwd"], order=2)(
            [R(3, 3)], [R(3, 3)], [R(3, 3)]
        )

    def test_solve_banded():
        combo_check(spla.solve_banded, [1, 2], modes=["rev"], order=1)([(1, 1)], [R(3, 5)], [R(5)])


================================================
FILE: tests/test_systematic.py
================================================
import operator as op

import numpy as onp
from numpy_utils import binary_ufunc_check, binary_ufunc_check_no_same_args, stat_check, unary_ufunc_check

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.test_util import combo_check

npr.seed(0)


# Array statistics functions
def test_max():
    stat_check(np.max)


# def test_all():  stat_check(np.all)
# def test_any():  stat_check(np.any)
def test_max():
    stat_check(np.max)


def test_mean():
    stat_check(np.mean)


def test_min():
    stat_check(np.min)


def test_sum():
    stat_check(np.sum)


def test_prod():
    stat_check(np.prod)


def test_var():
    stat_check(np.var)


def test_std():
    stat_check(np.std)


# Unary ufunc tests


def test_sin():
    unary_ufunc_check(np.sin)


def test_abs():
    unary_ufunc_check(np.abs, lims=[0.1, 4.0])


def test_absolute():
    unary_ufunc_check(np.absolute, lims=[0.1, 4.0])


def test_arccosh():
    unary_ufunc_check(np.arccosh, lims=[1.1, 4.0])


def test_arcsinh():
    unary_ufunc_check(np.arcsinh, lims=[-0.9, 0.9])


def test_arctanh():
    unary_ufunc_check(np.arctanh, lims=[-0.9, 0.9])


def test_ceil():
    unary_ufunc_check(np.ceil, lims=[-1.5, 1.5], test_complex=False)


def test_cos():
    unary_ufunc_check(np.cos)


def test_cosh():
    unary_ufunc_check(np.cosh)


def test_deg2rad():
    unary_ufunc_check(np.deg2rad, test_complex=False)


def test_degrees():
    unary_ufunc_check(lambda x: np.degrees(x) / 50.0, test_complex=False)


def test_exp():
    unary_ufunc_check(np.exp)


def test_exp2():
    unary_ufunc_check(np.exp2)


def test_expm1():
    unary_ufunc_check(np.expm1)


def test_fabs():
    unary_ufunc_check(np.fabs, test_complex=False)


def test_floor():
    unary_ufunc_check(np.floor, lims=[-1.5, 1.5], test_complex=False)


def test_log():
    unary_ufunc_check(np.log, lims=[0.2, 2.0])


def test_log10():
    unary_ufunc_check(np.log10, lims=[0.2, 2.0])


def test_log1p():
    unary_ufunc_check(np.log1p, lims=[0.2, 2.0])


def test_log2():
    unary_ufunc_check(np.log2, lims=[0.2, 2.0])


def test_rad2deg():
    unary_ufunc_check(lambda x: np.rad2deg(x) / 50.0, test_complex=False)


def test_radians():
    unary_ufunc_check(np.radians, test_complex=False)


def test_sign():
    unary_ufunc_check(np.sign, test_complex=False)


def test_sin():
    unary_ufunc_check(np.sin)


def test_sinh():
    unary_ufunc_check(np.sinh)


def test_sqrt():
    unary_ufunc_check(np.sqrt, lims=[1.0, 3.0])


def test_square():
    unary_ufunc_check(np.square, test_complex=False)


def test_tan():
    unary_ufunc_check(np.tan, lims=[-1.1, 1.1])


def test_tanh():
    unary_ufunc_check(np.tanh)


def test_real():
    unary_ufunc_check(np.real)


def test_real_ic():
    unary_ufunc_check(np.real_if_close)


def test_imag():
    unary_ufunc_check(np.imag)


def test_conj():
    unary_ufunc_check(np.conj)


def test_conjugate():
    unary_ufunc_check(np.conjugate)


def test_angle():
    unary_ufunc_check(np.angle)


# Binary ufunc tests


def test_add():
    binary_ufunc_check(np.add)


def test_logaddexp():
    binary_ufunc_check(np.logaddexp, test_complex=False)


def test_logaddexp2():
    binary_ufunc_check(np.logaddexp2, test_complex=False)


def test_remainder():
    binary_ufunc_check_no_same_args(np.remainder, lims_A=[-0.9, 0.9], lims_B=[0.7, 1.9], test_complex=False)


def test_true_divide():
    binary_ufunc_check(np.true_divide, lims_B=[0.8, 1.2], test_complex=False)


def test_mod():
    binary_ufunc_check_no_same_args(np.mod, lims_B=[0.8, 2.1], test_complex=False)


def test_true_divide_neg():
    binary_ufunc_check(np.true_divide, lims_B=[-0.3, -2.0], test_complex=False)


def test_mod_neg():
    binary_ufunc_check_no_same_args(np.mod, lims_B=[-0.3, -2.0], test_complex=False)


def test_op_mul():
    binary_ufunc_check(op.mul)


def test_op_add():
    binary_ufunc_check(op.add)


def test_op_sub():
    binary_ufunc_check(op.sub)


def test_op_mod():
    binary_ufunc_check_no_same_args(op.mod, lims_B=[0.3, 2.0], test_complex=False)


def test_op_mod_neg():
    binary_ufunc_check_no_same_args(op.mod, lims_B=[-0.3, -2.0], test_complex=False)


# Misc tests

R = npr.randn
C = lambda *shape: npr.randn(*shape) + 1j * npr.randn(*shape)


def test_transpose():
    combo_check(np.transpose, [0])(
        [R(2, 3, 4)], axes=[None, [0, 1, 2], [0, 2, 1], [2, 0, 1], [2, 1, 0], [1, 0, 2], [1, 2, 0]]
    )


def test_moveaxis():
    combo_check(np.moveaxis, [0])([R(2, 3, 4)], source=[0, 1, 2], destination=[0, 1, 2])


def test_repeat():
    combo_check(np.repeat, [0])([R(2, 3, 4), R(3, 1)], repeats=[0, 1, 2], axis=[None, 0, 1])


def test_diff():
    combo_check(np.diff, [0])([R(5, 5), R(5, 5, 5)], n=[1, 2], axis=[0, 1])
    combo_check(np.diff, [0])([R(1), R(1, 1)], axis=[0])
    combo_check(np.diff, [0])([R(1, 1), R(3, 1)], axis=[1])


def test_gradient():
    combo_check(np.gradient, [0])([R(5, 5), R(5, 5, 5)], axis=[None, 0, 1, -1])
    combo_check(np.gradient, [0])([R(5, 5, 5)], axis=[(0, 1), (0, -1)])


def test_tile():
    combo_check(np.tile, [0])([R(2, 1, 3, 1)], reps=[(1, 4, 1, 2)])
    combo_check(np.tile, [0])([R(1, 2)], reps=[(1, 2), (2, 3), (3, 2, 1)])
    combo_check(np.tile, [0])([R(1)], reps=[(2,), 2])


def test_kron():
    combo_check(np.kron, [0, 1])(
        [R(5, 5), R(4, 4), R(5), R(5, 1), R(1, 5), R(), C(5, 5)],
        [R(3, 3), R(2, 2), R(3), R(1, 3), R(3, 1), R(), C(3, 3)],
    )


def test_inner():
    combo_check(np.inner, [0, 1])([1.5, R(3), R(2, 3)], [0.3, R(3), R(4, 3)])


def test_dot():
    combo_check(np.dot, [0, 1], order=3)(
        [1.5, R(3), R(2, 3), R(2, 2, 3), C(3), C(2, 3)], [0.3, R(3), R(3, 4), R(2, 3, 4), C(3)]
    )


def test_outer():
    combo_check(np.outer, [0, 1], order=3)([R(3), C(3)], [R(3), C(3)])


def test_matmul():
    combo_check(np.matmul, [0, 1])(
        [R(3), R(2, 3), R(2, 2, 3), C(3), C(2, 3)], [R(3), R(3, 4), R(2, 3, 4), C(3), C(3, 4)]
    )


def test_matmul_broadcast():
    combo_check(np.matmul, [0, 1])([R(1, 2, 2)], [R(3, 2, 1)])


def test_tensordot_1():
    combo_check(np.tensordot, [0, 1], order=3)(
        [R(1, 3), R(2, 3, 2), C(1, 3)], [R(3), R(3, 1), R(3, 4, 2), C(3)], axes=[[(1,), (0,)]]
    )


def test_tensordot_2():
    combo_check(np.tensordot, [0, 1], order=3)(
        [R(3), R(3, 1), R(3, 4, 2)], [R(1, 3), R(2, 3, 2)], axes=[[(0,), (1,)]]
    )


def test_tensordot_3():
    combo_check(np.tensordot, [0, 1], order=3)(
        [R(2, 3), R(2, 3, 4)], [R(1, 2, 3), R(2, 2, 3, 4)], axes=[[(0, 1), (1, 2)], [(1, 0), (2, 1)]]
    )


def test_tensordot_4():
    combo_check(np.tensordot, [0, 1], order=3)([R(2, 2), R(4, 2, 2)], [R(2, 2), R(2, 2, 4)], axes=[1, 2])


def test_tensordot_5():
    combo_check(np.tensordot, [0, 1], order=3)([R(4)], [R()], axes=[0])


def test_tensordot_6():
    combo_check(np.tensordot, [0, 1], order=3)([R(2, 6)], [R(6, 3)], axes=[[[-1], [0]]])


def test_tensordot_7():
    combo_check(np.tensordot, [0, 1], order=3)([R(2, 6)], [R(6, 3)], axes=[[-1, 0]])


def test_tensordot_8():
    combo_check(np.tensordot, [0, 1], order=3)([R(2)], [R(2, 2)], axes=[[0, 1]])


# Need custom tests because gradient is undefined when arguments are identical.
def test_maximum():
    combo_check(np.maximum, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)])


def test_fmax():
    combo_check(np.fmax, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)])


def test_minimum():
    combo_check(np.minimum, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)])


def test_fmin():
    combo_check(np.fmin, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)])


def test_sort():
    combo_check(np.sort, [0])([R(1), R(7)])


if onp.lib.NumpyVersion(onp.__version__) < "2.0.0":

    def test_msort():
        combo_check(np.msort, [0])([R(1), R(7)])


def test_partition():
    combo_check(np.partition, [0])([R(7), R(14)], kth=[0, 3, 6])


def test_atleast_1d():
    combo_check(np.atleast_1d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5)])


def test_atleast_2d():
    combo_check(np.atleast_2d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5)])


def test_atleast_3d():
    combo_check(np.atleast_3d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5), R(2, 4, 3, 5)])


def test_einsum_transpose():
    combo_check(np.einsum, [1])(["ij->ji"], [R(1, 1), R(4, 4), R(3, 4)])


def test_einsum_matmult():
    combo_check(np.einsum, [1, 2])(["ij,jk->ik"], [R(2, 3), C(2, 3)], [R(3, 4), C(3, 4)])


def test_einsum_matmult_broadcast():
    combo_check(np.einsum, [1, 2])(["...ij,...jk->...ik"], [R(2, 3), R(2, 2, 3)], [R(3, 4), R(2, 3, 4)])


def test_einsum_matmult_broadcast_leadzero():
    combo_check(np.einsum, [1, 2])(["...ij,...jk->...ik"], [R(0, 2, 3)], [R(0, 3, 4)])


def test_einsum_covsum():
    combo_check(np.einsum, [1, 2])(["ijk,lji->lki"], [R(3, 4, 4)], [R(4, 4, 3)])


def test_einsum_ellipses():
    combo_check(np.einsum, [1, 2])(
        ["...jk,...lj->...lk", "...,...->..."], [R(4, 4), R(3, 4, 4)], [R(4, 4), R(3, 4, 4)]
    )


def test_einsum_ellipses_tail():
    combo_check(np.einsum, [1, 2])(["jk...,lj...->lk..."], [R(3, 2), R(3, 2, 4)], [R(2, 3), R(2, 3, 4)])


def test_einsum_ellipses_center():
    combo_check(np.einsum, [1, 2])(["j...k,lj...->lk..."], [R(2, 2), R(2, 2, 2)], [R(2, 2), R(2, 2, 2)])


def test_einsum_three_args():
    combo_check(np.einsum, [1, 2])(["ijk,lji,lli->lki"], [R(3, 4, 4)], [R(4, 4, 3)], [R(4, 4, 3)])


def test_einsum2_transpose():
    combo_check(np.einsum, [0])([R(1, 1), R(4, 4), R(3, 4)], [(0, 1)], [(1, 0)])


def test_einsum2_matmult():
    combo_check(np.einsum, [0, 2])([R(2, 3)], [(0, 1)], [R(3, 4)], [(1, 2)], [(0, 2)])


def test_einsum2_matmult_broadcast():
    combo_check(np.einsum, [0, 2])(
        [R(2, 3), R(2, 2, 3)],
        [(Ellipsis, 0, 1)],
        [R(3, 4), R(2, 3, 4)],
        [(Ellipsis, 1, 2)],
        [(Ellipsis, 0, 2)],
    )


def test_einsum2_covsum():
    combo_check(np.einsum, [0, 2])([R(3, 4, 4)], [(0, 1, 2)], [R(4, 4, 3)], [(3, 1, 0)], [(3, 2, 0)])


def test_einsum2_three_args():
    combo_check(np.einsum, [0, 2])(
        [R(3, 4, 4)], [(0, 1, 2)], [R(4, 4, 3)], [(3, 1, 0)], [R(4, 4, 3)], [(3, 3, 0)], [(3, 2, 0)]
    )


def test_einsum_naked_sum():
    combo_check(np.einsum, [1, 2])(["k,nk->"], [R(5)], [R(10, 5)])


def test_einsum_naked_sum2():
    combo_check(np.einsum, [1])(["abcd->bd"], [R(3, 2, 3, 2)])


def test_einsum_naked_sum_ellipsis():
    combo_check(np.einsum, [1, 2])(["...k,...nk->..."], [R(3, 5)], [R(3, 10, 5)])


def test_einsum_no_output_indices():
    combo_check(np.einsum, [1, 2])(["ij,k"], [R(3, 4)], [R(3)])


def test_trace():
    combo_check(np.trace, [0])([R(5, 5), R(4, 5), R(5, 4), R(3, 4, 5)], offset=[-1, 0, 1])


def test_diag():
    combo_check(np.diag, [0])([R(5, 5)], k=[-1, 0, 1])


def test_diag_flat():
    combo_check(np.diag, [0])([R(5)], k=[-1, 0, 1])


def test_tril():
    combo_check(np.tril, [0])([R(5, 5)], k=[-1, 0, 1])


def test_triu():
    combo_check(np.triu, [0])([R(5, 5)], k=[-1, 0, 1])


def test_tril_3d():
    combo_check(np.tril, [0])([R(5, 5, 4)], k=[-1, 0, 1])


def test_triu_3d():
    combo_check(np.triu, [0])([R(5, 5, 4)], k=[-1, 0, 1])


def test_swapaxes():
    combo_check(np.swapaxes, [0])([R(3, 4, 5)], axis1=[0, 1, 2], axis2=[0, 1, 2])


def test_rollaxis():
    combo_check(np.rollaxis, [0])([R(2, 3, 4)], axis=[0, 1, 2], start=[0, 1, 2])


def test_cross():
    combo_check(np.cross, [0, 1])(
        [R(3, 3)], [R(3, 3)], axisa=[-1, 0, 1], axisb=[-1, 0, 1], axisc=[-1, 0, 1], axis=[None, -1, 0, 1]
    )


def test_vsplit_2d():
    combo_check(np.vsplit, [0])([R(4, 8)], [4, [1, 2]])


def test_vsplit_3d():
    combo_check(np.vsplit, [0])([R(4, 4, 4)], [2, [1, 2]])


def test_hsplit_2d():
    combo_check(np.hsplit, [0])([R(4, 8)], [4, [1, 2]])


def test_hsplit_3d():
    combo_check(np.hsplit, [0])([R(4, 4, 4)], [2, [1, 2]])


def test_dsplit_3d():
    combo_check(np.dsplit, [0])([R(4, 4, 4)], [2, [1, 2]])


def test_split_1d():
    combo_check(np.split, [0])([R(1), R(7)], [1], axis=[0])


def test_split_2d():
    combo_check(np.split, [0])([R(4, 8)], [4, [1, 2]], axis=[0, 1])


def test_split_3d():
    combo_check(np.split, [0])([R(4, 4, 4)], [2, [1, 2]], axis=[0, 1, 2])


def test_array_split_1d():
    combo_check(np.array_split, [0])([R(1), R(7)], [1, 3], axis=[0])


def test_array_split_2d():
    combo_check(np.array_split, [0])([R(7, 7)], [4, [3, 5]], axis=[0, 1])


def test_array_split_3d():
    combo_check(np.array_split, [0])([R(7, 7, 7)], [4, [3, 5]], axis=[0, 1, 2])


def test_concatenate_1ist():
    combo_check(np.concatenate, [0])([(R(1), R(3))], axis=[0])


def test_concatenate_tuple():
    combo_check(np.concatenate, [0])([[R(1), R(3)]], axis=[0])


def test_concatenate_2d():
    combo_check(np.concatenate, [0])([(R(2, 2), R(2, 2))], axis=[0, 1])


def test_concatenate_3d():
    combo_check(np.concatenate, [0])([(R(2, 2, 2), R(2, 2, 2))], axis=[0, 1, 2])


def test_vstack_1d():
    combo_check(np.vstack, [0])([R(2), (R(2), R(2))])


def test_vstack_2d():
    combo_check(np.vstack, [0])([R(2, 3), (R(2, 4), R(1, 4))])


def test_vstack_3d():
    combo_check(np.vstack, [0])([R(2, 3, 4), (R(2, 3, 4), R(5, 3, 4))])


def test_hstack_1d():
    combo_check(np.hstack, [0])([R(2), (R(2), R(2))])


def test_hstack_2d():
    combo_check(np.hstack, [0])([R(3, 2), (R(3, 4), R(3, 5))])


def test_hstack_3d():
    combo_check(np.hstack, [0])([R(2, 3, 4), (R(2, 1, 4), R(2, 5, 4))])


def test_stack_1d():
    combo_check(np.stack, [0])([(R(2),), (R(2), R(2))], axis=[0, 1])


def test_row_stack_1d():
    combo_check(np.row_stack, [0])([R(2), (R(2), R(2))])


def test_row_stack_2d():
    combo_check(np.row_stack, [0])([R(2, 3), (R(2, 4), R(1, 4))])


def test_column_stack_1d():
    combo_check(np.column_stack, [0])([R(2), (R(2), R(2))])


def test_column_stack_2d():
    combo_check(np.column_stack, [0])([R(2, 2), (R(2, 2), R(2, 2))])


def test_select():
    combo_check(np.select, [1])(
        [[R(3, 4, 5) > 0, R(3, 4, 5) > 0, R(3, 4, 5) > 0]],
        [[R(3, 4, 5), R(3, 4, 5), R(3, 4, 5)]],
        default=[0.0, 1.1],
    )


def test_pad():
    combo_check(np.pad, [0])(
        [R(2, 2)], [0, 3, (3,), (3, 2), ((3, 2),), ((1, 2), (3, 4)), ((0, 0), (0, 0))], ["constant"]
    )


================================================
FILE: tests/test_tests.py
================================================
from pytest import raises

from autograd.extend import defvjp
from autograd.test_util import check_grads
from autograd.tracer import primitive


def test_check_vjp_1st_order_fail():
    @primitive
    def foo(x):
        return x * 2.0

    defvjp(foo, lambda ans, x: lambda g: g * 2.001)

    with raises(AssertionError, match="\\(VJP\\) check of foo failed"):
        check_grads(foo, modes=["rev"])(1.0)


def test_check_vjp_2nd_order_fail():
    @primitive
    def foo(x):
        return x * 2.0

    defvjp(foo, lambda ans, x: lambda g: bar(g) * 2)

    @primitive
    def bar(x):
        return x

    defvjp(bar, lambda ans, x: lambda g: g * 1.001)

    with raises(AssertionError, match="\\(VJP\\) check of vjp_foo failed"):
        check_grads(foo, modes=["rev"])(1.0)


================================================
FILE: tests/test_truediv.py
================================================
# This file is to check that future division works.

from test_binary_ops import arg_pairs

import autograd.numpy as np
from autograd.test_util import check_grads


def test_div():
    fun = lambda x, y: x / y
    make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5)
    for arg1, arg2 in arg_pairs():
        arg1 = make_gap_from_zero(arg1)
        arg2 = make_gap_from_zero(arg2)
        check_grads(fun)(arg1, arg2)


================================================
FILE: tests/test_tuple.py
================================================
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd import isinstance as ag_isinstance
from autograd import tuple as ag_tuple
from autograd.test_util import check_grads

npr.seed(1)


def test_getter():
    def fun(input_tuple):
        A = np.sum(input_tuple[0])
        B = np.sum(input_tuple[1])
        C = np.sum(input_tuple[1])
        return A + B + C

    d_fun = grad(fun)
    input_tuple = (npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4))

    result = d_fun(input_tuple)
    assert np.allclose(result[0], np.ones((5, 6)))
    assert np.allclose(result[1], 2 * np.ones((4, 3)))
    assert np.allclose(result[2], np.zeros((2, 4)))


def test_grads():
    def fun(input_tuple):
        A = np.sum(np.sin(input_tuple[0]))
        B = np.sum(np.cos(input_tuple[1]))
        return A + B

    def d_fun(input_tuple):
        g = grad(fun)(input_tuple)
        A = np.sum(g[0])
        B = np.sum(np.sin(g[0]))
        C = np.sum(np.sin(g[1]))
        return A + B + C

    input_tuple = (npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4))

    check_grads(fun)(input_tuple)
    check_grads(d_fun)(input_tuple)


def test_nested_higher_order():
    def outer_fun(x):
        def inner_fun(y):
            return y[0] * y[1]

        return np.sum(np.sin(np.array(grad(inner_fun)(ag_tuple((x, x))))))

    check_grads(outer_fun)(5.0)
    check_grads(grad(outer_fun))(10.0)
    check_grads(grad(grad(outer_fun)))(10.0)


def test_isinstance():
    def fun(x):
        assert ag_isinstance(x, tuple)
        assert ag_isinstance(x, ag_tuple)
        return x[0]

    fun((1.0, 2.0, 3.0))
    grad(fun)((1.0, 2.0, 3.0))


================================================
FILE: tests/test_vspaces.py
================================================
import itertools as it
from functools import reduce

import numpy as np

from autograd.core import vspace
from autograd.test_util import check_grads, scalar_close


def check_vspace(value):
    vs = vspace(value)
    # --- required attributes ---
    size = vs.size
    add = vs.add
    scalar_mul = vs.scalar_mul
    inner_prod = vs.inner_prod
    randn = vs.randn
    zeros = vs.zeros
    ones = vs.ones
    standard_basis = vs.standard_basis

    # --- util ---
    def randns(N=2):
        return [randn() for i in range(N)]

    def rand_scalar():
        return float(np.random.randn())

    def rand_scalars(N=2):
        return [rand_scalar() for i in range(N)]

    def vector_close(x, y):
        z = randn()
        return scalar_close(inner_prod(z, x), inner_prod(z, y))

    # --- vector space axioms ---
    def associativity_of_add(x, y, z):
        return vector_close(add(x, add(y, z)), add(add(x, y), z))

    def commutativity_of_add(x, y):
        return vector_close(add(x, y), add(y, x))

    def identity_element_of_add(x):
        return vector_close(add(zeros(), x), x)

    def inverse_elements_of_add(x):
        return vector_close(zeros(), add(x, scalar_mul(x, -1.0)))

    def compatibility_of_scalar_mul_with_field_mul(x, a, b):
        return vector_close(scalar_mul(x, a * b), scalar_mul(scalar_mul(x, a), b))

    def identity_element_of_scalar_mul(x):
        return vector_close(scalar_mul(x, 1.0), x)

    def distributivity_of_scalar_mul_wrt_vector_add(x, y, a):
        return vector_close(scalar_mul(add(x, y), a), add(scalar_mul(x, a), scalar_mul(y, a)))

    def distributivity_of_scalar_mul_wrt_scalar_add(x, a, b):
        return vector_close(scalar_mul(x, a + b), add(scalar_mul(x, a), scalar_mul(x, b)))

    # --- closure ---
    def add_preserves_vspace(x, y):
        return vs == vspace(add(x, y))

    def scalar_mul_preserves_vspace(x, a):
        return vs == vspace(scalar_mul(x, a))

    # --- inner product axioms ---
    def symmetry(x, y):
        return scalar_close(inner_prod(x, y), inner_prod(y, x))

    def linearity(x, y, a):
        return scalar_close(inner_prod(scalar_mul(x, a), y), a * inner_prod(x, y))

    def positive_definitive(x):
        return 0 < inner_prod(x, x)

    def inner_zeros():
        return scalar_close(0, inner_prod(zeros(), zeros()))

    # --- basis vectors and special vectors---
    def basis_orthonormality():
        return all(
            [
                scalar_close(inner_prod(x, y), 1.0 * (ix == iy))
                for (ix, x), (iy, y) in it.product(enumerate(standard_basis()), enumerate(standard_basis()))
            ]
        )

    def ones_sum_of_basis_vects():
        return vector_close(reduce(add, standard_basis()), ones())

    def basis_correct_size():
        return len(list(standard_basis())) == size

    def basis_correct_vspace():
        return (vs == vspace(x) for x in standard_basis())

    def zeros_correct_vspace():
        return vs == vspace(zeros())

    def ones_correct_vspace():
        return vs == vspace(ones())

    def randn_correct_vspace():
        return vs == vspace(randn())

    assert associativity_of_add(*randns(3))
    assert commutativity_of_add(*randns())
    assert identity_element_of_add(randn())
    assert inverse_elements_of_add(randn())
    assert compatibility_of_scalar_mul_with_field_mul(randn(), *rand_scalars())
    assert identity_element_of_scalar_mul(randn())
    assert distributivity_of_scalar_mul_wrt_vector_add(randn(), randn(), rand_scalar())
    assert distributivity_of_scalar_mul_wrt_scalar_add(randn(), *rand_scalars())
    assert add_preserves_vspace(*randns())
    assert scalar_mul_preserves_vspace(randn(), rand_scalar())
    assert symmetry(*randns())
    assert linearity(randn(), randn(), rand_scalar())
    assert positive_definitive(randn())
    assert inner_zeros()
    assert basis_orthonormality()
    assert ones_sum_of_basis_vects()
    assert basis_correct_size()
    assert basis_correct_vspace()
    assert zeros_correct_vspace()
    assert ones_correct_vspace()
    assert randn_correct_vspace()

    # --- grads of basic operations ---
    check_grads(add)(*randns())
    check_grads(scalar_mul)(randn(), rand_scalar())
    check_grads(inner_prod)(*randns())


def test_array_vspace():
    check_vspace(np.zeros((3, 2)))


def test_array_vspace_0_dim():
    check_vspace(0.0)


def test_array_vspace_complex():
    check_vspace(1.0j * np.zeros((2, 1)))


def test_list_vspace():
    check_vspace([1.0, np.zeros((2, 1))])


def test_tuple_vspace():
    check_vspace((1.0, np.zeros((2, 1))))


def test_dict_vspace():
    check_vspace({"a": 1.0, "b": np.zeros((2, 1))})


def test_mixed_vspace():
    check_vspace({"x": [0.0, np.zeros((3, 1))], "y": ({"a": 0.0}, [0.0])})


================================================
FILE: tests/test_wrappers.py
================================================
import warnings
from functools import partial

import pytest

import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import (
    checkpoint,
    elementwise_grad,
    grad,
    grad_and_aux,
    hessian,
    hessian_tensor_product,
    jacobian,
    make_ggnvp,
    make_hvp,
    make_jvp,
    tensor_jacobian_product,
    value_and_grad,
)
from autograd.test_util import check_equivalent, check_grads  # , nd
from autograd.tracer import isbox

npr.seed(1)


def test_return_both():
    fun = lambda x: 3.0 * x**3.2
    d_fun = grad(fun)
    f_and_d_fun = value_and_grad(fun)

    test_x = 1.7
    f, d = f_and_d_fun(test_x)
    assert f == fun(test_x)
    assert d == d_fun(test_x)


def test_value_and_grad():
    fun = lambda x: np.sum(np.sin(x) ** 2)
    dfun = grad(fun)
    dfun_both = value_and_grad(fun)
    x = npr.randn(5)
    assert not isbox(dfun_both(x)[0])
    check_equivalent(fun(x), dfun_both(x)[0])
    check_equivalent(dfun(x), dfun_both(x)[1])

    def fun2(x):
        return dfun_both(x)[0]

    check_grads(fun2)(x)


def test_hessian():
    # Check Hessian of a quadratic function.
    D = 5
    H = npr.randn(D, D)

    def fun(x):
        return np.dot(np.dot(x, H), x)

    hess = hessian(fun)
    x = npr.randn(D)
    check_equivalent(hess(x), H + H.T)


def test_multigrad():
    def complicated_fun(a, b, c, d, e, f=1.1, g=9.0):
        return a + np.sin(b) + np.cosh(c) + np.cos(d) + np.tan(e) + f + g

    def complicated_fun_3_1(d_b):
        d, b = d_b
        return complicated_fun(A, b, C, d, E, f=F, g=G)

    A = 0.5
    B = -0.3
    C = 0.2
    D = -1.1
    E = 0.7
    F = 0.6
    G = -0.1

    wrapped = grad(complicated_fun, argnum=[3, 1])(A, B, C, D, E, f=F, g=G)
    explicit = grad(complicated_fun_3_1)((D, B))
    check_equivalent(wrapped, explicit)


def test_value_and_multigrad():
    def complicated_fun(a, b, c, d, e, f=1.1, g=9.0):
        return a + np.sin(b) + np.cosh(c) + np.cos(d) + np.tan(e) + f + g

    A = 0.5
    B = -0.3
    C = 0.2
    D = -1.1
    E = 0.7
    F = 0.6
    G = -0.1

    dfun = grad(complicated_fun, argnum=[3, 1])
    dfun_both = value_and_grad(complicated_fun, argnum=[3, 1])

    check_equivalent(complicated_fun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[0])

    check_equivalent(dfun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[1])


def test_multigrad_onearg():
    fun = lambda x, y: np.sum(x + np.sin(y))
    packed_fun = lambda xy: np.sum(xy[0] + np.sin(xy[1]))
    A, B = npr.randn(3), npr.randn(3)
    check_equivalent(grad(fun, argnum=[0])(A, B), (grad(packed_fun)((A, B))[0],))


def test_elementwise_grad():
    def simple_fun(a):
        return a + np.sin(a) + np.cosh(a)

    A = npr.randn(10)

    wrapped = elementwise_grad(simple_fun)(A)
    explicit = np.array([grad(simple_fun)(A[i]) for i in range(len(A))])
    check_equivalent(wrapped, explicit)


def test_elementwise_grad_multiple_args():
    def simple_fun(a, b):
        return a + np.sin(a) + np.cosh(b)

    A = 0.9
    B = npr.randn(10)
    argnum = 1

    wrapped = elementwise_grad(simple_fun, argnum)(A, B)
    explicit = np.array([grad(simple_fun, argnum)(A, B[i]) for i in range(len(B))])
    check_equivalent(wrapped, explicit)


def test_hessian_tensor_product():
    fun = lambda a: np.sum(np.sin(a))
    a = npr.randn(5)
    v = npr.randn(5)
    H = hessian(fun)(a)
    check_equivalent(np.dot(H, v), hessian_tensor_product(fun)(a, v))


def test_hvp():
    fun = lambda a: np.sum(np.sin(a))
    a = npr.randn(5)
    v = npr.randn(5)
    H = hessian(fun)(a)
    hvp = make_hvp(fun)(a)[0]
    check_equivalent(np.dot(H, v), hvp(v))


def test_hessian_matrix_product():
    fun = lambda a: np.sum(np.sin(a))
    a = npr.randn(5, 4)
    V = npr.randn(5, 4)
    H = hessian(fun)(a)
    check_equivalent(np.tensordot(H, V), hessian_tensor_product(fun)(a, V))


def test_hessian_tensor_product_3d():
    fun = lambda a: np.sum(np.sin(a))
    a = npr.randn(5, 4, 3)
    V = npr.randn(5, 4, 3)
    H = hessian(fun)(a)
    check_equivalent(np.tensordot(H, V, axes=np.ndim(V)), hessian_tensor_product(fun)(a, V))


def test_tensor_jacobian_product():
    # This function will have an asymmetric jacobian matrix.
    fun = lambda a: np.roll(np.sin(a), 1)
    a = npr.randn(5)
    V = npr.randn(5)
    J = jacobian(fun)(a)
    check_equivalent(np.dot(V.T, J), tensor_jacobian_product(fun)(a, V))


def test_matrix_jacobian_product():
    fun = lambda a: np.roll(np.sin(a), 1)
    a = npr.randn(5, 4)
    V = npr.randn(5, 4)
    J = jacobian(fun)(a)
    check_equivalent(np.tensordot(V, J), tensor_jacobian_product(fun)(a, V))


def test_tensor_jacobian_product():
    fun = lambda a: np.roll(np.sin(a), 1)
    a = npr.randn(5, 4, 3)
    V = npr.randn(5, 4)
    J = jacobian(fun)(a)
    check_equivalent(np.tensordot(V, J, axes=np.ndim(V)), tensor_jacobian_product(fun)(a, V))


def test_deprecated_defgrad_wrapper():
    from autograd.core import primitive

    @primitive
    def new_mul(x, y):
        return x * y

    with warnings.catch_warnings(record=True) as w:
        new_mul.defgrad(lambda ans, x, y: lambda g: y * g)
        new_mul.defgrad(lambda ans, x, y: lambda g: x * g, argnum=1)

    def fun(x, y):
        return new_mul(x, y)

    mat1 = npr.randn(2, 2)
    mat2 = npr.randn(2, 2)
    check_grads(fun, modes=["rev"])(mat1, mat2)


def test_deprecated_defvjp_wrapper():
    from autograd.core import primitive

    @primitive
    def new_mul(x, y):
        return x * y

    with warnings.catch_warnings(record=True) as w:
        new_mul.defvjp(lambda g, ans, vs, gvs, x, y: y * g)
        new_mul.defvjp(lambda g, ans, vs, gvs, x, y: x * g, argnum=1)

    def fun(x, y):
        return new_mul(x, y)

    mat1 = npr.randn(2, 2)
    mat2 = npr.randn(2, 2)
    check_grads(fun, modes=["rev"])(mat1, mat2)


def test_deprecated_defvjp_is_zero_wrapper():
    from autograd.core import primitive

    @primitive
    def new_mul(x, y):
        return 0 * x * y

    with warnings.catch_warnings(record=True) as w:
        new_mul.defvjp_is_zero([0, 1])

    def fun(x, y):
        return new_mul(x, y)

    mat1 = npr.randn(2, 2)
    mat2 = npr.randn(2, 2)
    with warnings.catch_warnings(record=True) as w:
        check_grads(fun, modes=["rev"])(mat1, mat2)


def test_deprecated_quick_grad_check_wrapper():
    from autograd.util import quick_grad_check

    with warnings.catch_warnings(record=True) as w:
        quick_grad_check(lambda x, y: x**2 + y, 1.0, (2.0,))


def test_partial():
    def f(x, y):
        return x

    grad(partial(f, y=1))


@pytest.mark.skip(reason="fails with NumPy nightlies")
def test_dtypes():
    def f(x):
        return np.real(np.sum(x**2))

    # Array y with dtype np.float32
    y = np.random.randn(10, 10).astype(np.float32)
    assert grad(f)(y).dtype.type is np.float32

    y = np.random.randn(10, 10).astype(np.float16)
    assert grad(f)(y).dtype.type is np.float16

    y = np.random.randn(10, 10).astype(np.longdouble)
    grad(f)(y)

    y = np.random.randn(10, 10).astype(np.clongdouble)
    grad(f)(y)


def test_checkpoint_correctness():
    bar = lambda x, y: 2 * x + y + 5
    checkpointed_bar = checkpoint(bar)
    foo = lambda x: bar(x, x / 3.0) + bar(x, x**2)
    foo2 = lambda x: checkpointed_bar(x, x / 3.0) + checkpointed_bar(x, x**2)
    assert np.allclose(foo(3.0), foo2(3.0))
    assert np.allclose(grad(foo)(3.0), grad(foo2)(3.0))

    baz = lambda *args: sum(args)
    checkpointed_baz = checkpoint(baz)
    foobaz = lambda x: baz(x, x / 3.0)
    foobaz2 = lambda x: checkpointed_baz(x, x / 3.0)
    assert np.allclose(foobaz(3.0), foobaz2(3.0))
    assert np.allclose(grad(foobaz)(3.0), grad(foobaz2)(3.0))


def checkpoint_memory():
    """This test is meant to be run manually, since it depends on
    memory_profiler and its behavior may vary."""
    try:
        from memory_profiler import memory_usage
    except ImportError:
        return

    def f(a):
        for _ in range(10):
            a = np.sin(a**2 + 1)
        return a

    checkpointed_f = checkpoint(f)

    def testfun(f, x):
        for _ in range(5):
            x = f(x)
        return np.sum(x)

    gradfun = grad(testfun, 1)

    A = npr.RandomState(0).randn(100000)
    max_usage = max(memory_usage((gradfun, (f, A))))
    max_checkpointed_usage = max(memory_usage((gradfun, (checkpointed_f, A))))

    assert max_checkpointed_usage < max_usage / 2.0


def test_make_jvp():
    A = npr.randn(3, 5)
    x = npr.randn(5)
    v = npr.randn(5)
    fun = lambda x: np.tanh(np.dot(A, x))

    jvp_explicit = lambda x: lambda v: np.dot(jacobian(fun)(x), v)
    jvp = make_jvp(fun)

    check_equivalent(jvp_explicit(x)(v), jvp(x)(v)[1])


def _make_explicit_ggnvp(f, g=lambda x: 1.0 / 2 * np.dot(x, x)):
    def ggnvp_maker(x):
        J = jacobian(f)(x)
        H = hessian(g)(f(x))

        def ggnvp(v):
            return np.dot(J.T, np.dot(H, np.dot(J, v)))

        return ggnvp

    return ggnvp_maker


def test_make_ggnvp():
    A = npr.randn(5, 4)
    x = npr.randn(4)
    v = npr.randn(4)

    fun = lambda x: np.dot(A, x)
    check_equivalent(make_ggnvp(fun)(x)(v), _make_explicit_ggnvp(fun)(x)(v))

    fun2 = lambda x: np.tanh(np.dot(A, x))
    check_equivalent(make_ggnvp(fun2)(x)(v), _make_explicit_ggnvp(fun2)(x)(v))


def test_make_ggnvp_nondefault_g():
    A = npr.randn(5, 4)
    x = npr.randn(4)
    v = npr.randn(4)

    g = lambda y: np.sum(2.0 * y**2 + y**4)

    fun = lambda x: np.dot(A, x)
    check_equivalent(make_ggnvp(fun, g)(x)(v), _make_explicit_ggnvp(fun, g)(x)(v))

    fun2 = lambda x: np.tanh(np.dot(A, x))
    check_equivalent(make_ggnvp(fun2, g)(x)(v), _make_explicit_ggnvp(fun2, g)(x)(v))


def test_grad_and_aux():
    A = npr.randn(5, 4)
    x = npr.randn(4)

    f = lambda x: (np.sum(np.dot(A, x)), x**2)
    g = lambda x: np.sum(np.dot(A, x))

    assert len(grad_and_aux(f)(x)) == 2

    check_equivalent(grad_and_aux(f)(x)[0], grad(g)(x))
    check_equivalent(grad_and_aux(f)(x)[1], x**2)


## No longer support this behavior
# def test_make_ggnvp_broadcasting():
#   A = npr.randn(4, 5)
#   x = npr.randn(10, 4)
#   v = npr.randn(10, 4)

#   fun = lambda x: np.tanh(np.dot(x, A))
#   res1 = np.stack([_make_explicit_ggnvp(fun)(xi)(vi) for xi, vi in zip(x, v)])
#   res2 = make_ggnvp(fun)(x)(v)
#   check_equivalent(res1, res2)


def test_wrapped_name_and_docs():
    def foo(x):
        pass

    assert grad.__name__ == "grad"
    # Python 3.13: Compiler now strip indents from docstrings.
    # https://docs.python.org/3.13/whatsnew/3.13.html#other-language-changes
    assert grad.__doc__.startswith(tuple(f"\n{indent}Returns a function which" for indent in ("    ", "")))
    assert grad(foo, 1).__name__ == "grad_of_foo_wrt_argnum_1"
    assert grad(foo, 1).__doc__.startswith("    grad of function foo with")