Repository: HIPS/autograd Branch: master Commit: 994362fdbcc8 Files: 120 Total size: 426.4 KB Directory structure: gitextract_4gygwh8h/ ├── .github/ │ └── workflows/ │ ├── check.yml │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── README.md ├── autograd/ │ ├── __init__.py │ ├── builtins.py │ ├── core.py │ ├── differential_operators.py │ ├── extend.py │ ├── misc/ │ │ ├── __init__.py │ │ ├── fixed_points.py │ │ ├── flatten.py │ │ ├── optimizers.py │ │ └── tracers.py │ ├── numpy/ │ │ ├── __init__.py │ │ ├── fft.py │ │ ├── linalg.py │ │ ├── numpy_boxes.py │ │ ├── numpy_jvps.py │ │ ├── numpy_vjps.py │ │ ├── numpy_vspaces.py │ │ ├── numpy_wrapper.py │ │ └── random.py │ ├── scipy/ │ │ ├── __init__.py │ │ ├── integrate.py │ │ ├── linalg.py │ │ ├── signal.py │ │ ├── special.py │ │ └── stats/ │ │ ├── __init__.py │ │ ├── beta.py │ │ ├── chi2.py │ │ ├── dirichlet.py │ │ ├── gamma.py │ │ ├── multivariate_normal.py │ │ ├── norm.py │ │ ├── poisson.py │ │ └── t.py │ ├── test_util.py │ ├── tracer.py │ ├── util.py │ └── wrap_util.py ├── benchmarks/ │ ├── __init__.py │ ├── asv.conf.json.sample │ ├── bench_core.py │ ├── bench_mem.py │ ├── bench_numpy_vjps.py │ ├── bench_rnn.py │ └── bench_util.py ├── conda_recipe/ │ └── conda.yaml ├── docs/ │ ├── tutorial.md │ └── updateguide.md ├── examples/ │ ├── README.md │ ├── __init__.py │ ├── bayesian_neural_net.py │ ├── bayesian_optimization.py │ ├── black_box_svi.py │ ├── convnet.py │ ├── data.py │ ├── data_mnist.py │ ├── deep_gaussian_process.py │ ├── define_gradient.py │ ├── dot_graph.py │ ├── fixed_points.py │ ├── fluidsim/ │ │ ├── fluidsim.py │ │ └── wing.py │ ├── gaussian_process.py │ ├── generative_adversarial_net.py │ ├── gmm.py │ ├── gplvm.py │ ├── hmm_em.py │ ├── ica.py │ ├── logistic_regression.py │ ├── lstm.py │ ├── mixture_variational_inference.py │ ├── natural_gradient_black_box_svi.py │ ├── negative_binomial_maxlike.py │ ├── neural_net.py │ ├── neural_net_regression.py │ ├── ode_net.py │ ├── print_trace.py │ ├── rkhs.py │ ├── rnn.py │ ├── rosenbrock.py │ ├── sinusoid.py │ ├── tanh.py │ └── variational_autoencoder.py ├── license.txt ├── noxfile.py ├── pyproject.toml └── tests/ ├── _test_complexity.py ├── check_examples_run.sh ├── conftest.py ├── numpy_utils.py ├── profiling.py ├── test_binary_ops.py ├── test_builtins.py ├── test_complex.py ├── test_core.py ├── test_dict.py ├── test_direct.py ├── test_fft.py ├── test_graphs.py ├── test_jacobian.py ├── test_linalg.py ├── test_list.py ├── test_logic.py ├── test_misc.py ├── test_numpy.py ├── test_performance.py ├── test_scalar_ops.py ├── test_scipy.py ├── test_systematic.py ├── test_tests.py ├── test_truediv.py ├── test_tuple.py ├── test_vspaces.py └── test_wrappers.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/check.yml ================================================ name: Style and package checks on: pull_request: branches: - master push: branches: - master workflow_dispatch: env: PIP_DISABLE_PIP_VERSION_CHECK: "1" FORCE_COLOR: "3" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: check: name: ${{ matrix.env }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: session: # - lint - validate-package steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1 - name: Run ${{ matrix.env }} run: uvx nox -s ${{ matrix.env }} ================================================ FILE: .github/workflows/publish.yml ================================================ name: Publish on: workflow_dispatch: release: types: [published] env: PIP_DISABLE_PIP_VERSION_CHECK: '1' FORCE_COLOR: '3' jobs: build: name: Build sdist and wheel runs-on: ubuntu-latest steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 name: Checkout repository - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "3.12" - name: Install build tools run: | pipx run build --outdir dist - name: Upload wheel and sdist artifacts uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: artifacts path: ./dist/* if-no-files-found: error publish: needs: [build] name: Upload to PyPI runs-on: ubuntu-latest environment: name: release url: https://pypi.org/p/autograd permissions: id-token: write # mandatory for trusted publishing steps: - name: Download artifacts uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: path: dist merge-multiple: true - name: Sanity check artifacts run: ls -la dist/ - name: Publish sdist and wheel to PyPI uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 with: packages-dir: dist/ ================================================ FILE: .github/workflows/test.yml ================================================ name: CI on: pull_request: branches: - master push: branches: - master workflow_dispatch: schedule: - cron: "0 4 * * *" env: PIP_DISABLE_PIP_VERSION_CHECK: "1" FORCE_COLOR: "3" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: test: name: Regular tests / ${{ matrix.platform }} / Python ${{ matrix.python-version }} runs-on: ${{ matrix.platform }} strategy: fail-fast: false matrix: platform: [ubuntu-latest, ubuntu-22.04-arm, macos-15-intel, macos-latest, windows-latest] python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy-3.10"] steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.python-version }} allow-prereleases: true - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1 # On PyPy, we skip SciPy because we don't have wheels # available, see noxfile.py for more details. - name: Run tests run: uvx nox -s tests # In this job, we test against the NumPy nightly wheels hosted on # https://anaconda.org/scientific-python-nightly-wheels/numpy # on the latest Python version available across platforms, instead of # testing all Python versions and implementations on all platforms. # We do not test on PyPy. # # However, "nox -s nightly-tests" can be used locally anywhere, on # any Python version and implementation on any platform and we leave # it to the user to decide what Python version to test against, which # might or might not have a corresponding NumPy nightly wheel present. nightlies: name: Nightly tests / ${{ matrix.platform }} / Python ${{ matrix.python-version }} runs-on: ${{ matrix.platform }} strategy: fail-fast: false matrix: platform: [ubuntu-latest, ubuntu-22.04-arm, macos-15-intel, macos-latest, windows-latest] python-version: ["3.x"] steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.python-version }} allow-prereleases: true - uses: yezz123/setup-uv@ab6be5a42627f19dc36e57b548592a5e52cece4a # v4.1 - name: Run tests against nightly wheels for NumPy and SciPy run: uvx nox -s nightly-tests ================================================ FILE: .gitignore ================================================ __pycache__/ *.py[cod] *$py.class # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache coverage.* *.cover .hypothesis/ nosetests.xml .pytest_cache/ junit-report.xml # pyenv .python-version # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # mypy .mypy_cache/ # OS and IDE config files .DS_Store .idea/ # project-specific data/ *.so *.c scratch/ examples/data .asv/ asv.conf.json benchmarks/asv.conf.js ================================================ FILE: .pre-commit-config.yaml ================================================ ci: autoupdate_commit_msg: "chore: update pre-commit hooks" autofix_commit_msg: "style: pre-commit fixes" repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-added-large-files - id: check-case-conflict - id: check-merge-conflict - id: check-yaml exclude: conda_recipe/conda.yaml - id: debug-statements - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/asottile/pyupgrade rev: v3.21.2 hooks: - id: pyupgrade args: [--py310-plus] - repo: https://github.com/astral-sh/ruff-pre-commit rev: "v0.15.6" hooks: - id: ruff args: ["--fix", "--show-fixes"] - id: ruff-format - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: - id: python-check-blanket-type-ignore exclude: ^src/vector/backends/_numba_object.py$ - id: rst-backticks - id: rst-directive-colons - id: rst-inline-touching-normal ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Use [Nox](https://nox.thea.codes/en/stable/) to run tests and linting, e.g., ```shell pip install nox ``` `nox` will run all checks in an isolated virtual environment with Autograd and its dependencies, including its optional dependencies, installed. ## Run tests, linting, packaging checks | Command | Description | | ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nox --list` | Lists all available Nox sessions, including selected ones | | `nox -s lint` | Runs code style checks with pre-commit and pre-commit hooks as listed in `.pre-commit-config.yaml`. Accepts posargs to pass additional arguments to the linter. | | `nox -s tests` | Runs tests with your default Python interpreter. Accepts posargs to pass additional arguments and configuration to `pytest`. | | `nox -s nightly-tests` | Similar to `nox -s tests`, except that it runs tests with nightly versions of dependencies (NumPy, SciPy, etc.). | | `nox -s validate-package` | Builds a source distribution and a wheel using `pypa/build` and checks the package with `twine` in strict mode. | | `nox` | Runs all selected sessions, as listed in `nox.options.sessions` in `noxfile.py`. | Additionally, `nox` supports tags to run specific sessions, e.g., `nox --tags tests` runs all sessions tagged with `tests`. Make sure all tests pass before you push your changes to GitHub. GH Actions will run the tests across all supported Python versions. ## Using positional arguments (reformat, upload package, help) You can use additional arguments for the tools (`pytest`, `pre-commit`, etc.) called by Nox by separating them from the Nox arguments by a double-hyphen `--`, e.g., - `nox -s tests -- tests/test_tuple.py` runs just the tests listed `tests/test_tuple.py`. - `nox -s lint -- --fix` runs the linter with the `--fix` flag. - and so on. ================================================ FILE: README.md ================================================ # Autograd [![Checks status][checks-badge]][checks-url] [![Tests status][tests-badge]][tests-url] [![Publish status][publish-badge]][publish-url] [![asv][asv-badge]](#) [publish-badge]: https://github.com/HIPS/autograd/actions/workflows/publish.yml/badge.svg [checks-badge]: https://github.com/HIPS/autograd/actions/workflows/check.yml/badge.svg [tests-badge]: https://github.com/HIPS/autograd/actions/workflows/test.yml/badge.svg [asv-badge]: http://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat [publish-url]: https://github.com/HIPS/autograd/actions/workflows/publish.yml [checks-url]: https://github.com/HIPS/autograd/actions/workflows/check.yml [tests-url]: https://github.com/HIPS/autograd/actions/workflows/test.yml Autograd can automatically differentiate native Python and Numpy code. It can handle a large subset of Python's features, including loops, ifs, recursion and closures, and it can even take derivatives of derivatives of derivatives. It supports reverse-mode differentiation (a.k.a. backpropagation), which means it can efficiently take gradients of scalar-valued functions with respect to array-valued arguments, as well as forward-mode differentiation, and the two can be composed arbitrarily. The main intended application of Autograd is gradient-based optimization. For more information, check out the [tutorial](docs/tutorial.md) and the [examples directory](examples/). Example use: ```python >>> import autograd.numpy as np # Thinly-wrapped numpy >>> from autograd import grad # The only autograd function you may ever need >>> >>> def tanh(x): # Define a function ... return (1.0 - np.exp((-2 * x))) / (1.0 + np.exp(-(2 * x))) ... >>> grad_tanh = grad(tanh) # Obtain its gradient function >>> grad_tanh(1.0) # Evaluate the gradient at x = 1.0 np.float64(0.419974341614026) >>> (tanh(1.0001) - tanh(0.9999)) / 0.0002 # Compare to finite differences np.float64(0.41997434264973155) ``` We can continue to differentiate as many times as we like, and use numpy's vectorization of scalar-valued functions across many different input values: ```python >>> from autograd import elementwise_grad as egrad # for functions that vectorize over inputs >>> import matplotlib.pyplot as plt >>> x = np.linspace(-7, 7, 700) >>> plt.plot(x, tanh(x), ... x, egrad(tanh)(x), # first derivative ... x, egrad(egrad(tanh))(x), # second derivative ... x, egrad(egrad(egrad(tanh)))(x), # third derivative ... x, egrad(egrad(egrad(egrad(tanh))))(x),) # fourth derivative >>> plt.show() ``` See the [tanh example file](examples/tanh.py) for the code. ## Documentation You can find a tutorial [here.](docs/tutorial.md) ## End-to-end examples * [Simple neural net](examples/neural_net.py) * [Convolutional neural net](examples/convnet.py) * [Recurrent neural net](examples/rnn.py) * [LSTM](examples/lstm.py) * [Neural Turing Machine](https://github.com/DoctorTeeth/diffmem/blob/512aadeefd6dbafc1bdd253a64b6be192a435dc3/ntm/ntm.py) * [Backpropagating through a fluid simulation](examples/fluidsim/fluidsim.py) * [Variational inference in Bayesian neural network](examples/bayesian_neural_net.py) * [Gaussian process regression](examples/gaussian_process.py) * [Sampyl, a pure Python MCMC package with HMC and NUTS](https://github.com/mcleonard/sampyl) ## How to install Install Autograd using Pip: ```shell pip install autograd ``` Some features require SciPy, which you can install separately or as an optional dependency along with Autograd: ```shell pip install "autograd[scipy]" ``` ## Authors and maintainers Autograd was written by [Dougal Maclaurin](https://dougalmaclaurin.com), [David Duvenaud](https://www.cs.toronto.edu/~duvenaud/), [Matt Johnson](http://people.csail.mit.edu/mattjj/), [Jamie Townsend](https://github.com/j-towns) and many other contributors. The package is currently being maintained by [Agriya Khetarpal](https://github.com/agriyakhetarpal), [Fabian Joswig](https://github.com/fjosw) and [Jamie Townsend](https://github.com/j-towns). Please feel free to submit any bugs or feature requests. We'd also love to hear about your experiences with Autograd in general. Drop us an email! We want to thank Jasper Snoek and the rest of the HIPS group (led by Prof. Ryan P. Adams) for helpful contributions and advice; Barak Pearlmutter for foundational work on automatic differentiation and for guidance on our implementation; and Analog Devices Inc. (Lyric Labs) and Samsung Advanced Institute of Technology for their generous support. ================================================ FILE: autograd/__init__.py ================================================ from autograd.core import primitive_with_deprecation_warnings as primitive from .builtins import dict, isinstance, list, tuple, type from .differential_operators import ( checkpoint, deriv, elementwise_grad, grad, grad_and_aux, grad_named, hessian, hessian_tensor_product, hessian_vector_product, holomorphic_grad, jacobian, make_ggnvp, make_hvp, make_jvp, make_vjp, multigrad_dict, tensor_jacobian_product, value_and_grad, vector_jacobian_product, ) ================================================ FILE: autograd/builtins.py ================================================ from .extend import ( Box, SparseObject, VSpace, defjvp, defjvp_argnum, defvjp, defvjp_argnum, notrace_primitive, primitive, vspace, ) from .util import subvals isinstance_ = isinstance isinstance = notrace_primitive(isinstance) type_ = type type = notrace_primitive(type) tuple_, list_, dict_ = tuple, list, dict @primitive def container_take(A, idx): return A[idx] def grad_container_take(ans, A, idx): return lambda g: container_untake(g, idx, vspace(A)) defvjp(container_take, grad_container_take) defjvp(container_take, "same") class SequenceBox(Box): __slots__ = [] __getitem__ = container_take def __len__(self): return len(self._value) def __add__(self, other): return sequence_extend_right(self, *other) def __radd__(self, other): return sequence_extend_left(self, *other) def __contains__(self, elt): return elt in self._value def index(self, elt): return self._value.index(elt) SequenceBox.register(tuple_) SequenceBox.register(list_) class DictBox(Box): __slots__ = [] __getitem__ = container_take def __len__(self): return len(self._value) def __iter__(self): return self._value.__iter__() def __contains__(self, elt): return elt in self._value def items(self): return list(self.iteritems()) def keys(self): return list(self.iterkeys()) def values(self): return list(self.itervalues()) def iteritems(self): return ((k, self[k]) for k in self) def iterkeys(self): return iter(self) def itervalues(self): return (self[k] for k in self) def get(self, k, d=None): return self[k] if k in self else d DictBox.register(dict_) @primitive def container_untake(x, idx, vs): if isinstance(idx, slice): accum = lambda result: [elt_vs._mut_add(a, b) for elt_vs, a, b in zip(vs.shape[idx], result, x)] else: accum = lambda result: vs.shape[idx]._mut_add(result, x) def mut_add(A): return vs._subval(A, idx, accum(A[idx])) return SparseObject(vs, mut_add) defvjp(container_untake, lambda ans, x, idx, _: lambda g: container_take(g, idx)) defjvp(container_untake, "same") @primitive def sequence_extend_right(seq, *elts): return seq + type(seq)(elts) def grad_sequence_extend_right(argnum, ans, args, kwargs): seq, elts = args[0], args[1:] return lambda g: g[: len(seq)] if argnum == 0 else g[len(seq) + argnum - 1] defvjp_argnum(sequence_extend_right, grad_sequence_extend_right) @primitive def sequence_extend_left(seq, *elts): return type(seq)(elts) + seq def grad_sequence_extend_left(argnum, ans, args, kwargs): seq, elts = args[0], args[1:] return lambda g: g[len(elts) :] if argnum == 0 else g[argnum - 1] defvjp_argnum(sequence_extend_left, grad_sequence_extend_left) @primitive def make_sequence(seq_type, *args): return seq_type(args) defvjp_argnum(make_sequence, lambda argnum, *args: lambda g: g[argnum - 1]) def fwd_grad_make_sequence(argnum, g, ans, seq_type, *args, **kwargs): return container_untake(g, argnum - 1, vspace(ans)) defjvp_argnum(make_sequence, fwd_grad_make_sequence) class TupleMeta(type(tuple_)): def __instancecheck__(self, instance): return isinstance(instance, tuple_) class tuple(tuple_, metaclass=TupleMeta): def __new__(cls, xs): return make_sequence(tuple_, *xs) class ListMeta(type_): def __instancecheck__(self, instance): return isinstance(instance, list_) class list(list_, metaclass=ListMeta): def __new__(cls, xs): return make_sequence(list_, *xs) class DictMeta(type_): def __instancecheck__(self, instance): return isinstance(instance, dict_) class dict(dict_, metaclass=DictMeta): def __new__(cls, *args, **kwargs): result = dict_(*args, **kwargs) if result: return _make_dict(result.keys(), list(result.values())) return result @primitive def _make_dict(keys, vals): return dict_(zip(keys, vals)) defvjp(_make_dict, lambda ans, keys, vals: lambda g: list(g[key] for key in keys), argnums=(1,)) class ContainerVSpace(VSpace): def __init__(self, value): self.shape = value self.shape = self._map(vspace) @property def size(self): return sum(self._values(self._map(lambda vs: vs.size))) def zeros(self): return self._map(lambda vs: vs.zeros()) def ones(self): return self._map(lambda vs: vs.ones()) def randn(self): return self._map(lambda vs: vs.randn()) def standard_basis(self): zero = self.zeros() for i, vs in self._kv_pairs(self.shape): for x in vs.standard_basis(): yield self._subval(zero, i, x) def _add(self, xs, ys): return self._map(lambda vs, x, y: vs._add(x, y), xs, ys) def _mut_add(self, xs, ys): return self._map(lambda vs, x, y: vs._mut_add(x, y), xs, ys) def _scalar_mul(self, xs, a): return self._map(lambda vs, x: vs._scalar_mul(x, a), xs) def _inner_prod(self, xs, ys): return sum(self._values(self._map(lambda vs, x, y: vs._inner_prod(x, y), xs, ys))) def _covector(self, xs): return self._map(lambda vs, x: vs._covector(x), xs) class SequenceVSpace(ContainerVSpace): def _values(self, x): return x def _kv_pairs(self, x): return enumerate(x) def _map(self, f, *args): return self.seq_type(map(f, self.shape, *args)) def _subval(self, xs, idx, x): return self.seq_type(subvals(xs, [(idx, x)])) class ListVSpace(SequenceVSpace): seq_type = list_ class TupleVSpace(SequenceVSpace): seq_type = tuple_ class DictVSpace(ContainerVSpace): def _values(self, x): return x.values() def _kv_pairs(self, x): return x.items() def _map(self, f, *args): return {k: f(vs, *[x[k] for x in args]) for k, vs in self.shape.items()} def _subval(self, xs, idx, x): d = dict(xs.items()) d[idx] = x return d ListVSpace.register(list_) TupleVSpace.register(tuple_) DictVSpace.register(dict_) class NamedTupleVSpace(SequenceVSpace): def _map(self, f, *args): return self.seq_type(*map(f, self.shape, *args)) def _subval(self, xs, idx, x): return self.seq_type(*subvals(xs, [(idx, x)])) ================================================ FILE: autograd/core.py ================================================ from functools import reduce from itertools import count from .tracer import Box, Node, getval, isbox, primitive, toposort, trace from .util import func, subval # -------------------- reverse mode -------------------- def make_vjp(fun, x): start_node = VJPNode.new_root() end_value, end_node = trace(start_node, fun, x) if end_node is None: def vjp(g): return vspace(x).zeros() else: def vjp(g): return backward_pass(g, end_node) return vjp, end_value def backward_pass(g, end_node): outgrads = {end_node: (g, False)} for node in toposort(end_node): outgrad = outgrads.pop(node) ingrads = node.vjp(outgrad[0]) for parent, ingrad in zip(node.parents, ingrads): outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) return outgrad[0] class VJPNode(Node): __slots__ = ["parents", "vjp"] def __init__(self, value, fun, args, kwargs, parent_argnums, parents): self.parents = parents try: vjpmaker = primitive_vjps[fun] except KeyError: fun_name = getattr(fun, "__name__", fun) raise NotImplementedError(f"VJP of {fun_name} wrt argnums {parent_argnums} not defined") self.vjp = vjpmaker(parent_argnums, value, args, kwargs) def initialize_root(self): self.parents = [] self.vjp = lambda g: () primitive_vjps = {} def defvjp_argnums(fun, vjpmaker): primitive_vjps[fun] = vjpmaker def defvjp_argnum(fun, vjpmaker): def vjp_argnums(argnums, *args): vjps = [vjpmaker(argnum, *args) for argnum in argnums] return lambda g: (vjp(g) for vjp in vjps) defvjp_argnums(fun, vjp_argnums) def defvjp(fun, *vjpmakers, **kwargs): argnums = kwargs.get("argnums", count()) vjps_dict = { argnum: translate_vjp(vjpmaker, fun, argnum) for argnum, vjpmaker in zip(argnums, vjpmakers) } def vjp_argnums(argnums, ans, args, kwargs): L = len(argnums) # These first two cases are just optimizations if L == 1: argnum = argnums[0] try: vjpfun = vjps_dict[argnum] except KeyError: raise NotImplementedError(f"VJP of {fun.__name__} wrt argnum 0 not defined") vjp = vjpfun(ans, *args, **kwargs) return lambda g: (vjp(g),) elif L == 2: argnum_0, argnum_1 = argnums try: vjp_0_fun = vjps_dict[argnum_0] vjp_1_fun = vjps_dict[argnum_1] except KeyError: raise NotImplementedError(f"VJP of {fun.__name__} wrt argnums 0, 1 not defined") vjp_0 = vjp_0_fun(ans, *args, **kwargs) vjp_1 = vjp_1_fun(ans, *args, **kwargs) return lambda g: (vjp_0(g), vjp_1(g)) else: vjps = [vjps_dict[argnum](ans, *args, **kwargs) for argnum in argnums] return lambda g: (vjp(g) for vjp in vjps) defvjp_argnums(fun, vjp_argnums) def translate_vjp(vjpfun, fun, argnum): if vjpfun is None: return lambda ans, *args, **kwargs: lambda g: vspace(args[argnum]).zeros() elif callable(vjpfun): return vjpfun else: raise Exception(f"Bad VJP '{vjpfun}' for '{fun.__name__}'") # -------------------- forward mode -------------------- def make_jvp(fun, x): def jvp(g): start_node = JVPNode.new_root(g) end_value, end_node = trace(start_node, fun, x) if end_node is None: return end_value, vspace(end_value).zeros() else: return end_value, end_node.g return jvp class JVPNode(Node): __slots__ = ["g"] def __init__(self, value, fun, args, kwargs, parent_argnums, parents): parent_gs = [parent.g for parent in parents] try: jvpmaker = primitive_jvps[fun] except KeyError: name = getattr(fun, "__name__", fun) raise NotImplementedError(f"JVP of {name} wrt argnums {parent_argnums} not defined") self.g = jvpmaker(parent_argnums, parent_gs, value, args, kwargs) def initialize_root(self, g): self.g = g primitive_jvps = {} def defjvp_argnums(fun, jvpmaker): primitive_jvps[fun] = jvpmaker def defjvp_argnum(fun, jvpmaker): def jvp_argnums(argnums, gs, ans, args, kwargs): return sum_outgrads(jvpmaker(argnum, g, ans, args, kwargs) for argnum, g in zip(argnums, gs)) defjvp_argnums(fun, jvp_argnums) def defjvp(fun, *jvpfuns, **kwargs): argnums = kwargs.get("argnums", count()) jvps_dict = {argnum: translate_jvp(jvpfun, fun, argnum) for argnum, jvpfun in zip(argnums, jvpfuns)} def jvp_argnums(argnums, gs, ans, args, kwargs): return sum_outgrads(jvps_dict[argnum](g, ans, *args, **kwargs) for argnum, g in zip(argnums, gs)) defjvp_argnums(fun, jvp_argnums) def translate_jvp(jvpfun, fun, argnum): if jvpfun is None: return lambda g, ans, *a, **k: vspace(ans).zeros() elif jvpfun == "same": return lambda g, ans, *args, **kwargs: fun(*subval(args, argnum, g), **kwargs) elif callable(jvpfun): return jvpfun else: raise Exception(f"Bad JVP '{jvpfun}' for '{fun.__name__}'") def def_linear(fun): """Flags that a function is linear wrt all args""" defjvp_argnum(fun, lambda argnum, g, ans, args, kwargs: fun(*subval(args, argnum, g), **kwargs)) # -------------------- vector behavior -------------------- def add_outgrads(prev_g_flagged, g): sparse = type(g) in sparse_object_types if prev_g_flagged: vs = vspace(g) prev_g, mutable = prev_g_flagged if mutable: if sparse: return sparse_add(vs, prev_g, g), True else: return vs.mut_add(prev_g, g), True else: if sparse: prev_g_mutable = vs.mut_add(None, prev_g) return sparse_add(vs, prev_g_mutable, g), True else: return vs.add(prev_g, g), True else: if sparse: return sparse_add(vspace(g), None, g), True else: return g, False def sum_outgrads(gs): return reduce(add_outgrads, gs, None)[0] @primitive def sparse_add(vs, x_prev, x_new): x_prev = x_prev if x_prev is not None else vs.zeros() return x_new.mut_add(x_prev) class VSpace: __slots__ = [] mappings = {} iscomplex = False def __init__(self, value): pass def zeros(self): assert False, repr(self) def ones(self): assert False, repr(self) def standard_basis(self): assert False, repr(self) def randn(self): assert False, repr(self) @primitive def mut_add(self, x_prev, x_new): x_prev = x_prev if x_prev is not None else self.zeros() return self._mut_add(x_prev, x_new) @primitive def add(self, x_prev, x_new): return self._add(x_prev, x_new) @primitive def scalar_mul(self, x, a): return self._scalar_mul(x, a) @primitive def inner_prod(self, x, y): return self._inner_prod(x, y) @primitive def covector(self, x): return self._covector(x) def _add(self, x, y): return x + y def _mut_add(self, x, y): x += y return x def _scalar_mul(self, x, a): return x * a def _inner_prod(self, x, y): assert False def _covector(self, x): return x def __eq__(self, other): return type(self) == type(other) and self.__dict__ == other.__dict__ def __repr__(self): return f"{type(self).__name__}_{self.__dict__}" @classmethod def register(cls, value_type, vspace_maker=None): if vspace_maker: VSpace.mappings[value_type] = vspace_maker else: VSpace.mappings[value_type] = cls def vspace(value): try: return VSpace.mappings[type(value)](value) except KeyError: if isbox(value): return vspace(getval(value)) else: raise TypeError( "Can't find vector space for value {} of type {}. Valid types are {}".format( value, type(value), VSpace.mappings.keys() ) ) class SparseBox(Box): __slots__ = [] class SparseObject: __slots__ = ["vs", "mut_add"] def __init__(self, vs, mut_add): self.vs = vs self.mut_add = mut_add VSpace.register(SparseObject, lambda x: x.vs) SparseBox.register(SparseObject) sparse_object_types = {SparseObject, SparseBox} # -------------------- core reverse mode grads -------------------- identity_vjp = lambda argnums, *args: lambda g: g defvjp(sparse_add, None, identity_vjp, identity_vjp) defvjp(func(VSpace.add), None, identity_vjp, identity_vjp) defvjp(func(VSpace.mut_add), None, identity_vjp, identity_vjp) defvjp( func(VSpace.inner_prod), None, lambda ans, vs, x, y: lambda g: vs.covector(vs.scalar_mul(y, g)), lambda ans, vs, x, y: lambda g: vs.covector(vs.scalar_mul(x, g)), ) defvjp(func(VSpace.covector), None, lambda ans, vs, x: lambda g: vs.covector(g)) defvjp( func(VSpace.scalar_mul), None, lambda ans, vs, x, a: lambda g: vs.covector(vs.scalar_mul(vs.covector(g), a)), lambda ans, vs, x, a: lambda g: vs.inner_prod(g, vs.covector(x)), ) # -------------------- core forward mode grads -------------------- identity_jvp = lambda g, *args, **kwargs: g defjvp(sparse_add, None, identity_jvp, identity_jvp) defjvp(func(VSpace.mut_add), None, identity_jvp, identity_jvp) defjvp(func(VSpace.add), None, identity_jvp, identity_jvp) defjvp(func(VSpace.scalar_mul), None, "same", "same") defjvp(func(VSpace.inner_prod), None, "same", "same") defjvp(func(VSpace.covector), None, "same") # -------------------- deprecation warnings ----------------------- import warnings deprecated_defvjp_message = """ The {} method is deprecated. See the update guide and tutorial: https://github.com/HIPS/autograd/blob/master/docs/updateguide.md https://github.com/HIPS/autograd/blob/master/docs/tutorial.md""" def deprecated_defvjp(primitive_fun): deprecation_msg = deprecated_defvjp_message.format("defvjp") vjpfuns = {} def defvjp_unstaged(vjpmaker, argnum=0): warnings.warn(deprecation_msg) def staged_vjpmaker(ans, *args, **kwargs): def vjp(g): vs, gvs = vspace(args[argnum]), vspace(g) return vjpmaker(g, ans, vs, gvs, *args, **kwargs) return vjp vjpfuns[argnum] = staged_vjpmaker argnums, vjpmakers = zip(*[(argnum, vjpfuns[argnum]) for argnum in sorted(vjpfuns.keys())]) defvjp(primitive_fun, *vjpmakers, argnums=argnums) return defvjp_unstaged def deprecated_defvjp_is_zero(primitive_fun): deprecation_msg = deprecated_defvjp_message.format("defvjp_is_zero") zero_vjps = [set()] def defvjp_is_zero(argnums=(0,)): warnings.warn(deprecation_msg) zero_vjps[0] |= set(argnums) nones = [None] * len(zero_vjps[0]) defvjp(primitive_fun, *nones, argnums=sorted(zero_vjps[0])) return defvjp_is_zero def deprecated_defgrad(primitive_fun): deprecation_msg = deprecated_defvjp_message.format("defgrad") gradfuns = {} def defgrad(gradfun, argnum=0): warnings.warn(deprecation_msg) gradfuns[argnum] = gradfun argnums, vjpmakers = zip(*[(argnum, gradfuns[argnum]) for argnum in sorted(gradfuns.keys())]) defvjp(primitive_fun, *vjpmakers, argnums=argnums) return defgrad primitive_ = primitive def primitive_with_deprecation_warnings(f_raw): f_wrapped = primitive_(f_raw) f_wrapped.defvjp = deprecated_defvjp(f_wrapped) f_wrapped.defvjp_is_zero = deprecated_defvjp_is_zero(f_wrapped) f_wrapped.defgrad = deprecated_defgrad(f_wrapped) return f_wrapped primitive = primitive_with_deprecation_warnings ================================================ FILE: autograd/differential_operators.py ================================================ """Convenience functions built on top of `make_vjp`.""" from collections import OrderedDict try: from inspect import getfullargspec as _getargspec # Python 3 except ImportError: from inspect import getargspec as _getargspec # Python 2 import warnings import autograd.numpy as np from .builtins import tuple as atuple from .core import make_jvp as _make_jvp from .core import make_vjp as _make_vjp from .extend import defvjp_argnum, primitive, vspace from .wrap_util import unary_to_nary make_vjp = unary_to_nary(_make_vjp) make_jvp = unary_to_nary(_make_jvp) @unary_to_nary def grad(fun, x): """ Returns a function which computes the gradient of `fun` with respect to positional argument number `argnum`. The returned function takes the same arguments as `fun`, but returns the gradient instead. The function `fun` should be scalar-valued. The gradient has the same type as the argument.""" vjp, ans = _make_vjp(fun, x) if not vspace(ans).size == 1: raise TypeError( "Grad only applies to real scalar-output functions. " "Try jacobian, elementwise_grad or holomorphic_grad." ) return vjp(vspace(ans).ones()) @unary_to_nary def elementwise_grad(fun, x): """ Returns a function that computes the sum of each column of the Jacobian of `fun`, in one pass. If the Jacobian is diagonal, then this is the diagonal of the Jacobian. """ vjp, ans = _make_vjp(fun, x) if vspace(ans).iscomplex: raise TypeError("Elementwise_grad only applies to real-output functions.") return vjp(vspace(ans).ones()) @unary_to_nary def deriv(fun, x): return _make_jvp(fun, x)(vspace(x).ones())[1] @unary_to_nary def jacobian(fun, x): """ Returns a function which computes the Jacobian of `fun` with respect to positional argument number `argnum`, which must be a scalar or array. Unlike `grad` it is not restricted to scalar-output functions, but also it cannot take derivatives with respect to some argument types (like lists or dicts). If the input to `fun` has shape (in1, in2, ...) and the output has shape (out1, out2, ...) then the Jacobian has shape (out1, out2, ..., in1, in2, ...). """ vjp, ans = _make_vjp(fun, x) ans_vspace = vspace(ans) jacobian_shape = ans_vspace.shape + vspace(x).shape grads = map(vjp, ans_vspace.standard_basis()) return np.reshape(np.stack(grads), jacobian_shape) @unary_to_nary def holomorphic_grad(fun, x): if not vspace(x).iscomplex: warnings.warn("Input to holomorphic_grad is not complex") return grad(lambda x: np.real(fun(x)))(x) def grad_named(fun, argname): """Takes gradients with respect to a named argument. Doesn't work on *args or **kwargs.""" arg_index = _getargspec(fun).args.index(argname) return grad(fun, arg_index) @unary_to_nary def hessian(fun, x): "Returns a function that computes the exact Hessian." return jacobian(jacobian(fun))(x) @unary_to_nary def make_hvp(fun, x): """Builds a function for evaluating the Hessian-vector product at a point, which may be useful when evaluating many Hessian-vector products at the same point while caching the results of the forward pass.""" return _make_vjp(grad(fun), x) def hessian_tensor_product(fun, argnum=0): """Builds a function that returns the exact Hessian-tensor product. The returned function has arguments (*args, tensor, **kwargs), and for vectors takes roughly 4x as long to evaluate as the original function.""" fun_grad = grad(fun, argnum) def vector_dot_grad(*args, **kwargs): args, vector = args[:-1], args[-1] return np.tensordot(fun_grad(*args, **kwargs), vector, np.ndim(vector)) return grad(vector_dot_grad, argnum) hessian_vector_product = hessian_tensor_product def tensor_jacobian_product(fun, argnum=0): """Builds a function that returns the exact tensor-Jacobian product, that is the Jacobian matrix left-multiplied by tensor. The returned function has arguments (*args, tensor, **kwargs).""" def vector_dot_fun(*args, **kwargs): args, vector = args[:-1], args[-1] return np.tensordot(vector, fun(*args, **kwargs), axes=np.ndim(vector)) return jacobian(vector_dot_fun, argnum) vector_jacobian_product = tensor_jacobian_product @unary_to_nary def make_jvp_reversemode(fun, x): """Builds a function for evaluating the Jacobian-vector product at a point. Roughly 1.5x more FLOPs than forward-mode, plus memory requirements that scale with the number of primitives applied in the evaluation of f, as well as other overheads. See j-towns.github.io/2017/06/12/A-new-trick.html.""" vjp, y = _make_vjp(fun, x) vjp_vjp, _ = _make_vjp(vjp, vspace(y).zeros()) return vjp_vjp # vjp_vjp is just jvp by linearity # TODO(mattjj): update this function using make_jvp and const_graph def make_ggnvp(f, g=lambda x: 1.0 / 2 * np.sum(x**2, axis=-1), f_argnum=0): """Builds a function for evaluating generalized-Gauss-Newton-vector products at a point. Slightly more expensive than mixed-mode.""" @unary_to_nary def _make_ggnvp(f, x): f_vjp, f_x = _make_vjp(f, x) g_hvp, grad_g_x = _make_vjp(grad(g), f_x) f_jvp, _ = _make_vjp(f_vjp, vspace(grad_g_x).zeros()) def ggnvp(v): return f_vjp(g_hvp(f_jvp(v))) return ggnvp return _make_ggnvp(f, f_argnum) @unary_to_nary def value_and_grad(fun, x): """Returns a function that returns both value and gradient. Suitable for use in scipy.optimize""" vjp, ans = _make_vjp(fun, x) if not vspace(ans).size == 1: raise TypeError( "value_and_grad only applies to real scalar-output " "functions. Try jacobian, elementwise_grad or " "holomorphic_grad." ) return ans, vjp(vspace(ans).ones()) @unary_to_nary def grad_and_aux(fun, x): """Builds a function that returns the gradient of the first output and the (unmodified) second output of a function that returns two outputs.""" vjp, (ans, aux) = _make_vjp(lambda x: atuple(fun(x)), x) return vjp((vspace(ans).ones(), vspace(aux).zeros())), aux def multigrad_dict(fun): "Takes gradients wrt all arguments simultaneously," "returns a dict mapping 'argname' to 'gradval'" import funcsigs sig = funcsigs.signature(fun) def select(preds, lst): idx = lambda item: next((i for i, pred in enumerate(preds) if pred(item)), len(preds)) results = [[] for _ in preds] + [[]] for item in lst: results[idx(item)].append(item) return results is_var_pos = lambda name: sig.parameters[name].kind == sig.parameters[name].VAR_POSITIONAL is_var_kwd = lambda name: sig.parameters[name].kind == sig.parameters[name].VAR_KEYWORD var_pos, var_kwd, argnames = select([is_var_pos, is_var_kwd], sig.parameters) todict = lambda dct: {key: dct[key] for key in dct} def apply_defaults(arguments): defaults = { name: param.default for name, param in sig.parameters.items() if param.default is not param.empty } return OrderedDict( (name, arguments[name] if name in arguments else defaults[name]) for name in sig.parameters ) def gradfun(*args, **kwargs): bindings = sig.bind(*args, **kwargs) args = lambda dct: tuple(dct[var_pos[0]]) if var_pos else () kwargs = lambda dct: todict(dct[var_kwd[0]]) if var_kwd else {} others = lambda dct: tuple(dct[argname] for argname in argnames if argname not in var_kwd + var_pos) newfun = lambda dct: fun(*(others(dct) + args(dct)), **kwargs(dct)) argdict = apply_defaults(bindings.arguments) grad_dict = grad(newfun)(dict(argdict)) return OrderedDict((argname, grad_dict[argname]) for argname in argdict) return gradfun def checkpoint(fun): """Returns a checkpointed version of `fun`, where intermediate values computed during the forward pass of `fun` are discarded and then recomputed for the backward pass. Useful to save memory, effectively trading off time and memory. See e.g. arxiv.org/abs/1604.06174. """ def wrapped_grad(argnum, ans, args, kwargs): return make_vjp(fun, argnum)(*args, **kwargs)[0] wrapped = primitive(fun) defvjp_argnum(wrapped, wrapped_grad) return wrapped ================================================ FILE: autograd/extend.py ================================================ # Exposes API for extending autograd from .core import ( JVPNode, SparseObject, VJPNode, VSpace, def_linear, defjvp, defjvp_argnum, defjvp_argnums, defvjp, defvjp_argnum, defvjp_argnums, vspace, ) from .tracer import Box, notrace_primitive, primitive, register_notrace ================================================ FILE: autograd/misc/__init__.py ================================================ from .flatten import flatten from .tracers import const_graph ================================================ FILE: autograd/misc/fixed_points.py ================================================ from autograd import make_vjp from autograd.builtins import tuple from autograd.extend import defvjp, primitive, vspace @primitive def fixed_point(f, a, x0, distance, tol): _f = f(a) x, x_prev = _f(x0), x0 while distance(x, x_prev) > tol: x, x_prev = _f(x), x return x def fixed_point_vjp(ans, f, a, x0, distance, tol): def rev_iter(params): a, x_star, x_star_bar = params vjp_x, _ = make_vjp(f(a))(x_star) vs = vspace(x_star) return lambda g: vs.add(vjp_x(g), x_star_bar) vjp_a, _ = make_vjp(lambda x, y: f(x)(y))(a, ans) return lambda g: vjp_a(fixed_point(rev_iter, tuple((a, ans, g)), vspace(x0).zeros(), distance, tol)) defvjp(fixed_point, None, fixed_point_vjp, None) ================================================ FILE: autograd/misc/flatten.py ================================================ """ Handy functions for flattening nested containers containing numpy arrays. The main purpose is to make examples and optimizers simpler. """ import autograd.numpy as np from autograd import make_vjp from autograd.builtins import type def flatten(value): """Flattens any nesting of tuples, lists, or dicts, with numpy arrays or scalars inside. Returns 1D numpy array and an unflatten function. Doesn't preserve mixed numeric types (e.g. floats and ints). Assumes dict keys are sortable.""" unflatten, flat_value = make_vjp(_flatten)(value) return flat_value, unflatten def _flatten(value): t = type(value) if t in (list, tuple): return _concatenate(map(_flatten, value)) elif t is dict: return _concatenate(_flatten(value[k]) for k in sorted(value)) else: return np.ravel(value) def _concatenate(lst): lst = list(lst) return np.concatenate(lst) if lst else np.array([]) def flatten_func(func, example): _ex, unflatten = flatten(example) _func = lambda _x, *args: flatten(func(unflatten(_x), *args))[0] return _func, unflatten, _ex ================================================ FILE: autograd/misc/optimizers.py ================================================ """Some standard gradient-based stochastic optimizers. These are just standard routines that don't make any use of autograd, though you could take gradients of these functions too if you want to do meta-optimization. These routines can optimize functions whose inputs are structured objects, such as dicts of numpy arrays.""" import autograd.numpy as np from autograd.misc import flatten from autograd.wrap_util import wraps def unflatten_optimizer(optimize): """Takes an optimizer that operates on flat 1D numpy arrays and returns a wrapped version that handles trees of nested containers (lists/tuples/dicts) with arrays/scalars at the leaves.""" @wraps(optimize) def _optimize(grad, x0, callback=None, *args, **kwargs): _x0, unflatten = flatten(x0) _grad = lambda x, i: flatten(grad(unflatten(x), i))[0] if callback: _callback = lambda x, i, g: callback(unflatten(x), i, unflatten(g)) else: _callback = None return unflatten(optimize(_grad, _x0, _callback, *args, **kwargs)) return _optimize @unflatten_optimizer def sgd(grad, x, callback=None, num_iters=200, step_size=0.1, mass=0.9): """Stochastic gradient descent with momentum. grad() must have signature grad(x, i), where i is the iteration number.""" velocity = np.zeros(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size * velocity return x @unflatten_optimizer def rmsprop(grad, x, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8): """Root mean squared prop: See Adagrad paper for details.""" avg_sq_grad = np.ones(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma) x = x - step_size * g / (np.sqrt(avg_sq_grad) + eps) return x @unflatten_optimizer def adam(grad, x, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1 ** (i + 1)) # Bias correction. vhat = v / (1 - b2 ** (i + 1)) x = x - step_size * mhat / (np.sqrt(vhat) + eps) return x ================================================ FILE: autograd/misc/tracers.py ================================================ from functools import partial from itertools import repeat from autograd.tracer import Node, trace from autograd.util import subvals, toposort from autograd.wrap_util import wraps class ConstGraphNode(Node): __slots__ = ["parents", "partial_fun"] def __init__(self, value, fun, args, kwargs, parent_argnums, parents): args = subvals(args, zip(parent_argnums, repeat(None))) def partial_fun(partial_args): return fun(*subvals(args, zip(parent_argnums, partial_args)), **kwargs) self.parents = parents self.partial_fun = partial_fun def initialize_root(self): self.parents = [] def const_graph_unary(fun): graph = [] _fun = [fun] # Allow fun to be freed, since it may have bound args def maybe_cached_fun(x): if graph: _graph = graph[0] vals = {_graph[0]: x} for node in _graph[1:]: vals[node] = node.partial_fun([vals[p] for p in node.parents]) return vals[node] else: start_node = ConstGraphNode.new_root() end_value, end_node = trace(start_node, _fun.pop(), x) if end_node is None: raise Exception("Output is independent of input") graph.append(list(toposort(end_node))[::-1]) return end_value return maybe_cached_fun def const_graph(fun, *args, **kwargs): partial_fun = partial(fun, *args, **kwargs) unary_fun = lambda args: partial_fun(*args) maybe_cached_unary_fun = const_graph_unary(unary_fun) @wraps(fun) def _fun(*args): return maybe_cached_unary_fun(args) return _fun class FullGraphNode(Node): __slots__ = ["value", "recipe"] def __init__(self, value, fun, args, kwargs, parent_argnums, parents): self.value = value self.recipe = (fun, args, kwargs, zip(parent_argnums, parents)) def initialize_root(self): self.value = None self.recipe = (lambda x: x, (), {}, []) def full_graph(fun, *args, **kwargs): unary_fun = lambda args: fun(*args, **kwargs) start_node = FullGraphNode.new_root() end_value, end_node = trace(start_node, unary_fun, args) return end_node ================================================ FILE: autograd/numpy/__init__.py ================================================ from . import fft, linalg, numpy_boxes, numpy_jvps, numpy_vjps, numpy_vspaces, random from .numpy_wrapper import * from .numpy_wrapper import numpy_version as __version__ ================================================ FILE: autograd/numpy/fft.py ================================================ import numpy.fft as ffto from autograd.extend import defvjp, primitive, vspace from . import numpy_wrapper as anp from .numpy_vjps import match_complex from .numpy_wrapper import wrap_namespace wrap_namespace(ffto.__dict__, globals()) # TODO: make fft gradient work for a repeated axis, # e.g. by replacing fftn with repeated calls to 1d fft along each axis def fft_grad(get_args, fft_fun, ans, x, *args, **kwargs): axes, s, norm = get_args(x, *args, **kwargs) check_no_repeated_axes(axes) vs = vspace(x) return lambda g: match_complex(x, truncate_pad(fft_fun(g, *args, **kwargs), vs.shape)) defvjp(fft, lambda *args, **kwargs: fft_grad(get_fft_args, fft, *args, **kwargs)) defvjp(ifft, lambda *args, **kwargs: fft_grad(get_fft_args, ifft, *args, **kwargs)) defvjp(fft2, lambda *args, **kwargs: fft_grad(get_fft_args, fft2, *args, **kwargs)) defvjp(ifft2, lambda *args, **kwargs: fft_grad(get_fft_args, ifft2, *args, **kwargs)) defvjp(fftn, lambda *args, **kwargs: fft_grad(get_fft_args, fftn, *args, **kwargs)) defvjp(ifftn, lambda *args, **kwargs: fft_grad(get_fft_args, ifftn, *args, **kwargs)) def rfft_grad(get_args, irfft_fun, ans, x, *args, **kwargs): axes, s, norm = get_args(x, *args, **kwargs) vs = vspace(x) gvs = vspace(ans) check_no_repeated_axes(axes) if s is None: s = [vs.shape[i] for i in axes] check_even_shape(s) # s is the full fft shape # gs is the compressed shape gs = list(s) gs[-1] = gs[-1] // 2 + 1 fac = make_rfft_factors(axes, gvs.shape, gs, s, norm) def vjp(g): g = anp.conj(g / fac) r = match_complex(x, truncate_pad((irfft_fun(g, *args, **kwargs)), vs.shape)) return r return vjp def irfft_grad(get_args, rfft_fun, ans, x, *args, **kwargs): axes, gs, norm = get_args(x, *args, **kwargs) vs = vspace(x) gvs = vspace(ans) check_no_repeated_axes(axes) if gs is None: gs = [gvs.shape[i] for i in axes] check_even_shape(gs) # gs is the full fft shape # s is the compressed shape s = list(gs) s[-1] = s[-1] // 2 + 1 def vjp(g): r = match_complex(x, truncate_pad((rfft_fun(g, *args, **kwargs)), vs.shape)) fac = make_rfft_factors(axes, vs.shape, s, gs, norm) r = anp.conj(r) * fac return r return vjp defvjp(rfft, lambda *args, **kwargs: rfft_grad(get_fft_args, irfft, *args, **kwargs)) defvjp(irfft, lambda *args, **kwargs: irfft_grad(get_fft_args, rfft, *args, **kwargs)) defvjp(rfft2, lambda *args, **kwargs: rfft_grad(get_fft2_args, irfft2, *args, **kwargs)) defvjp(irfft2, lambda *args, **kwargs: irfft_grad(get_fft2_args, rfft2, *args, **kwargs)) defvjp(rfftn, lambda *args, **kwargs: rfft_grad(get_fftn_args, irfftn, *args, **kwargs)) defvjp(irfftn, lambda *args, **kwargs: irfft_grad(get_fftn_args, rfftn, *args, **kwargs)) defvjp( fftshift, lambda ans, x, axes=None: lambda g: match_complex(x, anp.conj(ifftshift(anp.conj(g), axes))) ) defvjp( ifftshift, lambda ans, x, axes=None: lambda g: match_complex(x, anp.conj(fftshift(anp.conj(g), axes))) ) @primitive def truncate_pad(x, shape): # truncate/pad x to have the appropriate shape slices = [slice(n) for n in shape] pads = tuple( zip(anp.zeros(len(shape), dtype=int), anp.maximum(0, anp.array(shape) - anp.array(x.shape))) ) return anp.pad(x, pads, "constant")[tuple(slices)] defvjp(truncate_pad, lambda ans, x, shape: lambda g: match_complex(x, truncate_pad(g, vspace(x).shape))) ## TODO: could be made less stringent, to fail only when repeated axis has different values of s def check_no_repeated_axes(axes): axes_set = set(axes) if len(axes) != len(axes_set): raise NotImplementedError("FFT gradient for repeated axes not implemented.") def check_even_shape(shape): if shape[-1] % 2 != 0: raise NotImplementedError("Real FFT gradient for odd lengthed last axes is not implemented.") def get_fft_args(a, d=None, axis=-1, norm=None, *args, **kwargs): axes = [axis] if d is not None: d = [d] return axes, d, norm def get_fft2_args(a, s=None, axes=(-2, -1), norm=None, *args, **kwargs): return axes, s, norm def get_fftn_args(a, s=None, axes=None, norm=None, *args, **kwargs): if axes is None: axes = list(range(a.ndim)) return axes, s, norm def make_rfft_factors(axes, resshape, facshape, normshape, norm): """make the compression factors and compute the normalization for irfft and rfft. """ N = 1.0 for n in normshape: N = N * n # inplace modification is fine because we produce a constant # which doesn't go into autograd. # For same reason could have used numpy rather than anp. # but we already imported anp, so use it instead. fac = anp.zeros(resshape) fac[...] = 2 index = [slice(None)] * len(resshape) if facshape[-1] <= resshape[axes[-1]]: index[axes[-1]] = (0, facshape[-1] - 1) else: index[axes[-1]] = (0,) fac[tuple(index)] = 1 if norm is None: fac /= N return fac ================================================ FILE: autograd/numpy/linalg.py ================================================ from functools import partial import numpy.linalg as npla from autograd.extend import defjvp, defvjp from . import numpy_wrapper as anp from .numpy_wrapper import wrap_namespace wrap_namespace(npla.__dict__, globals()) # Some formulas are from # "An extended collection of matrix derivative results # for forward and reverse mode algorithmic differentiation" # by Mike Giles # https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf # transpose by swapping last two dimensions def T(x): return anp.swapaxes(x, -1, -2) _dot = partial(anp.einsum, "...ij,...jk->...ik") # batched diag _diag = lambda a: anp.eye(a.shape[-1]) * a # batched diagonal, similar to matrix_diag in tensorflow def _matrix_diag(a): reps = anp.array(a.shape) reps[:-1] = 1 reps[-1] = a.shape[-1] newshape = list(a.shape) + [a.shape[-1]] return _diag(anp.tile(a, reps).reshape(newshape)) # add two dimensions to the end of x def add2d(x): return anp.reshape(x, anp.shape(x) + (1, 1)) defvjp(det, lambda ans, x: lambda g: add2d(g) * add2d(ans) * T(inv(x))) defvjp(slogdet, lambda ans, x: lambda g: add2d(g[1]) * T(inv(x))) def grad_inv(ans, x): return lambda g: -_dot(_dot(T(ans), g), T(ans)) defvjp(inv, grad_inv) def grad_pinv(ans, x): # https://mathoverflow.net/questions/25778/analytical-formula-for-numerical-derivative-of-the-matrix-pseudo-inverse return lambda g: T( -_dot(_dot(ans, T(g)), ans) + _dot(_dot(_dot(ans, T(ans)), g), anp.eye(x.shape[-2]) - _dot(x, ans)) + _dot(_dot(_dot(anp.eye(ans.shape[-2]) - _dot(ans, x), g), T(ans)), ans) ) defvjp(pinv, grad_pinv) def grad_solve(argnum, ans, a, b): updim = lambda x: x if x.ndim == a.ndim else x[..., None] if argnum == 0: return lambda g: -_dot(updim(solve(T(a), g)), T(updim(ans))) else: return lambda g: solve(T(a), g) defvjp(solve, partial(grad_solve, 0), partial(grad_solve, 1)) def norm_vjp(ans, x, ord=None, axis=None): def check_implemented(): matrix_norm = (x.ndim == 2 and axis is None) or isinstance(axis, tuple) if matrix_norm: if not (ord is None or ord == "fro" or ord == "nuc"): raise NotImplementedError(f"Gradient of matrix norm not implemented for ord={ord}") elif not (ord is None or ord > 1): raise NotImplementedError(f"Gradient of norm not implemented for ord={ord}") if axis is None: expand = lambda a: a elif isinstance(axis, tuple): row_axis, col_axis = axis if row_axis > col_axis: row_axis = row_axis - 1 expand = lambda a: anp.expand_dims(anp.expand_dims(a, row_axis), col_axis) else: expand = lambda a: anp.expand_dims(a, axis=axis) if ord == "nuc": if axis is None: roll = lambda a: a unroll = lambda a: a else: row_axis, col_axis = axis if row_axis > col_axis: row_axis = row_axis - 1 # Roll matrix axes to the back roll = lambda a: anp.rollaxis(anp.rollaxis(a, col_axis, a.ndim), row_axis, a.ndim - 1) # Roll matrix axes to their original position unroll = lambda a: anp.rollaxis(anp.rollaxis(a, a.ndim - 2, row_axis), a.ndim - 1, col_axis) check_implemented() def vjp(g): if ord in (None, 2, "fro"): return expand(g / ans) * anp.conj(x) elif ord == "nuc": x_rolled = roll(x) u, s, vt = svd(x_rolled, full_matrices=False) uvt_rolled = _dot(u, vt) # Roll the matrix axes back to their correct positions uvt = unroll(uvt_rolled) g = expand(g) return g * anp.conj(uvt) else: # see https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm return expand(g / ans ** (ord - 1)) * anp.conj(x) * anp.abs(x) ** (ord - 2) return vjp defvjp(norm, norm_vjp) def norm_jvp(g, ans, x, ord=None, axis=None): def check_implemented(): matrix_norm = (x.ndim == 2 and axis is None) or isinstance(axis, tuple) if matrix_norm: if not (ord is None or ord == "fro" or ord == "nuc"): raise NotImplementedError(f"Gradient of matrix norm not implemented for ord={ord}") elif not (ord is None or ord > 1): raise NotImplementedError(f"Gradient of norm not implemented for ord={ord}") if axis is None: contract = lambda a: anp.sum(a) else: contract = partial(anp.sum, axis=axis) if ord == "nuc": if axis is None: roll = lambda a: a unroll = lambda a: a else: row_axis, col_axis = axis if row_axis > col_axis: row_axis = row_axis - 1 # Roll matrix axes to the back roll = lambda a: anp.rollaxis(anp.rollaxis(a, col_axis, a.ndim), row_axis, a.ndim - 1) # Roll matrix axes to their original position unroll = lambda a: anp.rollaxis(anp.rollaxis(a, a.ndim - 2, row_axis), a.ndim - 1, col_axis) check_implemented() if ord in (None, 2, "fro"): return contract(g * anp.conj(x)) / ans elif ord == "nuc": x_rolled = roll(x) u, s, vt = svd(x_rolled, full_matrices=False) uvt_rolled = _dot(u, vt) # Roll the matrix axes back to their correct positions uvt = unroll(uvt_rolled) return contract(g * anp.conj(uvt)) else: # see https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm return contract(g * anp.conj(x) * anp.abs(x) ** (ord - 2)) / ans ** (ord - 1) defjvp(norm, norm_jvp) def grad_eigh(ans, x, UPLO="L"): """Gradient for eigenvalues and vectors of a symmetric matrix.""" N = x.shape[-1] w, v = ans # Eigenvalues, eigenvectors. vc = anp.conj(v) def vjp(g): wg, vg = g # Gradient w.r.t. eigenvalues, eigenvectors. w_repeated = anp.repeat(w[..., anp.newaxis], N, axis=-1) # Eigenvalue part vjp_temp = _dot(vc * wg[..., anp.newaxis, :], T(v)) # Add eigenvector part only if non-zero backward signal is present. # This can avoid NaN results for degenerate cases if the function depends # on the eigenvalues only. if anp.any(vg): off_diag = anp.ones((N, N)) - anp.eye(N) F = off_diag / (T(w_repeated) - w_repeated + anp.eye(N)) vjp_temp += _dot(_dot(vc, F * _dot(T(v), vg)), T(v)) # eigh always uses only the lower or the upper part of the matrix # we also have to make sure broadcasting works reps = anp.array(x.shape) reps[-2:] = 1 if UPLO == "L": tri = anp.tile(anp.tril(anp.ones(N), -1), reps) elif UPLO == "U": tri = anp.tile(anp.triu(anp.ones(N), 1), reps) return anp.real(vjp_temp) * anp.eye(vjp_temp.shape[-1]) + (vjp_temp + anp.conj(T(vjp_temp))) * tri return vjp defvjp(eigh, grad_eigh) # https://arxiv.org/pdf/1701.00392.pdf Eq(4.77) # Note the formula from Sec3.1 in https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf is incomplete def grad_eig(ans, x): """Gradient of a general square (complex valued) matrix""" e, u = ans # eigenvalues as 1d array, eigenvectors in columns n = e.shape[-1] def vjp(g): ge, gu = g ge = _matrix_diag(ge) f = 1 / (e[..., anp.newaxis, :] - e[..., :, anp.newaxis] + 1.0e-20) f -= _diag(f) ut = anp.swapaxes(u, -1, -2) r1 = f * _dot(ut, gu) r2 = -f * (_dot(_dot(ut, anp.conj(u)), anp.real(_dot(ut, gu)) * anp.eye(n))) r = _dot(_dot(inv(ut), ge + r1 + r2), ut) if not anp.iscomplexobj(x): r = anp.real(r) # the derivative is still complex for real input (imaginary delta is allowed), real output # but the derivative should be real in real input case when imaginary delta is forbidden return r return vjp defvjp(eig, grad_eig) def grad_cholesky(L, A): # Based on Iain Murray's note http://arxiv.org/abs/1602.07527 # scipy's dtrtrs wrapper, solve_triangular, doesn't broadcast along leading # dimensions, so we just call a generic LU solve instead of directly using # backsubstitution (also, we factor twice...) solve_trans = lambda a, b: solve(T(a), b) phi = lambda X: anp.tril(X) / (1.0 + anp.eye(X.shape[-1])) def conjugate_solve(L, X): # X -> L^{-T} X L^{-1} return solve_trans(L, T(solve_trans(L, T(X)))) def vjp(g): S = conjugate_solve(L, phi(anp.einsum("...ki,...kj->...ij", L, g))) return (S + T(S)) / 2.0 return vjp defvjp(cholesky, grad_cholesky) # https://j-towns.github.io/papers/svd-derivative.pdf # https://arxiv.org/abs/1909.02659 def grad_svd(usv_, a, full_matrices=True, compute_uv=True): def vjp(g): usv = usv_ if not compute_uv: s = usv # Need U and V so do the whole svd anyway... usv = svd(a, full_matrices=False) u = usv[0] v = anp.conj(T(usv[2])) return _dot(anp.conj(u) * g[..., anp.newaxis, :], T(v)) elif full_matrices: raise NotImplementedError("Gradient of svd not implemented for full_matrices=True") else: u = usv[0] s = usv[1] v = anp.conj(T(usv[2])) m, n = a.shape[-2:] k = anp.min((m, n)) # broadcastable identity array with shape (1, 1, ..., 1, k, k) i = anp.reshape(anp.eye(k), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (k, k)))) f = 1 / (s[..., anp.newaxis, :] ** 2 - s[..., :, anp.newaxis] ** 2 + i) gu = g[0] gs = g[1] gv = anp.conj(T(g[2])) utgu = _dot(T(u), gu) vtgv = _dot(T(v), gv) t1 = (f * (utgu - anp.conj(T(utgu)))) * s[..., anp.newaxis, :] t1 = t1 + i * gs[..., :, anp.newaxis] t1 = t1 + s[..., :, anp.newaxis] * (f * (vtgv - anp.conj(T(vtgv)))) if anp.iscomplexobj(u): t1 = t1 + 1j * anp.imag(_diag(utgu)) / s[..., anp.newaxis, :] t1 = _dot(_dot(anp.conj(u), t1), T(v)) if m < n: i_minus_vvt = anp.reshape( anp.eye(n), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (n, n))) ) - _dot(v, anp.conj(T(v))) t1 = t1 + anp.conj(_dot(_dot(u / s[..., anp.newaxis, :], T(gv)), i_minus_vvt)) return t1 elif m == n: return t1 elif m > n: i_minus_uut = anp.reshape( anp.eye(m), anp.concatenate((anp.ones(a.ndim - 2, dtype=int), (m, m))) ) - _dot(u, anp.conj(T(u))) t1 = t1 + T(_dot(_dot(v / s[..., anp.newaxis, :], T(gu)), i_minus_uut)) return t1 return vjp defvjp(svd, grad_svd) ================================================ FILE: autograd/numpy/numpy_boxes.py ================================================ import numpy as np from autograd.builtins import SequenceBox from autograd.extend import Box, primitive from . import numpy_wrapper as anp Box.__array_priority__ = 90.0 class ArrayBox(Box): __slots__ = [] __array_priority__ = 100.0 @primitive def __getitem__(A, idx): return A[idx] # Constants w.r.t float data just pass though shape = property(lambda self: self._value.shape) ndim = property(lambda self: self._value.ndim) size = property(lambda self: self._value.size) dtype = property(lambda self: self._value.dtype) T = property(lambda self: anp.transpose(self)) def __array_namespace__(self, *, api_version: str | None = None): return anp def __len__(self): return len(self._value) def astype(self, *args, **kwargs): return anp._astype(self, *args, **kwargs) def __neg__(self): return anp.negative(self) def __add__(self, other): return anp.add(self, other) def __sub__(self, other): return anp.subtract(self, other) def __mul__(self, other): return anp.multiply(self, other) def __pow__(self, other): return anp.power(self, other) def __div__(self, other): return anp.divide(self, other) def __mod__(self, other): return anp.mod(self, other) def __truediv__(self, other): return anp.true_divide(self, other) def __matmul__(self, other): return anp.matmul(self, other) def __radd__(self, other): return anp.add(other, self) def __rsub__(self, other): return anp.subtract(other, self) def __rmul__(self, other): return anp.multiply(other, self) def __rpow__(self, other): return anp.power(other, self) def __rdiv__(self, other): return anp.divide(other, self) def __rmod__(self, other): return anp.mod(other, self) def __rtruediv__(self, other): return anp.true_divide(other, self) def __rmatmul__(self, other): return anp.matmul(other, self) def __eq__(self, other): return anp.equal(self, other) def __ne__(self, other): return anp.not_equal(self, other) def __gt__(self, other): return anp.greater(self, other) def __ge__(self, other): return anp.greater_equal(self, other) def __lt__(self, other): return anp.less(self, other) def __le__(self, other): return anp.less_equal(self, other) def __abs__(self): return anp.abs(self) def __hash__(self): return id(self) ArrayBox.register(np.ndarray) for type_ in [ float, np.longdouble, np.float64, np.float32, np.float16, complex, np.clongdouble, np.complex64, np.complex128, ]: ArrayBox.register(type_) # These numpy.ndarray methods are just refs to an equivalent numpy function nondiff_methods = [ "all", "any", "argmax", "argmin", "argpartition", "argsort", "nonzero", "searchsorted", "round", ] diff_methods = [ "clip", "compress", "cumprod", "cumsum", "diagonal", "max", "mean", "min", "prod", "ptp", "ravel", "repeat", "reshape", "squeeze", "std", "sum", "swapaxes", "take", "trace", "transpose", "var", ] for method_name in nondiff_methods + diff_methods: setattr(ArrayBox, method_name, anp.__dict__[method_name]) # Flatten has no function, only a method. setattr(ArrayBox, "flatten", anp.__dict__["ravel"]) if np.lib.NumpyVersion(np.__version__) >= "2.0.0": SequenceBox.register(np.linalg._linalg.EigResult) SequenceBox.register(np.linalg._linalg.EighResult) SequenceBox.register(np.linalg._linalg.QRResult) SequenceBox.register(np.linalg._linalg.SlogdetResult) SequenceBox.register(np.linalg._linalg.SVDResult) elif np.__version__ >= "1.25": SequenceBox.register(np.linalg.linalg.EigResult) SequenceBox.register(np.linalg.linalg.EighResult) SequenceBox.register(np.linalg.linalg.QRResult) SequenceBox.register(np.linalg.linalg.SlogdetResult) SequenceBox.register(np.linalg.linalg.SVDResult) ================================================ FILE: autograd/numpy/numpy_jvps.py ================================================ import numpy as onp from autograd.extend import JVPNode, def_linear, defjvp, defjvp_argnum, register_notrace, vspace from ..util import func from . import numpy_wrapper as anp from .numpy_boxes import ArrayBox from .numpy_vjps import ( balanced_eq, dot_adjoint_0, dot_adjoint_1, match_complex, nograd_functions, replace_zero, tensordot_adjoint_0, tensordot_adjoint_1, untake, ) for fun in nograd_functions: register_notrace(JVPNode, fun) defjvp(func(ArrayBox.__getitem__), "same") defjvp(untake, "same") defjvp_argnum(anp.array_from_args, lambda argnum, g, ans, args, kwargs: untake(g, argnum - 2, vspace(ans))) defjvp( anp._array_from_scalar_or_array, None, None, lambda g, ans, args, kwargs, _: anp._array_from_scalar_or_array(args, kwargs, g), ) # ----- Functions that are constant w.r.t. continuous inputs ----- defjvp(anp.nan_to_num, lambda g, ans, x: anp.where(anp.isfinite(x), g, 0.0)) # ----- Binary ufuncs (linear) ----- def_linear(anp.multiply) # ----- Binary ufuncs ----- defjvp(anp.add, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: broadcast(g, ans)) defjvp(anp.subtract, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: broadcast(-g, ans)) defjvp(anp.divide, "same", lambda g, ans, x, y: -g * x / y**2) defjvp( anp.maximum, lambda g, ans, x, y: g * balanced_eq(x, ans, y), lambda g, ans, x, y: g * balanced_eq(y, ans, x), ) defjvp( anp.minimum, lambda g, ans, x, y: g * balanced_eq(x, ans, y), lambda g, ans, x, y: g * balanced_eq(y, ans, x), ) defjvp( anp.fmax, lambda g, ans, x, y: g * balanced_eq(x, ans, y), lambda g, ans, x, y: g * balanced_eq(y, ans, x), ) defjvp( anp.fmin, lambda g, ans, x, y: g * balanced_eq(x, ans, y), lambda g, ans, x, y: g * balanced_eq(y, ans, x), ) defjvp(anp.logaddexp, lambda g, ans, x, y: g * anp.exp(x - ans), lambda g, ans, x, y: g * anp.exp(y - ans)) defjvp(anp.logaddexp2, lambda g, ans, x, y: g * 2 ** (x - ans), lambda g, ans, x, y: g * 2 ** (y - ans)) defjvp(anp.true_divide, "same", lambda g, ans, x, y: -g * x / y**2) defjvp(anp.mod, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: -g * anp.floor(x / y)) defjvp(anp.remainder, lambda g, ans, x, y: broadcast(g, ans), lambda g, ans, x, y: -g * anp.floor(x / y)) defjvp( anp.power, lambda g, ans, x, y: g * y * x ** anp.where(y, y - 1, 1.0), lambda g, ans, x, y: g * anp.log(replace_zero(x, 1.0)) * ans, ) defjvp(anp.arctan2, lambda g, ans, x, y: g * y / (x**2 + y**2), lambda g, ans, x, y: g * -x / (x**2 + y**2)) # ----- Simple grads (linear) ----- defjvp(anp.negative, "same") defjvp(anp.rad2deg, "same") defjvp(anp.degrees, "same") defjvp(anp.deg2rad, "same") defjvp(anp.radians, "same") defjvp(anp.reshape, "same") defjvp(anp.roll, "same") defjvp(anp.array_split, "same") defjvp(anp.split, "same") defjvp(anp.vsplit, "same") defjvp(anp.hsplit, "same") defjvp(anp.dsplit, "same") defjvp(anp.ravel, "same") defjvp(anp.expand_dims, "same") defjvp(anp.squeeze, "same") defjvp(anp.diag, "same") defjvp(anp.diagonal, "same") defjvp(anp.make_diagonal, "same") defjvp(anp.flipud, "same") defjvp(anp.fliplr, "same") defjvp(anp.rot90, "same") defjvp(anp.trace, "same") defjvp(anp.full, "same", argnums=(1,)) defjvp(anp.triu, "same") defjvp(anp.tril, "same") defjvp(anp.swapaxes, "same") defjvp(anp.rollaxis, "same") defjvp(anp.moveaxis, "same") defjvp(anp.broadcast_to, "same") def_linear(anp.cross) # ----- Simple grads ----- np_abs_jvp = lambda g, ans, x: anp.real(g * replace_zero(anp.conj(x), 0.0)) / replace_zero(ans, 1.0) defjvp(anp.abs, np_abs_jvp) defjvp(anp.absolute, np_abs_jvp) defjvp(anp.fabs, lambda g, ans, x: anp.sign(x) * g) # fabs doesn't take complex numbers. defjvp(anp.reciprocal, lambda g, ans, x: -g / x**2) defjvp(anp.exp, lambda g, ans, x: ans * g) defjvp(anp.exp2, lambda g, ans, x: ans * anp.log(2) * g) defjvp(anp.expm1, lambda g, ans, x: (ans + 1) * g) defjvp(anp.log, lambda g, ans, x: g / x) defjvp(anp.log2, lambda g, ans, x: g / x / anp.log(2)) defjvp(anp.log10, lambda g, ans, x: g / x / anp.log(10)) defjvp(anp.log1p, lambda g, ans, x: g / (x + 1)) defjvp(anp.sin, lambda g, ans, x: g * anp.cos(x)) defjvp(anp.cos, lambda g, ans, x: -g * anp.sin(x)) defjvp(anp.tan, lambda g, ans, x: g / anp.cos(x) ** 2) defjvp(anp.arcsin, lambda g, ans, x: g / anp.sqrt(1 - x**2)) defjvp(anp.arccos, lambda g, ans, x: -g / anp.sqrt(1 - x**2)) defjvp(anp.arctan, lambda g, ans, x: g / (1 + x**2)) defjvp(anp.sinh, lambda g, ans, x: g * anp.cosh(x)) defjvp(anp.cosh, lambda g, ans, x: g * anp.sinh(x)) defjvp(anp.tanh, lambda g, ans, x: g / anp.cosh(x) ** 2) defjvp(anp.arcsinh, lambda g, ans, x: g / anp.sqrt(x**2 + 1)) defjvp(anp.arccosh, lambda g, ans, x: g / anp.sqrt(x**2 - 1)) defjvp(anp.arctanh, lambda g, ans, x: g / (1 - x**2)) defjvp(anp.square, lambda g, ans, x: g * 2 * x) defjvp(anp.sqrt, lambda g, ans, x: g * 0.5 * x**-0.5) defjvp( anp.sinc, lambda g, ans, x: g * (anp.cos(anp.pi * x) * anp.pi * x - anp.sin(anp.pi * x)) / (anp.pi * x**2), ) defjvp(anp.clip, lambda g, ans, x, a_min, a_max: g * anp.logical_and(ans != a_min, ans != a_max)) defjvp(anp.real_if_close, lambda g, ans, x: match_complex(ans, g)) defjvp(anp.real, lambda g, ans, x: anp.real(g)) defjvp(anp.imag, lambda g, ans, x: match_complex(ans, -1j * g)) np_conj_jvp = lambda g, ans, x: anp.conj(g) defjvp(anp.conj, np_conj_jvp) defjvp(anp.conjugate, np_conj_jvp) defjvp(anp.angle, lambda g, ans, x: match_complex(ans, g * anp.conj(x * 1j) / anp.abs(x) ** 2)) defjvp( anp.where, None, lambda g, ans, c, x=None, y=None: anp.where(c, g, anp.zeros(anp.shape(g))), lambda g, ans, c, x=None, y=None: anp.where(c, anp.zeros(g.shape), g), ) # ----- Trickier grads ----- defjvp(anp.kron, "same", "same") defjvp(anp.diff, "same") defjvp(anp.gradient, "same") defjvp(anp.repeat, "same") defjvp(anp.tile, "same") defjvp(anp.transpose, "same") defjvp(anp.sum, "same") defjvp(anp.mean, "same") defjvp( anp.prod, lambda g, ans, x, axis=None, keepdims=False: ans * anp.sum(g / x, axis=axis, keepdims=keepdims) ) defjvp( anp.linspace, lambda g, ans, start, stop, *args, **kwargs: anp.linspace(g, 0, *args, **kwargs), lambda g, ans, start, stop, *args, **kwargs: anp.linspace(0, g, *args, **kwargs), ) def forward_grad_np_var(g, ans, x, axis=None, ddof=0, keepdims=False): if axis is None: num_reps = anp.size(g) elif isinstance(axis, int): num_reps = anp.shape(g)[axis] elif isinstance(axis, tuple): num_reps = anp.prod(anp.array(np.shape(g))[list(axis)]) x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True)) return 2.0 * anp.sum(anp.real(g * x_minus_mean), axis=axis, keepdims=keepdims) / (num_reps - ddof) defjvp(anp.var, forward_grad_np_var) def forward_grad_np_std(g, ans, x, axis=None, ddof=0, keepdims=False): if axis is None: num_reps = anp.size(g) elif isinstance(axis, int): num_reps = anp.shape(g)[axis] elif isinstance(axis, tuple): num_reps = anp.prod(anp.array(anp.shape(g))[list(axis)]) if num_reps <= 1: return anp.zeros_like(ans) x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True)) return anp.sum(anp.real(g * x_minus_mean), axis=axis, keepdims=keepdims) / ((num_reps - ddof) * ans) defjvp(anp.std, forward_grad_np_std) def fwd_grad_chooser(g, ans, x, axis=None, keepdims=False): if anp.isscalar(x): return g if not keepdims: if isinstance(axis, int): ans = anp.expand_dims(ans, axis) elif isinstance(axis, tuple): for ax in sorted(axis): ans = anp.expand_dims(ans, ax) chosen_locations = x == ans return anp.sum((g * chosen_locations), axis=axis, keepdims=keepdims) / anp.sum( chosen_locations, axis=axis, keepdims=keepdims ) defjvp(anp.max, fwd_grad_chooser) defjvp(anp.min, fwd_grad_chooser) defjvp(anp.amax, fwd_grad_chooser) defjvp(anp.amin, fwd_grad_chooser) defjvp(anp.cumsum, "same") def_linear(anp.inner) def_linear(anp.matmul) def_linear(anp.dot) def_linear(anp.tensordot) def_linear(anp.outer) def_linear(dot_adjoint_0) def_linear(dot_adjoint_1) def_linear(tensordot_adjoint_0) def_linear(tensordot_adjoint_1) def fwd_grad_concatenate_args(argnum, g, ans, axis_args, kwargs): result = [] for i in range(1, len(axis_args)): if i == argnum: result.append(g) else: result.append(anp.zeros_like(axis_args[i])) return anp.concatenate_args(axis_args[0], *result) defjvp_argnum(anp.concatenate_args, fwd_grad_concatenate_args) def fwd_grad_sort(g, ans, x, axis=-1, kind="quicksort", order=None): sort_perm = anp.argsort(x, axis, kind, order) return g[sort_perm] defjvp(anp.sort, fwd_grad_sort) if onp.lib.NumpyVersion(onp.__version__) < "2.0.0": defjvp(anp.msort, lambda g, ans, x: fwd_grad_sort(g, ans, x, axis=0)) def fwd_grad_partition(g, ans, x, kth, axis=-1, kind="introselect", order=None): partition_perm = anp.argpartition(x, kth, axis, kind, order) return g[partition_perm] defjvp(anp.partition, fwd_grad_partition) def atleast_jvpmaker(fun): def jvp(g, ans, *arys): if len(arys) > 1: raise NotImplementedError("Can't handle multiple arguments yet.") return fun(g) return jvp defjvp(anp.atleast_1d, atleast_jvpmaker(anp.atleast_1d)) defjvp(anp.atleast_2d, atleast_jvpmaker(anp.atleast_2d)) defjvp(anp.atleast_3d, atleast_jvpmaker(anp.atleast_3d)) def_linear(anp.einsum) # TODO(mattjj): can we call np.broadcast_to or a related function instead? def broadcast(x, target): target_shape, target_ndim, target_dtype, target_iscomplex = anp.metadata(target) while anp.ndim(x) < target_ndim: x = anp.expand_dims(x, 0) for axis, size in enumerate(anp.shape(x)): if size == 1: x = anp.repeat(x, target_shape[axis], axis=axis) if target_iscomplex and not anp.iscomplexobj(x): x = x + 0j # TODO(mattjj): this might promote the dtype return x defjvp(anp.pad, lambda g, ans, array, width, mode, **kwargs: anp.pad(g, width, mode)) ================================================ FILE: autograd/numpy/numpy_vjps.py ================================================ from functools import partial import numpy as onp from autograd.extend import SparseObject, VJPNode, defvjp, defvjp_argnum, primitive, register_notrace, vspace from ..util import func from . import numpy_wrapper as anp from .numpy_boxes import ArrayBox # ----- Non-differentiable functions ----- nograd_functions = [ anp.floor, anp.ceil, anp.round, anp.rint, anp.around, anp.fix, anp.trunc, anp.all, anp.any, anp.argmax, anp.argmin, anp.argpartition, anp.argsort, anp.argwhere, anp.nonzero, anp.flatnonzero, anp.count_nonzero, anp.searchsorted, anp.sign, anp.ndim, anp.shape, anp.floor_divide, anp.logical_and, anp.logical_or, anp.logical_not, anp.logical_xor, anp.isfinite, anp.isinf, anp.isnan, anp.isneginf, anp.isposinf, anp.allclose, anp.isclose, anp.array_equal, anp.array_equiv, anp.greater, anp.greater_equal, anp.less, anp.less_equal, anp.equal, anp.not_equal, anp.iscomplexobj, anp.iscomplex, anp.size, anp.isscalar, anp.isreal, anp.zeros_like, anp.ones_like, anp.empty_like, anp.full_like, anp.result_type, ] for fun in nograd_functions: register_notrace(VJPNode, fun) # ----- Functions that are constant w.r.t. continuous inputs ----- defvjp(anp.nan_to_num, lambda ans, x: lambda g: anp.where(anp.isfinite(x), g, 0.0)) # ----- Binary ufuncs ----- defvjp( anp.add, lambda ans, x, y: unbroadcast_f(x, lambda g: g), lambda ans, x, y: unbroadcast_f(y, lambda g: g) ) defvjp( anp.multiply, lambda ans, x, y: unbroadcast_f(x, lambda g: y * g), lambda ans, x, y: unbroadcast_f(y, lambda g: x * g), ) defvjp( anp.subtract, lambda ans, x, y: unbroadcast_f(x, lambda g: g), lambda ans, x, y: unbroadcast_f(y, lambda g: -g), ) defvjp( anp.divide, lambda ans, x, y: unbroadcast_f(x, lambda g: g / y), lambda ans, x, y: unbroadcast_f(y, lambda g: -g * x / y**2), ) defvjp( anp.maximum, lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)), ) defvjp( anp.minimum, lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)), ) defvjp( anp.fmax, lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)), ) defvjp( anp.fmin, lambda ans, x, y: unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)), ) defvjp( anp.logaddexp, lambda ans, x, y: unbroadcast_f(x, lambda g: g * anp.exp(x - ans)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * anp.exp(y - ans)), ) defvjp( anp.logaddexp2, lambda ans, x, y: unbroadcast_f(x, lambda g: g * 2 ** (x - ans)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * 2 ** (y - ans)), ) defvjp( anp.true_divide, lambda ans, x, y: unbroadcast_f(x, lambda g: g / y), lambda ans, x, y: unbroadcast_f(y, lambda g: -g * x / y**2), ) defvjp( anp.mod, lambda ans, x, y: unbroadcast_f(x, lambda g: g), lambda ans, x, y: unbroadcast_f(y, lambda g: -g * anp.floor(x / y)), ) defvjp( anp.remainder, lambda ans, x, y: unbroadcast_f(x, lambda g: g), lambda ans, x, y: unbroadcast_f(y, lambda g: -g * anp.floor(x / y)), ) defvjp( anp.power, lambda ans, x, y: unbroadcast_f(x, lambda g: g * y * x ** anp.where(y, y - 1, 1.0)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * anp.log(replace_zero(x, 1.0)) * ans), ) defvjp( anp.arctan2, lambda ans, x, y: unbroadcast_f(x, lambda g: g * y / (x**2 + y**2)), lambda ans, x, y: unbroadcast_f(y, lambda g: g * -x / (x**2 + y**2)), ) defvjp( anp.hypot, lambda ans, x, y: unbroadcast_f(x, lambda g: g * x / ans), lambda ans, x, y: unbroadcast_f(y, lambda g: g * y / ans), ) # ----- Simple grads ----- defvjp(anp.negative, lambda ans, x: lambda g: -g) np_abs_vjp = lambda ans, x: lambda g: g * replace_zero(anp.conj(x), 0.0) / replace_zero(ans, 1.0) defvjp(anp.abs, np_abs_vjp) defvjp(anp.absolute, np_abs_vjp) defvjp(anp.fabs, lambda ans, x: lambda g: anp.sign(x) * g) # fabs doesn't take complex numbers. defvjp(anp.reciprocal, lambda ans, x: lambda g: -g / x**2) defvjp(anp.exp, lambda ans, x: lambda g: ans * g) defvjp(anp.exp2, lambda ans, x: lambda g: ans * anp.log(2) * g) defvjp(anp.expm1, lambda ans, x: lambda g: (ans + 1) * g) defvjp(anp.log, lambda ans, x: lambda g: g / x) defvjp(anp.log2, lambda ans, x: lambda g: g / x / anp.log(2)) defvjp(anp.log10, lambda ans, x: lambda g: g / x / anp.log(10)) defvjp(anp.log1p, lambda ans, x: lambda g: g / (x + 1)) defvjp(anp.sin, lambda ans, x: lambda g: g * anp.cos(x)) defvjp(anp.cos, lambda ans, x: lambda g: -g * anp.sin(x)) defvjp(anp.tan, lambda ans, x: lambda g: g / anp.cos(x) ** 2) defvjp(anp.arcsin, lambda ans, x: lambda g: g / anp.sqrt(1 - x**2)) defvjp(anp.arccos, lambda ans, x: lambda g: -g / anp.sqrt(1 - x**2)) defvjp(anp.arctan, lambda ans, x: lambda g: g / (1 + x**2)) defvjp(anp.sinh, lambda ans, x: lambda g: g * anp.cosh(x)) defvjp(anp.cosh, lambda ans, x: lambda g: g * anp.sinh(x)) defvjp(anp.tanh, lambda ans, x: lambda g: g / anp.cosh(x) ** 2) defvjp(anp.arcsinh, lambda ans, x: lambda g: g / anp.sqrt(x**2 + 1)) defvjp(anp.arccosh, lambda ans, x: lambda g: g / anp.sqrt(x**2 - 1)) defvjp(anp.arctanh, lambda ans, x: lambda g: g / (1 - x**2)) defvjp(anp.rad2deg, lambda ans, x: lambda g: g / anp.pi * 180.0) defvjp(anp.degrees, lambda ans, x: lambda g: g / anp.pi * 180.0) defvjp(anp.deg2rad, lambda ans, x: lambda g: g * anp.pi / 180.0) defvjp(anp.radians, lambda ans, x: lambda g: g * anp.pi / 180.0) defvjp(anp.square, lambda ans, x: lambda g: g * 2 * x) defvjp(anp.sqrt, lambda ans, x: lambda g: g * 0.5 * x**-0.5) defvjp( anp.sinc, lambda ans, x: lambda g: g * (anp.cos(anp.pi * x) * anp.pi * x - anp.sin(anp.pi * x)) / (anp.pi * x**2), ) defvjp(anp.reshape, lambda ans, x, shape, order=None: lambda g: anp.reshape(g, anp.shape(x), order=order)) defvjp(anp.roll, lambda ans, x, shift, axis=None: lambda g: anp.roll(g, -shift, axis=axis)) defvjp(anp.array_split, lambda ans, ary, idxs, axis=0: lambda g: anp.concatenate(g, axis=axis)) defvjp(anp.split, lambda ans, ary, idxs, axis=0: lambda g: anp.concatenate(g, axis=axis)) defvjp(anp.vsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=0)) defvjp(anp.hsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=1)) defvjp(anp.dsplit, lambda ans, ary, idxs: lambda g: anp.concatenate(g, axis=2)) defvjp(anp.ravel, lambda ans, x, order=None: lambda g: anp.reshape(g, anp.shape(x), order=order)) defvjp(anp.expand_dims, lambda ans, x, axis: lambda g: anp.reshape(g, anp.shape(x))) defvjp(anp.squeeze, lambda ans, x, axis=None: lambda g: anp.reshape(g, anp.shape(x))) defvjp(anp.diag, lambda ans, x, k=0: lambda g: anp.diag(g, k)) defvjp(anp.flipud, lambda ans, x,: lambda g: anp.flipud(g)) defvjp(anp.fliplr, lambda ans, x,: lambda g: anp.fliplr(g)) defvjp(anp.rot90, lambda ans, x, k=1: lambda g: anp.rot90(g, -k)) defvjp( anp.trace, lambda ans, x, offset=0: ( lambda g: anp.einsum("ij,...->ij...", anp.eye(x.shape[0], x.shape[1], k=offset), g) ), ) defvjp(anp.full, lambda ans, shape, fill_value, dtype=None: lambda g: anp.sum(g), argnums=(1,)) defvjp(anp.triu, lambda ans, x, k=0: lambda g: anp.triu(g, k=k)) defvjp(anp.tril, lambda ans, x, k=0: lambda g: anp.tril(g, k=k)) defvjp(anp.clip, lambda ans, x, a_min, a_max: lambda g: g * anp.logical_and(ans != a_min, ans != a_max)) defvjp(anp.swapaxes, lambda ans, x, axis1, axis2: lambda g: anp.swapaxes(g, axis2, axis1)) defvjp(anp.moveaxis, lambda ans, a, source, destination: lambda g: anp.moveaxis(g, destination, source)) defvjp(anp.real_if_close, lambda ans, x: lambda g: match_complex(x, g)) defvjp(anp.real, lambda ans, x: lambda g: match_complex(x, g)) defvjp(anp.imag, lambda ans, x: lambda g: match_complex(x, -1j * g)) np_conj_vjp = lambda ans, x: lambda g: anp.conj(g) defvjp(anp.conj, np_conj_vjp) defvjp(anp.conjugate, np_conj_vjp) defvjp(anp.angle, lambda ans, x: lambda g: match_complex(x, g * anp.conj(x * 1j) / anp.abs(x) ** 2)) defvjp( anp.where, None, lambda ans, c, x=None, y=None: lambda g: anp.where(c, g, anp.zeros(g.shape)), lambda ans, c, x=None, y=None: lambda g: anp.where(c, anp.zeros(g.shape), g), ) defvjp( anp.cross, lambda ans, a, b, axisa=-1, axisb=-1, axisc=-1, axis=None: ( lambda g: anp.cross(b, g, axisb, axisc, axisa, axis) ), lambda ans, a, b, axisa=-1, axisb=-1, axisc=-1, axis=None: ( lambda g: anp.cross(g, a, axisc, axisa, axisb, axis) ), ) defvjp( anp.linspace, lambda ans, start, stop, num: lambda g: anp.dot(anp.linspace(1.0, 0.0, num), g), lambda ans, start, stop, num: lambda g: anp.dot(anp.linspace(0.0, 1.0, num), g), ) defvjp( anp._astype, lambda ans, A, dtype, order="K", casting="unsafe", subok=True, copy=True: ( lambda g: anp._astype(g, A.dtype) ), ) # ----- Trickier grads ----- def grad_rollaxis(ans, a, axis, start=0): if axis < 0: raise NotImplementedError( "Gradient of rollaxis not implemented for axis < 0. Please use moveaxis instead." ) elif start < 0: raise NotImplementedError( "Gradient of rollaxis not implemented for start < 0. Please use moveaxis instead." ) return lambda g: anp.rollaxis(g, start - 1, axis) if start > axis else anp.rollaxis(g, start, axis + 1) defvjp(anp.rollaxis, grad_rollaxis) def grad_diff(ans, a, n=1, axis=-1): nd = anp.ndim(a) ans_shape = anp.shape(ans) sl1 = [slice(None)] * nd sl1[axis] = slice(None, 1) sl2 = [slice(None)] * nd sl2[axis] = slice(-1, None) def undiff(g): if g.shape[axis] > 0: return anp.concatenate((-g[tuple(sl1)], -anp.diff(g, axis=axis), g[tuple(sl2)]), axis=axis) shape = list(ans_shape) shape[axis] = 1 return anp.zeros(shape) def helper(g, n): if n == 0: return g return helper(undiff(g), n - 1) return lambda g: helper(g, n) defvjp(anp.diff, grad_diff) def grad_gradient(ans, x, *vargs, **kwargs): axis = kwargs.pop("axis", None) if vargs or kwargs: raise NotImplementedError("The only optional argument currently supported for np.gradient is axis.") if axis is None: axis = range(x.ndim) elif type(axis) is int: axis = [axis] else: axis = list(axis) x_dtype = x.dtype x_shape = x.shape nd = x.ndim def vjp(g): if anp.ndim(g) == nd: # add axis if gradient was along one axis only g = g[anp.newaxis] # accumulate gradient out = anp.zeros(x_shape, dtype=x_dtype) for i, a in enumerate(axis): # swap gradient axis to the front g_swap = anp.swapaxes(g[i], 0, a)[:, anp.newaxis] out_axis = anp.concatenate( ( -g_swap[0] - 0.5 * g_swap[1], g_swap[0] - 0.5 * g_swap[2], (-1.0) * anp.gradient(g_swap, axis=0)[2:-2, 0], 0.5 * g_swap[-3] - g_swap[-1], 0.5 * g_swap[-2] + g_swap[-1], ), axis=0, ) out = out + anp.swapaxes(out_axis, 0, a) return out return vjp defvjp(anp.gradient, grad_gradient) def grad_repeat(ans, x, repeats, axis=None): shape = anp.shape(x) def vjp(g): if axis is None: # If axis is none, np.repeat() repeats the flattened array. expanded = anp.reshape(g, (anp.prod(shape),) + (repeats,)) return anp.reshape(anp.sum(expanded, axis=1, keepdims=False), shape) else: if shape[axis] == 1: # For this common case, the logic is simple. return anp.sum(g, axis=axis, keepdims=True) else: expanded = anp.reshape(g, shape[0 : axis + 1] + (repeats,) + shape[axis + 1 :]) return anp.sum(expanded, axis=axis + 1, keepdims=False) return vjp defvjp(anp.repeat, grad_repeat) def grad_tile(ans, x, reps): reps = [reps] if anp.isscalar(reps) else reps x_shape = anp.shape(x) def vjp(g): for axis, rep in enumerate(reps): g = sum(anp.split(g, rep, axis)) return anp.reshape(g, x_shape) return vjp defvjp(anp.tile, grad_tile) def grad_kron(argnum, ans, orig_A, orig_B): # kron has different promotion rules than dot. the reshapes are necessary if # and only if (1) orig_B is 1D or (2) orig_A and/or orig_B are 0D orig_A_shape = anp.shape(orig_A) orig_B_shape = anp.shape(orig_B) def vjp(G): A, B = anp.atleast_2d(orig_A), anp.atleast_2d(orig_B) shape = list(A.shape + B.shape) n = anp.ndim(A) shape[n - 1], shape[n] = shape[n], shape[n - 1] reshaped_G = anp.swapaxes(anp.reshape(G, shape), n - 1, n) if argnum == 0: return match_complex( orig_A, anp.reshape(anp.tensordot(reshaped_G, B, axes=anp.ndim(B)), orig_A_shape) ) else: return match_complex( orig_B, anp.reshape(anp.tensordot(A, reshaped_G, axes=anp.ndim(A)), orig_B_shape) ) return vjp defvjp(anp.kron, partial(grad_kron, 0), partial(grad_kron, 1)) def grad_transpose(ans, x, axes=None): if axes is not None: axes = anp.argsort(axes) return lambda g: anp.transpose(g, axes) defvjp(anp.transpose, grad_transpose) def repeat_to_match_shape(g, shape, dtype, axis, keepdims): """Returns the array g repeated along axis to fit vector space vs. Also returns the number of repetitions of the array.""" if shape == (): return g, 1 axis = list(axis) if isinstance(axis, tuple) else axis new_shape = onp.array(shape) new_shape[axis] = 1 num_reps = onp.prod(onp.array(shape)[axis]) # Can't use broadcast_to because of numpy bug: https://github.com/numpy/numpy/issues/9165 # return anp.broadcast_to(anp.reshape(g, new_shape), shape), num_reps return anp.reshape(g, new_shape) + onp.zeros(shape, dtype=dtype), num_reps def grad_broadcast_to(ans, x, new_shape): old_shape = anp.shape(x) assert anp.shape(ans) == new_shape assert len(old_shape) == len(new_shape), "Can't handle extra leading dims" broadcast_axes = tuple( onp.where(onp.logical_and(onp.array(old_shape) == 1, onp.array(new_shape) > 1))[0] ) return lambda g: anp.sum(g, axis=broadcast_axes, keepdims=True) defvjp(anp.broadcast_to, grad_broadcast_to) def grad_np_sum(ans, x, axis=None, keepdims=False, dtype=None): shape, dtype = anp.shape(x), anp.result_type(x) return lambda g: repeat_to_match_shape(g, shape, dtype, axis, keepdims)[0] defvjp(anp.sum, grad_np_sum) def grad_np_mean(ans, x, axis=None, keepdims=False): shape, dtype = anp.shape(x), anp.result_type(x) def vjp(g): g_repeated, num_reps = repeat_to_match_shape(g, shape, dtype, axis, keepdims) return g_repeated / num_reps return vjp defvjp(anp.mean, grad_np_mean) def grad_np_prod(ans, x, axis=None, keepdims=False): # TODO: Support tuples of axes. shape, dtype = anp.shape(x), anp.result_type(x) def vjp(g): g_repeated, _ = repeat_to_match_shape(g * ans, shape, dtype, axis, keepdims) return g_repeated / x return vjp defvjp(anp.prod, grad_np_prod) def grad_np_var(ans, x, axis=None, ddof=0, keepdims=False): shape, _, dtype, iscomplex = anp.metadata(x) def vjp(g): if iscomplex: g = g + 0j g_repeated, num_reps = repeat_to_match_shape(g, shape, dtype, axis, keepdims) x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True)) return 2.0 * g_repeated * x_minus_mean / (num_reps - ddof) return vjp defvjp(anp.var, grad_np_var) def grad_np_std(ans, x, axis=None, ddof=0, keepdims=False): shape, _, dtype, iscomplex = anp.metadata(x) def vjp(g): if iscomplex: g = g + 0j g_repeated, num_reps = repeat_to_match_shape( g, shape, dtype, axis, keepdims ) # Avoid division by zero. if num_reps <= 1: return g_repeated * 0.0 else: g_repeated, num_reps = repeat_to_match_shape(g / ans, shape, dtype, axis, keepdims) x_minus_mean = anp.conj(x - anp.mean(x, axis=axis, keepdims=True)) return g_repeated * x_minus_mean / (num_reps - ddof) return vjp defvjp(anp.std, grad_np_std) def grad_chooser(ans, x, axis=None, keepdims=None): shape, dtype = anp.shape(x), anp.result_type(x) def vjp(g): """Builds gradient of functions that choose a single item, such as min or max.""" g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims) argmax_locations = x == repeat_to_match_shape(ans, shape, dtype, axis, keepdims)[0] return g_repeated * argmax_locations / onp.sum(argmax_locations, axis=axis, keepdims=True) return vjp defvjp(anp.max, grad_chooser) defvjp(anp.min, grad_chooser) defvjp(anp.amax, grad_chooser) defvjp(anp.amin, grad_chooser) def reverse_axis(x, axis): x = x.swapaxes(axis, 0) x = x[::-1, ...] return x.swapaxes(0, axis) def grad_np_cumsum(ans, x, axis=None): def vjp(g): if axis: return reverse_axis(anp.cumsum(reverse_axis(g, axis), axis), axis) else: return anp.reshape(anp.cumsum(g[::-1], axis)[::-1], x.shape) return vjp defvjp(anp.cumsum, grad_np_cumsum) def grad_inner(argnum, ans, A, B): A_ndim, B_ndim = anp.ndim(A), anp.ndim(B) if A_ndim == 0 or B_ndim == 0: axes = ([], []) else: axes = ([A_ndim - 1], [B_ndim - 1]) if argnum == 0: return lambda G: tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim) elif argnum == 1: return lambda G: tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim) defvjp(anp.inner, partial(grad_inner, 0), partial(grad_inner, 1)) def matmul_adjoint_0(B, G, A_meta, B_ndim): if anp.ndim(G) == 0: # A_ndim == B_ndim == 1 return unbroadcast(G * B, A_meta) _, A_ndim, _, _ = A_meta if A_ndim == 1: G = anp.expand_dims(G, anp.ndim(G) - 1) if B_ndim == 1: # The result we need is an outer product B = anp.expand_dims(B, 0) G = anp.expand_dims(G, anp.ndim(G)) else: # We need to swap the last two axes of B B = anp.swapaxes(B, B_ndim - 2, B_ndim - 1) result = anp.matmul(G, B) return unbroadcast(result, A_meta) def matmul_adjoint_1(A, G, A_ndim, B_meta): if anp.ndim(G) == 0: # A_ndim == B_ndim == 1 return unbroadcast(G * A, B_meta) _, B_ndim, _, _ = B_meta B_is_vec = B_ndim == 1 if B_is_vec: G = anp.expand_dims(G, anp.ndim(G)) if A_ndim == 1: # The result we need is an outer product A = anp.expand_dims(A, 1) G = anp.expand_dims(G, anp.ndim(G) - 1) else: # We need to swap the last two axes of A A = anp.swapaxes(A, A_ndim - 2, A_ndim - 1) result = anp.matmul(A, G) if B_is_vec: result = anp.squeeze(result, anp.ndim(G) - 1) return unbroadcast(result, B_meta) def matmul_vjp_0(ans, A, B): A_meta = anp.metadata(A) B_ndim = anp.ndim(B) return lambda g: matmul_adjoint_0(B, g, A_meta, B_ndim) def matmul_vjp_1(ans, A, B): A_ndim = anp.ndim(A) B_meta = anp.metadata(B) return lambda g: matmul_adjoint_1(A, g, A_ndim, B_meta) defvjp(anp.matmul, matmul_vjp_0, matmul_vjp_1) @primitive def dot_adjoint_0(B, G, A_meta, B_meta): _, A_ndim, A_dtype, _ = A_meta _, B_ndim, _, _ = B_meta if B_ndim == 0 or B_ndim == 1 or A_ndim == 0: contract_num = max(0, B_ndim - (A_ndim != 0)) out = onp.tensordot(G, B, contract_num) else: out = onp.tensordot(G, onp.swapaxes(B, -1, -2), B_ndim - 1) return onp.asarray(out, dtype=A_dtype) @primitive def dot_adjoint_1(A, G, A_meta, B_meta): _, A_ndim, _, _ = A_meta _, B_ndim, B_dtype, _ = B_meta needs_transpose = B_ndim > 1 and A_ndim != 0 swap = (lambda x: onp.swapaxes(x, -1, -2)) if needs_transpose else (lambda x: x) if A_ndim == 0 or A_ndim == 1 or B_ndim == 0: contract_num = max(0, A_ndim - (B_ndim != 0)) out = swap(onp.tensordot(G, A, contract_num)) else: out = swap(onp.tensordot(G, A, [range(-A_ndim - B_ndim + 2, -B_ndim + 1), range(A_ndim - 1)])) return onp.asarray(out, dtype=B_dtype) def dot_vjp_0(ans, A, B): A_meta, B_meta = anp.metadata(A), anp.metadata(B) return lambda g: match_complex(A, dot_adjoint_0(B, g, A_meta, B_meta)) def dot_vjp_1(ans, A, B): A_meta, B_meta = anp.metadata(A), anp.metadata(B) return lambda g: match_complex(B, dot_adjoint_1(A, g, A_meta, B_meta)) defvjp(anp.dot, dot_vjp_0, dot_vjp_1) defvjp( dot_adjoint_0, lambda ans, B, g, An, Bn: lambda A: match_complex(B, dot_adjoint_1(A, g, An, Bn)), lambda ans, B, g, An, Bn: lambda A: match_complex(g, anp.dot(A, B)), ) defvjp( dot_adjoint_1, lambda ans, A, g, An, Bn: lambda B: match_complex(A, dot_adjoint_0(B, g, An, Bn)), lambda ans, A, g, An, Bn: lambda B: match_complex(g, anp.dot(A, B)), ) @primitive def tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim): # The adjoint of the operator # A |--> np.tensordot(A, B, axes) if B_ndim == 0: return G * B G_axes = onp.arange(onp.ndim(G)) if type(axes) is int: axes = max(axes, 0) B_axes = onp.arange(B_ndim) return onp.tensordot(G, B, [G_axes[A_ndim - axes :], B_axes[axes:]]) else: axes0 = [axes[0]] if type(axes[0]) is int else axes[0] axes1 = [axes[1]] if type(axes[1]) is int else axes[1] axes = [axes0, axes1] A_axes = onp.arange(A_ndim) B_axes = onp.arange(B_ndim) summed_axes = [ onp.asarray(axes[0], dtype="int64") % A_ndim, onp.asarray(axes[1], dtype="int64") % B_ndim, ] other_axes = [onp.delete(A_axes, summed_axes[0]), onp.delete(B_axes, summed_axes[1])] out = onp.tensordot(G, B, [G_axes[len(other_axes[0]) :], other_axes[1]]) perm = onp.argsort(onp.concatenate((other_axes[0], summed_axes[0][onp.argsort(summed_axes[1])]))) return onp.transpose(out, perm) @primitive def tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim): # The adjoint of the operator # B |--> np.tensordot(A, B, axes) if A_ndim == 0: return G * A G_axes = onp.arange(onp.ndim(G)) if type(axes) is int: axes = max(axes, 0) A_axes = onp.arange(A_ndim) return onp.tensordot(A, G, [A_axes[: A_ndim - axes], G_axes[: A_ndim - axes]]) else: axes0 = [axes[0]] if type(axes[0]) is int else axes[0] axes1 = [axes[1]] if type(axes[1]) is int else axes[1] axes = [axes0, axes1] A_axes = onp.arange(A_ndim) B_axes = onp.arange(B_ndim) summed_axes = [ onp.asarray(axes[0], dtype="int64") % A_ndim, onp.asarray(axes[1], dtype="int64") % B_ndim, ] other_axes = [onp.delete(A_axes, summed_axes[0]), onp.delete(B_axes, summed_axes[1])] out = onp.tensordot(A, G, [other_axes[0], G_axes[: len(other_axes[0])]]) perm = onp.argsort(onp.concatenate((summed_axes[1][onp.argsort(summed_axes[0])], other_axes[1]))) return onp.transpose(out, perm) def tensordot_vjp_0(ans, A, B, axes=2): A_ndim, B_ndim = anp.ndim(A), anp.ndim(B) return lambda G: match_complex(A, tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim)) def tensordot_vjp_1(ans, A, B, axes=2): A_ndim, B_ndim = anp.ndim(A), anp.ndim(B) return lambda G: match_complex(B, tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim)) defvjp(anp.tensordot, tensordot_vjp_0, tensordot_vjp_1) defvjp( tensordot_adjoint_0, lambda ans, B, G, axes, An, Bn: lambda A: match_complex(B, tensordot_adjoint_1(A, G, axes, An, Bn)), lambda ans, B, G, axes, An, Bn: lambda A: match_complex(G, anp.tensordot(A, B, axes)), ) defvjp( tensordot_adjoint_1, lambda ans, A, G, axes, An, Bn: lambda B: match_complex(A, tensordot_adjoint_0(B, G, axes, An, Bn)), lambda ans, A, G, axes, An, Bn: lambda B: match_complex(G, anp.tensordot(A, B, axes)), ) defvjp( anp.outer, lambda ans, a, b: lambda g: match_complex(a, anp.dot(g, b.T)), lambda ans, a, b: lambda g: match_complex(b, anp.dot(a.T, g)), ) def grad_concatenate_args(argnum, ans, axis_args, kwargs): axis, args = axis_args[0], axis_args[1:] sizes = [anp.shape(a)[axis] for a in args[:argnum]] start = sum(sizes[:-1]) idxs = [slice(None)] * ans.ndim idxs[axis] = slice(start, start + sizes[-1]) return lambda g: g[tuple(idxs)] defvjp_argnum(anp.concatenate_args, grad_concatenate_args) def wrapped_reshape(x, *args, **kwargs): # The reshape method can be called like A.reshape((5,4)) or A.reshape(5,4). # The reshape function doesn't support both ways, so we have to wrap it. if isinstance(args[0], int): return anp.reshape(x, args, **kwargs) else: return anp.reshape(x, *args, **kwargs) setattr(ArrayBox, "reshape", wrapped_reshape) def grad_sort(ans, x, axis=-1, kind="quicksort", order=None): # TODO: Cast input with np.asanyarray() if len(x.shape) > 1: raise NotImplementedError("Gradient of sort not implemented for multi-dimensional arrays.") sort_perm = anp.argsort(x, axis, kind, order) return lambda g: unpermuter(g, sort_perm) defvjp(anp.sort, grad_sort) if onp.lib.NumpyVersion(onp.__version__) < "2.0.0": defvjp(anp.msort, grad_sort) # Until multi-D is allowed, these are the same. def grad_partition(ans, x, kth, axis=-1, kind="introselect", order=None): # TODO: Cast input with np.asanyarray() if len(x.shape) > 1: raise NotImplementedError("Gradient of partition not implemented for multi-dimensional arrays.") partition_perm = anp.argpartition(x, kth, axis, kind, order) return lambda g: unpermuter(g, partition_perm) defvjp(anp.partition, grad_partition) def unpermuter(g, permutation): unsort = anp.zeros(len(permutation), dtype=int) unsort[permutation] = list(range(len(permutation))) return g[unsort] def grad_reshape_list(ans, *arys): if len(arys) > 1: raise NotImplementedError("Can't handle multiple arguments yet.") return lambda g: anp.reshape(g, anp.shape(arys[0])) defvjp(anp.atleast_1d, grad_reshape_list) defvjp(anp.atleast_2d, grad_reshape_list) defvjp(anp.atleast_3d, grad_reshape_list) def grad_einsum(argnum, ans, operands_, kwargs): result_meta = anp.metadata(operands_[argnum]) def vjp(g): operands = operands_ if isinstance(operands[0], str): # using "ijk" convention. in_subs, out_subs, _ = anp.parse_einsum_input(*operands) string, operands = operands[0], operands[1:] in_subs_list = in_subs.split(",") op_num = argnum - 1 subs_wrt = in_subs_list[op_num] rest_of_ops = operands[:op_num] + operands[op_num + 1 :] rest_of_subs = in_subs_list[:op_num] + in_subs_list[op_num + 1 :] # subscripts that only appear in subs_wrt (and not in other subscript lists # or in the output) are implicitly being summed out, as if contracted # against a tensor of ones. we make that tensor of ones explicit to handle # the necessary vjp broadcasting inside einsum. other_named_subs = set("".join([out_subs] + rest_of_subs)) naked_summed = [(i, sub) for i, sub in enumerate(subs_wrt) if sub not in other_named_subs] if naked_summed: naked_summed_dims, ones_subs = zip(*naked_summed) ones_subs = "".join(ones_subs) ones = onp.ones(onp.array(operands[op_num].shape)[list(naked_summed_dims)]) new_input_subs = ",".join([out_subs, ones_subs] + rest_of_subs) new_operands = (g, ones) + rest_of_ops else: new_input_subs = ",".join([out_subs] + rest_of_subs) new_operands = (g,) + rest_of_ops new_subscripts = new_input_subs + "->" + subs_wrt return unbroadcast(anp.einsum(new_subscripts, *new_operands), result_meta) else: # using (op0, sublist0, op1, sublist1, ..., sublistout) convention if len(operands) % 2 == 0: raise NotImplementedError("Need sublistout argument") operands = list(operands) rest_of_ops = ( [operands[-1]] + operands[:argnum] + operands[(argnum + 2) : -1] + [operands[argnum + 1]] ) return unbroadcast_einsum(anp.einsum(g, *rest_of_ops), result_meta, operands[argnum + 1]) return vjp defvjp_argnum(anp.einsum, grad_einsum) defvjp( anp.diagonal, lambda ans, A, offset=0, axis1=0, axis2=1: lambda g: anp.make_diagonal(g, offset, axis1, axis2), ) defvjp( anp.make_diagonal, lambda ans, D, offset=0, axis1=0, axis2=1: lambda g: anp.diagonal(g, offset, axis1, axis2), ) def match_complex(target, x): target_iscomplex = anp.iscomplexobj(target) x_iscomplex = anp.iscomplexobj(x) if x_iscomplex and not target_iscomplex: return anp.real(x) elif not x_iscomplex and target_iscomplex: return x + 0j else: return x def unbroadcast(x, target_meta, broadcast_idx=0): target_shape, target_ndim, dtype, target_iscomplex = target_meta while anp.ndim(x) > target_ndim: x = anp.sum(x, axis=broadcast_idx) for axis, size in enumerate(target_shape): if size == 1: x = anp.sum(x, axis=axis, keepdims=True) if anp.iscomplexobj(x) and not target_iscomplex: x = anp.real(x) return x def unbroadcast_f(target, f): target_meta = anp.metadata(target) return lambda g: unbroadcast(f(g), target_meta) def unbroadcast_einsum(x, target_meta, subscript): if Ellipsis not in subscript: return x elif subscript[0] == Ellipsis: return unbroadcast(x, target_meta, 0) elif subscript[-1] == Ellipsis: return unbroadcast(x, target_meta, -1) else: return unbroadcast(x, target_meta, subscript.index(Ellipsis)) def balanced_eq(x, z, y): return (x == z) / (1.0 + (x == y)) def replace_zero(x, val): return anp.where(x, x, val) # ----- extra functions used internally ----- def array_from_args_gradmaker(argnum, ans, args, kwargs): return lambda g: g[argnum - 2] defvjp_argnum(anp.array_from_args, array_from_args_gradmaker) def array_from_scalar_or_array_gradmaker(ans, array_args, array_kwargs, scarray): ndmin = array_kwargs.get("ndmin", 0) scarray_ndim = anp.ndim(scarray) if ndmin > scarray_ndim: return lambda g: anp.squeeze(g, axis=tuple(range(ndmin - scarray_ndim))) else: return lambda g: g defvjp(anp._array_from_scalar_or_array, array_from_scalar_or_array_gradmaker, argnums=(2, 3)) @primitive def untake(x, idx, vs): if isinstance(idx, list) and (len(idx) == 0 or not isinstance(idx[0], slice)): idx = onp.array(idx, dtype="int64") def mut_add(A): onp.add.at(A, idx, x) return A return SparseObject(vs, mut_add) defvjp(func(ArrayBox.__getitem__), lambda ans, A, idx: lambda g: untake(g, idx, vspace(A))) defvjp(untake, lambda ans, x, idx, _: lambda g: g[idx]) def _unpad(array, width): if anp.isscalar(width): width = [[width, width]] elif anp.shape(width) == (1,): width = [anp.concatenate((width, width))] elif anp.shape(width) == (2,): width = [width] if anp.shape(width)[0] == 1: width = anp.repeat(width, anp.ndim(array), 0) idxs = tuple(slice(l, -u or None) for l, u in width) return array[idxs] def pad_vjp(ans, array, pad_width, mode, **kwargs): assert mode == "constant", "Only constant mode padding is supported." return lambda g: _unpad(g, pad_width) defvjp(anp.pad, pad_vjp) ================================================ FILE: autograd/numpy/numpy_vspaces.py ================================================ import numpy as np from autograd.builtins import NamedTupleVSpace from autograd.extend import VSpace class ArrayVSpace(VSpace): def __init__(self, value): value = np.asarray(value) self.shape = value.shape self.dtype = value.dtype @property def size(self): return np.prod(self.shape) @property def ndim(self): return len(self.shape) def zeros(self): return np.zeros(self.shape, dtype=self.dtype) def ones(self): return np.ones(self.shape, dtype=self.dtype) def standard_basis(self): for idxs in np.ndindex(*self.shape): vect = np.zeros(self.shape, dtype=self.dtype) vect[idxs] = 1 yield vect def randn(self): return np.array(np.random.randn(*self.shape)).astype(self.dtype) def _inner_prod(self, x, y): return np.dot(np.ravel(x), np.ravel(y)) class ComplexArrayVSpace(ArrayVSpace): iscomplex = True @property def size(self): return np.prod(self.shape) * 2 def ones(self): return np.ones(self.shape, dtype=self.dtype) + 1.0j * np.ones(self.shape, dtype=self.dtype) def standard_basis(self): for idxs in np.ndindex(*self.shape): for v in [1.0, 1.0j]: vect = np.zeros(self.shape, dtype=self.dtype) vect[idxs] = v yield vect def randn(self): return np.array(np.random.randn(*self.shape)).astype(self.dtype) + 1.0j * np.array( np.random.randn(*self.shape) ).astype(self.dtype) def _inner_prod(self, x, y): return np.real(np.dot(np.conj(np.ravel(x)), np.ravel(y))) def _covector(self, x): return np.conj(x) VSpace.register(np.ndarray, lambda x: ComplexArrayVSpace(x) if np.iscomplexobj(x) else ArrayVSpace(x)) for type_ in [float, np.longdouble, np.float64, np.float32, np.float16]: ArrayVSpace.register(type_) for type_ in [complex, np.clongdouble, np.complex64, np.complex128]: ComplexArrayVSpace.register(type_) if np.lib.NumpyVersion(np.__version__) >= "2.0.0": class EigResultVSpace(NamedTupleVSpace): seq_type = np.linalg._linalg.EigResult class EighResultVSpace(NamedTupleVSpace): seq_type = np.linalg._linalg.EighResult class QRResultVSpace(NamedTupleVSpace): seq_type = np.linalg._linalg.QRResult class SlogdetResultVSpace(NamedTupleVSpace): seq_type = np.linalg._linalg.SlogdetResult class SVDResultVSpace(NamedTupleVSpace): seq_type = np.linalg._linalg.SVDResult EigResultVSpace.register(np.linalg._linalg.EigResult) EighResultVSpace.register(np.linalg._linalg.EighResult) QRResultVSpace.register(np.linalg._linalg.QRResult) SlogdetResultVSpace.register(np.linalg._linalg.SlogdetResult) SVDResultVSpace.register(np.linalg._linalg.SVDResult) elif np.__version__ >= "1.25": class EigResultVSpace(NamedTupleVSpace): seq_type = np.linalg.linalg.EigResult class EighResultVSpace(NamedTupleVSpace): seq_type = np.linalg.linalg.EighResult class QRResultVSpace(NamedTupleVSpace): seq_type = np.linalg.linalg.QRResult class SlogdetResultVSpace(NamedTupleVSpace): seq_type = np.linalg.linalg.SlogdetResult class SVDResultVSpace(NamedTupleVSpace): seq_type = np.linalg.linalg.SVDResult EigResultVSpace.register(np.linalg.linalg.EigResult) EighResultVSpace.register(np.linalg.linalg.EighResult) QRResultVSpace.register(np.linalg.linalg.QRResult) SlogdetResultVSpace.register(np.linalg.linalg.SlogdetResult) SVDResultVSpace.register(np.linalg.linalg.SVDResult) ================================================ FILE: autograd/numpy/numpy_wrapper.py ================================================ import warnings import numpy as _np import autograd.builtins as builtins from autograd.extend import notrace_primitive, primitive if _np.lib.NumpyVersion(_np.__version__) >= "2.0.0": from numpy._core.einsumfunc import _parse_einsum_input else: from numpy.core.einsumfunc import _parse_einsum_input numpy_version = _np.__version__ notrace_functions = [_np.ndim, _np.shape, _np.iscomplexobj, _np.result_type] def wrap_intdtype(cls): class IntdtypeSubclass(cls): __new__ = notrace_primitive(cls.__new__) return IntdtypeSubclass def wrap_namespace(old, new): unchanged_types = {float, int, type(None), type} int_types = {_np.int8, _np.int16, _np.int32, _np.int64, _np.integer} for name, obj in old.items(): if obj in notrace_functions: new[name] = notrace_primitive(obj) elif callable(obj) and type(obj) is not type: new[name] = primitive(obj) elif type(obj) is type and obj in int_types: new[name] = wrap_intdtype(obj) elif type(obj) in unchanged_types: new[name] = obj wrap_namespace(_np.__dict__, globals()) # ----- Special treatment of list-input functions ----- @primitive def concatenate_args(axis, *args): return _np.concatenate(args, axis).view(ndarray) concatenate = lambda arr_list, axis=0: concatenate_args(axis, *arr_list) vstack = row_stack = lambda tup: concatenate([atleast_2d(_m) for _m in tup], axis=0) def hstack(tup): arrs = [atleast_1d(_m) for _m in tup] if arrs[0].ndim == 1: return concatenate(arrs, 0) return concatenate(arrs, 1) def column_stack(tup): arrays = [] for v in tup: arr = array(v) if arr.ndim < 2: arr = array(arr, ndmin=2).T arrays.append(arr) return concatenate(arrays, 1) def array(A, *args, **kwargs): t = builtins.type(A) if t in (list, tuple): return array_from_args(args, kwargs, *map(array, A)) else: return _array_from_scalar_or_array(args, kwargs, A) def wrap_if_boxes_inside(raw_array, slow_op_name=None): if raw_array.dtype is _np.dtype("O"): if slow_op_name: warnings.warn(f"{slow_op_name} is slow for array inputs. np.concatenate() is faster.") return array_from_args((), {}, *raw_array.ravel()).reshape(raw_array.shape) else: return raw_array @primitive def _array_from_scalar_or_array(array_args, array_kwargs, scalar): return _np.array(scalar, *array_args, **array_kwargs) @primitive def array_from_args(array_args, array_kwargs, *args): return _np.array(args, *array_args, **array_kwargs) def select(condlist, choicelist, default=0): raw_array = _np.select(list(condlist), list(choicelist), default=default) return array(list(raw_array.ravel())).reshape(raw_array.shape) def stack(arrays, axis=0): # this code is basically copied from numpy/core/shape_base.py's stack # we need it here because we want to re-implement stack in terms of the # primitives defined in this file arrays = [array(arr) for arr in arrays] if not arrays: raise ValueError("need at least one array to stack") shapes = {arr.shape for arr in arrays} if len(shapes) != 1: raise ValueError("all input arrays must have the same shape") result_ndim = arrays[0].ndim + 1 if not -result_ndim <= axis < result_ndim: raise IndexError("axis {0} out of bounds [-{1}, {1})".format(axis, result_ndim)) if axis < 0: axis += result_ndim sl = (slice(None),) * axis + (None,) return concatenate([arr[sl] for arr in arrays], axis=axis) def append(arr, values, axis=None): # this code is basically copied from numpy/lib/function_base.py's append arr = array(arr) if axis is None: if ndim(arr) != 1: arr = ravel(arr) values = ravel(array(values)) axis = ndim(arr) - 1 return concatenate((arr, values), axis=axis) # ----- Enable functions called using [] ---- class r_class: def __getitem__(self, args): raw_array = _np.r_[args] return wrap_if_boxes_inside(raw_array, slow_op_name="r_") r_ = r_class() class c_class: def __getitem__(self, args): raw_array = _np.c_[args] return wrap_if_boxes_inside(raw_array, slow_op_name="c_") c_ = c_class() # ----- misc ----- @primitive def make_diagonal(D, offset=0, axis1=0, axis2=1): # Numpy doesn't offer a complement to np.diagonal: a function to create new # diagonal arrays with extra dimensions. We need such a function for the # gradient of np.diagonal and it's also quite handy to have. So here it is. if not (offset == 0 and axis1 == -1 and axis2 == -2): raise NotImplementedError("Currently make_diagonal only supports offset=0, axis1=-1, axis2=-2") # We use a trick: calling np.diagonal returns a view on the original array, # so we can modify it in-place. (only valid for numpy version >= 1.10.) new_array = _np.zeros(D.shape + (D.shape[-1],)) new_array_diag = _np.diagonal(new_array, offset=0, axis1=-1, axis2=-2) new_array_diag.flags.writeable = True new_array_diag[:] = D return new_array @notrace_primitive def metadata(A): return _np.shape(A), _np.ndim(A), _np.result_type(A), _np.iscomplexobj(A) @notrace_primitive def parse_einsum_input(*args): return _parse_einsum_input(args) if _np.lib.NumpyVersion(_np.__version__) >= "2.0.0": # Wrapped above _astype = astype else: @primitive def _astype(A, dtype, order="K", casting="unsafe", subok=True, copy=True): return A.astype(dtype, order, casting, subok, copy) ================================================ FILE: autograd/numpy/random.py ================================================ import numpy.random as npr from .numpy_wrapper import wrap_namespace wrap_namespace(npr.__dict__, globals()) ================================================ FILE: autograd/scipy/__init__.py ================================================ from . import integrate, signal, special, stats ================================================ FILE: autograd/scipy/integrate.py ================================================ import scipy.integrate import autograd.numpy as np from autograd import make_vjp from autograd.builtins import tuple from autograd.extend import defvjp_argnums, primitive from autograd.misc import flatten odeint = primitive(scipy.integrate.odeint) def grad_odeint(yt, func, y0, t, func_args, **kwargs): # Extended from "Scalable Inference of Ordinary Differential # Equation Models of Biochemical Processes", Sec. 2.4.2 # Fabian Froehlich, Carolin Loos, Jan Hasenauer, 2017 # https://arxiv.org/abs/1711.08079 T, D = np.shape(yt) flat_args, unflatten = flatten(func_args) def flat_func(y, t, flat_args): return func(y, t, *unflatten(flat_args)) def unpack(x): # y, vjp_y, vjp_t, vjp_args return x[0:D], x[D : 2 * D], x[2 * D], x[2 * D + 1 :] def augmented_dynamics(augmented_state, t, flat_args): # Orginal system augmented with vjp_y, vjp_t and vjp_args. y, vjp_y, _, _ = unpack(augmented_state) vjp_all, dy_dt = make_vjp(flat_func, argnum=(0, 1, 2))(y, t, flat_args) vjp_y, vjp_t, vjp_args = vjp_all(-vjp_y) return np.hstack((dy_dt, vjp_y, vjp_t, vjp_args)) def vjp_all(g): vjp_y = g[-1, :] vjp_t0 = 0 time_vjp_list = [] vjp_args = np.zeros(np.size(flat_args)) for i in range(T - 1, 0, -1): # Compute effect of moving measurement time. vjp_cur_t = np.dot(func(yt[i, :], t[i], *func_args), g[i, :]) time_vjp_list.append(vjp_cur_t) vjp_t0 = vjp_t0 - vjp_cur_t # Run augmented system backwards to the previous observation. aug_y0 = np.hstack((yt[i, :], vjp_y, vjp_t0, vjp_args)) aug_ans = odeint( augmented_dynamics, aug_y0, np.array([t[i], t[i - 1]]), tuple((flat_args,)), **kwargs ) _, vjp_y, vjp_t0, vjp_args = unpack(aug_ans[1]) # Add gradient from current output. vjp_y = vjp_y + g[i - 1, :] time_vjp_list.append(vjp_t0) vjp_times = np.hstack(time_vjp_list)[::-1] return None, vjp_y, vjp_times, unflatten(vjp_args) return vjp_all def argnums_unpack(all_vjp_builder): # A generic autograd helper function. Takes a function that # builds vjps for all arguments, and wraps it to return only required vjps. def build_selected_vjps(argnums, ans, combined_args, kwargs): vjp_func = all_vjp_builder(ans, *combined_args, **kwargs) def chosen_vjps(g): # Returns whichever vjps were asked for. all_vjps = vjp_func(g) return [all_vjps[argnum] for argnum in argnums] return chosen_vjps return build_selected_vjps defvjp_argnums(odeint, argnums_unpack(grad_odeint)) ================================================ FILE: autograd/scipy/linalg.py ================================================ from functools import partial import scipy.linalg import autograd.numpy as anp from autograd.extend import defjvp, defjvp_argnums, defvjp, defvjp_argnums from autograd.numpy.numpy_wrapper import wrap_namespace wrap_namespace(scipy.linalg.__dict__, globals()) # populates module namespace def _vjp_sqrtm(ans, A, disp=True, blocksize=64): assert disp, "sqrtm vjp not implemented for disp=False" ans_transp = anp.transpose(ans) def vjp(g): return anp.real(solve_sylvester(ans_transp, ans_transp, g)) return vjp defvjp(sqrtm, _vjp_sqrtm) def _flip(a, trans): if anp.iscomplexobj(a): return "H" if trans in ("N", 0) else "N" else: return "T" if trans in ("N", 0) else "N" def grad_solve_triangular(ans, a, b, trans=0, lower=False, **kwargs): tri = anp.tril if (lower ^ (_flip(a, trans) == "N")) else anp.triu transpose = lambda x: x if _flip(a, trans) != "N" else x.T al2d = lambda x: x if x.ndim > 1 else x[..., None] def vjp(g): v = al2d(solve_triangular(a, g, trans=_flip(a, trans), lower=lower)) return -transpose(tri(anp.dot(v, al2d(ans).T))) return vjp defvjp( solve_triangular, grad_solve_triangular, lambda ans, a, b, trans=0, lower=False, **kwargs: ( lambda g: solve_triangular(a, g, trans=_flip(a, trans), lower=lower) ), ) def grad_solve_banded(argnum, ans, l_and_u, a, b): updim = lambda x: x if x.ndim == a.ndim else x[..., None] def transpose_banded(l_and_u, a): # Compute the transpose of a banded matrix. # The transpose is itself a banded matrix. num_rows = a.shape[0] shifts = anp.arange(-l_and_u[1], l_and_u[0] + 1) T_a = anp.roll(a[:1, :], shifts[0]) for rr in range(1, num_rows): T_a = anp.vstack([T_a, anp.flipud(anp.roll(a[rr : rr + 1, :], shifts[rr]))]) T_a = anp.flipud(T_a) T_l_and_u = anp.flip(l_and_u) return T_l_and_u, T_a def banded_dot(l_and_u, uu, vv): # Compute tensor product of vectors uu and vv. # Tensor product elements are resticted to the bands specified by l_and_u. # TODO: replace the brute-force ravel() by smarter dimension handeling of uu and vv # main diagonal banded_uv = anp.ravel(uu) * anp.ravel(vv) # stack below the sub-diagonals for rr in range(1, l_and_u[0] + 1): banded_uv_rr = anp.hstack([anp.ravel(uu)[rr:] * anp.ravel(vv)[:-rr], anp.zeros(rr)]) banded_uv = anp.vstack([banded_uv, banded_uv_rr]) # stack above the sup-diagonals for rr in range(1, l_and_u[1] + 1): banded_uv_rr = anp.hstack([anp.zeros(rr), anp.ravel(uu)[:-rr] * anp.ravel(vv)[rr:]]) banded_uv = anp.vstack([banded_uv_rr, banded_uv]) return banded_uv T_l_and_u, T_a = transpose_banded(l_and_u, a) if argnum == 1: return lambda g: ( -banded_dot(l_and_u, updim(solve_banded(T_l_and_u, T_a, g)), anp.transpose(updim(ans))) ) elif argnum == 2: return lambda g: solve_banded(T_l_and_u, T_a, g) defvjp(solve_banded, partial(grad_solve_banded, 1), partial(grad_solve_banded, 2), argnums=[1, 2]) def _jvp_sqrtm(dA, ans, A, disp=True, blocksize=64): assert disp, "sqrtm jvp not implemented for disp=False" return solve_sylvester(ans, ans, dA) defjvp(sqrtm, _jvp_sqrtm) def _jvp_sylvester(argnums, dms, ans, args, _): a, b, q = args if 0 in argnums: da = dms[0] db = dms[1] if 1 in argnums else 0 else: da = 0 db = dms[0] if 1 in argnums else 0 dq = dms[-1] if 2 in argnums else 0 rhs = dq - anp.dot(da, ans) - anp.dot(ans, db) return solve_sylvester(a, b, rhs) defjvp_argnums(solve_sylvester, _jvp_sylvester) def _vjp_sylvester(argnums, ans, args, _): a, b, q = args def vjp(g): vjps = [] q_vjp = solve_sylvester(anp.transpose(a), anp.transpose(b), g) if 0 in argnums: vjps.append(-anp.dot(q_vjp, anp.transpose(ans))) if 1 in argnums: vjps.append(-anp.dot(anp.transpose(ans), q_vjp)) if 2 in argnums: vjps.append(q_vjp) return tuple(vjps) return vjp defvjp_argnums(solve_sylvester, _vjp_sylvester) ================================================ FILE: autograd/scipy/signal.py ================================================ from functools import partial import numpy as npo # original numpy from numpy.lib.stride_tricks import as_strided import autograd.numpy as np from autograd.extend import defvjp, primitive @primitive def convolve(A, B, axes=None, dot_axes=[(), ()], mode="full"): assert mode in ["valid", "full"], f"Mode {mode} not yet implemented" if axes is None: axes = [list(range(A.ndim)), list(range(A.ndim))] wrong_order = any([B.shape[ax_B] < A.shape[ax_A] for ax_A, ax_B in zip(*axes)]) if wrong_order: if mode == "valid" and not all([B.shape[ax_B] <= A.shape[ax_A] for ax_A, ax_B in zip(*axes)]): raise Exception("One array must be larger than the other along all convolved dimensions") elif mode != "full" or B.size <= A.size: # Tie breaker i1 = B.ndim - len(dot_axes[1]) - len(axes[1]) # B ignore i2 = i1 + A.ndim - len(dot_axes[0]) - len(axes[0]) # A ignore i3 = i2 + len(axes[0]) ignore_B = list(range(i1)) ignore_A = list(range(i1, i2)) conv = list(range(i2, i3)) return convolve(B, A, axes=axes[::-1], dot_axes=dot_axes[::-1], mode=mode).transpose( ignore_A + ignore_B + conv ) if mode == "full": B = pad_to_full(B, A, axes[::-1]) B_view_shape = list(B.shape) B_view_strides = list(B.strides) flipped_idxs = [slice(None)] * A.ndim for ax_A, ax_B in zip(*axes): B_view_shape.append(abs(B.shape[ax_B] - A.shape[ax_A]) + 1) B_view_strides.append(B.strides[ax_B]) B_view_shape[ax_B] = A.shape[ax_A] flipped_idxs[ax_A] = slice(None, None, -1) B_view = as_strided(B, B_view_shape, B_view_strides) A_view = A[tuple(flipped_idxs)] all_axes = [list(axes[i]) + list(dot_axes[i]) for i in [0, 1]] return einsum_tensordot(A_view, B_view, all_axes) def einsum_tensordot(A, B, axes, reverse=False): # Does tensor dot product using einsum, which shouldn't require a copy. A_axnums = list(range(A.ndim)) B_axnums = list(range(A.ndim, A.ndim + B.ndim)) sum_axnum = A.ndim + B.ndim for i_sum, (i_A, i_B) in enumerate(zip(*axes)): A_axnums[i_A] = sum_axnum + i_sum B_axnums[i_B] = sum_axnum + i_sum return npo.einsum(A, A_axnums, B, B_axnums) def pad_to_full(A, B, axes): A_pad = [(0, 0)] * A.ndim for ax_A, ax_B in zip(*axes): A_pad[ax_A] = (B.shape[ax_B] - 1,) * 2 return npo.pad(A, A_pad, mode="constant") def parse_axes(A_shape, B_shape, conv_axes, dot_axes, mode): A_ndim, B_ndim = len(A_shape), len(B_shape) if conv_axes is None: conv_axes = ( tuple(range(A_ndim)), tuple(range(A_ndim)), ) axes = { "A": { "conv": tuple(conv_axes[0]), "dot": tuple(dot_axes[0]), "ignore": tuple(i for i in range(A_ndim) if i not in conv_axes[0] and i not in dot_axes[0]), }, "B": { "conv": tuple(conv_axes[1]), "dot": tuple(dot_axes[1]), "ignore": tuple(i for i in range(B_ndim) if i not in conv_axes[1] and i not in dot_axes[1]), }, } assert len(axes["A"]["dot"]) == len(axes["B"]["dot"]) assert len(axes["A"]["conv"]) == len(axes["B"]["conv"]) i1 = len(axes["A"]["ignore"]) i2 = i1 + len(axes["B"]["ignore"]) i3 = i2 + len(axes["A"]["conv"]) axes["out"] = { "ignore_A": tuple(range(i1)), "ignore_B": tuple(range(i1, i2)), "conv": tuple(range(i2, i3)), } conv_shape = ( compute_conv_size(A_shape[i], B_shape[j], mode) for i, j in zip(axes["A"]["conv"], axes["B"]["conv"]) ) shapes = { "A": {s: tuple(A_shape[i] for i in ax) for s, ax in axes["A"].items()}, "B": {s: tuple(B_shape[i] for i in ax) for s, ax in axes["B"].items()}, } shapes["out"] = { "ignore_A": shapes["A"]["ignore"], "ignore_B": shapes["B"]["ignore"], "conv": conv_shape, } return axes, shapes def compute_conv_size(A_size, B_size, mode): if mode == "full": return A_size + B_size - 1 elif mode == "same": return A_size elif mode == "valid": return abs(A_size - B_size) + 1 else: raise Exception(f"Mode {mode} not recognized") def flipped_idxs(ndim, axes): new_idxs = [slice(None)] * ndim for ax in axes: new_idxs[ax] = slice(None, None, -1) return tuple(new_idxs) def grad_convolve(argnum, ans, A, B, axes=None, dot_axes=[(), ()], mode="full"): assert mode in ["valid", "full"], f"Grad for mode {mode} not yet implemented" axes, shapes = parse_axes(A.shape, B.shape, axes, dot_axes, mode) if argnum == 0: X, Y = A, B _X_, _Y_ = "A", "B" ignore_Y = "ignore_B" elif argnum == 1: X, Y = B, A _X_, _Y_ = "B", "A" ignore_Y = "ignore_A" else: raise NotImplementedError(f"Can't take grad of convolve w.r.t. arg {argnum}") if mode == "full": new_mode = "valid" else: if any([x_size > y_size for x_size, y_size in zip(shapes[_X_]["conv"], shapes[_Y_]["conv"])]): new_mode = "full" else: new_mode = "valid" def vjp(g): result = convolve( g, Y[flipped_idxs(Y.ndim, axes[_Y_]["conv"])], axes=[axes["out"]["conv"], axes[_Y_]["conv"]], dot_axes=[axes["out"][ignore_Y], axes[_Y_]["ignore"]], mode=new_mode, ) new_order = npo.argsort(axes[_X_]["ignore"] + axes[_X_]["dot"] + axes[_X_]["conv"]) return np.transpose(result, new_order) return vjp defvjp(convolve, partial(grad_convolve, 0), partial(grad_convolve, 1)) ================================================ FILE: autograd/scipy/special.py ================================================ import scipy.special import autograd.numpy as np from autograd.extend import defjvp, defvjp, primitive from autograd.numpy.numpy_vjps import repeat_to_match_shape, unbroadcast_f ### Beta function ### beta = primitive(scipy.special.beta) betainc = primitive(scipy.special.betainc) betaln = primitive(scipy.special.betaln) defvjp( beta, lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b))), ) defvjp( betainc, lambda ans, a, b, x: unbroadcast_f( x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b) ), argnums=[2], ) defvjp( betaln, lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b))), ) ### Gamma functions ### polygamma = primitive(scipy.special.polygamma) psi = primitive(scipy.special.psi) # psi(x) is just polygamma(0, x) digamma = primitive(scipy.special.digamma) # digamma is another name for psi. gamma = primitive(scipy.special.gamma) gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma) multigammaln = primitive(scipy.special.multigammaln) defvjp(gammasgn, None) defvjp(polygamma, None, lambda ans, n, x: lambda g: g * polygamma(n + 1, x)) defvjp(psi, lambda ans, x: lambda g: g * polygamma(1, x)) defvjp(digamma, lambda ans, x: lambda g: g * polygamma(1, x)) defvjp(gamma, lambda ans, x: lambda g: g * ans * psi(x)) defvjp(gammaln, lambda ans, x: lambda g: g * psi(x)) defvjp(rgamma, lambda ans, x: lambda g: g * psi(x) / -gamma(x)) defvjp( multigammaln, lambda ans, a, d: lambda g: g * np.sum(digamma(np.expand_dims(a, -1) - np.arange(d) / 2.0), -1), None, ) def make_gammainc_vjp_arg1(sign): def gammainc_vjp_arg1(ans, a, x): coeffs = sign * np.exp(-x) * np.power(x, a - 1) / gamma(a) return unbroadcast_f(x, lambda g: g * coeffs) return gammainc_vjp_arg1 defvjp(gammainc, make_gammainc_vjp_arg1(1), argnums=[1]) defvjp(gammaincc, make_gammainc_vjp_arg1(-1), argnums=[1]) ### Bessel functions ### j0 = primitive(scipy.special.j0) y0 = primitive(scipy.special.y0) j1 = primitive(scipy.special.j1) y1 = primitive(scipy.special.y1) jn = primitive(scipy.special.jn) yn = primitive(scipy.special.yn) defvjp(j0, lambda ans, x: lambda g: -g * j1(x)) defvjp(y0, lambda ans, x: lambda g: -g * y1(x)) defvjp(j1, lambda ans, x: lambda g: g * (j0(x) - jn(2, x)) / 2.0) defvjp(y1, lambda ans, x: lambda g: g * (y0(x) - yn(2, x)) / 2.0) defvjp(jn, None, lambda ans, n, x: lambda g: g * (jn(n - 1, x) - jn(n + 1, x)) / 2.0) defvjp(yn, None, lambda ans, n, x: lambda g: g * (yn(n - 1, x) - yn(n + 1, x)) / 2.0) ### Faster versions of common Bessel functions ### i0 = primitive(scipy.special.i0) i1 = primitive(scipy.special.i1) iv = primitive(scipy.special.iv) ive = primitive(scipy.special.ive) defvjp(i0, lambda ans, x: lambda g: g * i1(x)) defvjp(i1, lambda ans, x: lambda g: g * (i0(x) + iv(2, x)) / 2.0) defvjp(iv, None, lambda ans, n, x: lambda g: g * (iv(n - 1, x) + iv(n + 1, x)) / 2.0) defvjp(ive, None, lambda ans, n, x: lambda g: g * (ans * (n / x - np.sign(x)) + ive(n + 1, x))) ### Error Function ### inv_root_pi = 0.56418958354775627928 erf = primitive(scipy.special.erf) erfc = primitive(scipy.special.erfc) defvjp(erf, lambda ans, x: lambda g: 2.0 * g * inv_root_pi * np.exp(-(x**2))) defvjp(erfc, lambda ans, x: lambda g: -2.0 * g * inv_root_pi * np.exp(-(x**2))) ### Inverse error function ### root_pi = 1.7724538509055159 erfinv = primitive(scipy.special.erfinv) erfcinv = primitive(scipy.special.erfcinv) defvjp(erfinv, lambda ans, x: lambda g: g * root_pi / 2 * np.exp(erfinv(x) ** 2)) defvjp(erfcinv, lambda ans, x: lambda g: -g * root_pi / 2 * np.exp(erfcinv(x) ** 2)) ### Logit and Expit ### logit = primitive(scipy.special.logit) expit = primitive(scipy.special.expit) defvjp(logit, lambda ans, x: lambda g: g / (x * (1 - x))) defvjp(expit, lambda ans, x: lambda g: g * ans * (1 - ans)) ### logsumexp ### logsumexp = primitive(scipy.special.logsumexp) def make_grad_logsumexp(ans, x, axis=None, b=1.0, keepdims=False): shape, dtype = np.shape(x), np.result_type(x) def vjp(g): g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims) ans_repeated, _ = repeat_to_match_shape(ans, shape, dtype, axis, keepdims) return g_repeated * b * np.exp(x - ans_repeated) return vjp defvjp(logsumexp, make_grad_logsumexp) def fwd_grad_logsumexp(g, ans, x, axis=None, b=1.0, keepdims=False): if not keepdims: if isinstance(axis, int): ans = np.expand_dims(ans, axis) elif isinstance(axis, tuple): for ax in sorted(axis): ans = np.expand_dims(ans, ax) return np.sum(g * b * np.exp(x - ans), axis=axis, keepdims=keepdims) defjvp(logsumexp, fwd_grad_logsumexp) ================================================ FILE: autograd/scipy/stats/__init__.py ================================================ from . import beta, chi2, gamma, norm, poisson, t # Try block needed in case the user has an # old version of scipy without multivariate normal. try: from . import multivariate_normal except AttributeError: pass try: from . import dirichlet except AttributeError: pass ================================================ FILE: autograd/scipy/stats/beta.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import beta, psi cdf = primitive(scipy.stats.beta.cdf) logpdf = primitive(scipy.stats.beta.logpdf) pdf = primitive(scipy.stats.beta.pdf) def grad_beta_logpdf_arg0(x, a, b): return (1 + a * (x - 1) + x * (b - 2)) / (x * (x - 1)) def grad_beta_logpdf_arg1(x, a, b): return np.log(x) - psi(a) + psi(a + b) def grad_beta_logpdf_arg2(x, a, b): return np.log1p(-x) - psi(b) + psi(a + b) defvjp( cdf, lambda ans, x, a, b: unbroadcast_f( x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b) ), argnums=[0], ) defvjp( logpdf, lambda ans, x, a, b: unbroadcast_f(x, lambda g: g * grad_beta_logpdf_arg0(x, a, b)), lambda ans, x, a, b: unbroadcast_f(a, lambda g: g * grad_beta_logpdf_arg1(x, a, b)), lambda ans, x, a, b: unbroadcast_f(b, lambda g: g * grad_beta_logpdf_arg2(x, a, b)), ) defvjp( pdf, lambda ans, x, a, b: unbroadcast_f(x, lambda g: g * ans * grad_beta_logpdf_arg0(x, a, b)), lambda ans, x, a, b: unbroadcast_f(a, lambda g: g * ans * grad_beta_logpdf_arg1(x, a, b)), lambda ans, x, a, b: unbroadcast_f(b, lambda g: g * ans * grad_beta_logpdf_arg2(x, a, b)), ) ================================================ FILE: autograd/scipy/stats/chi2.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma cdf = primitive(scipy.stats.chi2.cdf) logpdf = primitive(scipy.stats.chi2.logpdf) pdf = primitive(scipy.stats.chi2.pdf) def grad_chi2_logpdf(x, df): return np.where(df % 1 == 0, (df - x - 2) / (2 * x), 0) defvjp( cdf, lambda ans, x, df: unbroadcast_f( x, lambda g: g * np.power(2.0, -df / 2) * np.exp(-x / 2) * np.power(x, df / 2 - 1) / gamma(df / 2) ), argnums=[0], ) defvjp(logpdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * grad_chi2_logpdf(x, df)), argnums=[0]) defvjp(pdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * ans * grad_chi2_logpdf(x, df)), argnums=[0]) ================================================ FILE: autograd/scipy/stats/dirichlet.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.scipy.special import digamma rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) defvjp( logpdf, lambda ans, x, alpha: lambda g: g * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), ) # Same as log pdf, but multiplied by the pdf (ans). defvjp( pdf, lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), ) ================================================ FILE: autograd/scipy/stats/gamma.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma, psi cdf = primitive(scipy.stats.gamma.cdf) logpdf = primitive(scipy.stats.gamma.logpdf) pdf = primitive(scipy.stats.gamma.pdf) def grad_gamma_logpdf_arg0(x, a): return (a - x - 1) / x def grad_gamma_logpdf_arg1(x, a): return np.log(x) - psi(a) defvjp( cdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * np.exp(-x) * np.power(x, a - 1) / gamma(a)), argnums=[0], ) defvjp( logpdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * grad_gamma_logpdf_arg1(x, a)), ) defvjp( pdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * ans * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * ans * grad_gamma_logpdf_arg1(x, a)), ) ================================================ FILE: autograd/scipy/stats/multivariate_normal.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f pdf = primitive(scipy.stats.multivariate_normal.pdf) logpdf = primitive(scipy.stats.multivariate_normal.logpdf) entropy = primitive(scipy.stats.multivariate_normal.entropy) # With thanks to Eric Bresch. # Some formulas are from # "An extended collection of matrix derivative results # for forward and reverse mode algorithmic differentiation" # by Mike Giles # https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf def generalized_outer_product(x): if np.ndim(x) == 1: return np.outer(x, x) return np.matmul(x, np.swapaxes(x, -1, -2)) def covgrad(x, mean, cov, allow_singular=False): if allow_singular: raise NotImplementedError( "The multivariate normal pdf is not differentiable w.r.t. a singular covariance matix" ) J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1.0 / 2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp( logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( x, lambda g: -np.expand_dims(np.atleast_1d(g), 1) * solve(allow_singular)(cov, (x - mean).T).T ), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( mean, lambda g: np.expand_dims(np.atleast_1d(g), 1) * solve(allow_singular)(cov, (x - mean).T).T ), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( cov, lambda g: np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular) ), ) # Same as log pdf, but multiplied by the pdf (ans). defvjp( pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( x, lambda g: -np.expand_dims(np.atleast_1d(ans * g), 1) * solve(allow_singular)(cov, (x - mean).T).T ), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( mean, lambda g: np.expand_dims(np.atleast_1d(ans * g), 1) * solve(allow_singular)(cov, (x - mean).T).T, ), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( cov, lambda g: np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular) ), ) defvjp(entropy, None, lambda ans, mean, cov: unbroadcast_f(cov, lambda g: 0.5 * g * np.linalg.inv(cov).T)) ================================================ FILE: autograd/scipy/stats/norm.py ================================================ """Gradients of the normal distribution.""" import scipy.stats import autograd.numpy as anp from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) sf = primitive(scipy.stats.norm.sf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) logsf = primitive(scipy.stats.norm.logsf) defvjp( pdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * ans * (((x - loc) / scale) ** 2 - 1.0) / scale ), ) defvjp( cdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: -g * pdf(x, loc, scale) * (x - loc) / scale ), ) defvjp( logpdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * (-1.0 / scale + (x - loc) ** 2 / scale**3) ), ) defvjp( logcdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)) ), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)) ), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)) * (x - loc) / scale ), ) defvjp( logsf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale)) ), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale)) ), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * anp.exp(logpdf(x, loc, scale) - logsf(x, loc, scale)) * (x - loc) / scale ), ) defvjp( sf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * pdf(x, loc, scale) * (x - loc) / scale ), ) ================================================ FILE: autograd/scipy/stats/poisson.py ================================================ import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f cdf = primitive(scipy.stats.poisson.cdf) logpmf = primitive(scipy.stats.poisson.logpmf) pmf = primitive(scipy.stats.poisson.pmf) def grad_poisson_logpmf(k, mu): return np.where(k % 1 == 0, k / mu - 1, 0) defvjp(cdf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * -pmf(np.floor(k), mu)), argnums=[1]) defvjp(logpmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * grad_poisson_logpmf(k, mu)), argnums=[1]) defvjp( pmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * ans * grad_poisson_logpmf(k, mu)), argnums=[1] ) ================================================ FILE: autograd/scipy/stats/t.py ================================================ """Gradients of the univariate t distribution.""" import scipy.stats import autograd.numpy as np from autograd.extend import defvjp, primitive from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import psi pdf = primitive(scipy.stats.t.pdf) cdf = primitive(scipy.stats.t.cdf) logpdf = primitive(scipy.stats.t.logpdf) logcdf = primitive(scipy.stats.t.logcdf) def grad_tlogpdf_diff(diff, df): return -diff * (1.0 + df) / (diff**2 + df) def grad_tlogpdf_x(x, df, loc, scale): return grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_loc(x, df, loc, scale): return -grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2)) / (scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc) / scale return 0.5 * ( (y**2 * (df + 1)) / (df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0 / df - psi(df / 2.0) + psi((df + 1) / 2.0) ) defvjp( pdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: g * ans * grad_tlogpdf_x(x, df, loc, scale) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( df, lambda g: g * ans * grad_tlogpdf_df(x, df, loc, scale) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * ans * grad_tlogpdf_loc(x, df, loc, scale) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * ans * grad_tlogpdf_scale(x, df, loc, scale) ), ) defvjp( cdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, df, loc, scale)), argnums=(0, 2), ) defvjp( logpdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * grad_tlogpdf_x(x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( df, lambda g: g * grad_tlogpdf_df(x, df, loc, scale) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * grad_tlogpdf_loc(x, df, loc, scale) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * grad_tlogpdf_scale(x, df, loc, scale) ), ) defvjp( logcdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale)) ), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: -g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale)) ), argnums=(0, 2), ) ================================================ FILE: autograd/test_util.py ================================================ from itertools import product from .core import make_jvp, make_vjp, vspace from .wrap_util import get_name, unary_to_nary TOL = 1e-6 RTOL = 1e-6 def scalar_close(a, b): return abs(a - b) < TOL or abs(a - b) / abs(a + b) < RTOL EPS = 1e-6 def make_numerical_jvp(f, x): y = f(x) x_vs, y_vs = vspace(x), vspace(y) def jvp(v): # (f(x + v*eps/2) - f(x - v*eps/2)) / eps f_x_plus = f(x_vs.add(x, x_vs.scalar_mul(v, EPS / 2))) f_x_minus = f(x_vs.add(x, x_vs.scalar_mul(v, -EPS / 2))) neg_f_x_minus = y_vs.scalar_mul(f_x_minus, -1.0) return y_vs.scalar_mul(y_vs.add(f_x_plus, neg_f_x_minus), 1.0 / EPS) return jvp def check_vjp(f, x): vjp, y = make_vjp(f, x) jvp = make_numerical_jvp(f, x) x_vs, y_vs = vspace(x), vspace(y) x_v, y_v = x_vs.randn(), y_vs.randn() vjp_y = x_vs.covector(vjp(y_vs.covector(y_v))) assert vspace(vjp_y) == x_vs vjv_exact = x_vs.inner_prod(x_v, vjp_y) vjv_numeric = y_vs.inner_prod(y_v, jvp(x_v)) assert scalar_close(vjv_numeric, vjv_exact), ( "Derivative (VJP) check of {} failed with arg {}:\nanalytic: {}\nnumeric: {}".format( get_name(f), x, vjv_exact, vjv_numeric ) ) def check_jvp(f, x): jvp = make_jvp(f, x) jvp_numeric = make_numerical_jvp(f, x) x_v = vspace(x).randn() check_equivalent(jvp(x_v)[1], jvp_numeric(x_v)) def check_equivalent(x, y): x_vs, y_vs = vspace(x), vspace(y) assert x_vs == y_vs, f"VSpace mismatch:\nx: {x_vs}\ny: {y_vs}" v = x_vs.randn() assert scalar_close(x_vs.inner_prod(x, v), x_vs.inner_prod(y, v)), f"Value mismatch:\nx: {x}\ny: {y}" @unary_to_nary def check_grads(f, x, modes=["fwd", "rev"], order=2): assert all(m in ["fwd", "rev"] for m in modes) if "fwd" in modes: check_jvp(f, x) if order > 1: grad_f = lambda x, v: make_jvp(f, x)(v)[1] grad_f.__name__ = f"jvp_{get_name(f)}" v = vspace(x).randn() check_grads(grad_f, (0, 1), modes, order=order - 1)(x, v) if "rev" in modes: check_vjp(f, x) if order > 1: grad_f = lambda x, v: make_vjp(f, x)[0](v) grad_f.__name__ = f"vjp_{get_name(f)}" v = vspace(f(x)).randn() check_grads(grad_f, (0, 1), modes, order=order - 1)(x, v) def combo_check(fun, *args, **kwargs): # Tests all combinations of args and kwargs given. _check_grads = lambda f: check_grads(f, *args, **kwargs) def _combo_check(*args, **kwargs): kwarg_key_vals = [[(k, x) for x in xs] for k, xs in kwargs.items()] for _args in product(*args): for _kwargs in product(*kwarg_key_vals): _check_grads(fun)(*_args, **dict(_kwargs)) return _combo_check ================================================ FILE: autograd/tracer.py ================================================ import warnings from collections import defaultdict from contextlib import contextmanager from .util import subvals, toposort from .wrap_util import wraps def trace(start_node, fun, x): with trace_stack.new_trace() as t: start_box = new_box(x, t, start_node) end_box = fun(start_box) if isbox(end_box) and end_box._trace == start_box._trace: return end_box._value, end_box._node else: warnings.warn("Output seems independent of input.") return end_box, None class Node: __slots__ = [] def __init__(self, value, fun, args, kwargs, parent_argnums, parents): assert False def initialize_root(self, *args, **kwargs): assert False @classmethod def new_root(cls, *args, **kwargs): root = cls.__new__(cls) root.initialize_root(*args, **kwargs) return root def primitive(f_raw): """ Wraps a function so that its gradient can be specified and its invocation can be recorded. For examples, see the docs.""" @wraps(f_raw) def f_wrapped(*args, **kwargs): boxed_args, trace, node_constructor = find_top_boxed_args(args) if boxed_args: argvals = subvals(args, [(argnum, box._value) for argnum, box in boxed_args]) if f_wrapped in notrace_primitives[node_constructor]: return f_wrapped(*argvals, **kwargs) parents = tuple(box._node for _, box in boxed_args) argnums = tuple(argnum for argnum, _ in boxed_args) ans = f_wrapped(*argvals, **kwargs) node = node_constructor(ans, f_wrapped, argvals, kwargs, argnums, parents) return new_box(ans, trace, node) else: return f_raw(*args, **kwargs) f_wrapped.fun = f_raw f_wrapped._is_autograd_primitive = True return f_wrapped notrace_primitives = defaultdict(set) def register_notrace(trace_type, primitive_fun): notrace_primitives[trace_type].add(primitive_fun) def notrace_primitive(f_raw): @wraps(f_raw) def f_wrapped(*args, **kwargs): argvals = map(getval, args) return f_raw(*argvals, **kwargs) f_wrapped._is_primitive = True return f_wrapped def find_top_boxed_args(args): top_trace = -1 top_boxes = [] top_node_type = None for argnum, arg in enumerate(args): if isbox(arg): trace = arg._trace if trace > top_trace: top_boxes = [(argnum, arg)] top_trace = trace top_node_type = type(arg._node) elif trace == top_trace: top_boxes.append((argnum, arg)) return top_boxes, top_trace, top_node_type class TraceStack: def __init__(self): self.top = -1 @contextmanager def new_trace(self): self.top += 1 yield self.top self.top -= 1 trace_stack = TraceStack() class Box: type_mappings = {} types = set() __slots__ = ["_value", "_trace", "_node"] def __init__(self, value, trace, node): self._value = value self._node = node self._trace = trace def __bool__(self): return bool(self._value) __nonzero__ = __bool__ def __str__(self): return f"Autograd {type(self).__name__} with value {str(self._value)}" @classmethod def register(cls, value_type): Box.types.add(cls) Box.type_mappings[value_type] = cls Box.type_mappings[cls] = cls box_type_mappings = Box.type_mappings def new_box(value, trace, node): try: return box_type_mappings[type(value)](value, trace, node) except KeyError: raise TypeError(f"Can't differentiate w.r.t. type {type(value)}") box_types = Box.types isbox = lambda x: type(x) in box_types # almost 3X faster than isinstance(x, Box) getval = lambda x: getval(x._value) if isbox(x) else x ================================================ FILE: autograd/util.py ================================================ import operator def subvals(x, ivs): x_ = list(x) for i, v in ivs: x_[i] = v return tuple(x_) def subval(x, i, v): x_ = list(x) x_[i] = v return tuple(x_) def func(f): return f def toposort(end_node, parents=operator.attrgetter("parents")): child_counts = {} stack = [end_node] while stack: node = stack.pop() if node in child_counts: child_counts[node] += 1 else: child_counts[node] = 1 stack.extend(parents(node)) childless_nodes = [end_node] while childless_nodes: node = childless_nodes.pop() yield node for parent in parents(node): if child_counts[parent] == 1: childless_nodes.append(parent) else: child_counts[parent] -= 1 # -------------------- deprecation warnings ----------------------- import warnings deprecation_msg = """ The quick_grad_check function is deprecated. See the update guide: https://github.com/HIPS/autograd/blob/master/docs/updateguide.md""" def quick_grad_check( fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=1e-4, rtol=1e-4, atol=1e-6, rs=None ): warnings.warn(deprecation_msg) from autograd.test_util import check_grads fun_ = lambda arg0: fun(arg0, *extra_args, **kwargs) check_grads(fun_, modes=["rev"], order=1)(arg0) ================================================ FILE: autograd/wrap_util.py ================================================ from .util import subvals def unary_to_nary(unary_operator): @wraps(unary_operator) def nary_operator(fun, argnum=0, *nary_op_args, **nary_op_kwargs): assert type(argnum) in (int, tuple, list), argnum @wrap_nary_f(fun, unary_operator, argnum) def nary_f(*args, **kwargs): @wraps(fun) def unary_f(x): if isinstance(argnum, int): subargs = subvals(args, [(argnum, x)]) else: subargs = subvals(args, zip(argnum, x)) return fun(*subargs, **kwargs) if isinstance(argnum, int): x = args[argnum] else: x = tuple(args[i] for i in argnum) return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs) return nary_f return nary_operator def wraps(fun, namestr="{fun}", docstr="{doc}", **kwargs): def _wraps(f): try: f.__name__ = namestr.format(fun=get_name(fun), **kwargs) f.__doc__ = docstr.format(fun=get_name(fun), doc=get_doc(fun), **kwargs) except BaseException: pass return f return _wraps def wrap_nary_f(fun, op, argnum): namestr = "{op}_of_{fun}_wrt_argnum_{argnum}" docstr = """\ {op} of function {fun} with respect to argument number {argnum}. Takes the same arguments as {fun} but returns the {op}. """ return wraps(fun, namestr, docstr, op=get_name(op), argnum=argnum) get_name = lambda f: getattr(f, "__name__", "[unknown name]") get_doc = lambda f: getattr(f, "__doc__", "") ================================================ FILE: benchmarks/__init__.py ================================================ ================================================ FILE: benchmarks/asv.conf.json.sample ================================================ { "version": 1, "project": "autograd", "project_url": "http://github.com/hips/autograd", "branches": ["master"], "dvcs": "git", "environment_type": "virtualenv", "install_timeout": 600, "repo" : "..", "benchmark_dir" : ".", "env_dir" : "../.asv/env", "results_dir" : "../.asv/results", "html_dir" : "../.asv/html", } ================================================ FILE: benchmarks/bench_core.py ================================================ import numpy as onp import autograd.numpy as np from autograd import grad try: from autograd.core import VJPNode, backward_pass, vspace from autograd.tracer import new_box, trace MASTER_BRANCH = False except ImportError: from autograd.core import backward_pass, forward_pass, new_progenitor, vspace MASTER_BRANCH = True ## SHORT FUNCTION def f_short(x): return x**2 def time_short_fun(): f_short(2.0) def time_short_forward_pass(): if MASTER_BRANCH: forward_pass(f_short, (2.0,), {}) else: start_node = VJPNode.new_root() trace(start_node, f_short, x) def time_short_backward_pass(): if MASTER_BRANCH: backward_pass(1.0, short_end_node, short_start_node) else: backward_pass(1.0, short_end_node) def time_short_grad(): grad(f_short)(2.0) ## LONG FUNCTION def f_long(x): for i in range(50): x = np.sin(x) return x def time_long_fun(): f_long(2.0) def time_long_forward_pass(): if MASTER_BRANCH: forward_pass(f_long, (2.0,), {}) else: start_node = VJPNode.new_root() trace(start_node, f_long, x) def time_long_backward_pass(): if MASTER_BRANCH: backward_pass(1.0, long_end_node, long_start_node) else: backward_pass(1.0, long_end_node) def time_long_grad(): grad(f_long)(2.0) ## 'PEARLMUTTER TEST' FUNCTION def fan_out_fan_in(x): for i in range(10**4): x = (x + x) / 2.0 return np.sum(x) def time_fan_out_fan_in_fun(): fan_out_fan_in(2.0) def time_fan_out_fan_in_forward_pass(): if MASTER_BRANCH: forward_pass(fan_out_fan_in, (2.0,), {}) else: start_node = VJPNode.new_root() trace(start_node, fan_out_fan_in, x) def time_fan_out_fan_in_backward_pass(): if MASTER_BRANCH: backward_pass(1.0, fan_end_node, fan_start_node) else: backward_pass(1.0, fan_end_node) def time_fan_out_fan_in_grad(): grad(fan_out_fan_in)(2.0) ## UNIT BENCHMARKS def time_vspace_float(): vspace(1.0) A = np.array([[1.0, 2.0, 3.0]]) def time_vspace_array(): vspace(A) def time_new_box_float(): new_box(1.0, 0, start_node) def time_new_box_array(): new_box(A, 0, start_node) def time_exp_call(): onp.exp(2.0) def time_exp_primitive_call_unboxed(): np.exp(2.0) def time_exp_primitive_call_boxed(): if MASTER_BRANCH: np.exp(progenitor) else: np.exp(start_box) def time_no_autograd_control(): # Test whether the benchmarking machine is running slowly independent of autograd A = np.random.randn(200, 200) np.dot(A, A) if MASTER_BRANCH: short_start_node, short_end_node = forward_pass(f_short, (2.0,), {}) long_start_node, long_end_node = forward_pass(f_long, (2.0,), {}) fan_start_node, fan_end_node = forward_pass(fan_out_fan_in, (2.0,), {}) progenitor = new_progenitor(2.0) else: x = 2.0 start_node = VJPNode.new_root() start_box = new_box(x, 0, start_node) _, short_end_node = trace(VJPNode.new_root(), f_short, x) _, long_end_node = trace(VJPNode.new_root(), f_long, x) _, fan_end_node = trace(VJPNode.new_root(), fan_out_fan_in, x) ================================================ FILE: benchmarks/bench_mem.py ================================================ import autograd.numpy as np from autograd import grad def peakmem_needless_nodes(): N, M = 1000, 100 def fun(x): for i in range(M): x = x + 1 return np.sum(x) grad(fun)(np.zeros((N, N))) ================================================ FILE: benchmarks/bench_numpy_vjps.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import make_vjp dot_0 = lambda a, b, g: make_vjp(np.dot, argnum=0)(a, b)[0](g) dot_1 = lambda a, b, g: make_vjp(np.dot, argnum=1)(a, b)[0](g) dot_0_0 = lambda a, b, g: make_vjp(dot_0, argnum=0)(a, b, g)[0](a) dot_0_1 = lambda a, b, g: make_vjp(dot_0, argnum=1)(a, b, g)[0](a) dot_0_2 = lambda a, b, g: make_vjp(dot_0, argnum=2)(a, b, g)[0](a) dot_1_0 = lambda a, b, g: make_vjp(dot_1, argnum=0)(a, b, g)[0](b) dot_1_1 = lambda a, b, g: make_vjp(dot_1, argnum=1)(a, b, g)[0](b) dot_1_2 = lambda a, b, g: make_vjp(dot_1, argnum=2)(a, b, g)[0](b) a = npr.randn(2, 3, 4, 5) b = npr.randn(2, 3, 5, 4) g = npr.randn(2, 3, 4, 2, 3, 4) def time_dot_0(): dot_0(a, b, g) def time_dot_1(): dot_1(a, b, g) def time_dot_0_0(): dot_0_0(a, b, g) def time_dot_0_1(): dot_0_1(a, b, g) def time_dot_0_2(): dot_0_2(a, b, g) def time_dot_1_0(): dot_1_0(a, b, g) def time_dot_1_1(): dot_1_1(a, b, g) def time_dot_1_2(): dot_1_2(a, b, g) tensordot_0 = lambda A, B, G: make_vjp(np.tensordot, argnum=0)(A, B, 2)[0](G) tensordot_1 = lambda A, B, G: make_vjp(np.tensordot, argnum=1)(A, B, 2)[0](G) tensordot_0_0 = lambda A, B, G: make_vjp(tensordot_0, argnum=0)(A, B, G)[0](A) tensordot_0_1 = lambda A, B, G: make_vjp(tensordot_0, argnum=1)(A, B, G)[0](A) tensordot_0_2 = lambda A, B, G: make_vjp(tensordot_0, argnum=2)(A, B, G)[0](A) tensordot_1_0 = lambda A, B, G: make_vjp(tensordot_1, argnum=0)(A, B, G)[0](B) tensordot_1_1 = lambda A, B, G: make_vjp(tensordot_1, argnum=1)(A, B, G)[0](B) tensordot_1_2 = lambda A, B, G: make_vjp(tensordot_1, argnum=2)(A, B, G)[0](B) A = npr.randn(2, 3, 5, 4) B = npr.randn(5, 4, 2, 3) G = npr.randn(2, 3, 2, 3) def time_tensordot_0(): tensordot_0(A, B, G) def time_tensordot_1(): tensordot_1(A, B, G) def time_tensordot_0_0(): tensordot_0_0(A, B, G) def time_tensordot_0_1(): tensordot_0_1(A, B, G) def time_tensordot_0_2(): tensordot_0_2(A, B, G) def time_tensordot_1_0(): tensordot_1_0(A, B, G) def time_tensordot_1_1(): tensordot_1_1(A, B, G) def time_tensordot_1_2(): tensordot_1_2(A, B, G) ================================================ FILE: benchmarks/bench_rnn.py ================================================ # Write the benchmarking functions here. # See "Writing benchmarks" in the asv docs for more information. # http://asv.readthedocs.io/en/latest/writing_benchmarks.html import autograd.numpy as np from autograd import grad class RNNSuite: """ Checking speed on a vanilla RNN. """ # NOTE: this is run each time we run a benchmark. # Might want to switch to setup_cache, which has to return an object which is loaded and unpacked in setup(). def setup(self): self.batch_size = 16 self.dtype = "float32" self.D = 2**10 self.x = 0.01 * np.random.randn(self.batch_size, self.D).astype(self.dtype) self.W1 = 0.01 * np.random.randn(self.D, self.D).astype(self.dtype) self.b1 = 0.01 * np.random.randn(self.D).astype(self.dtype) self.Wout = 0.01 * np.random.randn(self.D, 1).astype(self.dtype) self.bout = 0.01 * np.random.randn(1).astype(self.dtype) self.l = (np.random.rand(self.batch_size, 1) > 0.5).astype(self.dtype) self.n = 50 def autograd_rnn(params, x, label, n): W, b, Wout, bout = params h1 = x for i in range(n): h1 = np.tanh(np.dot(h1, W) + b) logit = np.dot(h1, Wout) + bout loss = -np.sum(label * logit - (logit + np.log(1 + np.exp(-logit)))) return loss self.fn = autograd_rnn self.grad_fn = grad(self.fn) def rnn_grad(self): self.grad_fn((self.W1, self.b1, self.Wout, self.bout), self.x, self.l, self.n) def time_rnn_grad(self): self.rnn_grad() def peakmem_rnn_grad(self): self.rnn_grad() def time_manual_rnn_grad(self): self.manual_rnn_grad() def peakmem_manual_rnn_grad(self): self.manual_rnn_grad() def manual_rnn_grad(self): def repeat_to_match_shape(g, A, axis=None): gout = np.empty_like(A) if np.ndim(gout) == 0: gout = g else: gout = np.ones_like(A) * g return gout def sum_to_match_shape(sum_this, to_match_this): sum_this = np.sum(sum_this, axis=tuple(range(0, np.ndim(sum_this) - np.ndim(to_match_this)))) for axis, size in enumerate(np.shape(to_match_this)): if size == 1: sum_this = np.sum(sum_this, axis=axis, keepdims=True) return sum_this def grad_dot_A(g, A, B): ga = np.dot(g, B.T) ga = np.reshape(ga, np.shape(A)) return ga def grad_dot_B(g, A, B): gb = np.dot(A.T, g) gb = np.reshape(gb, np.shape(B)) return gb def _rnn_grad(x, W, b, Wout, bout, label, n): h1__1_stack, h1__1 = [], None h1__0_stack, h1__0 = [], None out_stack, out = [], None h1_stack = [] h1 = x _for1 = list(range(n)) for i in _for1: h1__1_stack.append(h1__1) h1__1 = np.dot(h1, W) h1__0_stack.append(h1__0) h1__0 = h1__1 + b h1_stack.append(h1) h1 = np.tanh(h1__0) out__0 = np.dot(h1, Wout) out = out__0 + bout loss__2 = label * out loss__7 = -out loss__6 = np.exp(loss__7) loss__5 = 1 + loss__6 loss__4 = np.log(loss__5) loss__3 = out + loss__4 loss__1 = loss__2 - loss__3 # Begin Backward Pass g_loss = 1 g_h1__0 = 0 g_h1__1 = 0 g_b = 0 g_W = 0 # Reverse of: loss = -loss__0 g_loss__0 = -g_loss # Reverse of: loss__0 = np.sum(loss__1) g_loss__1 = repeat_to_match_shape(g_loss__0, loss__1) # Reverse of: loss__1 = loss__2 - loss__3 g_loss__2 = sum_to_match_shape(g_loss__1, loss__2) g_loss__3 = sum_to_match_shape(-g_loss__1, loss__3) # Reverse of: loss__3 = out + loss__4 g_out = sum_to_match_shape(g_loss__3, out) g_loss__4 = sum_to_match_shape(g_loss__3, loss__4) # Reverse of: loss__4 = np.log(loss__5) g_loss__5 = g_loss__4 / loss__5 # Reverse of: loss__5 = 1 + loss__6 g_loss__6 = sum_to_match_shape(g_loss__5, loss__6) # Reverse of: loss__6 = np.exp(loss__7) g_loss__7 = g_loss__6 * np.exp(loss__7) # Reverse of: loss__7 = -out g_out += -g_loss__7 g_out += sum_to_match_shape(g_loss__2 * label, out) # Reverse of: out = out__0 + bout g_out__0 = sum_to_match_shape(g_out, out__0) g_bout = sum_to_match_shape(g_out, bout) # Reverse of: out__0 = np.dot(h1, Wout) g_h1 = grad_dot_A(g_out__0, h1, Wout) g_Wout = grad_dot_B(g_out__0, h1, Wout) _for1 = reversed(_for1) for i in _for1: h1 = h1_stack.pop() tmp_g0 = g_h1 / np.cosh(h1__0) ** 2.0 g_h1 = 0 g_h1__0 += tmp_g0 h1__0 = h1__0_stack.pop() tmp_g1 = sum_to_match_shape(g_h1__0, h1__1) tmp_g2 = sum_to_match_shape(g_h1__0, b) g_h1__0 = 0 g_h1__1 += tmp_g1 g_b += tmp_g2 h1__1 = h1__1_stack.pop() tmp_g3 = grad_dot_A(g_h1__1, h1, W) tmp_g4 = grad_dot_B(g_h1__1, h1, W) g_h1__1 = 0 g_h1 += tmp_g3 g_W += tmp_g4 return g_W, g_b, g_Wout, g_bout _rnn_grad(self.x, self.W1, self.b1, self.Wout, self.bout, self.l, self.n) pass ================================================ FILE: benchmarks/bench_util.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad try: from autograd.misc.flatten import flatten except ImportError: from autograd.util import flatten def time_flatten(): val = { "k": npr.random((4, 4)), "k2": npr.random((3, 3)), "k3": 3.0, "k4": [1.0, 4.0, 7.0, 9.0], "k5": np.array([4.0, 5.0, 6.0]), "k6": np.array([[7.0, 8.0], [9.0, 10.0]]), } vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) # def time_vspace_flatten(): # val = {'k': npr.random((4, 4)), # 'k2': npr.random((3, 3)), # 'k3': 3.0, # 'k4': [1.0, 4.0, 7.0, 9.0], # 'k5': np.array([4., 5., 6.]), # 'k6': np.array([[7., 8.], [9., 10.]])} # vspace_flatten(val) def time_grad_flatten(): val = { "k": npr.random((4, 4)), "k2": npr.random((3, 3)), "k3": 3.0, "k4": [1.0, 4.0, 7.0, 9.0], "k5": np.array([4.0, 5.0, 6.0]), "k6": np.array([[7.0, 8.0], [9.0, 10.0]]), } vect, unflatten = flatten(val) def fun(vec): v = unflatten(vec) return np.sum(v["k5"]) + np.sum(v["k6"]) grad(fun)(vect) ================================================ FILE: conda_recipe/conda.yaml ================================================ package: name: autograd # there are ways to derive version from other sources; for now, it's hard-coded version: 1.1.1 source: {% if not environ.get('BINSTAR_PLATFORM', None) %} git_url: ../ {% else %} # we're building on binstar, we already have the repo; treat as local path path: ../ {% endif %} requirements: build: - python - hatch - hatchling - future - numpy >=1.9 run: - python - future - numpy >=1.9 build: script: pip install . --no-deps test: # Python imports imports: - autograd - autograd.numpy about: home: https://github.com/HIPS/autograd license: MIT summary: 'Efficiently computes derivatives of numpy code.' ================================================ FILE: docs/tutorial.md ================================================ # Autograd tutorial ## Motivation Imagine you want to test out a new machine learning model for your data. This usually means coming up with some loss function to capture how well your model fits the data and optimizing that loss with respect to the model parameters. If there are many model parameters (neural nets can have millions) then you need gradients. You then have two options: derive and code them up yourself, or implement your model using the syntactic and semantic constraints of a system like [Theano](http://deeplearning.net/software/theano/) or [TensorFlow](https://github.com/tensorflow/tensorflow). We want to provide a third way: just write down the loss function using a standard numerical library like Numpy, and Autograd will give you its gradient. ## How to use Autograd Autograd's `grad` function takes in a function, and gives you a function that computes its derivative. Your function must have a scalar-valued output (i.e. a float). This covers the common case when you want to use gradients to optimize something. Autograd works on ordinary Python and Numpy code containing all the usual control structures, including `while` loops, `if` statements, and closures. Here's a simple example of using an open-ended loop to compute the sine function: ```python import autograd.numpy as np # Thinly-wrapped version of Numpy from autograd import grad def taylor_sine(x): # Taylor approximation to sine function ans = currterm = x i = 0 while np.abs(currterm) > 0.001: currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2)) ans = ans + currterm i += 1 return ans grad_sine = grad(taylor_sine) print "Gradient of sin(pi) is", grad_sine(np.pi) ``` ## Complete example: logistic regression A common use case for automatic differentiation is to train a probabilistic model. Here we present a very simple (but complete) example of specifying and training a logistic regression model for binary classification: ```python import autograd.numpy as np from autograd import grad def sigmoid(x): return 0.5 * (np.tanh(x / 2.) + 1) def logistic_predictions(weights, inputs): # Outputs probability of a label being true according to logistic model. return sigmoid(np.dot(inputs, weights)) def training_loss(weights): # Training loss is the negative log-likelihood of the training labels. preds = logistic_predictions(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) # Build a toy dataset. inputs = np.array([[0.52, 1.12, 0.77], [0.88, -1.08, 0.15], [0.52, 0.06, -1.30], [0.74, -2.49, 1.39]]) targets = np.array([True, True, False, True]) # Define a function that returns gradients of training loss using Autograd. training_gradient_fun = grad(training_loss) # Optimize weights using gradient descent. weights = np.array([0.0, 0.0, 0.0]) print("Initial loss:", training_loss(weights)) for i in range(100): weights -= training_gradient_fun(weights) * 0.01 print("Trained loss:", training_loss(weights)) ``` Python syntax is pretty good for specifying probabilistic models. The biggest win is that it becomes a lot easier to modify a model and rapidly iterate. For more complex examples, see our [examples directory](../examples/), which includes: * [a simple neural net](../examples/neural_net.py) * [a convolutional neural net](../examples/convnet.py) * [a recurrent neural net](../examples/rnn.py) * [a long short-term memory (LSTM)](../examples/lstm.py) * [backpropagating through a fluid simulation](../examples/fluidsim/fluidsim.py) ## What's going on under the hood? To compute the gradient, Autograd first has to record every transformation that was applied to the input as it was turned into the output of your function. To do this, Autograd wraps functions (using the function `primitive`) so that when they're called, they add themselves to a list of operations performed. Autograd's core has a table mapping these wrapped primitives to their corresponding gradient functions (or, more precisely, their vector-Jacobian product functions). To flag the variables we're taking the gradient with respect to, we wrap them using the `Box` class. You should never have to think about the `Box` class, but you might notice it when printing out debugging info. After the function is evaluated, Autograd has a graph specifying all operations that were performed on the inputs with respect to which we want to differentiate. This is the computational graph of the function evaluation. To compute the derivative, we simply apply the rules of differentiation to each node in the graph. ### Reverse mode differentiation Given a function made up of several nested function calls, there are several ways to compute its derivative. For example, given L(x) = F(G(H(x))), the chain rule says that its gradient is dL/dx = dF/dG * dG/dH * dH/dx. If we evaluate this product from right-to-left: (dF/dG * (dG/dH * dH/dx)), the same order as the computations themselves were performed, this is called forward-mode differentiation. If we evaluate this product from left-to-right: ((dF/dG * dG/dH) * dH/dx), the reverse order as the computations themselves were performed, this is called reverse-mode differentiation. Compared to finite differences or forward-mode, reverse-mode differentiation is by far the more practical method for differentiating functions that take in a large vector and output a single number. In the machine learning community, reverse-mode differentiation is known as 'backpropagation', since the gradients propagate backwards through the function. It's particularly nice since you don't need to instantiate the intermediate Jacobian matrices explicitly, and instead only rely on applying a sequence of matrix-free vector-Jacobian product functions (VJPs). Because Autograd supports higher derivatives as well, Hessian-vector products (a form of second-derivative) are also available and efficient to compute. ### How can you support ifs, while loops and recursion? Some autodiff packages (such as [TensorFlow](https://github.com/tensorflow/tensorflow)) work by having you specify a graph of the computation that your function performs, including all the control flow (such as if and for loops), and then turn that graph into another one that computes gradients. This has some benefits (such as allowing compile-time optimizations), but it requires you to express control flow in a limited mini-language that those packages know how to handle. (For example, the `tf.while` and `tf.cond` operations in TensorFlow.) In contrast, Autograd doesn't have to know about any ifs, branches, loops or recursion that were used to decide which operations were called. To compute the gradient of a particular input, one only needs to know which continuous transforms were applied to that particular input, not which other transforms might have been applied. Since Autograd keeps track of the relevant operations on each function call separately, it's not a problem that all the Python control flow operations are invisible to Autograd. In fact, it greatly simplifies the implementation. ## What can Autograd differentiate? The main constraint is that any function that operates on a `Box` is marked as `primitive`, and has its gradient implemented. This is taken care of for most functions in the Numpy library, and it's easy to write your own gradients. The input can be a scalar, complex number, vector, tuple, a tuple of vectors, a tuple of tuples, etc. When using the `grad` function, the output must be a scalar, but the functions `elementwise_grad` and `jacobian` allow gradients of vectors. ## Supported and unsupported parts of numpy/scipy Numpy has [a lot of features](http://docs.scipy.org/doc/numpy/reference/). We've done our best to support most of them. So far, we've implemented gradients for: * most of the [mathematical operations](../autograd/numpy/numpy_vjps.py) * most of the [array and matrix manipulation routines](../autograd/numpy/numpy_vjps.py) * some [linear algebra](../autograd/numpy/linalg.py) functions * most of the [fast fourier transform](../autograd/numpy/fft.py) routines * full support for complex numbers * [N-dimensional convolutions](../autograd/scipy/signal.py) * Some scipy routines, including [`scipy.stats.norm`](../autograd/scipy/stats/norm.py) Some things remain to be implemented. For example, we support indexing (`x = A[i, j, :]`) but not assignment (`A[i,j] = x`) in arrays that are being differentiated with respect to. Assignment is hard to support because it requires keeping copies of the overwritten data, and so even when you write code that looks like it's performing assignment, the system would have to be making copies behind the scenes, often defeating the purpose of in-place operations. Similarly, we don't support the syntax `A.dot(B)`; use the equivalent `np.dot(A, B)` instead. The reason we don't support the first way is that subclassing `ndarray` raises a host of issues. As another consequence of not subclassing `ndarray`, some subclass checks can break, like `isinstance(x, np.ndarray)` can return `False`. However, those `isinstance` checks will work if you instead use Autograd's provided one, writing `from autograd.builtins import isinstance`. In-place modification of arrays not being differentiated with respect to (for example, `A[i] = x` or `A += B`) won't raise an error, but be careful. It's easy to accidentally change something without Autograd knowing about it. This can be a problem because Autograd keeps references to variables used in the forward pass if they will be needed on the reverse pass. Making copies would be too slow. Lists and dicts can be used freely - like control flow, Autograd usually doesn't even need to know about them. The exception is passing in a list to a primitive function, such as `autograd.numpy.sum`. This requires special care, since the list contents need to be examined for boxes. We do support passing lists to `autograd.numpy.array` and `autograd.numpy.concatenate`, but in other cases, you may need to explicitly construct an array using `autograd.numpy.array` before passing a list or tuple argument into a primitive. An alternative is to use the `list`, `dict`, and `tuple` classes in `autograd.builtins`, which should work just like the Python builtins while also ensuring boxes don't get hidden inside those containers. Remember, these issues typically only come up when you're passing a `list` or `tuple` to a primitive function; when passing around lists or tuples in your own (non-primitive) functions, you can put boxed values inside lists, tuples, or dicts without having to worry about it. #### TL;DR: Do use * [Most](../autograd/numpy/numpy_vjps.py) of numpy's functions * [Most](../autograd/numpy/numpy_boxes.py) numpy.ndarray methods * [Some](../autograd/scipy/) scipy functions * Indexing and slicing of arrays `x = A[3, :, 2:4]` * Explicit array creation from lists `A = np.array([x, y])` #### Don't use * Assignment to arrays `A[0,0] = x` * Implicit casting of lists to arrays `A = np.sum([x, y])`, use `A = np.sum(np.array([x, y]))` instead. * `A.dot(B)` notation (use `np.dot(A, B)` instead) * In-place operations (such as `a += b`, use `a = a + b` instead) * Some isinstance checks, like `isinstance(x, np.ndarray)` or `isinstance(x, tuple)`, without first doing `from autograd.builtins import isinstance, tuple`. Luckily, it's easy to check gradients numerically if you're worried that something's wrong. ## Extend Autograd by defining your own primitives What if Autograd doesn't support a function you need to take the gradient of? This can happen if your code depends on external library calls or C code. It can sometimes even be a good idea to provide the gradient of a pure Python function for speed or numerical stability. For example, let's add the gradient of a numerically stable version of `log(sum(exp(x)))`. This function is included in `scipy.special` and already supported, but let's make our own version. Next, we define our function using standard Python, using `@primitive` as a decorator: ```python import autograd.numpy as np from autograd.extend import primitive, defvjp @primitive def logsumexp(x): """Numerically stable log(sum(exp(x)))""" max_x = np.max(x) return max_x + np.log(np.sum(np.exp(x - max_x))) ``` `@primitive` tells Autograd not to look inside the function, but instead to treat it as a black box whose gradient can be specified later. Functions with this decorator can contain anything that Python knows how to execute, including calls to other languages. Next, we write a function that specifies the gradient of the primitive `logsumexp`: ```python def logsumexp_vjp(ans, x): x_shape = x.shape return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans)) ``` `logsumexp_vjp` returns a vector-Jacobian product (VJP) operator, which is a function that right-multiplies its argument `g` by the Jacobian matrix of `logsumexp` (without explicitly forming the matrix's coefficients). `g` will be the gradient of the final objective with respect to `ans` (the output of `logsumexp`). The calculation can depend on both the input (`x`) and the output (`ans`) of the original function. If you want to be able to take higher-order derivatives, then the code inside the VJP function must be itself differentiable by Autograd, which usually just means you write it in terms of other primitives which themselves have VJPs (like Numpy functions). The final step is to tell Autograd about `logsumexp`'s vector-Jacobian product function: ```python defvjp(logsumexp, logsumexp_vjp) ``` Now we can use `logsumexp` anywhere, including inside of a larger function that we want to differentiate: ```python from autograd import grad def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print "Gradient: ", grad_of_example(np.array([1.5, 6.7, 1e-10]) ``` This example can be found as a Python script [here](../examples/define_gradient.py). ## Complex numbers Autograd supports complex arrays and scalars using a convention described as follows. Consider a complex-to-complex function, `f`, expressed in terms of real-to-real components, `u` and `v`: ```python def f(z): x, y = real(z), imag(z) return u(x, y) + v(x, y) * 1j ``` We define `grad` of `f` as ```python def grad_f(z): x, y = real(z), imag(z) return grad(u, 0)(x, y) - i * grad(u, 1)(x, y) ``` (The second argument of `grad` specifies which argument we're differentiating with respect to.) So we throw out v, the imaginary part of f, entirely. Our convention covers three important cases: * If `f` is holomorphic, we get the usual complex derivative (since `grad(u, 0) == grad(v, 1)` and `grad(u, 1) == - grad(v, 0)`). * If `f` is a real-valued loss function of a complex parameter, `x`, we get a result that we can use in a gradient-based optimizer, by taking steps in the direction of the complex conjugate of `grad(f)(x)`. * If `f` is a real-to-real function that happens to use complex primitives internally, some of which must necessarily be non-holomorphic (maybe you use FFTs to implement convolutions for example) then we get the same result that a purely real implementation would have given. Our convention doesn't handle the case where `f` is a non-holomorphic function and you're interested in all of du/dx, du/dy, dv/dx and dv/dy. But then the answer would have to contain four real values and there would be no way to express it as a single complex number. We define primitive vector-Jacobian products of complex functions like this ```python def f_vjp(g, z): z_x, z_y = real(z), imag(z) g_x, g_y = real(g), imag(g) return ( g_x * grad(u, 0)(x, y) - i * g_x * grad(u, 1)(x, y) - g_y * grad(v, 0)(x, y) + i * g_y * grad(v, 1)(x, y)) ``` For holomorphic primitives, this is just the regular complex derivative multiplied by `g`, so most simple math primitives don't need to be changed from their real implementations. For non-holomorphic primitives, it preserves all four real partial derivatives as if we were treating complex numbers as real 2-tuples (though it throws a couple of negative signs in there). Chapter 4 of [Dougal's PhD thesis](https://dougalmaclaurin.com/phd-thesis.pdf) goes into a bit more detail about how we define the primitive vector-Jacobian products. ## Autograd Lecture For more information on automatic differentiation, autograd's implementation, and advanced automatic differentiation techniques, see a [talk by Matt at the Deep Learning Summer School, Montreal 2017](https://videolectures.net/videos/deeplearning2017_johnson_automatic_differentiation/). ## Support Autograd was written by [Dougal Maclaurin](https://dougalmaclaurin.com), [David Duvenaud](http://mlg.eng.cam.ac.uk/duvenaud/), and [Matthew Johnson](http://www.mit.edu/~mattjj/) and we're actively developing it. Please feel free to submit any bugs or feature requests. We'd also love to hear about your experiences with Autograd in general. Drop us an email! ================================================ FILE: docs/updateguide.md ================================================ # Autograd v1.2 update guide Autograd v1.2 changed the interface for defining custom vector-Jacobian products (VJPs). Luckily the change only affects users writing custom VJPs, and should only require minor updates to the custom VJP code. This guide is meant to explain why we made these changes (and others) in Autograd v1.2, and to summarize everything you need to know to update your custom VJP code. - [Reasoning for the changes](#reasoning-for-the-changes) - [New defvjp interface](#new-defvjp-interface) - [Gradient checking](#gradient-checking) ## Reasoning for the changes Here are some of the most important reasons for this update: 1. To allow us to make Autograd faster and more memory efficient, we staged the VJP functions to allow more garbage collection and eliminated almost all of the vspace metadata checks. 1. Forward-mode now comes built-in with `make_jvp`. 1. There's now a clear extension API in `autograd.extend`, so you can write custom VJPs or wrap your own numerical libraries. 1. Autograd is now backend-independent, making it easy to wrap other numerical libraries. 1. Autograd's tracing functionality is now parameterized and easily reusable, and we added some new tracers for [computation graph visualization](https://github.com/hips/autograd/blob/master/examples/dot_graph.py) and [pure-Python constant folding](https://github.com/hips/autograd/blob/master/autograd/misc/tracers.py). 1. More exhaustive, fast reverse- and forward-mode checking with `autograd.test_util.check_grads`. 1. Expensive VJPs can share work across arguments using `defvjp_argnums`. 1. These changes enabled some internal cleanups, and more features to come! ## New defvjp interface First, here's an example of the old way to write custom primitives and VJPs: ```python import autograd.numpy as np from autograd import primitive @primitive def func(x, y, z): assert z != 0 return x * y**2 func.defvjp(lambda g, ans, vs, gvs, x, y, z: g * y**2) func.defvjp(lambda g, ans, vs, gvs, x, y, z: 2 * g * x * y, argnum=1) func.defvjp_is_zero(argnums=[2]) ``` Here's the new way to write custom VJPs for that same primitive: ```python import autograd.numpy as np from autograd.extend import primitive, defvjp # defvjp is now a function # primitives look the same as before @primitive def func(x, y, z): assert z != 0 return x * y**2 # but we call defvjp differently defvjp(func, lambda ans, x, y, z: lambda g: g * y**2, lambda ans, x, y, z: lambda g: 2 * g * x * y, None) ``` Here's a list of the `defvjp` changes illustrated in that example: 1. `defvjp` is a function, rather than a method on the `primitive` class. (Actually, `primitive` is now just a function, and no longer a class.) As a result, `func.defvjp(...)` became `defvjp(func, ...)`. 1. VJPs are staged, so that instead of writing `lambda g, ans, vs, gvs, *args: ...` we write `lambda ans, *args: lambda g: ...`. This change enables a lot of automatic garbage collection. In the above example, if we were differentiating only with respect to `x` argument of `func`, because the VJP for `func` with respect to argument index 0 doesn't need the values of `x` or `z` from the forward pass, those values aren't stored and can instead be immediately garbage-collected. 1. There are no more `vs` and `gvs` arguments. These usually weren't used, and computing vspace metadata for every intermediate value proved to contribute significant overhead for some programs. Autograd now avoids computing vspace metadata unless necessary. 1. `defvjp` lets you define VJPs with respect to multiple arguments at once, and the argnum(s) involved are often implicit. Here's another example, this time showing how to define VJPs with respect to specific argnums, leaving the others undefined. ```python # OLD way to leave some VJPs undefined func.defvjp(lambda g, ans, vs, gvs, x, y, z, w: ..., argnum=2) func.defvjp(lambda g, ans, vs, gvs, x, y, z, w: ..., argnum=3) # NEW way to leave some VJPs undefined defvjp(func, lambda ans, x, y, z, w: lambda g: ..., lambda ans, x, y, z, w: lambda g: ..., argnums=[2, 3]) ``` ## Gradient checking Here's how to do gradient checking, whether on a composite function or on your primitive with a custom VJP: ```python from autograd.test_util import check_grads # check reverse-mode to second order check_grads(my_func, modes=['rev'], order=2)(*args_for_my_func) ``` ================================================ FILE: examples/README.md ================================================ # Autograd examples ## Usage instructions Some of the examples require additional dependencies beyond Autograd and its core dependencies. These are set up under the `examples` dependency group. To install them, navigate to the root directory of where you cloned Autograd and run ```sh pip install --group examples ``` from the command line. Note that dependency groups are a recent feature so you may need to upgrade `pip` with ```sh pip install --upgrade pip ``` Having installed the additional dependencies, you may navigate to the `examples` subdirectory and run any of the Python scripts. For example: ```sh python3 tanh.py ``` Some of the examples print to the terminal and others open pop-up windows for plots. ================================================ FILE: examples/__init__.py ================================================ ================================================ FILE: examples/bayesian_neural_net.py ================================================ import matplotlib.pyplot as plt from black_box_svi import black_box_variational_inference import autograd.numpy as np import autograd.numpy.random as npr from autograd.misc.optimizers import adam def make_nn_funs(layer_sizes, L2_reg, noise_variance, nonlinearity=np.tanh): """These functions implement a standard multi-layer perceptron, vectorized over both training examples and weight samples.""" shapes = list(zip(layer_sizes[:-1], layer_sizes[1:])) num_weights = sum((m + 1) * n for m, n in shapes) def unpack_layers(weights): num_weight_sets = len(weights) for m, n in shapes: yield ( weights[:, : m * n].reshape((num_weight_sets, m, n)), weights[:, m * n : m * n + n].reshape((num_weight_sets, 1, n)), ) weights = weights[:, (m + 1) * n :] def predictions(weights, inputs): """weights is shape (num_weight_samples x num_weights) inputs is shape (num_datapoints x D)""" inputs = np.expand_dims(inputs, 0) for W, b in unpack_layers(weights): outputs = np.einsum("mnd,mdo->mno", inputs, W) + b inputs = nonlinearity(outputs) return outputs def logprob(weights, inputs, targets): log_prior = -L2_reg * np.sum(weights**2, axis=1) preds = predictions(weights, inputs) log_lik = -np.sum((preds - targets) ** 2, axis=1)[:, 0] / noise_variance return log_prior + log_lik return num_weights, predictions, logprob def build_toy_dataset(n_data=40, noise_std=0.1): D = 1 rs = npr.RandomState(0) inputs = np.concatenate([np.linspace(0, 2, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)]) targets = np.cos(inputs) + rs.randn(n_data) * noise_std inputs = (inputs - 4.0) / 4.0 inputs = inputs.reshape((len(inputs), D)) targets = targets.reshape((len(targets), D)) return inputs, targets if __name__ == "__main__": # Specify inference problem by its unnormalized log-posterior. rbf = lambda x: np.exp(-(x**2)) relu = lambda x: np.maximum(x, 0.0) num_weights, predictions, logprob = make_nn_funs( layer_sizes=[1, 20, 20, 1], L2_reg=0.1, noise_variance=0.01, nonlinearity=rbf ) inputs, targets = build_toy_dataset() log_posterior = lambda weights, t: logprob(weights, inputs, targets) # Build variational objective. objective, gradient, unpack_params = black_box_variational_inference( log_posterior, num_weights, num_samples=20 ) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) def callback(params, t, g): print(f"Iteration {t} lower bound {-objective(params, t)}") # Sample functions from posterior. rs = npr.RandomState(0) mean, log_std = unpack_params(params) # rs = npr.RandomState(0) sample_weights = rs.randn(10, num_weights) * np.exp(log_std) + mean plot_inputs = np.linspace(-8, 8, num=400) outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1)) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), "bx") ax.plot(plot_inputs, outputs[:, :, 0].T) ax.set_ylim([-2, 3]) plt.draw() plt.pause(1.0 / 60.0) # Initialize variational parameters rs = npr.RandomState(0) init_mean = rs.randn(num_weights) init_log_std = -5 * np.ones(num_weights) init_var_params = np.concatenate([init_mean, init_log_std]) print("Optimizing variational parameters...") variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=1000, callback=callback) ================================================ FILE: examples/bayesian_optimization.py ================================================ """This Bayesian optimization demo using gradient-based optimization to find the next query point.""" import matplotlib.pyplot as plt from gaussian_process import make_gp_funs, rbf_covariance from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr from autograd import value_and_grad from autograd.scipy.stats import norm def probability_of_improvement(mean, std, max_so_far): return norm.cdf(max_so_far, mean, std) def expected_new_max(mean, std, max_so_far): return ( max_so_far - (mean - max_so_far) * norm.cdf(mean, max_so_far, std) + std * norm.pdf(mean, max_so_far, std) ) def init_covariance_params(num_params): return np.zeros(num_params) def defaultmax(x, default=-np.inf): if x.size == 0: return default return np.max(x) def bayesian_optimize(func, domain_min, domain_max, num_iters=20, callback=None): D = len(domain_min) num_params, predict, log_marginal_likelihood = make_gp_funs(rbf_covariance, num_cov_params=D + 1) model_params = init_covariance_params(num_params) def optimize_gp_params(init_params, X, y): log_hyperprior = lambda params: np.sum(norm.logpdf(params, 0.0, 100.0)) objective = lambda params: -log_marginal_likelihood(params, X, y) - log_hyperprior(params) return minimize(value_and_grad(objective), init_params, jac=True, method="CG").x def choose_next_point(domain_min, domain_max, acquisition_function, num_tries=15, rs=npr.RandomState(0)): """Uses gradient-based optimization to find next query point.""" init_points = rs.rand(num_tries, D) * (domain_max - domain_min) + domain_min grad_obj = value_and_grad(lambda x: -acquisition_function(x)) def optimize_point(init_point): print(".", end="") result = minimize( grad_obj, x0=init_point, jac=True, method="L-BFGS-B", options={"maxiter": 10}, bounds=list(zip(domain_min, domain_max)), ) return result.x, acquisition_function(result.x) optimzed_points, optimized_values = list(zip(*list(map(optimize_point, init_points)))) print() best_ix = np.argmax(optimized_values) return np.atleast_2d(optimzed_points[best_ix]) # Start by evaluating once in the middle of the domain. X = np.zeros((0, D)) y = np.zeros(0) X = np.concatenate((X, np.reshape((domain_max - domain_min) / 2.0, (D, 1)))) y = np.concatenate((y, np.reshape(np.array(func(X)), (1,)))) for i in range(num_iters): if i > 1: print("Optimizing model parameters...") model_params = optimize_gp_params(model_params, X, y) print("Choosing where to look next", end="") def predict_func(xstar): mean, cov = predict(model_params, X, y, xstar) return mean, np.sqrt(np.diag(cov)) def acquisition_function(xstar): xstar = np.atleast_2d(xstar) # To work around a bug in scipy.minimize mean, std = predict_func(xstar) return expected_new_max(mean, std, defaultmax(y)) next_point = choose_next_point(domain_min, domain_max, acquisition_function) print("Evaluating expensive function...") new_value = func(next_point) X = np.concatenate((X, next_point)) y = np.concatenate((y, np.reshape(np.array(new_value), (1,)))) if callback: callback(X, y, predict_func, acquisition_function, next_point, new_value) best_ix = np.argmax(y) return X[best_ix, :], y[best_ix] if __name__ == "__main__": def example_function(x): return np.sum(x * np.sin(10.0 * x) + x) - 1 domain_min = np.array([0.0]) domain_max = np.array([1.1]) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(X, y, predict_func, acquisition_function, next_point, new_value): plt.cla() # Show posterior marginals. plot_xs = np.reshape(np.linspace(domain_min, domain_max, 300), (300, 1)) pred_mean, pred_std = predict_func(plot_xs) ax.plot(plot_xs, pred_mean, "b") ax.fill( np.concatenate([plot_xs, plot_xs[::-1]]), np.concatenate([pred_mean - 1.96 * pred_std, (pred_mean + 1.96 * pred_std)[::-1]]), alpha=0.15, fc="Blue", ec="None", ) ax.plot(X, y, "kx") ax.plot(next_point, new_value, "ro") alphas = acquisition_function(plot_xs) ax.plot(plot_xs, alphas, "r") ax.set_ylim([-1.5, 1.5]) ax.set_xticks([]) ax.set_yticks([]) plt.draw() plt.pause(1) best_x, best_y = bayesian_optimize(example_function, domain_min, domain_max, callback=callback) ================================================ FILE: examples/black_box_svi.py ================================================ import matplotlib.pyplot as plt import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.multivariate_normal as mvn import autograd.scipy.stats.norm as norm from autograd import grad from autograd.misc.optimizers import adam def black_box_variational_inference(logprob, D, num_samples): """Implements http://arxiv.org/abs/1401.0118, and uses the local reparameterization trick from http://arxiv.org/abs/1506.02557""" def unpack_params(params): # Variational dist is a diagonal Gaussian. mean, log_std = params[:D], params[D:] return mean, log_std def gaussian_entropy(log_std): return 0.5 * D * (1.0 + np.log(2 * np.pi)) + np.sum(log_std) rs = npr.RandomState(0) def variational_objective(params, t): """Provides a stochastic estimate of the variational lower bound.""" mean, log_std = unpack_params(params) samples = rs.randn(num_samples, D) * np.exp(log_std) + mean lower_bound = gaussian_entropy(log_std) + np.mean(logprob(samples, t)) return -lower_bound gradient = grad(variational_objective) return variational_objective, gradient, unpack_params if __name__ == "__main__": # Specify an inference problem by its unnormalized log-density. D = 2 def log_density(x, t): mu, log_sigma = x[:, 0], x[:, 1] sigma_density = norm.logpdf(log_sigma, 0, 1.35) mu_density = norm.logpdf(mu, 0, np.exp(log_sigma)) return sigma_density + mu_density # Build variational objective. objective, gradient, unpack_params = black_box_variational_inference(log_density, D, num_samples=2000) # Set up plotting code def plot_isocontours(ax, func, xlimits=[-2, 2], ylimits=[-4, 2], numticks=101): x = np.linspace(*xlimits, num=numticks) y = np.linspace(*ylimits, num=numticks) X, Y = np.meshgrid(x, y) zs = func(np.concatenate([np.atleast_2d(X.ravel()), np.atleast_2d(Y.ravel())]).T) Z = zs.reshape(X.shape) plt.contour(X, Y, Z) ax.set_yticks([]) ax.set_xticks([]) # Set up figure. fig = plt.figure(figsize=(8, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) def callback(params, t, g): print(f"Iteration {t} lower bound {-objective(params, t)}") plt.cla() target_distribution = lambda x: np.exp(log_density(x, t)) plot_isocontours(ax, target_distribution) mean, log_std = unpack_params(params) variational_contour = lambda x: mvn.pdf(x, mean, np.diag(np.exp(2 * log_std))) plot_isocontours(ax, variational_contour) plt.draw() plt.pause(1.0 / 30.0) print("Optimizing variational parameters...") init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=2000, callback=callback) ================================================ FILE: examples/convnet.py ================================================ """Convolutional neural net on MNIST, modeled on 'LeNet-5', http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf""" import data_mnist import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.signal from autograd import grad convolve = autograd.scipy.signal.convolve class WeightsParser: """A helper class to index into a parameter vector.""" def __init__(self): self.idxs_and_shapes = {} self.N = 0 def add_weights(self, name, shape): start = self.N self.N += np.prod(shape) self.idxs_and_shapes[name] = (slice(start, self.N), shape) def get(self, vect, name): idxs, shape = self.idxs_and_shapes[name] return np.reshape(vect[idxs], shape) def make_batches(N_total, N_batch): start = 0 batches = [] while start < N_total: batches.append(slice(start, start + N_batch)) start += N_batch return batches def logsumexp(X, axis, keepdims=False): max_X = np.max(X) return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=keepdims)) def make_nn_funs(input_shape, layer_specs, L2_reg): parser = WeightsParser() cur_shape = input_shape for layer in layer_specs: N_weights, cur_shape = layer.build_weights_dict(cur_shape) parser.add_weights(layer, (N_weights,)) def predictions(W_vect, inputs): """Outputs normalized log-probabilities. shape of inputs : [data, color, y, x]""" cur_units = inputs for layer in layer_specs: cur_weights = parser.get(W_vect, layer) cur_units = layer.forward_pass(cur_units, cur_weights) return cur_units def loss(W_vect, X, T): log_prior = -L2_reg * np.dot(W_vect, W_vect) log_lik = np.sum(predictions(W_vect, X) * T) return -log_prior - log_lik def frac_err(W_vect, X, T): return np.mean(np.argmax(T, axis=1) != np.argmax(pred_fun(W_vect, X), axis=1)) return parser.N, predictions, loss, frac_err class conv_layer: def __init__(self, kernel_shape, num_filters): self.kernel_shape = kernel_shape self.num_filters = num_filters def forward_pass(self, inputs, param_vector): # Input dimensions: [data, color_in, y, x] # Params dimensions: [color_in, color_out, y, x] # Output dimensions: [data, color_out, y, x] params = self.parser.get(param_vector, "params") biases = self.parser.get(param_vector, "biases") conv = convolve(inputs, params, axes=([2, 3], [2, 3]), dot_axes=([1], [0]), mode="valid") return conv + biases def build_weights_dict(self, input_shape): # Input shape : [color, y, x] (don't need to know number of data yet) self.parser = WeightsParser() self.parser.add_weights("params", (input_shape[0], self.num_filters) + self.kernel_shape) self.parser.add_weights("biases", (1, self.num_filters, 1, 1)) output_shape = (self.num_filters,) + self.conv_output_shape(input_shape[1:], self.kernel_shape) return self.parser.N, output_shape def conv_output_shape(self, A, B): return (A[0] - B[0] + 1, A[1] - B[1] + 1) class maxpool_layer: def __init__(self, pool_shape): self.pool_shape = pool_shape def build_weights_dict(self, input_shape): # input_shape dimensions: [color, y, x] output_shape = list(input_shape) for i in [0, 1]: assert input_shape[i + 1] % self.pool_shape[i] == 0, "maxpool shape should tile input exactly" output_shape[i + 1] = input_shape[i + 1] / self.pool_shape[i] return 0, output_shape def forward_pass(self, inputs, param_vector): new_shape = inputs.shape[:2] for i in [0, 1]: pool_width = self.pool_shape[i] img_width = inputs.shape[i + 2] new_shape += (img_width // pool_width, pool_width) result = inputs.reshape(new_shape) return np.max(np.max(result, axis=3), axis=4) class full_layer: def __init__(self, size): self.size = size def build_weights_dict(self, input_shape): # Input shape is anything (all flattened) input_size = np.prod(input_shape, dtype=int) self.parser = WeightsParser() self.parser.add_weights("params", (input_size, self.size)) self.parser.add_weights("biases", (self.size,)) return self.parser.N, (self.size,) def forward_pass(self, inputs, param_vector): params = self.parser.get(param_vector, "params") biases = self.parser.get(param_vector, "biases") if inputs.ndim > 2: inputs = inputs.reshape((inputs.shape[0], np.prod(inputs.shape[1:]))) return self.nonlinearity(np.dot(inputs[:, :], params) + biases) class tanh_layer(full_layer): def nonlinearity(self, x): return np.tanh(x) class softmax_layer(full_layer): def nonlinearity(self, x): return x - logsumexp(x, axis=1, keepdims=True) if __name__ == "__main__": # Network parameters L2_reg = 1.0 input_shape = (1, 28, 28) layer_specs = [ conv_layer((5, 5), 6), maxpool_layer((2, 2)), conv_layer((5, 5), 16), maxpool_layer((2, 2)), tanh_layer(120), tanh_layer(84), softmax_layer(10), ] # Training parameters param_scale = 0.1 learning_rate = 1e-3 momentum = 0.9 batch_size = 256 num_epochs = 50 # Load and process MNIST data print("Loading training data...") add_color_channel = lambda x: x.reshape((x.shape[0], 1, x.shape[1], x.shape[2])) one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) train_images, train_labels, test_images, test_labels = data_mnist.mnist() train_images = add_color_channel(train_images) / 255.0 test_images = add_color_channel(test_images) / 255.0 train_labels = one_hot(train_labels, 10) test_labels = one_hot(test_labels, 10) N_data = train_images.shape[0] # Make neural net functions N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(input_shape, layer_specs, L2_reg) loss_grad = grad(loss_fun) # Initialize weights rs = npr.RandomState() W = rs.randn(N_weights) * param_scale # Check the gradients numerically, just to be safe # quick_grad_check(loss_fun, W, (train_images[:50], train_labels[:50])) print(" Epoch | Train err | Test error ") def print_perf(epoch, W): test_perf = frac_err(W, test_images, test_labels) train_perf = frac_err(W, train_images, train_labels) print(f"{epoch:15}|{train_perf:15}|{test_perf:15}") # Train with sgd batch_idxs = make_batches(N_data, batch_size) cur_dir = np.zeros(N_weights) for epoch in range(num_epochs): print_perf(epoch, W) for idxs in batch_idxs: grad_W = loss_grad(W, train_images[idxs], train_labels[idxs]) cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W W -= learning_rate * cur_dir ================================================ FILE: examples/data.py ================================================ import data_mnist import matplotlib.image import matplotlib.pyplot as plt import autograd.numpy as np import autograd.numpy.random as npr def load_mnist(): partial_flatten = lambda x: np.reshape(x, (x.shape[0], np.prod(x.shape[1:]))) one_hot = lambda x, k: np.array(x[:, None] == np.arange(k)[None, :], dtype=int) train_images, train_labels, test_images, test_labels = data_mnist.mnist() train_images = partial_flatten(train_images) / 255.0 test_images = partial_flatten(test_images) / 255.0 train_labels = one_hot(train_labels, 10) test_labels = one_hot(test_labels, 10) N_data = train_images.shape[0] return N_data, train_images, train_labels, test_images, test_labels def plot_images( images, ax, ims_per_row=5, padding=5, digit_dimensions=(28, 28), cmap=matplotlib.cm.binary, vmin=None, vmax=None, ): """Images should be a (N_images x pixels) matrix.""" N_images = images.shape[0] N_rows = (N_images - 1) // ims_per_row + 1 pad_value = np.min(images.ravel()) concat_images = np.full( ( (digit_dimensions[0] + padding) * N_rows + padding, (digit_dimensions[1] + padding) * ims_per_row + padding, ), pad_value, ) for i in range(N_images): cur_image = np.reshape(images[i, :], digit_dimensions) row_ix = i // ims_per_row col_ix = i % ims_per_row row_start = padding + (padding + digit_dimensions[0]) * row_ix col_start = padding + (padding + digit_dimensions[1]) * col_ix concat_images[ row_start : row_start + digit_dimensions[0], col_start : col_start + digit_dimensions[1] ] = cur_image cax = ax.matshow(concat_images, cmap=cmap, vmin=vmin, vmax=vmax) plt.xticks(np.array([])) plt.yticks(np.array([])) return cax def save_images(images, filename, **kwargs): fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(images, ax, **kwargs) fig.patch.set_visible(False) ax.patch.set_visible(False) plt.savefig(filename) def make_pinwheel(radial_std, tangential_std, num_classes, num_per_class, rate, rs=npr.RandomState(0)): """Based on code by Ryan P. Adams.""" rads = np.linspace(0, 2 * np.pi, num_classes, endpoint=False) features = rs.randn(num_classes * num_per_class, 2) * np.array([radial_std, tangential_std]) features[:, 0] += 1 labels = np.repeat(np.arange(num_classes), num_per_class) angles = rads[labels] + rate * np.exp(features[:, 0]) rotations = np.stack([np.cos(angles), -np.sin(angles), np.sin(angles), np.cos(angles)]) rotations = np.reshape(rotations.T, (-1, 2, 2)) return np.einsum("ti,tij->tj", features, rotations) ================================================ FILE: examples/data_mnist.py ================================================ import array import gzip import os import struct from urllib.request import urlretrieve import numpy as np def download(url, filename): if not os.path.exists("data"): os.makedirs("data") out_file = os.path.join("data", filename) if not os.path.isfile(out_file): urlretrieve(url, out_file) def mnist(): base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/" def parse_labels(filename): with gzip.open(filename, "rb") as fh: magic, num_data = struct.unpack(">II", fh.read(8)) return np.array(array.array("B", fh.read()), dtype=np.uint8) def parse_images(filename): with gzip.open(filename, "rb") as fh: magic, num_data, rows, cols = struct.unpack(">IIII", fh.read(16)) return np.array(array.array("B", fh.read()), dtype=np.uint8).reshape(num_data, rows, cols) for filename in [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz", ]: download(base_url + filename, filename) train_images = parse_images("data/train-images-idx3-ubyte.gz") train_labels = parse_labels("data/train-labels-idx1-ubyte.gz") test_images = parse_images("data/t10k-images-idx3-ubyte.gz") test_labels = parse_labels("data/t10k-labels-idx1-ubyte.gz") return train_images, train_labels, test_images, test_labels ================================================ FILE: examples/deep_gaussian_process.py ================================================ import matplotlib.pyplot as plt from gaussian_process import make_gp_funs, rbf_covariance from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr from autograd import value_and_grad def build_step_function_dataset(D=1, n_data=40, noise_std=0.1): rs = npr.RandomState(0) inputs = np.linspace(-2, 2, num=n_data) targets = np.sign(inputs) + rs.randn(n_data) * noise_std inputs = inputs.reshape((len(inputs), D)) return inputs, targets def build_deep_gp(input_dimension, hidden_dimension, covariance_function): # GP going from input to hidden num_params_layer1, predict_layer1, log_marginal_likelihood_layer1 = make_gp_funs( covariance_function, num_cov_params=input_dimension + 1 ) # GP going from hidden to output num_params_layer2, predict_layer2, log_marginal_likelihood_layer2 = make_gp_funs( covariance_function, num_cov_params=hidden_dimension + 1 ) num_hidden_params = hidden_dimension * n_data total_num_params = num_params_layer1 + num_params_layer2 + num_hidden_params def unpack_all_params(all_params): layer1_params = all_params[:num_params_layer1] layer2_params = all_params[num_params_layer1 : num_params_layer1 + num_params_layer2] hiddens = all_params[num_params_layer1 + num_params_layer2 :] return layer1_params, layer2_params, hiddens def combined_predict_fun(all_params, X, y, xs): layer1_params, layer2_params, hiddens = unpack_all_params(all_params) h_star_mean, h_star_cov = predict_layer1(layer1_params, X, hiddens, xs) y_star_mean, y_star_cov = predict_layer2( layer2_params, np.atleast_2d(hiddens).T, y, np.atleast_2d(h_star_mean).T ) return y_star_mean, y_star_cov def log_marginal_likelihood(all_params): layer1_params, layer2_params, h = unpack_all_params(all_params) return log_marginal_likelihood_layer1(layer1_params, X, h) + log_marginal_likelihood_layer2( layer2_params, np.atleast_2d(h).T, y ) predict_layer_funcs = [predict_layer1, predict_layer2] return ( total_num_params, log_marginal_likelihood, combined_predict_fun, unpack_all_params, predict_layer_funcs, ) if __name__ == "__main__": n_data = 20 input_dimension = 1 hidden_dimension = 1 X, y = build_step_function_dataset(D=input_dimension, n_data=n_data) ( total_num_params, log_marginal_likelihood, combined_predict_fun, unpack_all_params, predict_layer_funcs, ) = build_deep_gp(input_dimension, hidden_dimension, rbf_covariance) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax_end_to_end = fig.add_subplot(311, frameon=False) ax_x_to_h = fig.add_subplot(312, frameon=False) ax_h_to_y = fig.add_subplot(313, frameon=False) plt.show(block=False) def plot_gp(ax, X, y, pred_mean, pred_cov, plot_xs): ax.cla() marg_std = np.sqrt(np.diag(pred_cov)) ax.plot(plot_xs, pred_mean, "b") ax.fill( np.concatenate([plot_xs, plot_xs[::-1]]), np.concatenate([pred_mean - 1.96 * marg_std, (pred_mean + 1.96 * marg_std)[::-1]]), alpha=0.15, fc="Blue", ec="None", ) # Show samples from posterior. rs = npr.RandomState(0) sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10) ax.plot(plot_xs, sampled_funcs.T) ax.plot(X, y, "kx") ax.set_ylim([-1.5, 1.5]) ax.set_xticks([]) ax.set_yticks([]) def callback(params): print(f"Log marginal likelihood {log_marginal_likelihood(params)}") # Show posterior marginals. plot_xs = np.reshape(np.linspace(-5, 5, 300), (300, 1)) pred_mean, pred_cov = combined_predict_fun(params, X, y, plot_xs) plot_gp(ax_end_to_end, X, y, pred_mean, pred_cov, plot_xs) ax_end_to_end.set_title("X to y") layer1_params, layer2_params, hiddens = unpack_all_params(params) h_star_mean, h_star_cov = predict_layer_funcs[0](layer1_params, X, hiddens, plot_xs) y_star_mean, y_star_cov = predict_layer_funcs[0](layer2_params, np.atleast_2d(hiddens).T, y, plot_xs) plot_gp(ax_x_to_h, X, hiddens, h_star_mean, h_star_cov, plot_xs) ax_x_to_h.set_title("X to hiddens") plot_gp(ax_h_to_y, np.atleast_2d(hiddens).T, y, y_star_mean, y_star_cov, plot_xs) ax_h_to_y.set_title("hiddens to y") plt.draw() plt.pause(1.0 / 60.0) # Initialize covariance parameters and hiddens. rs = npr.RandomState(0) init_params = 0.1 * rs.randn(total_num_params) print("Optimizing covariance parameters...") objective = lambda params: -log_marginal_likelihood(params) cov_params = minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback) plt.pause(10.0) ================================================ FILE: examples/define_gradient.py ================================================ """This example shows how to define the gradient of your own functions. This can be useful for speed, numerical stability, or in cases where your code depends on external library calls.""" import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.extend import defvjp, primitive from autograd.test_util import check_grads # @primitive tells Autograd not to look inside this function, but instead # to treat it as a black box, whose gradient might be specified later. # Functions with this decorator can contain anything that Python knows # how to execute, and you can do things like in-place operations on arrays. @primitive def logsumexp(x): """Numerically stable log(sum(exp(x))), also defined in scipy.special""" max_x = np.max(x) return max_x + np.log(np.sum(np.exp(x - max_x))) # Next, we write a function that specifies the gradient with a closure. # The reason for the closure is so that the gradient can depend # on both the input to the original function (x), and the output of the # original function (ans). def logsumexp_vjp(ans, x): # If you want to be able to take higher-order derivatives, then all the # code inside this function must be itself differentiable by Autograd. # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because Autograd uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. # This returned VJP function doesn't close over `x`, so Python can # garbage-collect `x` if there are no references to it elsewhere. x_shape = x.shape return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans)) # Now we tell Autograd that logsumexmp has a gradient-making function. defvjp(logsumexp, logsumexp_vjp) if __name__ == "__main__": # Now we can use logsumexp() inside a larger function that we want # to differentiate. def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print("Gradient: \n", grad_of_example(npr.randn(10))) # Check the gradients numerically, just to be safe. check_grads(example_func, modes=["rev"])(npr.randn(10)) ================================================ FILE: examples/dot_graph.py ================================================ """Generates a graphviz DOT file of an evaluation trace. Usage (need the dot binary, from the graphviz package, www.graphviz.org): python2 dot_graph.py | dot -Tpdf -o graph.pdf """ import autograd.numpy as np from autograd.tracer import Node, trace class GraphNode(Node): # Records the full graph (could having this in tracer.py) def __init__(self, value, fun, args, kwargs, parent_argnums, parents): self.fun_name = fun.__name__ self.args = args self.parents = dict(zip(parent_argnums, parents)) self.isroot = False def initialize_root(self, x): self.isroot = True def __repr__(self): return f"node_{id(self)}" def trace_graph(f, x): start_node = GraphNode.new_root(x) _, node = trace(start_node, f, x) return node dot_edge = "{} -> {} [color=gray30];\n".format dot_function_node = '{} [label="{}", shape=box, color=lightblue, style=filled];\n'.format dot_variable_node = '{} [label="{}", color=orange, style=filled];\n'.format dot_graph = "digraph G {{{}}}".format def graph_to_dotfile(graph): visited = set() def node_to_fragment(node): visited.add(node) if node.isroot: return dot_variable_node(node, "input") fragment = dot_function_node(node, node.fun_name) for argnum, arg in enumerate(node.args): if argnum in node.parents: parent = node.parents[argnum] fragment += dot_edge(parent, node) if parent not in visited: fragment += node_to_fragment(parent) else: argnode = f"{node}_arg_{argnum}" fragment += dot_edge(argnode, node) fragment += dot_variable_node(argnode, arg) return fragment dot_body = node_to_fragment(graph) dot_body += dot_variable_node("output", "output") dot_body += dot_edge(graph, "output") return dot_graph(dot_body) if __name__ == "__main__": def fun(x): y = np.sin(x) return (y + np.exp(x) - 0.5) * y print(graph_to_dotfile(trace_graph(fun, 1.0))) ================================================ FILE: examples/fixed_points.py ================================================ import autograd.numpy as np from autograd import grad from autograd.misc.fixed_points import fixed_point def newton_sqrt_iter(a): return lambda x: 0.5 * (x + a / x) def grad_descent_sqrt_iter(a): return lambda x: x - 0.05 * (x**2 - a) def sqrt(a, guess=10.0): # return fixed_point(newton_sqrt_iter, a, guess, distance, 1e-4) return fixed_point(grad_descent_sqrt_iter, a, guess, distance, 1e-4) def distance(x, y): return np.abs(x - y) print(np.sqrt(2.0)) print(sqrt(2.0)) print() print(grad(np.sqrt)(2.0)) print(grad(sqrt)(2.0)) print() print(grad(grad(np.sqrt))(2.0)) print(grad(grad(sqrt))(2.0)) print() ================================================ FILE: examples/fluidsim/fluidsim.py ================================================ import os import matplotlib import matplotlib.pyplot as plt from matplotlib.pyplot import imread from scipy.optimize import minimize import autograd.numpy as np from autograd import value_and_grad # Fluid simulation code based on # "Real-Time Fluid Dynamics for Games" by Jos Stam # https://www.josstam.com/_files/ugd/cf1fd6_9989229efbd34a26ba5ccd913721a2ac.pdf def project(vx, vy): """Project the velocity field to be approximately mass-conserving, using a few iterations of Gauss-Seidel.""" p = np.zeros(vx.shape) h = 1.0 / vx.shape[0] div = ( -0.5 * h * ( np.roll(vx, -1, axis=0) - np.roll(vx, 1, axis=0) + np.roll(vy, -1, axis=1) - np.roll(vy, 1, axis=1) ) ) for k in range(10): p = ( div + np.roll(p, 1, axis=0) + np.roll(p, -1, axis=0) + np.roll(p, 1, axis=1) + np.roll(p, -1, axis=1) ) / 4.0 vx -= 0.5 * (np.roll(p, -1, axis=0) - np.roll(p, 1, axis=0)) / h vy -= 0.5 * (np.roll(p, -1, axis=1) - np.roll(p, 1, axis=1)) / h return vx, vy def advect(f, vx, vy): """Move field f according to x and y velocities (u and v) using an implicit Euler integrator.""" rows, cols = f.shape cell_ys, cell_xs = np.meshgrid(np.arange(rows), np.arange(cols)) center_xs = (cell_xs - vx).ravel() center_ys = (cell_ys - vy).ravel() # Compute indices of source cells. left_ix = np.floor(center_xs).astype(int) top_ix = np.floor(center_ys).astype(int) rw = center_xs - left_ix # Relative weight of right-hand cells. bw = center_ys - top_ix # Relative weight of bottom cells. left_ix = np.mod(left_ix, rows) # Wrap around edges of simulation. right_ix = np.mod(left_ix + 1, rows) top_ix = np.mod(top_ix, cols) bot_ix = np.mod(top_ix + 1, cols) # A linearly-weighted sum of the 4 surrounding cells. flat_f = (1 - rw) * ((1 - bw) * f[left_ix, top_ix] + bw * f[left_ix, bot_ix]) + rw * ( (1 - bw) * f[right_ix, top_ix] + bw * f[right_ix, bot_ix] ) return np.reshape(flat_f, (rows, cols)) def simulate(vx, vy, smoke, num_time_steps, ax=None, render=False): print("Running simulation...") for t in range(num_time_steps): if ax: plot_matrix(ax, smoke, t, render) vx_updated = advect(vx, vx, vy) vy_updated = advect(vy, vx, vy) vx, vy = project(vx_updated, vy_updated) smoke = advect(smoke, vx, vy) if ax: plot_matrix(ax, smoke, num_time_steps, render) return smoke def plot_matrix(ax, mat, t, render=False): plt.cla() ax.matshow(mat) ax.set_xticks([]) ax.set_yticks([]) plt.draw() if render: matplotlib.image.imsave(f"step{t:03d}.png", mat) plt.pause(0.001) if __name__ == "__main__": simulation_timesteps = 100 basepath = os.path.dirname(__file__) print("Loading initial and target states...") init_smoke = imread(os.path.join(basepath, "init_smoke.png"))[:, :, 0] # target = imread('peace.png')[::2,::2,3] target = imread(os.path.join(basepath, "skull.png"))[::2, ::2] rows, cols = target.shape init_dx_and_dy = np.zeros((2, rows, cols)).ravel() def distance_from_target_image(smoke): return np.mean((target - smoke) ** 2) def convert_param_vector_to_matrices(params): vx = np.reshape(params[: (rows * cols)], (rows, cols)) vy = np.reshape(params[(rows * cols) :], (rows, cols)) return vx, vy def objective(params): init_vx, init_vy = convert_param_vector_to_matrices(params) final_smoke = simulate(init_vx, init_vy, init_smoke, simulation_timesteps) return distance_from_target_image(final_smoke) # Specify gradient of objective function using autograd. objective_with_grad = value_and_grad(objective) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111, frameon=False) def callback(params): init_vx, init_vy = convert_param_vector_to_matrices(params) simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax) print("Optimizing initial conditions...") result = minimize( objective_with_grad, init_dx_and_dy, jac=True, method="CG", options={"maxiter": 25, "disp": True}, callback=callback, ) print("Rendering optimized flow...") init_vx, init_vy = convert_param_vector_to_matrices(result.x) simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax, render=True) print("Converting frames to an animated GIF...") os.system("convert -delay 5 -loop 0 step*.png -delay 250 step100.png surprise.gif") # Using imagemagick. os.system("rm step*.png") ================================================ FILE: examples/fluidsim/wing.py ================================================ import os import matplotlib.pyplot as plt from scipy.optimize import minimize import autograd.numpy as np from autograd import value_and_grad rows, cols = 40, 60 # Fluid simulation code based on # "Real-Time Fluid Dynamics for Games" by Jos Stam # http://www.intpowertechcorp.com/GDC03.pdf def occlude(f, occlusion): return f * (1 - occlusion) def project(vx, vy, occlusion): """Project the velocity field to be approximately mass-conserving, using a few iterations of Gauss-Seidel.""" p = np.zeros(vx.shape) div = -0.5 * ( np.roll(vx, -1, axis=1) - np.roll(vx, 1, axis=1) + np.roll(vy, -1, axis=0) - np.roll(vy, 1, axis=0) ) div = make_continuous(div, occlusion) for k in range(50): p = ( div + np.roll(p, 1, axis=1) + np.roll(p, -1, axis=1) + np.roll(p, 1, axis=0) + np.roll(p, -1, axis=0) ) / 4.0 p = make_continuous(p, occlusion) vx = vx - 0.5 * (np.roll(p, -1, axis=1) - np.roll(p, 1, axis=1)) vy = vy - 0.5 * (np.roll(p, -1, axis=0) - np.roll(p, 1, axis=0)) vx = occlude(vx, occlusion) vy = occlude(vy, occlusion) return vx, vy def advect(f, vx, vy): """Move field f according to x and y velocities (u and v) using an implicit Euler integrator.""" rows, cols = f.shape cell_xs, cell_ys = np.meshgrid(np.arange(cols), np.arange(rows)) center_xs = (cell_xs - vx).ravel() center_ys = (cell_ys - vy).ravel() # Compute indices of source cells. left_ix = np.floor(center_ys).astype(int) top_ix = np.floor(center_xs).astype(int) rw = center_ys - left_ix # Relative weight of right-hand cells. bw = center_xs - top_ix # Relative weight of bottom cells. left_ix = np.mod(left_ix, rows) # Wrap around edges of simulation. right_ix = np.mod(left_ix + 1, rows) top_ix = np.mod(top_ix, cols) bot_ix = np.mod(top_ix + 1, cols) # A linearly-weighted sum of the 4 surrounding cells. flat_f = (1 - rw) * ((1 - bw) * f[left_ix, top_ix] + bw * f[left_ix, bot_ix]) + rw * ( (1 - bw) * f[right_ix, top_ix] + bw * f[right_ix, bot_ix] ) return np.reshape(flat_f, (rows, cols)) def make_continuous(f, occlusion): non_occluded = 1 - occlusion num = ( np.roll(f, 1, axis=0) * np.roll(non_occluded, 1, axis=0) + np.roll(f, -1, axis=0) * np.roll(non_occluded, -1, axis=0) + np.roll(f, 1, axis=1) * np.roll(non_occluded, 1, axis=1) + np.roll(f, -1, axis=1) * np.roll(non_occluded, -1, axis=1) ) den = ( np.roll(non_occluded, 1, axis=0) + np.roll(non_occluded, -1, axis=0) + np.roll(non_occluded, 1, axis=1) + np.roll(non_occluded, -1, axis=1) ) return f * non_occluded + (1 - non_occluded) * num / (den + 0.001) def sigmoid(x): return 0.5 * (np.tanh(x) + 1.0) # Output ranges from 0 to 1. def simulate(vx, vy, num_time_steps, occlusion, ax=None, render=False): occlusion = sigmoid(occlusion) # Disallow occlusion outside a certain area. mask = np.zeros((rows, cols)) mask[10:30, 10:30] = 1.0 occlusion = occlusion * mask # Initialize smoke bands. red_smoke = np.zeros((rows, cols)) red_smoke[rows // 4 : rows // 2] = 1 blue_smoke = np.zeros((rows, cols)) blue_smoke[rows // 2 : 3 * rows // 4] = 1 print("Running simulation...") vx, vy = project(vx, vy, occlusion) for t in range(num_time_steps): plot_matrix(ax, red_smoke, occlusion, blue_smoke, t, render) vx_updated = advect(vx, vx, vy) vy_updated = advect(vy, vx, vy) vx, vy = project(vx_updated, vy_updated, occlusion) red_smoke = advect(red_smoke, vx, vy) red_smoke = occlude(red_smoke, occlusion) blue_smoke = advect(blue_smoke, vx, vy) blue_smoke = occlude(blue_smoke, occlusion) plot_matrix(ax, red_smoke, occlusion, blue_smoke, num_time_steps, render) return vx, vy def plot_matrix(ax, r, g, b, t, render=False): if ax: plt.cla() ax.imshow(np.concatenate((r[..., np.newaxis], g[..., np.newaxis], b[..., np.newaxis]), axis=2)) ax.set_xticks([]) ax.set_yticks([]) plt.draw() if render: plt.savefig(f"step{t:03d}.png", bbox_inches="tight") plt.pause(0.001) if __name__ == "__main__": simulation_timesteps = 20 print("Loading initial and target states...") init_vx = np.ones((rows, cols)) init_vy = np.zeros((rows, cols)) # Initialize the occlusion to be a block. init_occlusion = -np.ones((rows, cols)) init_occlusion[15:25, 15:25] = 0.0 init_occlusion = init_occlusion.ravel() def drag(vx): return np.mean(init_vx - vx) def lift(vy): return np.mean(vy - init_vy) def objective(params): cur_occlusion = np.reshape(params, (rows, cols)) final_vx, final_vy = simulate(init_vx, init_vy, simulation_timesteps, cur_occlusion) return -lift(final_vy) / drag(final_vx) # Specify gradient of objective function using autograd. objective_with_grad = value_and_grad(objective) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111, frameon=False) def callback(weights): cur_occlusion = np.reshape(weights, (rows, cols)) simulate(init_vx, init_vy, simulation_timesteps, cur_occlusion, ax) print("Rendering initial flow...") callback(init_occlusion) print("Optimizing initial conditions...") result = minimize( objective_with_grad, init_occlusion, jac=True, method="CG", options={"maxiter": 50, "disp": True}, callback=callback, ) print("Rendering optimized flow...") final_occlusion = np.reshape(result.x, (rows, cols)) simulate(init_vx, init_vy, simulation_timesteps, final_occlusion, ax, render=True) print("Converting frames to an animated GIF...") # Using imagemagick. os.system(f"convert -delay 5 -loop 0 step*.png -delay 250 step{simulation_timesteps:03d}.png wing.gif") os.system("rm step*.png") ================================================ FILE: examples/gaussian_process.py ================================================ import matplotlib.pyplot as plt from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.multivariate_normal as mvn from autograd import value_and_grad from autograd.numpy.linalg import solve def make_gp_funs(cov_func, num_cov_params): """Functions that perform Gaussian process regression. cov_func has signature (cov_params, x, x')""" def unpack_kernel_params(params): mean = params[0] cov_params = params[2:] noise_scale = np.exp(params[1]) + 0.0001 return mean, cov_params, noise_scale def predict(params, x, y, xstar): """Returns the predictive mean and covariance at locations xstar, of the latent function value f (without observation noise).""" mean, cov_params, noise_scale = unpack_kernel_params(params) cov_f_f = cov_func(cov_params, xstar, xstar) cov_y_f = cov_func(cov_params, x, xstar) cov_y_y = cov_func(cov_params, x, x) + noise_scale * np.eye(len(y)) pred_mean = mean + np.dot(solve(cov_y_y, cov_y_f).T, y - mean) pred_cov = cov_f_f - np.dot(solve(cov_y_y, cov_y_f).T, cov_y_f) return pred_mean, pred_cov def log_marginal_likelihood(params, x, y): mean, cov_params, noise_scale = unpack_kernel_params(params) cov_y_y = cov_func(cov_params, x, x) + noise_scale * np.eye(len(y)) prior_mean = mean * np.ones(len(y)) return mvn.logpdf(y, prior_mean, cov_y_y) return num_cov_params + 2, predict, log_marginal_likelihood # Define an example covariance function. def rbf_covariance(kernel_params, x, xp): output_scale = np.exp(kernel_params[0]) lengthscales = np.exp(kernel_params[1:]) diffs = np.expand_dims(x / lengthscales, 1) - np.expand_dims(xp / lengthscales, 0) return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=2)) def build_toy_dataset(D=1, n_data=20, noise_std=0.1): rs = npr.RandomState(0) inputs = np.concatenate([np.linspace(0, 3, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)]) targets = (np.cos(inputs) + rs.randn(n_data) * noise_std) / 2.0 inputs = (inputs - 4.0) / 2.0 inputs = inputs.reshape((len(inputs), D)) return inputs, targets if __name__ == "__main__": D = 1 # Build model and objective function. num_params, predict, log_marginal_likelihood = make_gp_funs(rbf_covariance, num_cov_params=D + 1) X, y = build_toy_dataset(D=D) objective = lambda params: -log_marginal_likelihood(params, X, y) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params): print(f"Log likelihood {-objective(params)}") plt.cla() # Show posterior marginals. plot_xs = np.reshape(np.linspace(-7, 7, 300), (300, 1)) pred_mean, pred_cov = predict(params, X, y, plot_xs) marg_std = np.sqrt(np.diag(pred_cov)) ax.plot(plot_xs, pred_mean, "b") ax.fill( np.concatenate([plot_xs, plot_xs[::-1]]), np.concatenate([pred_mean - 1.96 * marg_std, (pred_mean + 1.96 * marg_std)[::-1]]), alpha=0.15, fc="Blue", ec="None", ) # Show samples from posterior. rs = npr.RandomState(0) sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10) ax.plot(plot_xs, sampled_funcs.T) ax.plot(X, y, "kx") ax.set_ylim([-1.5, 1.5]) ax.set_xticks([]) ax.set_yticks([]) plt.draw() plt.pause(1.0 / 60.0) # Initialize covariance parameters rs = npr.RandomState(0) init_params = 0.1 * rs.randn(num_params) print("Optimizing covariance parameters...") cov_params = minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback) plt.pause(10.0) ================================================ FILE: examples/generative_adversarial_net.py ================================================ # Implements a Generative Adversarial Network, from # arxiv.org/abs/1406.2661 # but, it always collapses to generating a single image. # Let me know if you can get it to work! - David Duvenaud from data import load_mnist, save_images import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.misc import flatten ### Define geneerator, discriminator, and objective ### def relu(x): return np.maximum(0, x) def sigmoid(x): return 0.5 * (np.tanh(x) + 1.0) def logsigmoid(x): return x - np.logaddexp(0, x) def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a list of (weights, biases) tuples, one for each layer in the net.""" return [ ( scale * rs.randn(m, n), # weight matrix scale * rs.randn(n), ) # bias vector for m, n in zip(layer_sizes[:-1], layer_sizes[1:]) ] def batch_normalize(activations): mbmean = np.mean(activations, axis=0, keepdims=True) return (activations - mbmean) / (np.std(activations, axis=0, keepdims=True) + 1) def neural_net_predict(params, inputs): """Params is a list of (weights, bias) tuples. inputs is an (N x D) matrix.""" inpW, inpb = params[0] inputs = relu(np.dot(inputs, inpW) + inpb) for W, b in params[1:-1]: outputs = batch_normalize(np.dot(inputs, W) + b) inputs = relu(outputs) outW, outb = params[-1] outputs = np.dot(inputs, outW) + outb return outputs def generate_from_noise(gen_params, num_samples, noise_dim, rs): noise = rs.rand(num_samples, noise_dim) samples = neural_net_predict(gen_params, noise) return sigmoid(samples) def gan_objective(gen_params, dsc_params, real_data, num_samples, noise_dim, rs): fake_data = generate_from_noise(gen_params, num_samples, noise_dim, rs) logprobs_fake = logsigmoid(neural_net_predict(dsc_params, fake_data)) logprobs_real = logsigmoid(neural_net_predict(dsc_params, real_data)) return np.mean(logprobs_real) - np.mean(logprobs_fake) ### Define minimax version of adam optimizer ### def adam_minimax( grad_both, init_params_max, init_params_min, callback=None, num_iters=100, step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8, ): """Adam modified to do minimiax optimization, for instance to help with training generative adversarial networks.""" x_max, unflatten_max = flatten(init_params_max) x_min, unflatten_min = flatten(init_params_min) m_max = np.zeros(len(x_max)) v_max = np.zeros(len(x_max)) m_min = np.zeros(len(x_min)) v_min = np.zeros(len(x_min)) for i in range(num_iters): g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i) g_max, _ = flatten(g_max_uf) g_min, _ = flatten(g_min_uf) if callback: callback( unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min) ) m_max = (1 - b1) * g_max + b1 * m_max # First moment estimate. v_max = (1 - b2) * (g_max**2) + b2 * v_max # Second moment estimate. mhat_max = m_max / (1 - b1 ** (i + 1)) # Bias correction. vhat_max = v_max / (1 - b2 ** (i + 1)) x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min**2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1 ** (i + 1)) # Bias correction. vhat_min = v_min / (1 - b2 ** (i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) return unflatten_max(x_max), unflatten_min(x_min) ### Setup and run on MNIST ### if __name__ == "__main__": # Model hyper-parameters noise_dim = 10 gen_layer_sizes = [noise_dim, 200, 784] dsc_layer_sizes = [784, 200, 1] # Training parameters param_scale = 0.001 batch_size = 100 num_epochs = 50 step_size_max = 0.01 step_size_min = 0.01 print("Loading training data...") N, train_images, _, test_images, _ = load_mnist() init_gen_params = init_random_params(param_scale, gen_layer_sizes) init_dsc_params = init_random_params(param_scale, dsc_layer_sizes) num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # Define training objective seed = npr.RandomState(0) def objective(gen_params, dsc_params, iter): idx = batch_indices(iter) return gan_objective(gen_params, dsc_params, train_images[idx], batch_size, noise_dim, seed) # Get gradients of objective using autograd. both_objective_grad = grad(objective, argnum=(0, 1)) print(" Epoch | Objective | Fake probability | Real Probability ") def print_perf(gen_params, dsc_params, iter, gen_gradient, dsc_gradient): if iter % 10 == 0: ability = np.mean(objective(gen_params, dsc_params, iter)) fake_data = generate_from_noise(gen_params, 20, noise_dim, seed) real_data = train_images[batch_indices(iter)] probs_fake = np.mean(sigmoid(neural_net_predict(dsc_params, fake_data))) probs_real = np.mean(sigmoid(neural_net_predict(dsc_params, real_data))) print(f"{iter // num_batches:15}|{ability:20}|{probs_fake:20}|{probs_real:20}") save_images(fake_data, "gan_samples.png", vmin=0, vmax=1) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam_minimax( both_objective_grad, init_gen_params, init_dsc_params, step_size_max=step_size_max, step_size_min=step_size_min, num_iters=num_epochs * num_batches, callback=print_perf, ) ================================================ FILE: examples/gmm.py ================================================ """Implements a Gaussian mixture model, in which parameters are fit using gradient descent. This example runs on 2-dimensional data, but the model works on arbitrarily-high dimension.""" import matplotlib.pyplot as plt from data import make_pinwheel from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.multivariate_normal as mvn from autograd import grad, hessian_vector_product from autograd.misc.flatten import flatten_func from autograd.scipy.special import logsumexp def init_gmm_params(num_components, D, scale, rs=npr.RandomState(0)): return { "log proportions": rs.randn(num_components) * scale, "means": rs.randn(num_components, D) * scale, "lower triangles": np.zeros((num_components, D, D)) + np.eye(D), } def log_normalize(x): return x - logsumexp(x) def unpack_gmm_params(params): normalized_log_proportions = log_normalize(params["log proportions"]) return normalized_log_proportions, params["means"], params["lower triangles"] def gmm_log_likelihood(params, data): cluster_lls = [] for log_proportion, mean, cov_sqrt in zip(*unpack_gmm_params(params)): cov = np.dot(cov_sqrt.T, cov_sqrt) cluster_lls.append(log_proportion + mvn.logpdf(data, mean, cov)) return np.sum(logsumexp(np.vstack(cluster_lls), axis=0)) def plot_ellipse(ax, mean, cov_sqrt, alpha, num_points=100): angles = np.linspace(0, 2 * np.pi, num_points) circle_pts = np.vstack([np.cos(angles), np.sin(angles)]).T * 2.0 cur_pts = mean + np.dot(circle_pts, cov_sqrt) ax.plot(cur_pts[:, 0], cur_pts[:, 1], "-", alpha=alpha) def plot_gaussian_mixture(params, ax): for log_proportion, mean, cov_sqrt in zip(*unpack_gmm_params(params)): alpha = np.minimum(1.0, np.exp(log_proportion) * 10) plot_ellipse(ax, mean, cov_sqrt, alpha) if __name__ == "__main__": init_params = init_gmm_params(num_components=10, D=2, scale=0.1) data = make_pinwheel(radial_std=0.3, tangential_std=0.05, num_classes=3, num_per_class=100, rate=0.4) def objective(params): return -gmm_log_likelihood(params, data) flattened_obj, unflatten, flattened_init_params = flatten_func(objective, init_params) fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(flattened_params): params = unflatten(flattened_params) print(f"Log likelihood {-objective(params)}") ax.cla() ax.plot(data[:, 0], data[:, 1], "k.") ax.set_xticks([]) ax.set_yticks([]) plot_gaussian_mixture(params, ax) plt.draw() plt.pause(1.0 / 60.0) minimize( flattened_obj, flattened_init_params, jac=grad(flattened_obj), hessp=hessian_vector_product(flattened_obj), method="Newton-CG", callback=callback, ) ================================================ FILE: examples/gplvm.py ================================================ # Implements a Gaussian process latent-variable model. # The (high-dimensional) data, Y is explained by some low-dimensional latent # data X, warped by a function drawn from a GP prior (f). So Y = f(X), but # we don't know X or f. # # In this example, we optimize X and the hyperparameters of the GP, but # we integrate over all possible functions f. # # Normally the observed data would be high-dimensional. # # David Duvenaud (duvenaud@gmail.com) import matplotlib.pyplot as plt from data import make_pinwheel from gaussian_process import make_gp_funs, rbf_covariance from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr from autograd import value_and_grad from autograd.scipy.stats import norm if __name__ == "__main__": data_dimension = 2 # Normally the data dimension would be much higher. latent_dimension = 2 # Build model and objective function. params_per_gp, predict, log_marginal_likelihood = make_gp_funs( rbf_covariance, num_cov_params=latent_dimension + 1 ) total_gp_params = data_dimension * params_per_gp data = make_pinwheel(radial_std=0.3, tangential_std=0.05, num_classes=3, num_per_class=30, rate=0.4) datalen = data.shape[0] num_latent_params = datalen * latent_dimension def unpack_params(params): gp_params = np.reshape(params[:total_gp_params], (data_dimension, params_per_gp)) latents = np.reshape(params[total_gp_params:], (datalen, latent_dimension)) return gp_params, latents def objective(params): gp_params, latents = unpack_params(params) gp_likelihood = sum( [log_marginal_likelihood(gp_params[i], latents, data[:, i]) for i in range(data_dimension)] ) latent_prior_likelihood = np.sum(norm.logpdf(latents)) return -gp_likelihood - latent_prior_likelihood # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") latent_ax = fig.add_subplot(121, frameon=False) data_ax = fig.add_subplot(122, frameon=False) plt.show(block=False) def callback(params): print(f"Log likelihood {-objective(params)}") gp_params, latents = unpack_params(params) data_ax.cla() data_ax.plot(data[:, 0], data[:, 1], "bx") data_ax.set_xticks([]) data_ax.set_yticks([]) data_ax.set_title("Observed Data") latent_ax.cla() latent_ax.plot(latents[:, 0], latents[:, 1], "kx") latent_ax.set_xticks([]) latent_ax.set_yticks([]) latent_ax.set_xlim([-2, 2]) latent_ax.set_ylim([-2, 2]) latent_ax.set_title("Latent coordinates") plt.draw() plt.pause(1.0 / 60.0) # Initialize covariance parameters rs = npr.RandomState(1) init_params = rs.randn(total_gp_params + num_latent_params) * 0.1 print("Optimizing covariance parameters and latent variable locations...") minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback) ================================================ FILE: examples/hmm_em.py ================================================ import string from functools import partial from os.path import dirname, join import autograd.numpy as np import autograd.numpy.random as npr from autograd import value_and_grad as vgrad from autograd.scipy.special import logsumexp def EM(init_params, data, callback=None): def EM_update(params): natural_params = list(map(np.log, params)) loglike, E_stats = vgrad(log_partition_function)(natural_params, data) # E step if callback: callback(loglike, params) return list(map(normalize, E_stats)) # M step def fixed_point(f, x0): x1 = f(x0) while different(x0, x1): x0, x1 = x1, f(x1) return x1 def different(params1, params2): allclose = partial(np.allclose, atol=1e-3, rtol=1e-3) return not all(map(allclose, params1, params2)) return fixed_point(EM_update, init_params) def normalize(a): def replace_zeros(a): return np.where(a > 0.0, a, 1.0) return a / replace_zeros(a.sum(-1, keepdims=True)) def log_partition_function(natural_params, data): if isinstance(data, list): return sum(map(partial(log_partition_function, natural_params), data)) log_pi, log_A, log_B = natural_params log_alpha = log_pi for y in data: log_alpha = logsumexp(log_alpha[:, None] + log_A, axis=0) + log_B[:, y] return logsumexp(log_alpha) def initialize_hmm_parameters(num_states, num_outputs): init_pi = normalize(npr.rand(num_states)) init_A = normalize(npr.rand(num_states, num_states)) init_B = normalize(npr.rand(num_states, num_outputs)) return init_pi, init_A, init_B def build_dataset(filename, max_lines=-1): """Loads a text file, and turns each line into an encoded sequence.""" encodings = dict(list(map(reversed, enumerate(string.printable)))) digitize = lambda char: encodings[char] if char in encodings else len(encodings) encode_line = lambda line: np.array(list(map(digitize, line))) nonblank_line = lambda line: len(line) > 2 with open(filename) as f: lines = f.readlines() encoded_lines = list(map(encode_line, list(filter(nonblank_line, lines))[:max_lines])) num_outputs = len(encodings) + 1 return encoded_lines, num_outputs if __name__ == "__main__": np.random.seed(0) np.seterr(divide="ignore") # callback to print log likelihoods during training print_loglike = lambda loglike, params: print(loglike) # load training data lstm_filename = join(dirname(__file__), "lstm.py") train_inputs, num_outputs = build_dataset(lstm_filename, max_lines=60) # train with EM num_states = 20 init_params = initialize_hmm_parameters(num_states, num_outputs) pi, A, B = EM(init_params, train_inputs, print_loglike) ================================================ FILE: examples/ica.py ================================================ import matplotlib.cm as cm import matplotlib.pyplot as plt from scipy.optimize import minimize import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.t as t from autograd import value_and_grad def make_ica_funs(observed_dimension, latent_dimension): """These functions implement independent component analysis. The model is: latents are drawn i.i.d. for each data point from a product of student-ts. weights are the same across all datapoints. each data = latents * weghts + noise.""" def sample(weights, n_samples, noise_std, rs): latents = rs.randn(latent_dimension, n_samples) latents = np.array(sorted(latents.T, key=lambda a_entry: a_entry[0])).T noise = rs.randn(n_samples, observed_dimension) * noise_std observed = predict(weights, latents) + noise return latents, observed def predict(weights, latents): return np.dot(weights, latents).T def logprob(weights, latents, noise_std, observed): preds = predict(weights, latents) log_lik = np.sum(t.logpdf(preds, 2.4, observed, noise_std)) return log_lik num_weights = observed_dimension * latent_dimension def unpack_weights(weights): return np.reshape(weights, (observed_dimension, latent_dimension)) return num_weights, sample, logprob, unpack_weights def color_scatter(ax, xs, ys): colors = cm.rainbow(np.linspace(0, 1, len(ys))) for x, y, c in zip(xs, ys, colors): ax.scatter(x, y, color=c) if __name__ == "__main__": observed_dimension = 100 latent_dimension = 2 true_noise_var = 1.0 n_samples = 200 num_weights, sample, logprob, unpack_weights = make_ica_funs(observed_dimension, latent_dimension) num_latent_params = latent_dimension * n_samples total_num_params = num_weights + num_latent_params + 1 def unpack_params(params): weights = unpack_weights(params[:num_weights]) latents = np.reshape( params[num_weights : num_weights + num_latent_params], (latent_dimension, n_samples) ) noise_std = np.exp(params[-1]) return weights, latents, noise_std rs = npr.RandomState(0) true_weights = np.zeros((observed_dimension, latent_dimension)) for i in range(latent_dimension): true_weights[:, i] = np.sin(np.linspace(0, 4 + i * 3.2, observed_dimension)) true_latents, data = sample(true_weights, n_samples, true_noise_var, rs) # Set up figure. fig2 = plt.figure(figsize=(6, 6), facecolor="white") ax_data = fig2.add_subplot(111, frameon=False) ax_data.matshow(data) fig1 = plt.figure(figsize=(12, 16), facecolor="white") ax_true_latents = fig1.add_subplot(411, frameon=False) ax_est_latents = fig1.add_subplot(412, frameon=False) ax_true_weights = fig1.add_subplot(413, frameon=False) ax_est_weights = fig1.add_subplot(414, frameon=False) plt.show(block=False) ax_true_weights.scatter(true_weights[:, 0], true_weights[:, 1]) ax_true_weights.set_title("True weights") color_scatter(ax_true_latents, true_latents[0, :], true_latents[1, :]) ax_true_latents.set_title("True latents") ax_true_latents.set_xticks([]) ax_true_weights.set_xticks([]) ax_true_latents.set_yticks([]) ax_true_weights.set_yticks([]) def objective(params): weight_matrix, latents, noise_std = unpack_params(params) return -logprob(weight_matrix, latents, noise_std, data) / n_samples def callback(params): weights, latents, noise_std = unpack_params(params) print(f"Log likelihood {-objective(params)}, noise_std {noise_std}") ax_est_weights.cla() ax_est_weights.scatter(weights[:, 0], weights[:, 1]) ax_est_weights.set_title("Estimated weights") ax_est_latents.cla() color_scatter(ax_est_latents, latents[0, :], latents[1, :]) ax_est_latents.set_title("Estimated latents") ax_est_weights.set_yticks([]) ax_est_latents.set_yticks([]) ax_est_weights.set_xticks([]) ax_est_latents.set_xticks([]) plt.draw() plt.pause(1.0 / 60.0) # Initialize and optimize model. rs = npr.RandomState(0) init_params = rs.randn(total_num_params) minimize(value_and_grad(objective), init_params, jac=True, method="CG", callback=callback) plt.pause(20) ================================================ FILE: examples/logistic_regression.py ================================================ import autograd.numpy as np from autograd import grad from autograd.test_util import check_grads def sigmoid(x): return 0.5 * (np.tanh(x) + 1) def logistic_predictions(weights, inputs): # Outputs probability of a label being true according to logistic model. return sigmoid(np.dot(inputs, weights)) def training_loss(weights): # Training loss is the negative log-likelihood of the training labels. preds = logistic_predictions(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) # Build a toy dataset. inputs = np.array([[0.52, 1.12, 0.77], [0.88, -1.08, 0.15], [0.52, 0.06, -1.30], [0.74, -2.49, 1.39]]) targets = np.array([True, True, False, True]) # Build a function that returns gradients of training loss using autograd. training_gradient_fun = grad(training_loss) # Check the gradients numerically, just to be safe. weights = np.array([0.0, 0.0, 0.0]) check_grads(training_loss, modes=["rev"])(weights) # Optimize weights using gradient descent. print("Initial loss:", training_loss(weights)) for i in range(100): weights -= training_gradient_fun(weights) * 0.01 print("Trained loss:", training_loss(weights)) ================================================ FILE: examples/lstm.py ================================================ """Implements the long-short term memory character model. This version vectorizes over multiple examples, but each string has a fixed length.""" from os.path import dirname, join from rnn import build_dataset, concat_and_multiply, one_hot_to_string, sigmoid, string_to_one_hot import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.misc.optimizers import adam from autograd.scipy.special import logsumexp def init_lstm_params(input_size, state_size, output_size, param_scale=0.01, rs=npr.RandomState(0)): def rp(*shape): return rs.randn(*shape) * param_scale return { "init cells": rp(1, state_size), "init hiddens": rp(1, state_size), "change": rp(input_size + state_size + 1, state_size), "forget": rp(input_size + state_size + 1, state_size), "ingate": rp(input_size + state_size + 1, state_size), "outgate": rp(input_size + state_size + 1, state_size), "predict": rp(state_size + 1, output_size), } def lstm_predict(params, inputs): def update_lstm(input, hiddens, cells): change = np.tanh(concat_and_multiply(params["change"], input, hiddens)) forget = sigmoid(concat_and_multiply(params["forget"], input, hiddens)) ingate = sigmoid(concat_and_multiply(params["ingate"], input, hiddens)) outgate = sigmoid(concat_and_multiply(params["outgate"], input, hiddens)) cells = cells * forget + ingate * change hiddens = outgate * np.tanh(cells) return hiddens, cells def hiddens_to_output_probs(hiddens): output = concat_and_multiply(params["predict"], hiddens) return output - logsumexp(output, axis=1, keepdims=True) # Normalize log-probs. num_sequences = inputs.shape[1] hiddens = np.repeat(params["init hiddens"], num_sequences, axis=0) cells = np.repeat(params["init cells"], num_sequences, axis=0) output = [hiddens_to_output_probs(hiddens)] for input in inputs: # Iterate over time steps. hiddens, cells = update_lstm(input, hiddens, cells) output.append(hiddens_to_output_probs(hiddens)) return output def lstm_log_likelihood(params, inputs, targets): logprobs = lstm_predict(params, inputs) loglik = 0.0 num_time_steps, num_examples, _ = inputs.shape for t in range(num_time_steps): loglik += np.sum(logprobs[t] * targets[t]) return loglik / (num_time_steps * num_examples) if __name__ == "__main__": num_chars = 128 # Learn to predict our own source code. text_filename = join(dirname(__file__), "lstm.py") train_inputs = build_dataset(text_filename, sequence_length=30, alphabet_size=num_chars, max_lines=60) init_params = init_lstm_params(input_size=128, output_size=128, state_size=40, param_scale=0.01) def print_training_prediction(weights): print("Training text Predicted text") logprobs = np.asarray(lstm_predict(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_inputs[:, t, :]) predicted_text = one_hot_to_string(logprobs[:, t, :]) print(training_text.replace("\n", " ") + "|" + predicted_text.replace("\n", " ")) def training_loss(params, iter): return -lstm_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("Training LSTM...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from LSTM...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = lstm_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text) ================================================ FILE: examples/mixture_variational_inference.py ================================================ # Implements black-box variational inference, where the variational # distribution is a mixture of Gaussians. # # This trick was written up by Alex Graves in this note: # http://arxiv.org/abs/1607.05690 import matplotlib.pyplot as plt import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.norm as norm from autograd import grad from autograd.misc.optimizers import adam from autograd.scipy.special import logsumexp def diag_gaussian_log_density(x, mu, log_std): return np.sum(norm.logpdf(x, mu, np.exp(log_std)), axis=-1) def unpack_gaussian_params(params): # Variational dist is a diagonal Gaussian. D = np.shape(params)[0] // 2 mean, log_std = params[:D], params[D:] return mean, log_std def variational_log_density_gaussian(params, x): mean, log_std = unpack_gaussian_params(params) return diag_gaussian_log_density(x, mean, log_std) def sample_diag_gaussian(params, num_samples, rs): mean, log_std = unpack_gaussian_params(params) D = np.shape(mean)[0] return rs.randn(num_samples, D) * np.exp(log_std) + mean def variational_lower_bound(params, t, logprob, sampler, log_density, num_samples, rs): """Provides a stochastic estimate of the variational lower bound, for any variational family and model density.""" samples = sampler(params, num_samples, rs) log_qs = log_density(params, samples) log_ps = logprob(samples, t) log_ps = np.reshape(log_ps, (num_samples, -1)) log_qs = np.reshape(log_qs, (num_samples, -1)) return np.mean(log_ps - log_qs) def init_gaussian_var_params(D, mean_mean=-1, log_std_mean=-5, scale=0.1, rs=npr.RandomState(0)): init_mean = mean_mean * np.ones(D) + rs.randn(D) * scale init_log_std = log_std_mean * np.ones(D) + rs.randn(D) * scale return np.concatenate([init_mean, init_log_std]) def log_normalize(x): return x - logsumexp(x) def build_mog_bbsvi(logprob, num_samples, k=10, rs=npr.RandomState(0)): init_component_var_params = init_gaussian_var_params component_log_density = variational_log_density_gaussian component_sample = sample_diag_gaussian def unpack_mixture_params(mixture_params): log_weights = log_normalize(mixture_params[:k]) var_params = np.reshape(mixture_params[k:], (k, -1)) return log_weights, var_params def init_var_params(D, rs=npr.RandomState(0), **kwargs): log_weights = np.ones(k) component_weights = [init_component_var_params(D, rs=rs, **kwargs) for i in range(k)] return np.concatenate([log_weights] + component_weights) def sample(var_mixture_params, num_samples, rs): """Sample locations aren't a continuous function of parameters due to multinomial sampling.""" log_weights, var_params = unpack_mixture_params(var_mixture_params) samples = np.concatenate( [component_sample(params_k, num_samples, rs)[:, np.newaxis, :] for params_k in var_params], axis=1, ) ixs = np.random.choice(k, size=num_samples, p=np.exp(log_weights)) return np.array([samples[i, ix, :] for i, ix in enumerate(ixs)]) def mixture_log_density(var_mixture_params, x): """Returns a weighted average over component densities.""" log_weights, var_params = unpack_mixture_params(var_mixture_params) component_log_densities = np.vstack( [component_log_density(params_k, x) for params_k in var_params] ).T return logsumexp(component_log_densities + log_weights, axis=1, keepdims=False) def mixture_elbo(var_mixture_params, t): # We need to only sample the continuous component parameters, # and integrate over the discrete component choice def mixture_lower_bound(params): """Provides a stochastic estimate of the variational lower bound.""" samples = component_sample(params, num_samples, rs) log_qs = mixture_log_density(var_mixture_params, samples) log_ps = logprob(samples, t) log_ps = np.reshape(log_ps, (num_samples, -1)) log_qs = np.reshape(log_qs, (num_samples, -1)) return np.mean(log_ps - log_qs) log_weights, var_params = unpack_mixture_params(var_mixture_params) component_elbos = np.stack([mixture_lower_bound(params_k) for params_k in var_params]) return np.sum(component_elbos * np.exp(log_weights)) return init_var_params, mixture_elbo, mixture_log_density, sample if __name__ == "__main__": # Specify an inference problem by its unnormalized log-density. D = 2 def log_density(x, t): mu, log_sigma = x[:, 0], x[:, 1] sigma_density = norm.logpdf(log_sigma, 0, 1.35) mu_density = norm.logpdf(mu, -0.5, np.exp(log_sigma)) sigma_density2 = norm.logpdf(log_sigma, 0.1, 1.35) mu_density2 = norm.logpdf(mu, 0.5, np.exp(log_sigma)) return np.logaddexp(sigma_density + mu_density, sigma_density2 + mu_density2) init_var_params, elbo, variational_log_density, variational_sampler = build_mog_bbsvi( log_density, num_samples=40, k=10 ) def objective(params, t): return -elbo(params, t) # Set up plotting code def plot_isocontours(ax, func, xlimits=[-2, 2], ylimits=[-4, 2], numticks=101, cmap=None): x = np.linspace(*xlimits, num=numticks) y = np.linspace(*ylimits, num=numticks) X, Y = np.meshgrid(x, y) zs = func(np.concatenate([np.atleast_2d(X.ravel()), np.atleast_2d(Y.ravel())]).T) Z = zs.reshape(X.shape) plt.contour(X, Y, Z, cmap=cmap) ax.set_yticks([]) ax.set_xticks([]) fig = plt.figure(figsize=(8, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) num_plotting_samples = 51 def callback(params, t, g): print(f"Iteration {t} lower bound {-objective(params, t)}") plt.cla() target_distribution = lambda x: np.exp(log_density(x, t)) var_distribution = lambda x: np.exp(variational_log_density(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution, cmap=plt.cm.bone) ax.set_autoscale_on(False) rs = npr.RandomState(0) samples = variational_sampler(params, num_plotting_samples, rs) plt.plot(samples[:, 0], samples[:, 1], "x") plt.draw() plt.pause(1.0 / 30.0) print("Optimizing variational parameters...") variational_params = adam( grad(objective), init_var_params(D), step_size=0.1, num_iters=2000, callback=callback ) ================================================ FILE: examples/natural_gradient_black_box_svi.py ================================================ import matplotlib.pyplot as plt # same BBSVI function! from black_box_svi import black_box_variational_inference import autograd.numpy as np import autograd.scipy.stats.norm as norm from autograd.misc.optimizers import adam, sgd if __name__ == "__main__": # Specify an inference problem by its unnormalized log-density. # it's difficult to see the benefit in low dimensions # model parameters are a mean and a log_sigma np.random.seed(42) obs_dim = 20 Y = np.random.randn(obs_dim, obs_dim).dot(np.random.randn(obs_dim)) def log_density(x, t): mu, log_sigma = x[:, :obs_dim], x[:, obs_dim:] sigma_density = np.sum(norm.logpdf(log_sigma, 0, 1.35), axis=1) mu_density = np.sum(norm.logpdf(Y, mu, np.exp(log_sigma)), axis=1) return sigma_density + mu_density # Build variational objective. D = obs_dim * 2 # dimension of our posterior objective, gradient, unpack_params = black_box_variational_inference(log_density, D, num_samples=2000) # Define the natural gradient # The natural gradient of the ELBO is the gradient of the elbo, # preconditioned by the inverse Fisher Information Matrix. The Fisher, # in the case of a diagonal gaussian, is a diagonal matrix that is a # simple function of the variance. Intuitively, statistical distance # created by perturbing the mean of an independent Gaussian is # determined by how wide the distribution is along that dimension --- # the wider the distribution, the less sensitive statistical distances is # to perturbations of the mean; the narrower the distribution, the more # the statistical distance changes when you perturb the mean (imagine # an extremely narrow Gaussian --- basically a spike. The KL between # this Gaussian and a Gaussian $\epsilon$ away in location can be big --- # moving the Gaussian could significantly reduce overlap in support # which corresponds to a greater statistical distance). # # When we want to move in directions of steepest ascent, we multiply by # the inverse fisher --- that way we make quicker progress when the # variance is wide, and we scale down our step size when the variance # is small (which leads to more robust/less chaotic ascent). def fisher_diag(lam): mu, log_sigma = unpack_params(lam) return np.concatenate([np.exp(-2.0 * log_sigma), np.ones(len(log_sigma)) * 2]) # simple! basically free! natural_gradient = lambda lam, i: (1.0 / fisher_diag(lam)) * gradient(lam, i) # function for keeping track of callback ELBO values (for plotting below) def optimize_and_lls(optfun): num_iters = 200 elbos = [] def callback(params, t, g): elbo_val = -objective(params, t) elbos.append(elbo_val) if t % 50 == 0: print(f"Iteration {t} lower bound {elbo_val}") init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = optfun(num_iters, init_var_params, callback) return np.array(elbos) # let's optimize this with a few different step sizes elbo_lists = [] step_sizes = [0.1, 0.25, 0.5] for step_size in step_sizes: # optimize with standard gradient + adam optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size, num_iters=n, callback=cb) standard_lls = optimize_and_lls(optfun) # optimize with natural gradient + sgd, no momentum optnat = lambda n, init, cb: sgd( natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=0.001 ) natural_lls = optimize_and_lls(optnat) elbo_lists.append((standard_lls, natural_lls)) # visually compare the ELBO plt.figure(figsize=(12, 8)) colors = ["b", "k", "g"] for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists): plt.plot( np.arange(len(stand_lls)), stand_lls, "--", label="standard (adam, step-size = %2.2f)" % ss, alpha=0.5, c=col, ) plt.plot(np.arange(len(nat_lls)), nat_lls, "-", label="natural (sgd, step-size = %2.2f)" % ss, c=col) llrange = natural_lls.max() - natural_lls.min() plt.ylim((natural_lls.max() - llrange * 0.1, natural_lls.max() + 10)) plt.xlabel("optimization iteration") plt.ylabel("ELBO") plt.legend(loc="lower right") plt.title("%d dimensional posterior" % D) plt.show() ================================================ FILE: examples/negative_binomial_maxlike.py ================================================ import scipy.optimize import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.scipy.special import gammaln # The code in this example implements a method for finding a stationary point of # the negative binomial likelihood via Newton's method, described here: # https://en.wikipedia.org/wiki/Negative_binomial_distribution#Maximum_likelihood_estimation def newton(f, x0): # wrap scipy.optimize.newton with our automatic derivatives return scipy.optimize.newton(f, x0, fprime=grad(f), fprime2=grad(grad(f))) def negbin_loglike(r, p, x): # the negative binomial log likelihood we want to maximize return gammaln(r + x) - gammaln(r) - gammaln(x + 1) + x * np.log(p) + r * np.log(1 - p) def negbin_sample(r, p, size): # a negative binomial is a gamma-compound-Poisson return npr.poisson(npr.gamma(r, p / (1 - p), size=size)) def fit_maxlike(x, r_guess): # follows Wikipedia's section on negative binomial max likelihood assert np.var(x) > np.mean(x), "Likelihood-maximizing parameters don't exist!" loglike = lambda r, p: np.sum(negbin_loglike(r, p, x)) p = lambda r: np.sum(x) / np.sum(r + x) rprime = lambda r: grad(loglike)(r, p(r)) r = newton(rprime, r_guess) return r, p(r) if __name__ == "__main__": # generate data npr.seed(0) data = negbin_sample(r=5, p=0.5, size=1000) # fit likelihood-extremizing parameters r, p = fit_maxlike(data, r_guess=1) # report fit print("Fit parameters:") print(f"r={r}, p={p}") print("Check that we are at a local stationary point:") loglike = lambda r, p: np.sum(negbin_loglike(r, p, data)) grad_both = grad(loglike, argnum=(0, 1)) print(grad_both(r, p)) import matplotlib.pyplot as plt xm = data.max() plt.figure() plt.hist(data, bins=np.arange(xm + 1) - 0.5, density=True, label="normed data counts") plt.xlim(0, xm) plt.plot(np.arange(xm), np.exp(negbin_loglike(r, p, np.arange(xm))), label="maxlike fit") plt.xlabel("k") plt.ylabel("p(k)") plt.legend(loc="best") plt.show() ================================================ FILE: examples/neural_net.py ================================================ """A multi-layer perceptron for classification of MNIST handwritten digits.""" from data import load_mnist import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.misc.flatten import flatten from autograd.misc.optimizers import adam from autograd.scipy.special import logsumexp def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a list of (weights, biases) tuples, one for each layer in the net.""" return [ ( scale * rs.randn(m, n), # weight matrix scale * rs.randn(n), ) # bias vector for m, n in zip(layer_sizes[:-1], layer_sizes[1:]) ] def neural_net_predict(params, inputs): """Implements a deep neural network for classification. params is a list of (weights, bias) tuples. inputs is an (N x D) matrix. returns normalized class log-probabilities.""" for W, b in params: outputs = np.dot(inputs, W) + b inputs = np.tanh(outputs) return outputs - logsumexp(outputs, axis=1, keepdims=True) def l2_norm(params): """Computes l2 norm of params by flattening them into a vector.""" flattened, _ = flatten(params) return np.dot(flattened, flattened) def log_posterior(params, inputs, targets, L2_reg): log_prior = -L2_reg * l2_norm(params) log_lik = np.sum(neural_net_predict(params, inputs) * targets) return log_prior + log_lik def accuracy(params, inputs, targets): target_class = np.argmax(targets, axis=1) predicted_class = np.argmax(neural_net_predict(params, inputs), axis=1) return np.mean(predicted_class == target_class) if __name__ == "__main__": # Model parameters layer_sizes = [784, 200, 100, 10] L2_reg = 1.0 # Training parameters param_scale = 0.1 batch_size = 256 num_epochs = 5 step_size = 0.001 print("Loading training data...") N, train_images, train_labels, test_images, test_labels = load_mnist() init_params = init_random_params(param_scale, layer_sizes) num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -log_posterior(params, train_images[idx], train_labels[idx], L2_reg) # Get gradient of objective using autograd. objective_grad = grad(objective) print(" Epoch | Train accuracy | Test accuracy ") def print_perf(params, iter, gradient): if iter % num_batches == 0: train_acc = accuracy(params, train_images, train_labels) test_acc = accuracy(params, test_images, test_labels) print(f"{iter // num_batches:15}|{train_acc:20}|{test_acc:20}") # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam( objective_grad, init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf, ) ================================================ FILE: examples/neural_net_regression.py ================================================ import matplotlib.pyplot as plt import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.norm as norm from autograd import grad from autograd.misc import flatten from autograd.misc.optimizers import adam def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a list of (weights, biases) tuples, one for each layer.""" return [ ( rs.randn(insize, outsize) * scale, # weight matrix rs.randn(outsize) * scale, ) # bias vector for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:]) ] def nn_predict(params, inputs, nonlinearity=np.tanh): for W, b in params: outputs = np.dot(inputs, W) + b inputs = nonlinearity(outputs) return outputs def log_gaussian(params, scale): flat_params, _ = flatten(params) return np.sum(norm.logpdf(flat_params, 0, scale)) def logprob(weights, inputs, targets, noise_scale=0.1): predictions = nn_predict(weights, inputs) return np.sum(norm.logpdf(predictions, targets, noise_scale)) def build_toy_dataset(n_data=80, noise_std=0.1): rs = npr.RandomState(0) inputs = np.concatenate([np.linspace(0, 3, num=n_data // 2), np.linspace(6, 8, num=n_data // 2)]) targets = np.cos(inputs) + rs.randn(n_data) * noise_std inputs = (inputs - 4.0) / 2.0 inputs = inputs[:, np.newaxis] targets = targets[:, np.newaxis] / 2.0 return inputs, targets if __name__ == "__main__": init_scale = 0.1 weight_prior_variance = 10.0 init_params = init_random_params(init_scale, layer_sizes=[1, 4, 4, 1]) inputs, targets = build_toy_dataset() def objective(weights, t): return -logprob(weights, inputs, targets) - log_gaussian(weights, weight_prior_variance) print(grad(objective)(init_params, 0)) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params, t, g): print(f"Iteration {t} log likelihood {-objective(params, t)}") # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), "bx", ms=12) plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300, 1)) outputs = nn_predict(params, plot_inputs) ax.plot(plot_inputs, outputs, "r", lw=3) ax.set_ylim([-1, 1]) plt.draw() plt.pause(1.0 / 60.0) print("Optimizing network parameters...") optimized_params = adam(grad(objective), init_params, step_size=0.01, num_iters=1000, callback=callback) ================================================ FILE: examples/ode_net.py ================================================ # A demo of gradients through scipy.integrate.odeint, # estimating the dynamics of a system given a trajectory. import matplotlib.pyplot as plt import numpy as npo import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.builtins import tuple from autograd.misc.optimizers import adam from autograd.scipy.integrate import odeint N = 30 # Dataset size D = 2 # Data dimension max_T = 1.5 # Two-dimensional damped oscillator def func(y, t0, A): return np.dot(y**3, A) def nn_predict(inputs, t, params): for W, b in params: outputs = np.dot(inputs, W) + b inputs = np.maximum(0, outputs) return outputs def init_nn_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a list of (weights, biases) tuples, one for each layer.""" return [ ( rs.randn(insize, outsize) * scale, # weight matrix rs.randn(outsize) * scale, ) # bias vector for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:]) ] # Define neural ODE model. def ode_pred(params, y0, t): return odeint(nn_predict, y0, t, tuple((params,)), rtol=0.01) def L1_loss(pred, targets): return np.mean(np.abs(pred - targets)) if __name__ == "__main__": # Generate data from true dynamics. true_y0 = np.array([2.0, 0.0]).T t = np.linspace(0.0, max_T, N) true_A = np.array([[-0.1, 2.0], [-2.0, -0.1]]) true_y = odeint(func, true_y0, t, args=(true_A,)) def train_loss(params, iter): pred = ode_pred(params, true_y0, t) return L1_loss(pred, true_y) # Set up figure fig = plt.figure(figsize=(12, 4), facecolor="white") ax_traj = fig.add_subplot(131, frameon=False) ax_phase = fig.add_subplot(132, frameon=False) ax_vecfield = fig.add_subplot(133, frameon=False) plt.show(block=False) # Plots data and learned dynamics. def callback(params, iter, g): pred = ode_pred(params, true_y0, t) print(f"Iteration {iter:d} train loss {L1_loss(pred, true_y):.6f}") ax_traj.cla() ax_traj.set_title("Trajectories") ax_traj.set_xlabel("t") ax_traj.set_ylabel("x,y") ax_traj.plot(t, true_y[:, 0], "-", t, true_y[:, 1], "g-") ax_traj.plot(t, pred[:, 0], "--", t, pred[:, 1], "b--") ax_traj.set_xlim(t.min(), t.max()) ax_traj.set_ylim(-2, 2) ax_traj.xaxis.set_ticklabels([]) ax_traj.yaxis.set_ticklabels([]) ax_traj.legend() ax_phase.cla() ax_phase.set_title("Phase Portrait") ax_phase.set_xlabel("x") ax_phase.set_ylabel("y") ax_phase.plot(true_y[:, 0], true_y[:, 1], "g-") ax_phase.plot(pred[:, 0], pred[:, 1], "b--") ax_phase.set_xlim(-2, 2) ax_phase.set_ylim(-2, 2) ax_phase.xaxis.set_ticklabels([]) ax_phase.yaxis.set_ticklabels([]) ax_vecfield.cla() ax_vecfield.set_title("Learned Vector Field") ax_vecfield.set_xlabel("x") ax_vecfield.set_ylabel("y") ax_vecfield.xaxis.set_ticklabels([]) ax_vecfield.yaxis.set_ticklabels([]) # vector field plot y, x = npo.mgrid[-2:2:21j, -2:2:21j] dydt = nn_predict(np.stack([x, y], -1).reshape(21 * 21, 2), 0, params).reshape(-1, 2) mag = np.sqrt(dydt[:, 0] ** 2 + dydt[:, 1] ** 2).reshape(-1, 1) dydt = dydt / mag dydt = dydt.reshape(21, 21, 2) ax_vecfield.streamplot(x, y, dydt[:, :, 0], dydt[:, :, 1], color="black") ax_vecfield.set_xlim(-2, 2) ax_vecfield.set_ylim(-2, 2) fig.tight_layout() plt.draw() plt.pause(0.001) # Train neural net dynamics to match data. init_params = init_nn_params(0.1, layer_sizes=[D, 150, D]) optimized_params = adam(grad(train_loss), init_params, num_iters=1000, callback=callback) ================================================ FILE: examples/print_trace.py ================================================ """Demonstrates how to use the tracer module, independent of autodiff, by creating a trace that prints out functions and their arguments as they're being evaluated""" import autograd.numpy as np # autograd has already wrapped numpy for us from autograd.tracer import Node, trace class PrintNode(Node): def __init__(self, value, fun, args, kwargs, parent_argnums, parents): self.varname_generator = parents[0].varname_generator self.varname = next(self.varname_generator) args_or_vars = list(args) for argnum, parent in zip(parent_argnums, parents): args_or_vars[argnum] = parent.varname print("{} = {}({}) = {}".format(self.varname, fun.__name__, ",".join(map(str, args_or_vars)), value)) def initialize_root(self, x): self.varname_generator = make_varname_generator() self.varname = next(self.varname_generator) print(f"{self.varname} = {x}") def make_varname_generator(): for i in range(65, 91): yield chr(i) raise Exception("Ran out of alphabet!") def print_trace(f, x): start_node = PrintNode.new_root(x) trace(start_node, f, x) print() def avg(x, y): return (x + y) / 2 def fun(x): y = np.sin(x + x) return avg(y, y) print_trace(fun, 1.23) # Traces can be nested, so we can also trace through grad(fun) from autograd import grad print_trace(grad(fun), 1.0) ================================================ FILE: examples/rkhs.py ================================================ """ Inferring a function from a reproducing kernel Hilbert space (RKHS) by taking gradients of eval with respect to the function-valued argument """ from itertools import chain import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.extend import Box, VSpace, defvjp, primitive from autograd.util import func class RKHSFun: def __init__(self, kernel, alphas={}): self.alphas = alphas self.kernel = kernel self.vs = RKHSFunVSpace(self) @primitive def __call__(self, x): return sum([a * self.kernel(x, x_repr) for x_repr, a in self.alphas.items()], 0.0) def __add__(self, f): return self.vs.add(self, f) def __mul__(self, a): return self.vs.scalar_mul(self, a) # TODO: add vjp of __call__ wrt x (and show it in action) defvjp(func(RKHSFun.__call__), lambda ans, f, x: lambda g: RKHSFun(f.kernel, {x: 1}) * g) class RKHSFunBox(Box, RKHSFun): @property def kernel(self): return self._value.kernel RKHSFunBox.register(RKHSFun) class RKHSFunVSpace(VSpace): def __init__(self, value): self.kernel = value.kernel def zeros(self): return RKHSFun(self.kernel) def randn(self): # These arbitrary vectors are not analogous to randn in any meaningful way N = npr.randint(1, 3) return RKHSFun(self.kernel, dict(zip(npr.randn(N), npr.randn(N)))) def _add(self, f, g): assert f.kernel is g.kernel return RKHSFun(f.kernel, add_dicts(f.alphas, g.alphas)) def _scalar_mul(self, f, a): return RKHSFun(f.kernel, {x: a * a_cur for x, a_cur in f.alphas.items()}) def _inner_prod(self, f, g): assert f.kernel is g.kernel return sum( [a1 * a2 * f.kernel(x1, x2) for x1, a1 in f.alphas.items() for x2, a2 in g.alphas.items()], 0.0 ) RKHSFunVSpace.register(RKHSFun) def add_dicts(d1, d2): d = {} for k, v in chain(d1.items(), d2.items()): d[k] = d[k] + v if k in d else v return d if __name__ == "__main__": def sq_exp_kernel(x1, x2): return np.exp(-((x1 - x2) ** 2)) xs = range(5) ys = [1, 2, 3, 2, 1] def logprob(f, xs, ys): return -sum((f(x) - y) ** 2 for x, y in zip(xs, ys)) f = RKHSFun(sq_exp_kernel) for i in range(100): f = f + grad(logprob)(f, xs, ys) * 0.01 for x, y in zip(xs, ys): print(f"{x}\t{y}\t{f(x)}") ================================================ FILE: examples/rnn.py ================================================ """Implements the long-short term memory character model. This version vectorizes over multiple examples, but each string has a fixed length.""" from os.path import dirname, join import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.misc.optimizers import adam from autograd.scipy.special import logsumexp ### Helper functions ################# def sigmoid(x): return 0.5 * (np.tanh(x) + 1.0) # Output ranges from 0 to 1. def concat_and_multiply(weights, *args): cat_state = np.hstack(args + (np.ones((args[0].shape[0], 1)),)) return np.dot(cat_state, weights) ### Define recurrent neural net ####### def create_rnn_params(input_size, state_size, output_size, param_scale=0.01, rs=npr.RandomState(0)): return { "init hiddens": rs.randn(1, state_size) * param_scale, "change": rs.randn(input_size + state_size + 1, state_size) * param_scale, "predict": rs.randn(state_size + 1, output_size) * param_scale, } def rnn_predict(params, inputs): def update_rnn(input, hiddens): return np.tanh(concat_and_multiply(params["change"], input, hiddens)) def hiddens_to_output_probs(hiddens): output = concat_and_multiply(params["predict"], hiddens) return output - logsumexp(output, axis=1, keepdims=True) # Normalize log-probs. num_sequences = inputs.shape[1] hiddens = np.repeat(params["init hiddens"], num_sequences, axis=0) output = [hiddens_to_output_probs(hiddens)] for input in inputs: # Iterate over time steps. hiddens = update_rnn(input, hiddens) output.append(hiddens_to_output_probs(hiddens)) return output def rnn_log_likelihood(params, inputs, targets): logprobs = rnn_predict(params, inputs) loglik = 0.0 num_time_steps, num_examples, _ = inputs.shape for t in range(num_time_steps): loglik += np.sum(logprobs[t] * targets[t]) return loglik / (num_time_steps * num_examples) ### Dataset setup ################## def string_to_one_hot(string, maxchar): """Converts an ASCII string to a one-of-k encoding.""" ascii = np.array([ord(c) for c in string]).T return np.array(ascii[:, None] == np.arange(maxchar)[None, :], dtype=int) def one_hot_to_string(one_hot_matrix): return "".join([chr(np.argmax(c)) for c in one_hot_matrix]) def build_dataset(filename, sequence_length, alphabet_size, max_lines=-1): """Loads a text file, and turns each line into an encoded sequence.""" with open(filename) as f: content = f.readlines() content = content[:max_lines] content = [line for line in content if len(line) > 2] # Remove blank lines seqs = np.zeros((sequence_length, len(content), alphabet_size)) for ix, line in enumerate(content): padded_line = (line + " " * sequence_length)[:sequence_length] seqs[:, ix, :] = string_to_one_hot(padded_line, alphabet_size) return seqs if __name__ == "__main__": num_chars = 128 # Learn to predict our own source code. text_filename = join(dirname(__file__), "rnn.py") train_inputs = build_dataset(text_filename, sequence_length=30, alphabet_size=num_chars, max_lines=60) init_params = create_rnn_params(input_size=128, output_size=128, state_size=40, param_scale=0.01) def print_training_prediction(weights): print("Training text Predicted text") logprobs = np.asarray(rnn_predict(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_inputs[:, t, :]) predicted_text = one_hot_to_string(logprobs[:, t, :]) print(training_text.replace("\n", " ") + "|" + predicted_text.replace("\n", " ")) def training_loss(params, iter): return -rnn_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("Training RNN...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = rnn_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text) ================================================ FILE: examples/rosenbrock.py ================================================ from scipy.optimize import minimize import autograd.numpy as np from autograd import value_and_grad def rosenbrock(x): return 100 * (x[1] - x[0] ** 2) ** 2 + (1 - x[0]) ** 2 # Build a function that also returns gradients using autograd. rosenbrock_with_grad = value_and_grad(rosenbrock) # Optimize using conjugate gradients. result = minimize(rosenbrock_with_grad, x0=np.array([0.0, 0.0]), jac=True, method="CG") print(f"Found minimum at {result.x}") ================================================ FILE: examples/sinusoid.py ================================================ import matplotlib.pyplot as plt import autograd.numpy as np from autograd import grad def fun(x): return np.sin(x) d_fun = grad(fun) # First derivative dd_fun = grad(d_fun) # Second derivative x = np.linspace(-10, 10, 100) plt.plot(x, list(map(fun, x)), x, list(map(d_fun, x)), x, list(map(dd_fun, x))) plt.xlim([-10, 10]) plt.ylim([-1.2, 1.2]) plt.axis("off") plt.savefig("sinusoid.png") plt.clf() # Taylor approximation to sin function def fun(x): currterm = x ans = currterm for i in range(1000): print(i, end=" ") currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2)) ans = ans + currterm if np.abs(currterm) < 0.2: break # (Very generous tolerance!) return ans d_fun = grad(fun) dd_fun = grad(d_fun) x = np.linspace(-10, 10, 100) plt.plot(x, list(map(fun, x)), x, list(map(d_fun, x)), x, list(map(dd_fun, x))) plt.xlim([-10, 10]) plt.ylim([-1.2, 1.2]) plt.axis("off") plt.savefig("sinusoid_taylor.png") plt.clf() ================================================ FILE: examples/tanh.py ================================================ import matplotlib.pyplot as plt import autograd.numpy as np from autograd import elementwise_grad as egrad """ Mathematically we can only take gradients of scalar-valued functions, but autograd's elementwise_grad function also handles numpy's familiar vectorization of scalar functions, which is used in this example. To be precise, elementwise_grad(fun)(x) always returns the value of a vector-Jacobian product, where the Jacobian of fun is evaluated at x and the vector is an all-ones vector with the same size as the output of fun. When vectorizing a scalar-valued function over many arguments, the Jacobian of the overall vector-to-vector mapping is diagonal, and so this vector-Jacobian product simply returns the diagonal elements of the Jacobian, which is the (elementwise) gradient of the function at each input value over which the function is vectorized. """ def tanh(x): return (1.0 - np.exp(-2 * x)) / (1.0 + np.exp(-(2 * x))) ### Plotting plt.figure(figsize=(12, 8)) x = np.linspace(-7, 7, 700) plt.plot(x, tanh(x), label="tanh(x)") plt.plot(x, egrad(tanh)(x), label="1st derivative") plt.plot(x, egrad(egrad(tanh))(x), label="2nd derivative") plt.plot(x, egrad(egrad(egrad(tanh)))(x), label="3rd derivative") plt.plot(x, egrad(egrad(egrad(egrad(tanh))))(x), label="4th derivative") plt.xlabel("x") plt.ylabel("y") plt.ylim(-5, 5) plt.yticks(np.arange(-5, 6, 1)) plt.legend() plt.grid(True) plt.title("tanh(x) and its derivatives") plt.savefig("tanh.png") plt.show() ================================================ FILE: examples/variational_autoencoder.py ================================================ # Implements auto-encoding variational Bayes. from data import load_mnist, save_images import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.stats.norm as norm from autograd import grad from autograd.misc.optimizers import adam from autograd.scipy.special import expit as sigmoid def diag_gaussian_log_density(x, mu, log_std): return np.sum(norm.logpdf(x, mu, np.exp(log_std)), axis=-1) def unpack_gaussian_params(params): # Params of a diagonal Gaussian. D = np.shape(params)[-1] // 2 mean, log_std = params[:, :D], params[:, D:] return mean, log_std def sample_diag_gaussian(mean, log_std, rs): return rs.randn(*mean.shape) * np.exp(log_std) + mean def bernoulli_log_density(targets, unnormalized_logprobs): # unnormalized_logprobs are in R # Targets must be -1 or 1 label_probabilities = -np.logaddexp(0, -unnormalized_logprobs * targets) return np.sum(label_probabilities, axis=-1) # Sum across pixels. def relu(x): return np.maximum(0, x) def init_net_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a (weights, biases) tuples for all layers.""" return [ ( scale * rs.randn(m, n), # weight matrix scale * rs.randn(n), ) # bias vector for m, n in zip(layer_sizes[:-1], layer_sizes[1:]) ] def batch_normalize(activations): mbmean = np.mean(activations, axis=0, keepdims=True) return (activations - mbmean) / (np.std(activations, axis=0, keepdims=True) + 1) def neural_net_predict(params, inputs): """Params is a list of (weights, bias) tuples. inputs is an (N x D) matrix. Applies batch normalization to every layer but the last.""" for W, b in params[:-1]: outputs = batch_normalize(np.dot(inputs, W) + b) # linear transformation inputs = relu(outputs) # nonlinear transformation outW, outb = params[-1] outputs = np.dot(inputs, outW) + outb return outputs def nn_predict_gaussian(params, inputs): # Returns means and diagonal variances return unpack_gaussian_params(neural_net_predict(params, inputs)) def generate_from_prior(gen_params, num_samples, noise_dim, rs): latents = rs.randn(num_samples, noise_dim) return sigmoid(neural_net_predict(gen_params, latents)) def p_images_given_latents(gen_params, images, latents): preds = neural_net_predict(gen_params, latents) return bernoulli_log_density(images, preds) def vae_lower_bound(gen_params, rec_params, data, rs): # We use a simple Monte Carlo estimate of the KL # divergence from the prior. q_means, q_log_stds = nn_predict_gaussian(rec_params, data) latents = sample_diag_gaussian(q_means, q_log_stds, rs) q_latents = diag_gaussian_log_density(latents, q_means, q_log_stds) p_latents = diag_gaussian_log_density(latents, 0, 0) likelihood = p_images_given_latents(gen_params, data, latents) return np.mean(p_latents + likelihood - q_latents) if __name__ == "__main__": # Model hyper-parameters latent_dim = 10 data_dim = 784 # How many pixels in each image (28x28). gen_layer_sizes = [latent_dim, 300, 200, data_dim] rec_layer_sizes = [data_dim, 200, 300, latent_dim * 2] # Training parameters param_scale = 0.01 batch_size = 200 num_epochs = 15 step_size = 0.001 print("Loading training data...") N, train_images, _, test_images, _ = load_mnist() def binarise(images): on = images > 0.5 images = images * 0 - 1 images[on] = 1.0 return images print("Binarising training data...") train_images = binarise(train_images) test_images = binarise(test_images) init_gen_params = init_net_params(param_scale, gen_layer_sizes) init_rec_params = init_net_params(param_scale, rec_layer_sizes) combined_init_params = (init_gen_params, init_rec_params) num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # Define training objective seed = npr.RandomState(0) def objective(combined_params, iter): data_idx = batch_indices(iter) gen_params, rec_params = combined_params return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim # Get gradients of objective using autograd. objective_grad = grad(objective) print(" Epoch | Objective | Test ELBO ") def print_perf(combined_params, iter, grad): if iter % 10 == 0: gen_params, rec_params = combined_params bound = np.mean(objective(combined_params, iter)) message = f"{iter // num_batches:15}|{bound:20}|" if iter % 100 == 0: test_bound = -vae_lower_bound(gen_params, rec_params, test_images, seed) / data_dim message += f"{test_bound:20}" print(message) fake_data = generate_from_prior(gen_params, 20, latent_dim, seed) save_images(fake_data, "vae_samples.png", vmin=0, vmax=1) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam( objective_grad, combined_init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf, ) ================================================ FILE: license.txt ================================================ The MIT License (MIT) Copyright (c) 2025 by the President and Fellows of Harvard University Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: noxfile.py ================================================ import platform import nox NIGHTLY_INDEX_URL = "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" UV_NIGHTLY_ENV_VARS = { "UV_INDEX_URL": NIGHTLY_INDEX_URL, "UV_PRERELEASE": "allow", "UV_INDEX_STRATEGY": "first-index", } nox.needs_version = ">=2024.4.15" nox.options.default_venv_backend = "uv|virtualenv" nox.options.reuse_existing_virtualenvs = False nox.options.error_on_external_run = True # nox.options.sessions = ["lint", "validate-package", "tests"] nox.options.sessions = ["tests"] @nox.session(name="validate-package") def check(session): """Build source distribution, wheel, and check their metadata""" session.install("build", "twine", silent=False) session.run("python", "-m", "build") session.run("twine", "check", "--strict", "dist/*") @nox.session(name="tests", tags=["tests"]) def run_tests(session): """Run unit tests and generate a coverage report""" pyproject = nox.project.load_toml("pyproject.toml") session.install(*nox.project.dependency_groups(pyproject, "test")) # SciPy doesn't have wheels on PyPy if platform.python_implementation() == "PyPy": session.install("-e.", silent=False) else: session.install("-e", ".[scipy]", silent=False) session.run("pytest", "--cov=autograd", "--cov-report=xml", "--cov-append", *session.posargs) @nox.session(name="lint", reuse_venv=True) def ruff(session): """Lightning-fast linting for Python""" session.install("pre-commit", silent=False) session.run("pre-commit", "run", "--all-files", "--show-diff-on-failure") @nox.session(name="nightly-tests", tags=["tests"]) def run_nightly_tests(session): """Run tests against nightly versions of dependencies""" session.install("-e.", silent=False) pyproject = nox.project.load_toml("pyproject.toml") session.install(*nox.project.dependency_groups(pyproject, "test")) # SciPy doesn't have wheels on PyPy if platform.python_implementation() == "PyPy": session.install( "numpy", "--upgrade", "--only-binary", ":all:", silent=False, env=UV_NIGHTLY_ENV_VARS ) else: session.install( "numpy", "scipy", "--upgrade", "--only-binary", ":all:", silent=False, env=UV_NIGHTLY_ENV_VARS ) session.run("pytest", "--cov=autograd", "--cov-report=xml", "--cov-append", *session.posargs) ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "autograd" version = "1.8.0" requires-python = ">=3.10" description = "Efficiently computes derivatives of NumPy code." readme = "README.md" license = {file = "license.txt"} authors = [ {name = "Dougal Maclaurin", email = "maclaurin@physics.harvard.edu"}, {name = "David Duvenaud", email = "duvenaud@cs.toronto.edu"}, {name = "Matthew Johnson", email = "mattjj@csail.mit.edu"}, {name = "Jamie Townsend", email = "j.h.n.townsend@uva.nl"}, ] maintainers = [ {name = "Jamie Townsend", email = "j.h.n.townsend@uva.nl"}, {name = "Fabian Joswig", email = "fabian.joswig@uni-muenster.de"}, {name = "Agriya Khetarpal", email = "agriyakhetarpal@outlook.com"}, ] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Information Technology", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering", ] keywords = [ "Automatic differentiation", "backpropagation", "gradients", "machine learning", "optimization", "neural networks", "Python", "NumPy", "SciPy", ] dependencies = [ "numpy<3", ] # dynamic = ["version"] [project.urls] Source = "https://github.com/HIPS/autograd" [project.optional-dependencies] scipy = [ "scipy", ] [dependency-groups] test = [ "pytest", "pytest-cov", "pytest-xdist", ] examples = ["matplotlib"] [tool.coverage.run] source = ["autograd"] [tool.coverage.report] show_missing = true [tool.pytest.ini_options] required_plugins = ["pytest-cov", "pytest-xdist"] # TODO: generate HTML report, upload to CodeCov addopts = "--color=yes -sra -n auto --cov=autograd --cov-report=xml --cov-report=term" [tool.ruff] extend-exclude = [] # TODO: not ignore them lint.extend-ignore = [ "E731", "F401", "F403", "F841", "F821", "E721", "E722", "E741", "E402", "F811" ] lint.extend-select = ["I", "W"] line-length = 109 ================================================ FILE: tests/_test_complexity.py ================================================ import time import warnings import autograd.numpy as np from autograd import deriv, grad from autograd.builtins import list as make_list def timefunction(f): t = time.time() f() return time.time() - t def assert_linear_time(f): t = timefunction(lambda: f(1)) t10 = timefunction(lambda: f(10)) assert t10 > 5 * t, f"Too fast: f(1) takes {t}, f(10) takes {t10}" assert t10 < 20 * t, f"Too slow: f(1) takes {t}, f(10) takes {t10}" if not (8 * t < t10 < 12 * t): warnings.warn("Borderline linearity. May fail on different hardware") def test_array_creation(): def fun(x, N): arr = [x for i in range(N)] return np.sum(np.array(arr)) assert_linear_time(lambda N: grad(fun)(1.0, 200 * N)) def test_array_indexing(): def fun(x): return sum([x[i] for i in range(len(x))]) assert_linear_time(lambda N: grad(fun)(np.zeros(200 * N))) def test_list_indexing(): def fun(x): return sum([x[i] for i in range(len(x))]) assert_linear_time(lambda N: grad(fun)([0.0 for i in range(50 * N)])) def test_list_creation(): def fun(x, N): return make_list(*[x for _ in range(N)]) assert_linear_time(lambda N: deriv(fun)(0.0, 20 * N)) # This fails. Need to figure out why def test_array_creation_fwd(): def fun(x, N): arr = [x for i in range(N)] return np.sum(np.array(arr)) assert_linear_time(lambda N: deriv(fun)(1.0, 400 * N)) ================================================ FILE: tests/check_examples_run.sh ================================================ #!/bin/bash PYTHONPATH=".:$PYTHONPATH" trap 'kill -INT -$pid && exit 1' INT working=() failing=() examples=$(find examples -name '*.py' -not -name '__init__.py') echo 'Running all the examples...' for f in $examples; do timeout 15s python2 $f > /dev/null 2>&1 & pid=$! wait $pid status=$? if [ $status -eq 0 -o $status -eq 124 ]; then echo $f "seems to work" working+=($f) elif [ $status -eq 137 ]; then echo $f "might be working, but had to be killed" working+=($f) else echo $f "seems broken, try running manually" failing+=($f) fi done if [ ! ${#working[@]} -eq 0 ]; then echo -e '\033[01;36m' echo "These seemed to WORK:" echo -en '\033[00m' printf '%s\n' "${working[@]}" echo fi if [ ! ${#failing[@]} -eq 0 ]; then echo -e '\033[01;31m' echo "These seemed to FAIL:" echo -en '\033[00m' printf '%s\n' "${failing[@]}" echo fi ================================================ FILE: tests/conftest.py ================================================ import numpy as np import pytest @pytest.fixture(autouse=True) def random_seed(): np.random.seed(42) ================================================ FILE: tests/numpy_utils.py ================================================ import autograd.numpy.random as npr from autograd.test_util import combo_check def stat_check(fun, test_complex=True, **kwargs): # Tests functions that compute statistics, like sum, mean, etc x = 3.5 A = npr.randn() B = npr.randn(3) C = npr.randn(2, 3) D = npr.randn(1, 3) check = combo_check(fun, (0,), **kwargs) check([x, A]) check([B, C, D], axis=[None, 0], keepdims=[True, False]) check([C, D], axis=[None, 0, 1], keepdims=[True, False]) if test_complex: c = npr.randn() + 0.1j * npr.randn() E = npr.randn(2, 3) + 0.1j * npr.randn(2, 3) check([x, c, A]) check([B, C, D, E], axis=[None, 0], keepdims=[True, False]) def unary_ufunc_check(fun, lims=[-2, 2], test_complex=True, **kwargs): scalar = transform(lims, 0.4) vector = transform(lims, npr.rand(2)) mat = transform(lims, npr.rand(3, 2)) mat2 = transform(lims, npr.rand(1, 2)) check = combo_check(fun, (0,), **kwargs) check([scalar, vector, mat, mat2]) if test_complex: comp = transform(lims, 0.4) + 0.1j * transform(lims, 0.3) matc = transform(lims, npr.rand(3, 2)) + 0.1j * npr.rand(3, 2) check([comp, matc]) def binary_ufunc_check(fun, lims_A=[-2, 2], lims_B=[-2, 2], test_complex=True, **kwargs): T_A = lambda x: transform(lims_A, x) T_B = lambda x: transform(lims_B, x) scalar = 0.6 vector = npr.rand(2) mat = npr.rand(3, 2) mat2 = npr.rand(1, 2) check = combo_check(fun, (0, 1), **kwargs) check([T_A(scalar), T_A(vector), T_A(mat), T_A(mat2)], [T_B(scalar), T_B(vector), T_B(mat), T_B(mat2)]) if test_complex: comp = 0.6 + 0.3j matc = npr.rand(3, 2) + 0.1j * npr.rand(3, 2) check( [T_A(scalar), T_A(comp), T_A(vector), T_A(matc), T_A(mat2)], [T_B(scalar), T_B(comp), T_B(vector), T_B(matc), T_B(mat2)], ) def binary_ufunc_check_no_same_args(fun, lims_A=[-2, 2], lims_B=[-2, 2], test_complex=True, **kwargs): T_A = lambda x: transform(lims_A, x) T_B = lambda x: transform(lims_B, x) scalar1 = 0.6 scalar2 = 0.7 vector1 = npr.rand(2) vector2 = npr.rand(2) mat11 = npr.rand(3, 2) mat12 = npr.rand(3, 2) mat21 = npr.rand(1, 2) mat22 = npr.rand(1, 2) check = combo_check(fun, (0, 1), **kwargs) check( [T_A(scalar1), T_A(vector1), T_A(mat11), T_A(mat21)], [T_B(scalar2), T_B(vector2), T_B(mat12), T_B(mat22)], ) if test_complex: comp1 = 0.6 + 0.3j comp2 = 0.1 + 0.2j matc1 = npr.rand(3, 2) + 0.1j * npr.rand(3, 2) matc2 = npr.rand(3, 2) + 0.1j * npr.rand(3, 2) check( [T_A(scalar1), T_A(comp1), T_A(vector1), T_A(matc1), T_A(mat21)], [T_B(scalar2), T_B(comp2), T_B(vector2), T_B(matc2), T_B(mat22)], ) def transform(lims, x): return x * (lims[1] - lims[0]) + lims[0] ================================================ FILE: tests/profiling.py ================================================ from contextlib import contextmanager from time import time import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad @contextmanager def tictoc(text=""): print("--- Start clock ---") t1 = time() yield dt = time() - t1 print(f"--- Stop clock {text}: {dt} seconds elapsed ---") def fan_out_fan_in(): """The 'Pearlmutter test'""" def fun(x): for i in range(10**4): x = (x + x) / 2.0 return np.sum(x) with tictoc(): grad(fun)(1.0) def convolution(): # MNIST-scale convolution operation import autograd.scipy.signal convolve = autograd.scipy.signal.convolve dat = npr.randn(256, 3, 28, 28) kernel = npr.randn(3, 5, 5) with tictoc(): convolve(dat, kernel, axes=([2, 3], [1, 2]), dot_axes=([1], [0])) def dot_equivalent(): # MNIST-scale convolution operation dat = npr.randn(256, 3, 24, 5, 24, 5) kernel = npr.randn(3, 5, 5) with tictoc(): np.tensordot(dat, kernel, axes=[(1, 3, 5), (0, 1, 2)]) # fan_out_fan_in() # convolution() dot_equivalent() ================================================ FILE: tests/test_binary_ops.py ================================================ import itertools as it import warnings import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad, value_and_grad from autograd.test_util import check_grads rs = npr.RandomState(0) def arg_pairs(): scalar = 2.0 vector = rs.randn(4) mat = rs.randn(3, 4) mat2 = rs.randn(1, 4) allargs = [scalar, vector, mat, mat2] yield from it.product(allargs, allargs) def test_mul(): fun = lambda x, y: x * y for arg1, arg2 in arg_pairs(): check_grads(fun)(arg1, arg2) def test_add(): fun = lambda x, y: x + y for arg1, arg2 in arg_pairs(): check_grads(fun)(arg1, arg2) def test_sub(): fun = lambda x, y: x - y for arg1, arg2 in arg_pairs(): check_grads(fun)(arg1, arg2) def test_div(): fun = lambda x, y: x / y make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5) for arg1, arg2 in arg_pairs(): arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun)(arg1, arg2) def test_mod(): fun = lambda x, y: x % y make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5) for arg1, arg2 in arg_pairs(): if arg1 is not arg2: # Gradient undefined at x == y arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun)(arg1, arg2) def test_pow(): fun = lambda x, y: x**y make_positive = lambda x: np.abs(x) + 1.1 # Numeric derivatives fail near zero for arg1, arg2 in arg_pairs(): arg1 = make_positive(arg1) check_grads(fun)(arg1, arg2) def test_arctan2(): for arg1, arg2 in arg_pairs(): check_grads(np.arctan2)(arg1, arg2) def test_hypot(): for arg1, arg2 in arg_pairs(): check_grads(np.hypot, modes=["rev"])(arg1, arg2) def test_comparison_grads(): compare_funs = [ lambda x, y: np.sum(x < x) + 0.0, lambda x, y: np.sum(x <= y) + 0.0, lambda x, y: np.sum(x > y) + 0.0, lambda x, y: np.sum(x >= y) + 0.0, lambda x, y: np.sum(x == y) + 0.0, lambda x, y: np.sum(x != y) + 0.0, ] with warnings.catch_warnings(record=True) as w: for arg1, arg2 in arg_pairs(): zeros = (arg1 + arg2) * 0 # get correct shape for fun in compare_funs: assert np.all(grad(fun)(arg1, arg2) == zeros) assert np.all(grad(fun, argnum=1)(arg1, arg2) == zeros) def test_comparison_values(): compare_funs = [ lambda x, y: np.sum(x < x) + 0.0, lambda x, y: np.sum(x <= y) + 0.0, lambda x, y: np.sum(x > y) + 0.0, lambda x, y: np.sum(x >= y) + 0.0, lambda x, y: np.sum(x == y) + 0.0, lambda x, y: np.sum(x != y) + 0.0, ] for arg1, arg2 in arg_pairs(): for fun in compare_funs: fun_val = fun(arg1, arg2) fun_val_from_grad, _ = value_and_grad(fun)(arg1, arg2) assert fun_val == fun_val_from_grad, (fun_val, fun_val_from_grad) ================================================ FILE: tests/test_builtins.py ================================================ import autograd.numpy as np from autograd import grad from autograd.builtins import isinstance def test_isinstance(): def checker(ex, type_, truthval): assert isinstance(ex, type_) == truthval return 1.0 examples = [ [list, [[]], [()]], [np.ndarray, [np.zeros(1)], [[]]], [(tuple, list), [[], ()], [np.zeros(1)]], ] for type_, positive_examples, negative_examples in examples: for ex in positive_examples: checker(ex, type_, True) grad(checker)(ex, type_, True) for ex in negative_examples: checker(ex, type_, False) grad(checker)(ex, type_, False) ================================================ FILE: tests/test_complex.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.test_util import check_grads npr.seed(1) def test_real_type(): fun = lambda x: np.sum(np.real(x)) df = grad(fun) assert np.isrealobj(df(2.0)) assert np.iscomplexobj(df(1.0j)) def test_real_if_close_type(): fun = lambda x: np.sum(np.real(x)) df = grad(fun) assert np.isrealobj(df(1.0)) assert np.iscomplexobj(df(1.0j)) def test_angle_real(): fun = lambda x: np.angle(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(npr.rand()) check_grads(d_fun)(npr.rand()) def test_angle_complex(): fun = lambda x: np.angle(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(npr.rand() + 1j * npr.rand()) check_grads(d_fun)(npr.rand() + 1j * npr.rand()) def test_abs_real(): fun = lambda x: np.abs(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(1.1) check_grads(d_fun)(2.1) def test_abs_complex(): fun = lambda x: np.abs(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(1.1 + 1.2j) check_grads(d_fun)(1.1 + 1.3j) ================================================ FILE: tests/test_core.py ================================================ """This file doesn't import the numpy wrapper, to check if core works on basic operations even without numpy.""" import warnings from autograd.core import make_vjp from autograd.wrap_util import unary_to_nary @unary_to_nary def grad(fun, x): vjp, _ = make_vjp(fun, x) return vjp(1.0) # Non-numpy gradient checking functions. def nd(f, x, eps=1e-4): return (f(x + eps / 2) - f(x - eps / 2)) / eps def check_close(a, b, atol=1e-4, rtol=1e-4): assert abs(a - b) < atol + rtol * abs(b), f"Diffs are: {a - b}" def check_binary_func(fun, independent=False): with warnings.catch_warnings(record=independent) as w: x, y = 0.7, 1.8 a = grad(fun)(x, y) b = nd(lambda x: fun(x, y), x) check_close(a, b) a = grad(fun, 1)(x, y) b = nd(lambda y: fun(x, y), y) check_close(a, b) def test_add(): check_binary_func(lambda x, y: x + y) def test_sub(): check_binary_func(lambda x, y: x - y) def test_div(): check_binary_func(lambda x, y: x / y) def test_mul(): check_binary_func(lambda x, y: x * y) def test_pow(): check_binary_func(lambda x, y: x**y) def test_mod(): check_binary_func(lambda x, y: x % y) def test_eq(): check_binary_func(lambda x, y: x == y, independent=True) def test_neq(): check_binary_func(lambda x, y: x != y, independent=True) def test_leq(): check_binary_func(lambda x, y: x <= y, independent=True) def test_geq(): check_binary_func(lambda x, y: x >= y, independent=True) def test_lt(): check_binary_func(lambda x, y: x < y, independent=True) def test_gt(): check_binary_func(lambda x, y: x > y, independent=True) ================================================ FILE: tests/test_dict.py ================================================ import operator as op import autograd.numpy as np import autograd.numpy.random as npr from autograd import dict as ag_dict from autograd import grad from autograd import isinstance as ag_isinstance from autograd.test_util import check_grads npr.seed(0) def test_getter(): def fun(input_dict): A = np.sum(input_dict["item_1"]) B = np.sum(input_dict["item_2"]) C = np.sum(input_dict["item_2"]) return A + B + C d_fun = grad(fun) input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)} result = d_fun(input_dict) assert np.allclose(result["item_1"], np.ones((5, 6))) assert np.allclose(result["item_2"], 2 * np.ones((4, 3))) assert np.allclose(result["item_X"], np.zeros((2, 4))) def test_grads(): def fun(input_dict): A = np.sum(np.sin(input_dict["item_1"])) B = np.sum(np.cos(input_dict["item_2"])) return A + B def d_fun(input_dict): g = grad(fun)(input_dict) A = np.sum(g["item_1"]) B = np.sum(np.sin(g["item_1"])) C = np.sum(np.sin(g["item_2"])) return A + B + C input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)} check_grads(fun)(input_dict) check_grads(d_fun)(input_dict) def test_iter(): def fun(input_dict): A = 0.0 B = 0.0 for i, k in enumerate(sorted(input_dict)): A = A + np.sum(np.sin(input_dict[k])) * (i + 1.0) B = B + np.sum(np.cos(input_dict[k])) return A + B def d_fun(input_dict): g = grad(fun)(input_dict) A = np.sum(g["item_1"]) B = np.sum(np.sin(g["item_1"])) C = np.sum(np.sin(g["item_2"])) return A + B + C input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)} check_grads(fun)(input_dict) check_grads(d_fun)(input_dict) def test_items_values_keys(): def fun(input_dict): A = 0.0 B = 0.0 for i, (k, v) in enumerate(sorted(input_dict.items(), key=op.itemgetter(0))): A = A + np.sum(np.sin(v)) * (i + 1.0) B = B + np.sum(np.cos(v)) for v in input_dict.values(): A = A + np.sum(np.sin(v)) for k in sorted(input_dict.keys()): A = A + np.sum(np.cos(input_dict[k])) return A + B def d_fun(input_dict): g = grad(fun)(input_dict) A = np.sum(g["item_1"]) B = np.sum(np.sin(g["item_1"])) C = np.sum(np.sin(g["item_2"])) return A + B + C input_dict = {"item_1": npr.randn(5, 6), "item_2": npr.randn(4, 3), "item_X": npr.randn(2, 4)} check_grads(fun)(input_dict) check_grads(d_fun)(input_dict) def test_get(): def fun(d, x): return d.get("item_1", x) ** 2 check_grads(fun, argnum=(0, 1))({"item_1": 3.0}, 2.0) check_grads(fun, argnum=(0, 1))({"item_2": 4.0}, 2.0) check_grads(fun, argnum=(0, 1))({}, 2.0) def test_make_dict(): def fun(x): return ag_dict([("a", x)], b=x) check_grads(fun, modes=["rev"])(1.0) def fun(x): return ag_dict({"a": x}) check_grads(fun, modes=["rev"])(1.0) # check some other forms of the constructor ag_dict() ag_dict(()) ag_dict({}) def test_isinstance(): def fun(x): assert ag_isinstance(x, dict) assert ag_isinstance(x, ag_dict) return x["x"] fun({"x": 1.0}) grad(fun)({"x": 1.0}) ================================================ FILE: tests/test_direct.py ================================================ """ Set of tests that are as explicit as possible, in case the test helpers like autograd.test_util break and start letting everything pass """ import numpy as onp import pytest import autograd.numpy as np from autograd import deriv, grad, holomorphic_grad def test_grad(): def fun(x): return (x + np.sin(x**2)) * x assert 3.190948746871 - 1e-6 < grad(fun)(1.3) < 3.190948746871 + 1e-6 def test_deriv(): def fun(x): return (x + np.sin(x**2)) * x assert 3.190948746871 - 1e-6 < deriv(fun)(1.3) < 3.190948746871 + 1e-6 def test_grad_complex_output(): def fun(x): return x * (1.0 + 0.2j) with pytest.raises(TypeError): grad(fun)(1.0) def test_holomorphic_grad(): def fun(x): return x * (1.0 + 0.2j) g = holomorphic_grad(fun)(1.0 + 0.0j) assert 0.9999 < onp.real(g) < 1.0001 assert 0.1999 < onp.imag(g) < 0.2001 ================================================ FILE: tests/test_fft.py ================================================ from functools import partial import pytest import autograd.numpy as np import autograd.numpy.random as npr from autograd.test_util import check_grads npr.seed(1) ### fwd mode not yet implemented check_grads = partial(check_grads, modes=["rev"]) def test_fft(): def fun(x): return np.fft.fft(x) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_fft_ortho(): def fun(x): return np.fft.fft(x, norm="ortho") D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_fft_axis(): def fun(x): return np.fft.fft(x, axis=0) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def match_complex(fft_fun, mat): # ensure hermitian by doing a fft if fft_fun.__name__.startswith("ir"): return getattr(np.fft, fft_fun.__name__[1:])(mat) else: return mat def check_fft_n(fft_fun, D, n): def fun(x): return fft_fun(x, D + n) mat = npr.randn(D, D) mat = match_complex(fft_fun, mat) check_grads(fun)(mat) def test_fft_n_smaller(): check_fft_n(np.fft.fft, 5, -2) def test_fft_n_bigger(): check_fft_n(np.fft.fft, 5, 2) def test_ifft_n_smaller(): check_fft_n(np.fft.ifft, 5, -2) def test_ifft_n_bigger(): check_fft_n(np.fft.ifft, 5, 2) def test_rfft_n_smaller(): check_fft_n(np.fft.rfft, 4, -2) def test_rfft_n_bigger(): check_fft_n(np.fft.rfft, 4, 2) def test_irfft_n_smaller(): check_fft_n(np.fft.irfft, 4, -2) def test_irfft_n_bigger(): check_fft_n(np.fft.irfft, 4, 2) def check_fft_s(fft_fun, D): def fun(x): return fft_fun(x, s=s, axes=axes) mat = npr.randn(D, D, D) / 10.0 mat = match_complex(fft_fun, mat) s = [D + 2, D - 2] axes = [0, 2] check_grads(fun)(mat) def test_fft2_s(): check_fft_s(np.fft.fft2, 5) def test_ifft2_s(): check_fft_s(np.fft.ifft2, 5) def test_fftn_s(): check_fft_s(np.fft.fftn, 5) def test_ifftn_s(): check_fft_s(np.fft.ifftn, 5) def test_rfft2_s(): check_fft_s(np.fft.rfft2, 4) def test_irfft2_s(): check_fft_s(np.fft.irfft2, 4) def test_rfftn_s(): check_fft_s(np.fft.rfftn, 4) def test_irfftn_s(): check_fft_s(np.fft.irfftn, 4) ## TODO: fft gradient not implemented for repeated axes # def test_fft_repeated_axis(): # D = 5 # for fft_fun in (np.fft.fft2,np.fft.ifft2,np.fft.fftn, np.fft.ifftn): # def fun(x): return fft_fun(x, s=s, axes=axes) # mat = npr.randn(D,D,D) / 10.0 # s = [D + 2, D - 2] # axes = [0,0] # check_grads(rad)(fun) def test_ifft(): def fun(x): return np.fft.ifft(x) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_fft2(): def fun(x): return np.fft.fft2(x) D = 5 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_ifft2(): def fun(x): return np.fft.ifft2(x) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_fftn(): def fun(x): return np.fft.fftn(x) D = 5 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_ifftn(): def fun(x): return np.fft.ifftn(x) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_rfft(): def fun(x): return np.fft.rfft(x) D = 4 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_rfft_ortho(): def fun(x): return np.fft.rfft(x, norm="ortho") D = 4 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_rfft_axes(): def fun(x): return np.fft.rfft(x, axis=0) D = 4 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_irfft(): def fun(x): return np.fft.irfft(x) D = 4 mat = npr.randn(D, D) / 10.0 # ensure hermitian by doing a fft mat = np.fft.rfft(mat) check_grads(fun)(mat) def test_irfft_ortho(): def fun(x): return np.fft.irfft(x, norm="ortho") D = 4 mat = npr.randn(D, D) / 10.0 # ensure hermitian by doing a fft mat = np.fft.rfft(mat) check_grads(fun)(mat) def test_rfft2(): def fun(x): return np.fft.rfft2(x) D = 4 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_irfft2(): def fun(x): return np.fft.irfft2(x) D = 4 mat = npr.randn(D, D) / 10.0 # ensure hermitian by doing a fft mat = np.fft.rfft2(mat) check_grads(fun)(mat) def test_rfftn(): def fun(x): return np.fft.rfftn(x) D = 4 mat = npr.randn(D, D, D) / 10.0 check_grads(fun)(mat) def test_rfftn_odd_not_implemented(): def fun(x): return np.fft.rfftn(x) D = 5 mat = npr.randn(D, D, D) / 10.0 with pytest.raises(NotImplementedError): check_grads(fun)(mat) def test_rfftn_subset(): def fun(x): return np.fft.rfftn(x)[(0, 1, 0), (3, 3, 2)] D = 4 mat = npr.randn(D, D, D) / 10.0 check_grads(fun)(mat) def test_rfftn_axes(): def fun(x): return np.fft.rfftn(x, axes=(0, 2)) D = 4 mat = npr.randn(D, D, D) / 10.0 check_grads(fun)(mat) def test_irfftn(): def fun(x): return np.fft.irfftn(x) D = 4 mat = npr.randn(D, D, D) / 10.0 # ensure hermitian by doing a fft mat = np.fft.rfftn(mat) check_grads(fun)(mat) def test_irfftn_subset(): def fun(x): return np.fft.irfftn(x)[(0, 1, 0), (3, 3, 2)] D = 4 mat = npr.randn(D, D, D) / 10.0 # ensure hermitian by doing a fft mat = np.fft.rfftn(mat) check_grads(fun)(mat) def test_fftshift(): def fun(x): return np.fft.fftshift(x) D = 5 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_fftshift_even(): def fun(x): return np.fft.fftshift(x) D = 4 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_fftshift_axes(): def fun(x): return np.fft.fftshift(x, axes=1) D = 5 mat = npr.randn(D, D) / 10.0 check_grads(fun)(mat) def test_ifftshift(): def fun(x): return np.fft.ifftshift(x) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) def test_ifftshift_even(): def fun(x): return np.fft.ifftshift(x) D = 4 mat = npr.randn(D, D) check_grads(fun)(mat) def test_ifftshift_axes(): def fun(x): return np.fft.ifftshift(x, axes=1) D = 5 mat = npr.randn(D, D) check_grads(fun)(mat) ================================================ FILE: tests/test_graphs.py ================================================ import warnings import pytest import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.test_util import check_grads npr.seed(1) def test_grad_fanout(): fun = lambda x: np.sin(np.sin(x) + np.sin(x)) df = grad(fun) check_grads(fun)(npr.randn()) check_grads(df)(npr.rand()) def test_grad_const(): fun = lambda x: 1.0 with warnings.catch_warnings(record=True) as w: warnings.simplefilter("ignore") df = grad(fun) assert np.allclose(df(2.0), 0.0) def test_grad_identity(): fun = lambda x: x df = grad(fun) ddf = grad(df) assert np.allclose(df(2.0), 1.0) assert np.allclose(ddf(2.0), 0.0) def test_hess_vector_prod(): npr.seed(1) randv = npr.randn(10) def fun(x): return np.sin(np.dot(x, randv)) df = grad(fun) def vector_product(x, v): return np.sin(np.dot(v, df(x))) ddf = grad(vector_product) A = npr.randn(10) B = npr.randn(10) check_grads(fun)(A) check_grads(vector_product)(A, B) def test_enclosing_scope_ref(): def fun(x): inner_fun = lambda y: x * y return x * grad(inner_fun)(2.0) check_grads(fun)(1.0) def test_enclosing_scope_ref_2(): def fun(x): inner_fun = lambda y: y * x return x * grad(inner_fun)(2.0) check_grads(fun)(1.0) def test_mutating_outgrad(): def fun(a): b = a + 1.0 c = b + 1.5 d = a + b e = d + c return e A = npr.randn(5) check_grads(fun)(A) def test_mutating_outgrad_from_indexing(): def fun(a): b = a + 1.0 c = b[0] + 1.5 d = a + b e = d + c return e A = npr.randn(5) check_grads(fun)(A) def test_complex_mutating_outgrad_from_indexing(): def fun(a): b = a + 1.0j c = b[0] + 1.5 d = a + b e = d + c return np.sum(np.sin(np.real(e))) A = npr.randn(5) check_grads(fun)(A) d_fun = lambda x: grad(fun)(x) check_grads(d_fun)(A) def test_complex_separate_real_and_imaginary(): def fun(a): r, i = np.real(a), np.imag(a) a = np.abs(r) ** 1.4 + np.abs(i) ** 1.3 return np.sum(np.sin(a)) d_fun = lambda x: grad(fun)(x) A = npr.randn(5, 3) + 0.1j * npr.randn(5, 3) check_grads(fun)(A) check_grads(d_fun)(A) def test_third_derivative(): fun = lambda x: np.sin(np.sin(x) + np.sin(x)) df = grad(fun) ddf = grad(fun) dddf = grad(fun) check_grads(fun)(npr.randn()) check_grads(df)(npr.rand()) check_grads(ddf)(npr.rand()) check_grads(dddf)(npr.rand()) def test_third_derivative_other_args(): fun = lambda x, y: np.sin(np.sin(x) + np.sin(y)) df = grad(fun) ddf = grad(fun, 1) dddf = grad(fun) check_grads(fun)(npr.randn(), npr.randn()) check_grads(df)(npr.randn(), npr.randn()) check_grads(ddf)(npr.randn(), npr.randn()) check_grads(dddf)(npr.randn(), npr.randn()) def test_third_derivative_other_args2(): fun = lambda x, y: np.sin(np.sin(x) + np.sin(y)) df = grad(fun, 1) ddf = grad(fun) dddf = grad(fun, 1) check_grads(fun)(npr.randn(), npr.randn()) check_grads(df)(npr.randn(), npr.randn()) check_grads(ddf)(npr.randn(), npr.randn()) check_grads(dddf)(npr.randn(), npr.randn()) def test_singleton_array_output(): fun = lambda x: np.sum(np.sin(x), keepdims=True) check_grads(fun)(npr.randn(3, 3)) check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 3)) def test_singleton_array_output_axis0(): fun = lambda x: np.sum(np.sin(x), axis=0, keepdims=False) check_grads(fun)(npr.randn(3, 1)) check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 1)) def test_singleton_array_output_axis1(): fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=False) check_grads(fun)(npr.randn(1, 3)) check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(1, 3)) def test_singleton_array_output_axis0_keepdims(): fun = lambda x: np.sum(np.sin(x), axis=0, keepdims=True) check_grads(fun)(npr.randn(3, 1)) check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(3, 1)) def test_singleton_array_output_axis1_keepdims(): fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=True) check_grads(fun)(npr.randn(1, 3)) check_grads(lambda x: np.sum(grad(fun)(x)))(npr.randn(1, 3)) def test_assignment_raises_error(): def fun(A, b): A[1] = b return A A = npr.randn(5) with pytest.raises(TypeError): check_grads(fun)(A, 3.0) # def test_nonscalar_output_1(): # with pytest.raises(TypeError): # grad(lambda x: x * 2)(np.zeros(2)) # def test_nonscalar_output_2(): # with pytest.raises(TypeError): # grad(lambda x: x * 2)(np.zeros(2)) # TODO: # Diamond patterns # Taking grad again after returning const # Empty functions # 2nd derivatives with fanout, thinking about the outgrad adder ================================================ FILE: tests/test_jacobian.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad, jacobian from autograd.test_util import check_grads npr.seed(1) def test_jacobian_against_grad(): fun = lambda x: np.sum(np.sin(x), axis=1, keepdims=True) A = npr.randn(1, 3) assert np.allclose(grad(fun)(A), jacobian(fun)(A)) def test_jacobian_scalar_to_vector(): fun = lambda x: np.array([x, x**2, x**3]) val = npr.randn() assert np.allclose(jacobian(fun)(val), np.array([1.0, 2 * val, 3 * val**2])) def test_jacobian_against_stacked_grads(): scalar_funs = [ lambda x: np.sum(x**3), lambda x: np.prod(np.sin(x) + np.sin(x)), lambda x: grad(lambda y: np.exp(y) * np.tanh(x[0]))(x[1]), ] vector_fun = lambda x: np.array([f(x) for f in scalar_funs]) x = npr.randn(5) jac = jacobian(vector_fun)(x) grads = [grad(f)(x) for f in scalar_funs] assert np.allclose(jac, np.vstack(grads)) def test_jacobian_higher_order(): fun = lambda x: np.sin(np.outer(x, x)) + np.cos(np.dot(x, x)) assert jacobian(fun)(npr.randn(2)).shape == (2, 2, 2) assert jacobian(jacobian(fun))(npr.randn(2)).shape == (2, 2, 2, 2) # assert jacobian(jacobian(jacobian(fun)))(npr.randn(2)).shape == (2,2,2,2,2) check_grads(lambda x: np.sum(np.sin(jacobian(fun)(x))))(npr.randn(2)) check_grads(lambda x: np.sum(np.sin(jacobian(jacobian(fun))(x))))(npr.randn(2)) ================================================ FILE: tests/test_linalg.py ================================================ from functools import partial import numpy as onp import pytest import autograd.numpy as np import autograd.numpy.random as npr from autograd import tuple from autograd.test_util import check_grads npr.seed(1) # Fwd mode not yet implemented check_grads = partial(check_grads, modes=["rev"]) def check_symmetric_matrix_grads(fun, **grad_check_kwargs): def check(*args): def symmetrize(A): L = np.tril(A) return (L + T(L)) / 2.0 new_fun = lambda *args: fun(symmetrize(args[0]), *args[1:]) check_grads(new_fun, **grad_check_kwargs)(*args) return check T = lambda A: np.swapaxes(A, -1, -2) def rand_psd(D): mat = npr.randn(D, D) return np.dot(mat, mat.T) def test_inv(): def fun(x): return np.linalg.inv(x) D = 8 mat = npr.randn(D, D) mat = np.dot(mat, mat) + 1.0 * np.eye(D) check_grads(fun)(mat) def test_pinv(): def fun(x): return np.linalg.pinv(x) N = 5 D = 2 ## Non-square matrices: for M in range(N // 2, N + N // 2 + 1): mat = npr.randn(N, M) check_grads(fun)(mat) # Stacked mat = npr.randn(D, N, M) check_grads(fun)(mat) ## Square, low (fixed) rank matrices def fun_low_rank(x): return np.linalg.pinv(np.linalg._dot(np.linalg.T(x), x)) for M in range(N // 2, N + N // 2 + 1): mat = npr.randn(N, M) check_grads(fun_low_rank)(mat) # Stacked mat = npr.randn(D, N, M) check_grads(fun_low_rank)(mat) def test_inv_3d(): fun = lambda x: np.linalg.inv(x) D = 4 mat = npr.randn(D, D, D) + 5 * np.eye(D) check_grads(fun)(mat) mat = npr.randn(D, D, D, D) + 5 * np.eye(D) check_grads(fun)(mat) def test_solve_arg1(): D = 8 A = npr.randn(D, D) + 10.0 * np.eye(D) B = npr.randn(D, D - 1) def fun(a): return np.linalg.solve(a, B) check_grads(fun)(A) def test_solve_arg1_1d(): D = 8 A = npr.randn(D, D) + 10.0 * np.eye(D) B = npr.randn(D) def fun(a): return np.linalg.solve(a, B) check_grads(fun)(A) def test_solve_arg2(): D = 6 A = npr.randn(D, D) + 1.0 * np.eye(D) B = npr.randn(D, D - 1) def fun(b): return np.linalg.solve(A, b) check_grads(fun)(B) def test_solve_arg1_3d(): D = 4 A = npr.randn(D + 1, D, D) + 5 * np.eye(D) B = npr.randn(D + 1, D) if onp.lib.NumpyVersion(onp.__version__) < "2.0.0": fun = lambda A: np.linalg.solve(A, B) else: fun = lambda A: np.linalg.solve(A, B[..., None])[..., 0] check_grads(fun)(A) def test_solve_arg1_3d_3d(): D = 4 A = npr.randn(D + 1, D, D) + 5 * np.eye(D) B = npr.randn(D + 1, D, D + 2) fun = lambda A: np.linalg.solve(A, B) check_grads(fun)(A) def test_det(): def fun(x): return np.linalg.det(x) D = 6 mat = npr.randn(D, D) check_grads(fun)(mat) def test_det_3d(): fun = lambda x: np.linalg.det(x) D = 3 mat = npr.randn(D, D, D) check_grads(fun)(mat) def test_slogdet(): def fun(x): sign, logdet = np.linalg.slogdet(x) return logdet D = 6 mat = npr.randn(D, D) check_grads(fun)(mat) check_grads(fun)(-mat) def test_slogdet_3d(): fun = lambda x: np.sum(np.linalg.slogdet(x)[1]) mat = np.concatenate([(rand_psd(5) + 5 * np.eye(5))[None, ...] for _ in range(3)]) check_grads(fun)(mat) def test_vector_2norm(): def fun(x): return np.linalg.norm(x) D = 6 vec = npr.randn(D) check_grads(fun, modes=["fwd", "rev"])(vec) def test_vector_2norm_complex(): def fun(x): return np.linalg.norm(x) D = 6 vec = npr.randn(D) + 1j * npr.randn(D) check_grads(fun)(vec) def test_frobenius_norm(): def fun(x): return np.linalg.norm(x) D = 6 mat = npr.randn(D, D - 1) check_grads(fun, modes=["fwd", "rev"])(mat) def test_frobenius_norm_complex(): def fun(x): return np.linalg.norm(x) D = 6 mat = npr.randn(D, D - 1) + 1j * npr.randn(D, D - 1) check_grads(fun)(mat) def test_frobenius_norm_axis(): def fun(x): return np.linalg.norm(x, axis=(0, 1)) D = 6 mat = npr.randn(D, D - 1, D - 2) check_grads(fun, modes=["fwd", "rev"])(mat) def test_frobenius_norm_axis_complex(): def fun(x): return np.linalg.norm(x, axis=(0, 1)) D = 6 mat = npr.randn(D, D - 1, D - 2) + 1j * npr.randn(D, D - 1, D - 2) check_grads(fun)(mat) @pytest.mark.parametrize("ord", range(2, 5)) @pytest.mark.parametrize("size", [6]) def test_vector_norm_ord(size, ord): def fun(x): return np.linalg.norm(x, ord=ord) vec = npr.randn(size) check_grads(fun, modes=["fwd", "rev"])(vec) @pytest.mark.parametrize("ord", range(2, 5)) @pytest.mark.parametrize("size", [6]) def test_vector_norm_ord_complex(size, ord): def fun(x): return np.linalg.norm(x, ord=ord) vec = npr.randn(size) + 1j * npr.randn(size) check_grads(fun)(vec) @pytest.mark.parametrize("axis", range(3)) @pytest.mark.parametrize("shape", [(6, 5, 4)]) def test_norm_axis(shape, axis): def fun(x): return np.linalg.norm(x, axis=axis) arr = npr.randn(*shape) check_grads(fun, modes=["fwd", "rev"])(arr) @pytest.mark.parametrize("axis", range(3)) @pytest.mark.parametrize("shape", [(6, 5, 4)]) def test_norm_axis_complex(shape, axis): def fun(x): return np.linalg.norm(x, axis=axis) arr = npr.randn(*shape) + 1j * npr.randn(*shape) check_grads(fun)(arr) def test_norm_nuclear(): def fun(x): return np.linalg.norm(x, ord="nuc") D = 6 mat = npr.randn(D, D - 1) # Order 1 because the jvp of the svd is not implemented check_grads(fun, modes=["fwd", "rev"], order=1)(mat) def test_norm_nuclear_complex(): def fun(x): return np.linalg.norm(x, ord="nuc") D = 6 mat = npr.randn(D, D - 1) + 1j * npr.randn(D, D - 1) check_grads(fun)(mat) def test_norm_nuclear_axis(): def fun(x): return np.linalg.norm(x, ord="nuc", axis=(0, 1)) D = 6 mat = npr.randn(D, D - 1, D - 2) # Order 1 because the jvp of the svd is not implemented check_grads(fun, modes=["fwd", "rev"], order=1)(mat) def test_norm_nuclear_axis_complex(): def fun(x): return np.linalg.norm(x, ord="nuc", axis=(0, 1)) D = 6 mat = npr.randn(D, D - 1, D - 2) + 1j * npr.randn(D, D - 1, D - 2) check_grads(fun)(mat) def test_eigvalh_lower(): def fun(x): w, v = np.linalg.eigh(x) return tuple((w, v)) D = 6 mat = npr.randn(D, D) check_grads(fun)(mat) def test_eigvalh_upper(): def fun(x): w, v = np.linalg.eigh(x, "U") return tuple((w, v)) D = 6 mat = npr.randn(D, D) check_grads(fun)(mat) broadcast_dot_transpose = partial(np.einsum, "...ij,...kj->...ik") def test_eigvalh_lower_broadcasting(): def fun(x): w, v = np.linalg.eigh(x) return tuple((w, v)) D = 6 mat = npr.randn(2, 3, D, D) + 10 * np.eye(D)[None, None, ...] hmat = broadcast_dot_transpose(mat, mat) check_grads(fun)(hmat) def test_eigvalh_upper_broadcasting(): def fun(x): w, v = np.linalg.eigh(x, "U") return tuple((w, v)) D = 6 mat = npr.randn(2, 3, D, D) + 10 * np.eye(D)[None, None, ...] hmat = broadcast_dot_transpose(mat, mat) check_grads(fun)(hmat) # For complex-valued matrices, the eigenvectors could have arbitrary phases (gauge) # which makes it impossible to compare to numerical derivatives. So we take the # absolute value to get rid of that phase. def test_eigvalh_lower_complex(): def fun(x): w, v = np.linalg.eigh(x) return tuple((w, np.abs(v))) D = 6 mat = npr.randn(D, D) + 1j * npr.randn(D, D) check_grads(fun)(mat) def test_eigvalh_upper_complex(): def fun(x): w, v = np.linalg.eigh(x, "U") return tuple((w, np.abs(v))) D = 6 mat = npr.randn(D, D) + 1j * npr.randn(D, D) check_grads(fun)(mat) # Note eigenvalues and eigenvectors for real matrix can still be complex def test_eig_real(): def fun(x): w, v = np.linalg.eig(x) return tuple((np.abs(w), np.abs(v))) D = 8 mat = npr.randn(D, D) check_grads(fun)(mat) def test_eig_complex(): def fun(x): w, v = np.linalg.eig(x) return tuple((w, np.abs(v))) D = 8 mat = npr.randn(D, D) + 1.0j * npr.randn(D, D) check_grads(fun)(mat) def test_eig_batched(): def fun(x): w, v = np.linalg.eig(x) return tuple((w, np.abs(v))) D = 8 b = 5 mat = npr.randn(b, D, D) + 1.0j * npr.randn(b, D, D) check_grads(fun)(mat) def test_cholesky(): fun = lambda A: np.linalg.cholesky(A) check_symmetric_matrix_grads(fun)(rand_psd(6)) def test_cholesky_broadcast(): fun = lambda A: np.linalg.cholesky(A) A = np.concatenate([rand_psd(6)[None, :, :] for i in range(3)], axis=0) check_symmetric_matrix_grads(fun)(A) def test_cholesky_reparameterization_trick(): def fun(A): rng = np.random.RandomState(0) z = np.dot(np.linalg.cholesky(A), rng.randn(A.shape[0])) return np.linalg.norm(z) check_symmetric_matrix_grads(fun)(rand_psd(6)) def test_svd_wide_2d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) m = 3 n = 5 mat = npr.randn(m, n) check_grads(fun)(mat) def test_svd_wide_2d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) m = 3 n = 5 mat = npr.randn(m, n) + 1j * npr.randn(m, n) check_grads(fun)(mat) def test_svd_wide_3d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) k = 4 m = 3 n = 5 mat = npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_wide_3d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) k = 4 m = 3 n = 5 mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_square_2d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) m = 4 n = 4 mat = npr.randn(m, n) check_grads(fun)(mat) def test_svd_square_2d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) m = 4 n = 4 mat = npr.randn(m, n) + 1j * npr.randn(m, n) check_grads(fun)(mat) def test_svd_square_3d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) k = 3 m = 4 n = 4 mat = npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_square_3d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) k = 3 m = 4 n = 4 mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_tall_2d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) m = 5 n = 3 mat = npr.randn(m, n) check_grads(fun)(mat) def test_svd_tall_2d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) m = 5 n = 3 mat = npr.randn(m, n) + 1j * npr.randn(m, n) check_grads(fun)(mat) def test_svd_tall_3d(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((u, s, v)) k = 4 m = 5 n = 3 mat = npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_tall_3d_complex(): def fun(x): u, s, v = np.linalg.svd(x, full_matrices=False) return tuple((np.abs(u), s, np.abs(v))) k = 4 m = 5 n = 3 mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_only_s_2d(): def fun(x): s = np.linalg.svd(x, full_matrices=False, compute_uv=False) return s m = 5 n = 3 mat = npr.randn(m, n) check_grads(fun)(mat) def test_svd_only_s_2d_complex(): def fun(x): s = np.linalg.svd(x, full_matrices=False, compute_uv=False) return s m = 5 n = 3 mat = npr.randn(m, n) + 1j * npr.randn(m, n) check_grads(fun)(mat) def test_svd_only_s_3d(): def fun(x): s = np.linalg.svd(x, full_matrices=False, compute_uv=False) return s k = 4 m = 5 n = 3 mat = npr.randn(k, m, n) check_grads(fun)(mat) def test_svd_only_s_3d_complex(): def fun(x): s = np.linalg.svd(x, full_matrices=False, compute_uv=False) return s k = 4 m = 5 n = 3 mat = npr.randn(k, m, n) + 1j * npr.randn(k, m, n) check_grads(fun)(mat) ================================================ FILE: tests/test_list.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd import isinstance as ag_isinstance from autograd import list as ag_list from autograd.test_util import check_grads npr.seed(1) def test_getter(): def fun(input_list): A = np.sum(input_list[0]) B = np.sum(input_list[1]) C = np.sum(input_list[1]) return A + B + C d_fun = grad(fun) input_list = [npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)] result = d_fun(input_list) assert np.allclose(result[0], np.ones((5, 6))) assert np.allclose(result[1], 2 * np.ones((4, 3))) assert np.allclose(result[2], np.zeros((2, 4))) def test_grads(): def fun(input_list): A = np.sum(np.sin(input_list[0])) B = np.sum(np.cos(input_list[1])) return A + B def d_fun(input_list): g = grad(fun)(input_list) A = np.sum(g[0]) B = np.sum(np.sin(g[0])) C = np.sum(np.sin(g[1])) return A + B + C input_list = [npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)] check_grads(fun)(input_list) check_grads(d_fun)(input_list) def test_slices(): def f(x): s = slice(None, -1, None) y = x[s] return y[0] grad(f)([1.0, 2.0, 3.0]) def f(x): y = x[1:3] return y[0] grad(f)([1.0, 2.0, 3.0]) def test_nested_list(): A = [[1.0], 2.0, 1.5] def fun(x): return x[1:][0] check_grads(fun)(A) def test_make_list(): def fun(x): return ag_list((x, x)) check_grads(fun)(1.0) def test_isinstance(): def fun(x): assert ag_isinstance(x, list) assert ag_isinstance(x, ag_list) return x[0] fun([1.0, 2.0, 3.0]) grad(fun)([1.0, 2.0, 3.0]) ================================================ FILE: tests/test_logic.py ================================================ import warnings from contextlib import contextmanager import pytest import autograd.numpy as np from autograd import deriv, grad from autograd.core import primitive_vjps from autograd.extend import primitive from autograd.test_util import check_grads def test_assert(): # from https://github.com/HIPS/autograd/issues/43 def fun(x): assert np.allclose(x, (x * 3.0) / 3.0) return np.sum(x) check_grads(fun)(np.array([1.0, 2.0, 3.0])) def test_nograd(): # we want this to raise non-differentiability error fun = lambda x: np.allclose(x, (x * 3.0) / 3.0) with pytest.raises(TypeError): with warnings.catch_warnings(record=True) as w: grad(fun)(np.array([1.0, 2.0, 3.0])) def test_no_vjp_def(): fun = primitive(lambda x: 2.0 * x) with pytest.raises(NotImplementedError): grad(fun)(1.0) def test_no_jvp_def(): fun = primitive(lambda x: 2.0 * x) with pytest.raises(NotImplementedError): deriv(fun)(1.0) def test_falseyness(): fun = lambda x: np.real(x**2 if np.iscomplex(x) else np.sum(x)) check_grads(fun)(2.0) check_grads(fun)(2.0 + 1j) def test_unimplemented_falseyness(): @contextmanager def remove_grad_definitions(fun): vjpmaker = primitive_vjps.pop(fun, None) yield if vjpmaker: primitive_vjps[fun] = vjpmaker with remove_grad_definitions(np.iscomplex): fun = lambda x: np.real(x**2 if np.iscomplex(x) else np.sum(x)) check_grads(fun)(5.0) check_grads(fun)(2.0 + 1j) ================================================ FILE: tests/test_misc.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad, make_vjp from autograd.misc import const_graph, flatten from autograd.test_util import scalar_close from autograd.tracer import primitive def test_const_graph(): L = [] def foo(x, y): L.append(None) return grad(lambda x: np.sin(x) + x * 2)(x * y) foo_wrapped = const_graph(foo) assert len(L) == 0 assert scalar_close(foo(0.0, 0.0), foo_wrapped(0.0, 0.0)) assert len(L) == 2 assert scalar_close(foo(1.0, 0.5), foo_wrapped(1.0, 0.5)) assert len(L) == 3 assert scalar_close(foo(1.0, 0.5), foo_wrapped(1.0, 0.5)) assert len(L) == 4 def test_const_graph_args(): L = [] @primitive def process(var, varname): L.append(varname) return var def foo(x, y, z): x = process(x, "x") y = process(y, "y") z = process(z, "z") return x + 2 * y + 3 * z foo_wrapped = const_graph(foo, 1.0, z=3.0) assert L == [] assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0)) assert L == ["x", "y", "z", "x", "y", "z"] L = [] assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0)) assert L == ["x", "y", "z", "y"] L = [] assert scalar_close(foo(1.0, 2.0, 3.0), foo_wrapped(2.0)) assert L == ["x", "y", "z", "y"] def test_flatten(): r = np.random.randn x = (1.0, r(2, 3), [r(1, 4), {"x": 2.0, "y": r(4, 2)}]) x_flat, unflatten = flatten(x) assert x_flat.shape == (20,) assert x_flat[0] == 1.0 assert np.all(x_flat == flatten(unflatten(x_flat))[0]) y = (1.0, 2.0, [3.0, {"x": 2.0, "y": 4.0}]) y_flat, unflatten = flatten(y) assert y_flat.shape == (5,) assert y == unflatten(y_flat) def test_flatten_empty(): val = (npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)])) vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) assert np.all(vect == vect_2) def test_flatten_dict(): val = {"k": npr.random((4, 4)), "k2": npr.random((3, 3)), "k3": 3.0, "k4": [1.0, 4.0, 7.0, 9.0]} vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) assert np.all(vect == vect_2) def unflatten_tracing(): val = [npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)])] vect, unflatten = flatten(val) def f(vect): return unflatten(vect) flatten2, _ = make_vjp(f)(vect) assert np.all(vect == flatten2(val)) def test_flatten_nodes_in_containers(): # see issue #232 def f(x, y): xy, _ = flatten([x, y]) return np.sum(xy) grad(f)(1.0, 2.0) def test_flatten_complex(): val = 1 + 1j flat, unflatten = flatten(val) assert np.all(val == unflatten(flat)) ================================================ FILE: tests/test_numpy.py ================================================ import warnings from numpy_utils import combo_check import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.test_util import check_grads npr.seed(1) def test_numpy_version(): import numpy assert np.__version__ == numpy.__version__ def test_dot(): def fun(x, y): return np.dot(x, y) mat1 = npr.randn(10, 11) mat2 = npr.randn(10, 11) vect1 = npr.randn(10) vect2 = npr.randn(11) vect3 = npr.randn(11) check_grads(fun)(mat1, vect2) check_grads(fun)(mat1, mat2.T) check_grads(fun)(vect1, mat1) check_grads(fun)(vect2, vect3) def test_dot_with_floats(): def fun(x, y): return np.dot(x, y) mat1 = npr.randn(10, 11) vect1 = npr.randn(10) float1 = npr.randn() check_grads(fun)(mat1, float1) check_grads(fun)(float1, mat1) check_grads(fun)(vect1, float1) check_grads(fun)(float1, vect1) # No longer supporting this # def test_dot_method(): # def fun(x, y): return x.dot(y) # mat1 = npr.randn(10, 11) # mat2 = npr.randn(10, 11) # vect1 = npr.randn(10) # vect2 = npr.randn(11) # vect3 = npr.randn(11) # check_grads(fun)(mat1, vect2) # check_grads(fun)(mat1, mat2.T) # check_grads(fun)(vect1, mat1) # check_grads(fun)(vect2, vect3) def test_outer(): def fun(x, y): return np.outer(x, y) vect2 = npr.randn(11) vect3 = npr.randn(11) check_grads(fun)(vect2, vect3) check_grads(fun)(vect2.T, vect3) check_grads(fun)(vect2.T, vect3.T) def test_max(): def fun(x): return np.max(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_max_axis(): def fun(x): return np.max(x, axis=1) mat = npr.randn(3, 4, 5) check_grads(fun)(mat) def test_max_axis_keepdims(): def fun(x): return np.max(x, axis=1, keepdims=True) mat = npr.randn(3, 4, 5) check_grads(fun)(mat) def test_min(): def fun(x): return np.min(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_min_axis(): def fun(x): return np.min(x, axis=1) mat = npr.randn(3, 4, 5) check_grads(fun)(mat) def test_min_axis_keepdims(): def fun(x): return np.min(x, axis=1, keepdims=True) mat = npr.randn(3, 4, 5) check_grads(fun)(mat) def test_sum_1(): def fun(x): return np.sum(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_sum_2(): def fun(x): return np.sum(x, axis=0) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_sum_3(): def fun(x): return np.sum(x, axis=0, keepdims=True) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_sum_with_axis_tuple(): def fun(x): return np.sum(x, axis=(1, 2)) mat = npr.randn(10, 11, 7) check_grads(fun)(mat) def test_flipud(): def fun(x): return np.flipud(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_fliplr(): def fun(x): return np.fliplr(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_rot90(): def fun(x): return np.rot90(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_cumsum_axis0(): def fun(x): return np.cumsum(x, axis=0) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_cumsum_axis1(): def fun(x): return np.cumsum(x, axis=1) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_cumsum_1d(): def fun(x): return np.cumsum(x) mat = npr.randn(10) check_grads(fun)(mat) def test_cumsum_no_axis(): def fun(x): return np.cumsum(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_non_numpy_sum(): def fun(x, y): return sum([x, y]) mat1 = npr.randn(10, 11) mat2 = npr.randn(10, 11) check_grads(fun)(mat1, mat2) def test_mean_1(): def fun(x): return np.mean(x) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_mean_2(): def fun(x): return np.mean(x, axis=0) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_mean_3(): def fun(x): return np.mean(x, axis=0, keepdims=True) mat = npr.randn(10, 11) check_grads(fun)(mat) def test_index_ints(): A = npr.randn(5, 6, 4) def fun(x): return x[3, 0, 1] check_grads(fun)(A) def test_index_slice(): A = npr.randn(5, 6, 4) def fun(x): return x[::-1, 2:4, :] check_grads(fun)(A) def test_index_lists(): A = npr.randn(5, 6, 4) def fun(x): return x[[0, 1, 2], :, :] check_grads(fun)(A) def test_index_mixed(): A = npr.randn(5, 6, 4) def fun(x): return x[3, 2:, [1, 3]] check_grads(fun)(A) def test_vector_slice(): A = npr.randn(5) def fun(x): return x[2:4] check_grads(fun)(A) def test_index_slice_fanout(): A = npr.randn(5, 6, 4) def fun(x): y = x[::-1, 2:4, :] z = x[::-1, 3:5, :] return y + z check_grads(fun)(A) def test_index_multiple_slices(): A = npr.randn(7) def fun(x): y = x[2:6] z = y[1:3] return z check_grads(fun)(A) def test_reshape_method(): A = npr.randn(5, 6, 4) def fun(x): return x.reshape((5 * 4, 6)) check_grads(fun)(A) def test_reshape_call(): A = npr.randn(5, 6, 4) def fun(x): return np.reshape(x, (5 * 4, 6)) check_grads(fun)(A) def test_reshape_method_nolist(): # The reshape can be called in two different ways: # like A.reshape((5,4)) or A.reshape(5,4). # This test checks that we support the second way. A = npr.randn(5, 6, 4) def fun(x): return x.reshape(5 * 4, 6) check_grads(fun)(A) def test_ravel_method(): A = npr.randn(5, 6, 4) def fun(x): return x.ravel() check_grads(fun)(A) def test_ravel_call(): A = npr.randn(5, 6, 4) def fun(x): return np.ravel(x) check_grads(fun)(A) def test_flatten_method(): A = npr.randn(5, 6, 4) def fun(x): return x.flatten() check_grads(fun)(A) def test_simple_append_list(): A = [1.0, 2.0, 3.0] b = 4.0 check_grads(np.append, argnum=(0, 1))(A, b) def test_simple_append_arr(): A = np.array([1.0, 2.0, 3.0]) b = 4.0 check_grads(np.append, argnum=(0, 1))(A, b) def test_simple_append_list_2D(): A = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] B = [[7.0, 8.0, 9.0]] check_grads(np.append, argnum=(0, 1))(A, B, axis=0) def test_simple_concatenate(): A = npr.randn(5, 6, 4) B = npr.randn(4, 6, 4) def fun(x): return np.concatenate((A, x)) check_grads(fun)(B) def test_concatenate_axis_0(): A = npr.randn(5, 6, 4) B = npr.randn(5, 6, 4) def fun(x): return np.concatenate((B, x, B)) check_grads(fun)(A) def test_concatenate_axis_1(): A = npr.randn(5, 6, 4) B = npr.randn(5, 6, 4) def fun(x): return np.concatenate((B, x, B), axis=1) check_grads(fun)(A) def test_concatenate_axis_1_unnamed(): """Tests whether you can specify the axis without saying "axis=1".""" A = npr.randn(5, 6, 4) B = npr.randn(5, 6, 4) def fun(x): return np.concatenate((B, x, B), 1) check_grads(fun)(A) def test_trace(): def fun(x): return np.trace(x, offset=offset) mat = npr.randn(10, 11) offset = npr.randint(-9, 11) check_grads(fun)(mat) def test_trace2(): def fun(x): return np.trace(x, offset=offset) mat = npr.randn(11, 10) offset = npr.randint(-9, 11) check_grads(fun)(mat) def test_trace_extradims(): def fun(x): return np.trace(x, offset=offset) mat = npr.randn(5, 6, 4, 3) offset = npr.randint(-5, 6) check_grads(fun)(mat) # TODO: Allow axis1, axis2 args. # def test_trace_extradims2(): # def fun(x): return np.trace(x, offset=offset, axis1=3,axis2=2) # mat = npr.randn(5,6,4,3) # offset = npr.randint(-5,6) # check_grads(fun)(mat) def test_diag(): def fun(x): return np.diag(x) mat = npr.randn(10, 10) check_grads(fun)(mat) def test_transpose(): def fun(x): return x.T mat = npr.randn(8, 8) check_grads(fun)(mat) def test_roll(): def fun(x): return np.roll(x, 2, axis=1) mat = npr.randn(4, 5) check_grads(fun)(mat) def test_roll_no_axis(): def fun(x): return np.roll(x, 2, axis=1) mat = npr.randn(4, 5) check_grads(fun)(mat) def test_triu(): def fun(x): return np.triu(x, k=2) mat = npr.randn(5, 5) check_grads(fun)(mat) def test_tril(): def fun(x): return np.tril(x, k=2) mat = npr.randn(5, 5) check_grads(fun)(mat) def test_clip(): def fun(x): return np.clip(x, a_min=0.1, a_max=1.1) mat = npr.randn(5, 5) check_grads(fun)(mat) def test_prod_1(): def fun(x): return np.prod(x) mat = npr.randn(2, 3) ** 2 / 10.0 + 0.1 # Gradient unstable when zeros are present. check_grads(fun)(mat) def test_prod_2(): def fun(x): return np.prod(x, axis=0) mat = npr.randn(2, 3) ** 2 + 0.1 check_grads(fun)(mat) def test_prod_3(): def fun(x): return np.prod(x, axis=0, keepdims=True) mat = npr.randn(2, 3) ** 2 + 0.1 check_grads(fun)(mat) def test_prod_4(): def fun(x): return np.prod(x) mat = npr.randn(7) ** 2 + 0.1 check_grads(fun)(mat) def test_1d_array(): def fun(x): return np.array([x, x * 1.0, x + 2.5]) check_grads(fun)(3.0) def test_2d_array(): def fun(x): return np.array([[x, x * 1.0, x + 2.5], [x**2, x, x / 2.0]]) check_grads(fun)(3.0) def test_1d_array_fanout(): def fun(x): A = np.array([x, x * 1.0, x + 2.5]) return A + A check_grads(fun)(3.0) def test_2d_array_fanout(): def fun(x): A = np.array([[x, x * 1.0, x + 2.5], [x**2, x, x / 2.0]]) return A + A check_grads(fun)(3.0) def test_array_from_scalar(): def fun(x): return np.array(x) check_grads(fun)(3.0) def test_array_from_arrays(): def fun(x): return np.array([x, x]) A = npr.randn(3, 2) check_grads(fun)(A) def test_array_from_arrays_2(): def fun(x): return np.array([[2 * x, x + 1], [x, x]]) A = npr.randn(3, 2) check_grads(fun)(A) def test_len(): def fun(x): assert len(x) == 3 return x A = npr.randn(3, 2) check_grads(fun)(A) def test_r_basic(): with warnings.catch_warnings(record=True) as w: def fun(x): c = npr.randn(3, 2) b = np.r_[x] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_r_double(): with warnings.catch_warnings(record=True) as w: def fun(x): c = npr.randn(3, 2) b = np.r_[x, x] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_no_relation(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): return c A = npr.randn(3, 2) check_grads(fun)(A) def test_r_no_relation(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): b = np.r_[c] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_r_node_and_const(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): b = np.r_[x, c] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_r_mixed(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): b = np.r_[x, c, x] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_r_slicing(): with warnings.catch_warnings(record=True) as w: c = npr.randn(10) def fun(x): b = np.r_[x, c, 1:10] return b A = npr.randn(10) check_grads(fun)(A) def test_c_(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): b = np.c_[x, c, x] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_c_mixed(): with warnings.catch_warnings(record=True) as w: c = npr.randn(3, 2) def fun(x): b = np.c_[x, c, x] return b A = npr.randn(3, 2) check_grads(fun)(A) def test_var_ddof(): B = npr.randn(3) C = npr.randn(3, 4) D = npr.randn(1, 3) combo_check(np.var, (0,))([B, C, D], axis=[None], keepdims=[True, False], ddof=[0, 1]) combo_check(np.var, (0,))([C, D], axis=[None, 1], keepdims=[True, False], ddof=[2]) def test_std_ddof(): B = npr.randn(3) C = npr.randn(3, 4) D = npr.randn(1, 3) combo_check(np.std, (0,))([B, C, D], axis=[None], keepdims=[True, False], ddof=[0, 1]) combo_check(np.std, (0,))([C, D], axis=[None, 1], keepdims=[True, False], ddof=[2]) def test_where(): def fun(x, y): b = np.where(C, x, y) return b C = npr.randn(4, 5) > 0 A = npr.randn(4, 5) B = npr.randn(4, 5) check_grads(fun)(A, B) def test_squeeze_func(): A = npr.randn(5, 1, 4) def fun(x): return np.squeeze(x) check_grads(fun)(A) def test_squeeze_method(): A = npr.randn(5, 1, 4) def fun(x): return x.squeeze() check_grads(fun)(A) def test_repeat(): A = npr.randn(5, 3, 4) def fun(x): return np.repeat(x, 2, axis=1) check_grads(fun)(A) def test_repeat_axis1_rep1(): A = npr.randn(5, 3, 4) def fun(x): return np.repeat(x, 1, axis=1) check_grads(fun)(A) def test_repeat_axis0(): A = npr.randn(5, 3) def fun(x): return np.repeat(x, 2, axis=0) check_grads(fun)(A) def test_repeat_1d_axis0(): A = npr.randn(5) def fun(x): return np.repeat(x, 2, axis=0) check_grads(fun)(A) def test_repeat_axis0_rep1(): A = npr.randn(5, 1) def fun(x): return np.repeat(x, 1, axis=0) check_grads(fun)(A) def test_expand_dims(): A = npr.randn(5, 1, 4) def fun(x): return np.expand_dims(x, 2) check_grads(fun)(A) def test_tensordot_kwargs_by_position(): def fun(x): return np.tensordot(x * np.ones((2, 2)), x * np.ones((2, 2)), 2) grad(fun)(1.0) def test_multi_index(): A = npr.randn(3) fun = lambda x: np.sum(x[[0, 0]]) check_grads(fun)(A) def test_multi_index2(): A = npr.randn(3) fun = lambda x: np.sum(x[[0, 1, 0]]) check_grads(fun)(A) def test_index_dot_slices(): A = npr.randn(4) def fun(x): return np.dot(x[:2], x[2:]) check_grads(fun)(A) # def test_index_exp_slicing(): # def fun(x): # b = np.index_exp[x, x] # return b # A = npr.randn(10, 1) # check_grads(fun)(A) # def test_s_slicing(): # def fun(x): # b = np.s_[x, x] # return b # A = npr.randn(10, 1) # check_grads(fun)(A) # TODO: # getitem def test_cast_to_int(): inds = np.ones(5)[:, None] def fun(W): # glue W and inds together glued_together = np.concatenate((W, inds), axis=1) # separate W and inds back out new_W = W[:, :-1] new_inds = np.int64(W[:, -1]) assert new_inds.dtype == np.int64 return new_W[new_inds].sum() W = np.random.randn(5, 10) check_grads(fun)(W) def test_make_diagonal(): def fun(D): return np.make_diagonal(D, axis1=-1, axis2=-2) D = np.random.randn(4) A = np.make_diagonal(D, axis1=-1, axis2=-2) assert np.allclose(np.diag(A), D) check_grads(fun)(D) D = np.random.randn(3, 4) A = np.make_diagonal(D, axis1=-1, axis2=-2) assert all([np.allclose(np.diag(A[i]), D[i]) for i in range(3)]) check_grads(fun)(D) def test_diagonal(): def fun(D): return np.diagonal(D, axis1=-1, axis2=-2) D = np.random.randn(4, 4) A = np.make_diagonal(D, axis1=-1, axis2=-2) check_grads(fun)(D) D = np.random.randn(3, 4, 4) A = np.make_diagonal(D, axis1=-1, axis2=-2) check_grads(fun)(D) def test_nan_to_num(): y = np.array([0.0, np.nan, np.inf, -np.inf]) fun = lambda x: np.sum(np.sin(np.nan_to_num(x + y))) x = np.random.randn(4) check_grads(fun)(x) # TODO(mattjj): np.frexp returns a pair of ndarrays and the second is an int # type, for which there is currently no vspace registered # def test_frexp(): # fun = lambda x: np.frexp(x)[0] # A = 1.2 #np.random.rand(4,3) * 0.8 + 2.1 # check_grads(fun)(A) def test_max_equal_values(): def fun(x): return np.max(np.array([x, x])) check_grads(fun)(1.0) def test_max_equal_values_2d(): def fun(x): return np.max(np.array([[x, x], [x, 0.5]]), axis=1) check_grads(fun)(1.0) check_grads(fun)(-1.0) def test_min_3_way_equality(): def fun(x): return np.min(np.array([[x, x, x], [x, 0.5, 0.5], [0.5, 0.5, 0.5], [x, x, 0.5]]), axis=0) check_grads(fun)(1.0) check_grads(fun)(-1.0) def test_maximum_equal_values(): def fun(x): return np.maximum(x, x) check_grads(fun)(1.0) def test_maximum_equal_values_2d(): def fun(x): return np.maximum(np.array([x, x, 0.5]), np.array([[x, 0.5, x], [x, x, 0.5]])) check_grads(fun)(1.0) check_grads(fun)(-1.0) check_grads(fun)(2.0) def test_linspace(): for num in [0, 1, 5]: def fun(x, y): return np.linspace(x, y, num) check_grads(fun)(1.2, 3.4) check_grads(fun)(1.2, -3.4) check_grads(fun)(1.2, 1.2) def test_astype(): x = np.arange(3, dtype="float32") def f(x): return np.sum(np.sin(x.astype("float64"))) assert grad(f)(x).dtype == np.dtype("float32") def test_gradient(): check_grads(np.gradient, 0)(npr.randn(10)) check_grads(np.gradient, 0)(npr.randn(10, 10)) check_grads(np.gradient, 0)(npr.randn(10, 10, 10)) for a in [None, 0, 1, -1, (0, 1), (0, -1)]: check_grads(np.gradient, 0)(npr.randn(10, 10, 10), axis=a) ================================================ FILE: tests/test_performance.py ================================================ # TODO: # Do a huge calculation with trivial primitive computations # and lots of diamonds and get a benchmark per-node time and # memory cost. ================================================ FILE: tests/test_scalar_ops.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd.test_util import check_grads npr.seed(1) def test_abs(): fun = lambda x: 3.0 * np.abs(x) check_grads(fun)(1.1) check_grads(fun)(-1.1) check_grads(fun, order=1)(0.0) def test_absolute(): fun = lambda x: 3.0 * np.absolute(x) check_grads(fun)(1.1) check_grads(fun)(-1.1) check_grads(fun, order=1)(0.0) def test_sin(): fun = lambda x: 3.0 * np.sin(x) check_grads(fun)(npr.randn()) def test_sign(): fun = lambda x: 3.0 * np.sign(x) check_grads(fun)(1.1) check_grads(fun)(-1.1) def test_exp(): fun = lambda x: 3.0 * np.exp(x) check_grads(fun)(npr.randn()) def test_log(): fun = lambda x: 3.0 * np.log(x) check_grads(fun)(abs(npr.randn())) def test_log2(): fun = lambda x: 3.0 * np.log2(x) check_grads(fun)(abs(npr.randn())) def test_log10(): fun = lambda x: 3.0 * np.log10(x) check_grads(fun)(abs(npr.randn())) def test_log1p(): fun = lambda x: 3.0 * np.log1p(x) check_grads(fun)(abs(npr.randn())) def test_expm1(): fun = lambda x: 3.0 * np.expm1(x) check_grads(fun)(abs(npr.randn())) def test_exp2(): fun = lambda x: 3.0 * np.exp2(x) check_grads(fun)(abs(npr.randn())) def test_neg(): fun = lambda x: 3.0 * -x check_grads(fun)(npr.randn()) def test_cos(): fun = lambda x: 3.0 * np.cos(x) check_grads(fun)(npr.randn()) def test_tan(): fun = lambda x: 3.0 * np.tan(x) check_grads(fun)(npr.randn()) def test_cosh(): fun = lambda x: 3.0 * np.cosh(x) check_grads(fun)(npr.randn()) def test_sinh(): fun = lambda x: 3.0 * np.sinh(x) check_grads(fun)(npr.randn()) def test_tanh(): fun = lambda x: 3.0 * np.tanh(x) check_grads(fun)(npr.randn()) def test_arccos(): fun = lambda x: 3.0 * np.arccos(x) check_grads(fun)(0.1) def test_arcsin(): fun = lambda x: 3.0 * np.arcsin(x) check_grads(fun)(0.1) def test_arctan(): fun = lambda x: 3.0 * np.arctan(x) check_grads(fun)(0.2) def test_arccosh(): fun = lambda x: 3.0 * np.arccosh(x) check_grads(fun)(npr.randn() ** 2 + 1.2) def test_arcsinh(): fun = lambda x: 3.0 * np.arcsinh(x) check_grads(fun)(npr.randn()) def test_arctanh(): fun = lambda x: 3.0 * np.arctanh(x) check_grads(fun)(0.2) def test_sqrt(): fun = lambda x: 3.0 * np.sqrt(x) check_grads(fun)(10.0 * npr.rand()) def test_power_arg0(): # the +1.'s here are to avoid regimes where numerical diffs fail make_fun = lambda y: lambda x: np.power(x, y) fun = make_fun(npr.randn() ** 2 + 1.0) check_grads(fun)(npr.rand() ** 2 + 1.0) # test y == 0. as a special case, c.f. #116 fun = make_fun(0.0) assert grad(fun)(0.0) == 0.0 assert grad(grad(fun))(0.0) == 0.0 def test_power_arg1(): x = npr.randn() ** 2 fun = lambda y: np.power(x, y) check_grads(fun)(npr.rand() ** 2) def test_power_arg1_zero(): fun = lambda y: np.power(0.0, y) check_grads(fun)(npr.rand() ** 2) def test_mod_arg0(): fun = lambda x, y: np.mod(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_mod_arg1(): fun = lambda x, y: np.mod(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_divide_arg0(): fun = lambda x, y: np.divide(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_divide_arg1(): fun = lambda x, y: np.divide(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_multiply_arg0(): fun = lambda x, y: np.multiply(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_multiply_arg1(): fun = lambda x, y: np.multiply(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_true_divide_arg0(): fun = lambda x, y: np.true_divide(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_true_divide_arg1(): fun = lambda x, y: np.true_divide(x, y) check_grads(fun)(npr.rand(), npr.rand()) def test_reciprocal(): fun = lambda x: np.reciprocal(x) check_grads(fun)(npr.rand()) def test_negative(): fun = lambda x: np.negative(x) check_grads(fun)(npr.rand()) def test_rad2deg(): fun = lambda x: 3.0 * np.rad2deg(x) check_grads(fun)(10.0 * npr.rand()) def test_deg2rad(): fun = lambda x: 3.0 * np.deg2rad(x) check_grads(fun)(10.0 * npr.rand()) def test_radians(): fun = lambda x: 3.0 * np.radians(x) check_grads(fun)(10.0 * npr.rand()) def test_degrees(): fun = lambda x: 3.0 * np.degrees(x) check_grads(fun)(10.0 * npr.rand()) def test_sinc(): fun = lambda x: 3.0 * np.sinc(x) check_grads(fun)(10.0 * npr.rand()) ================================================ FILE: tests/test_scipy.py ================================================ from functools import partial import numpy as npo try: import scipy except: from warnings import warn warn("Skipping scipy tests.") else: from numpy_utils import unary_ufunc_check from scipy.signal import convolve as sp_convolve import autograd.numpy as np import autograd.numpy.random as npr import autograd.scipy.integrate as integrate import autograd.scipy.linalg as spla import autograd.scipy.signal import autograd.scipy.special as special import autograd.scipy.stats as stats import autograd.scipy.stats.multivariate_normal as mvn from autograd import grad from autograd.test_util import check_grads, combo_check npr.seed(1) R = npr.randn U = npr.uniform # Fwd mode not yet implemented for scipy functions combo_check = partial(combo_check, modes=["rev"]) unary_ufunc_check = partial(unary_ufunc_check, modes=["rev"]) check_grads = partial(check_grads, modes=["rev"]) def symmetrize_matrix_arg(fun, argnum): def T(X): return np.swapaxes(X, -1, -2) if np.ndim(X) > 1 else X def symmetrize(X): return 0.5 * (X + T(X)) def symmetrized_fun(*args, **kwargs): args = list(args) args[argnum] = symmetrize(args[argnum]) return fun(*args, **kwargs) return symmetrized_fun ### Stats ### def test_chi2_pdf(): combo_check(stats.chi2.pdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3]) def test_chi2_cdf(): combo_check(stats.chi2.cdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3]) def test_chi2_logpdf(): combo_check(stats.chi2.logpdf, [0])([R(4) ** 2 + 1.1], [1, 2, 3]) def test_beta_cdf(): combo_check(stats.beta.cdf, [0])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_beta_pdf(): combo_check(stats.beta.pdf, [0, 1, 2])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_beta_logpdf(): combo_check(stats.beta.logpdf, [0, 1, 2])([U(0.0, 1.0, 4)], [R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_gamma_cdf(): combo_check(stats.gamma.cdf, [0])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_gamma_pdf(): combo_check(stats.gamma.pdf, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_gamma_logpdf(): combo_check(stats.gamma.logpdf, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_norm_pdf(): combo_check(stats.norm.pdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_cdf(): combo_check(stats.norm.cdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_sf(): combo_check(stats.norm.sf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_logpdf(): combo_check(stats.norm.logpdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_logcdf(): combo_check(stats.norm.logcdf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_logsf(): combo_check(stats.norm.logsf, [0, 1, 2])([R(4)], [R(4)], [R(4) ** 2 + 1.1]) def test_norm_pdf_broadcast(): combo_check(stats.norm.pdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_norm_cdf_broadcast(): combo_check(stats.norm.cdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_norm_sf_broadcast(): combo_check(stats.norm.cdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_norm_logpdf_broadcast(): combo_check(stats.norm.logpdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_norm_logcdf_broadcast(): combo_check(stats.norm.logcdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_norm_logsf_broadcast(): combo_check(stats.norm.logcdf, [0, 1, 2])([R(4, 3)], [R(1, 3)], [R(4, 1) ** 2 + 1.1]) def test_poisson_cdf(): combo_check(stats.poisson.cdf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1]) def test_poisson_logpmf(): combo_check(stats.poisson.logpmf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1]) def test_poisson_pmf(): combo_check(stats.poisson.pmf, [1])([np.round(R(4) ** 2)], [R(4) ** 2 + 1.1]) def test_poisson_cdf_broadcast(): combo_check(stats.poisson.cdf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1]) def test_poisson_logpmf_broadcast(): combo_check(stats.poisson.logpmf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1]) def test_poisson_pmf_broadcast(): combo_check(stats.poisson.pmf, [1])([np.round(R(4, 3) ** 2)], [R(4, 1) ** 2 + 1.1]) def test_t_pdf(): combo_check(stats.t.pdf, [0, 1, 2, 3])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1]) def test_t_cdf(): combo_check(stats.t.cdf, [0, 2])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1]) def test_t_logpdf(): combo_check(stats.t.logpdf, [0, 1, 2, 3])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1]) def test_t_logcdf(): combo_check(stats.t.logcdf, [0, 2])([R(4)], [R(4) ** 2 + 2.1], [R(4)], [R(4) ** 2 + 2.1]) def test_t_pdf_broadcast(): combo_check(stats.t.pdf, [0, 1, 2, 3])( [R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1] ) def test_t_cdf_broadcast(): combo_check(stats.t.cdf, [0, 2])([R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1]) def test_t_logpdf_broadcast(): combo_check(stats.t.logpdf, [0, 1, 2, 3])( [R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1] ) def test_t_logcdf_broadcast(): combo_check(stats.t.logcdf, [0, 2])([R(4, 3)], [R(1, 3) ** 2 + 2.1], [R(4, 3)], [R(4, 1) ** 2 + 2.1]) def make_psd(mat): return np.dot(mat.T, mat) + np.eye(mat.shape[0]) def test_mvn_pdf(): combo_check(symmetrize_matrix_arg(mvn.pdf, 2), [0, 1, 2])( [R(4)], [R(4)], [make_psd(R(4, 4))], allow_singular=[False] ) def test_mvn_logpdf(): combo_check(symmetrize_matrix_arg(mvn.logpdf, 2), [0, 1, 2])( [R(4)], [R(4)], [make_psd(R(4, 4))], allow_singular=[False] ) def test_mvn_entropy(): combo_check(symmetrize_matrix_arg(mvn.entropy, 1), [0, 1])([10 * R(4)], [make_psd(R(4, 4))]) def test_mvn_sing_cov(): cov = np.zeros((4, 4)) cov[0, 0] = cov[1, 1] = 1 # Only allow variations in x along the first two dimensions, because # variance is zero in the last two. def pdf(x, mean, cov): x = np.concatenate([x[:2], mean[2:]]) return symmetrize_matrix_arg(partial(mvn.pdf, allow_singular=True), 2)(x, mean, cov) combo_check(pdf, [0, 1])( [np.concatenate((R(2), np.zeros(2)))], [np.concatenate((R(2), np.zeros(2)))], [cov] ) def logpdf(x, mean, cov): x = np.concatenate([x[:2], mean[2:]]) return symmetrize_matrix_arg(partial(mvn.logpdf, allow_singular=True), 2)(x, mean, cov) combo_check(logpdf, [0, 1])( [np.concatenate((R(2), np.zeros(2)))], [np.concatenate((R(2), np.zeros(2)))], [cov] ) def test_mvn_pdf_broadcast(): combo_check(symmetrize_matrix_arg(mvn.pdf, 2), [0, 1, 2])([R(5, 4)], [R(4)], [make_psd(R(4, 4))]) def test_mvn_logpdf_broadcast(): combo_check(symmetrize_matrix_arg(mvn.logpdf, 2), [0, 1, 2])([R(5, 4)], [R(4)], [make_psd(R(4, 4))]) alpha = npr.random(4) ** 2 + 1.2 x = stats.dirichlet.rvs(alpha, size=1)[0, :] # Need to normalize input so that x's sum to one even when we perturb them to compute numeric gradient. def normalize(x): return x / sum(x) def normalized_dirichlet_pdf(x, alpha): return stats.dirichlet.pdf(normalize(x), alpha) def normalized_dirichlet_logpdf(x, alpha): return stats.dirichlet.logpdf(normalize(x), alpha) def test_dirichlet_pdf_x(): combo_check(normalized_dirichlet_pdf, [0])([x], [alpha]) def test_dirichlet_pdf_alpha(): combo_check(stats.dirichlet.pdf, [1])([x], [alpha]) def test_dirichlet_logpdf_x(): combo_check(normalized_dirichlet_logpdf, [0])([x], [alpha]) def test_dirichlet_logpdf_alpha(): combo_check(stats.dirichlet.logpdf, [1])([x], [alpha]) ### Misc ### def test_logsumexp1(): combo_check(special.logsumexp, [0], modes=["fwd", "rev"])( [np.array([1.1]), R(4), R(3, 4)], axis=[None, 0], keepdims=[True, False] ) def test_logsumexp2(): combo_check(special.logsumexp, [0], modes=["fwd", "rev"])( [R(3, 4), R(4, 5, 6), R(1, 5)], axis=[None, 0, 1], keepdims=[True, False] ) def test_logsumexp3(): combo_check(special.logsumexp, [0], modes=["fwd", "rev"])( [R(4)], b=[np.exp(R(4))], axis=[None, 0], keepdims=[True, False] ) def test_logsumexp4(): combo_check(special.logsumexp, [0], modes=["fwd", "rev"])( [ R(3, 4), ], b=[np.exp(R(3, 4))], axis=[None, 0, 1], keepdims=[True, False], ) def test_logsumexp5(): combo_check(special.logsumexp, [0], modes=["fwd", "rev"])( [R(2, 3, 4)], b=[np.exp(R(2, 3, 4))], axis=[None, 0, 1], keepdims=[True, False] ) def test_logsumexp6(): x = npr.randn(1, 5) def f(a): return special.logsumexp(a, axis=1, keepdims=True) check_grads(f, modes=["fwd", "rev"])(x) check_grads(lambda a: grad(f)(a), modes=["fwd", "rev"])(x) ### Signal ### def test_convolve_generalization(): ag_convolve = autograd.scipy.signal.convolve A_35 = R(3, 5) A_34 = R(3, 4) A_342 = R(3, 4, 2) A_2543 = R(2, 5, 4, 3) A_24232 = R(2, 4, 2, 3, 2) for mode in ["valid", "full"]: assert npo.allclose( ag_convolve(A_35, A_34, axes=([1], [0]), mode=mode)[1, 2], sp_convolve(A_35[1, :], A_34[:, 2], mode), ) assert npo.allclose( ag_convolve(A_35, A_34, axes=([], []), dot_axes=([0], [0]), mode=mode), npo.tensordot(A_35, A_34, axes=([0], [0])), ) assert npo.allclose( ag_convolve(A_35, A_342, axes=([1], [2]), dot_axes=([0], [0]), mode=mode)[2], sum([sp_convolve(A_35[i, :], A_342[i, 2, :], mode) for i in range(3)]), ) assert npo.allclose( ag_convolve(A_2543, A_24232, axes=([1, 2], [2, 4]), dot_axes=([0, 3], [0, 3]), mode=mode)[2], sum( [ sum( [sp_convolve(A_2543[i, :, :, j], A_24232[i, 2, :, j, :], mode) for i in range(2)] ) for j in range(3) ] ), ) def test_convolve(): combo_check(autograd.scipy.signal.convolve, [0, 1])( [R(4), R(5), R(6)], [R(2), R(3), R(4)], mode=["full", "valid"] ) def test_convolve_2d(): combo_check(autograd.scipy.signal.convolve, [0, 1])( [R(4, 3), R(5, 4), R(6, 7)], [R(2, 2), R(3, 2), R(4, 2), R(4, 1)], mode=["full", "valid"] ) def test_convolve_ignore(): combo_check(autograd.scipy.signal.convolve, [0, 1])( [R(4, 3)], [R(3, 2)], axes=[([0], [0]), ([1], [1]), ([0], [1]), ([1], [0]), ([0, 1], [0, 1]), ([1, 0], [1, 0])], mode=["full", "valid"], ) def test_convolve_ignore_dot(): combo_check(autograd.scipy.signal.convolve, [0, 1])( [R(3, 3, 2)], [R(3, 2, 3)], axes=[([1], [1])], dot_axes=[([0], [2]), ([0], [0])], mode=["full", "valid"], ) ### Special ### def test_beta(): combo_check(special.beta, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_betainc(): combo_check(special.betainc, [2])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1], [U(0.0, 1.0, 4)]) def test_betaln(): combo_check(special.betaln, [0, 1])([R(4) ** 2 + 1.1], [R(4) ** 2 + 1.1]) def test_gammainc(): combo_check(special.gammainc, [1])([1], R(4) ** 2 + 1.3) def test_gammaincc(): combo_check(special.gammaincc, [1])([1], R(4) ** 2 + 1.3) def test_polygamma(): combo_check(special.polygamma, [1])([0], R(4) ** 2 + 1.3) def test_jn(): combo_check(special.jn, [1])([2], R(4) ** 2 + 1.3) def test_yn(): combo_check(special.yn, [1])([2], R(4) ** 2 + 1.3) def test_psi(): unary_ufunc_check(special.psi, lims=[0.3, 2.0], test_complex=False) def test_digamma(): unary_ufunc_check(special.digamma, lims=[0.3, 2.0], test_complex=False) def test_gamma(): unary_ufunc_check(special.gamma, lims=[0.3, 2.0], test_complex=False) def test_gammaln(): unary_ufunc_check(special.gammaln, lims=[0.3, 2.0], test_complex=False) def test_gammasgn(): unary_ufunc_check(special.gammasgn, lims=[0.3, 2.0], test_complex=False) def test_rgamma(): unary_ufunc_check(special.rgamma, lims=[0.3, 2.0], test_complex=False) def test_multigammaln(): combo_check(special.multigammaln, [0])([U(4.0, 5.0), U(4.0, 5.0, (2, 3))], [1, 2, 3]) def test_j0(): unary_ufunc_check(special.j0, lims=[0.2, 20.0], test_complex=False) def test_j1(): unary_ufunc_check(special.j1, lims=[0.2, 20.0], test_complex=False) def test_y0(): unary_ufunc_check(special.y0, lims=[0.2, 20.0], test_complex=False) def test_y1(): unary_ufunc_check(special.y1, lims=[0.2, 20.0], test_complex=False) def test_i0(): unary_ufunc_check(special.i0, lims=[0.2, 20.0], test_complex=False) def test_i1(): unary_ufunc_check(special.i1, lims=[0.2, 20.0], test_complex=False) def test_iv(): combo_check(special.iv, [1])(U(1.0, 50.0, 4), R(4) ** 2 + 1.3) def test_ive(): combo_check(special.ive, [1])(U(1.0, 50.0, 4), R(4) ** 2 + 1.3) def test_erf(): unary_ufunc_check(special.erf, lims=[-3.0, 3.0], test_complex=True) def test_erfc(): unary_ufunc_check(special.erfc, lims=[-3.0, 3.0], test_complex=True) def test_erfinv(): unary_ufunc_check(special.erfinv, lims=[-0.95, 0.95], test_complex=False) def test_erfcinv(): unary_ufunc_check(special.erfcinv, lims=[0.05, 1.95], test_complex=False) def test_logit(): unary_ufunc_check(special.logit, lims=[0.10, 0.90], test_complex=False) def test_expit(): unary_ufunc_check(special.expit, lims=[-4.05, 4.95], test_complex=False) ### ODE integrator ### def func(y, t, arg1, arg2): return -np.sqrt(t) - y + arg1 - np.mean((y + arg2) ** 2) def test_odeint(): combo_check(integrate.odeint, [1, 2, 3])([func], [R(3)], [np.linspace(0.1, 0.2, 4)], [(R(3), R(3))]) ## Linalg def test_sqrtm(): combo_check(spla.sqrtm, modes=["fwd"], order=2)([R(3, 3)]) def test_sqrtm(): combo_check(symmetrize_matrix_arg(spla.sqrtm, 0), modes=["fwd", "rev"], order=2)([R(3, 3)]) def test_solve_sylvester(): combo_check(spla.solve_sylvester, [0, 1, 2], modes=["rev", "fwd"], order=2)( [R(3, 3)], [R(3, 3)], [R(3, 3)] ) def test_solve_banded(): combo_check(spla.solve_banded, [1, 2], modes=["rev"], order=1)([(1, 1)], [R(3, 5)], [R(5)]) ================================================ FILE: tests/test_systematic.py ================================================ import operator as op import numpy as onp from numpy_utils import binary_ufunc_check, binary_ufunc_check_no_same_args, stat_check, unary_ufunc_check import autograd.numpy as np import autograd.numpy.random as npr from autograd.test_util import combo_check npr.seed(0) # Array statistics functions def test_max(): stat_check(np.max) # def test_all(): stat_check(np.all) # def test_any(): stat_check(np.any) def test_max(): stat_check(np.max) def test_mean(): stat_check(np.mean) def test_min(): stat_check(np.min) def test_sum(): stat_check(np.sum) def test_prod(): stat_check(np.prod) def test_var(): stat_check(np.var) def test_std(): stat_check(np.std) # Unary ufunc tests def test_sin(): unary_ufunc_check(np.sin) def test_abs(): unary_ufunc_check(np.abs, lims=[0.1, 4.0]) def test_absolute(): unary_ufunc_check(np.absolute, lims=[0.1, 4.0]) def test_arccosh(): unary_ufunc_check(np.arccosh, lims=[1.1, 4.0]) def test_arcsinh(): unary_ufunc_check(np.arcsinh, lims=[-0.9, 0.9]) def test_arctanh(): unary_ufunc_check(np.arctanh, lims=[-0.9, 0.9]) def test_ceil(): unary_ufunc_check(np.ceil, lims=[-1.5, 1.5], test_complex=False) def test_cos(): unary_ufunc_check(np.cos) def test_cosh(): unary_ufunc_check(np.cosh) def test_deg2rad(): unary_ufunc_check(np.deg2rad, test_complex=False) def test_degrees(): unary_ufunc_check(lambda x: np.degrees(x) / 50.0, test_complex=False) def test_exp(): unary_ufunc_check(np.exp) def test_exp2(): unary_ufunc_check(np.exp2) def test_expm1(): unary_ufunc_check(np.expm1) def test_fabs(): unary_ufunc_check(np.fabs, test_complex=False) def test_floor(): unary_ufunc_check(np.floor, lims=[-1.5, 1.5], test_complex=False) def test_log(): unary_ufunc_check(np.log, lims=[0.2, 2.0]) def test_log10(): unary_ufunc_check(np.log10, lims=[0.2, 2.0]) def test_log1p(): unary_ufunc_check(np.log1p, lims=[0.2, 2.0]) def test_log2(): unary_ufunc_check(np.log2, lims=[0.2, 2.0]) def test_rad2deg(): unary_ufunc_check(lambda x: np.rad2deg(x) / 50.0, test_complex=False) def test_radians(): unary_ufunc_check(np.radians, test_complex=False) def test_sign(): unary_ufunc_check(np.sign, test_complex=False) def test_sin(): unary_ufunc_check(np.sin) def test_sinh(): unary_ufunc_check(np.sinh) def test_sqrt(): unary_ufunc_check(np.sqrt, lims=[1.0, 3.0]) def test_square(): unary_ufunc_check(np.square, test_complex=False) def test_tan(): unary_ufunc_check(np.tan, lims=[-1.1, 1.1]) def test_tanh(): unary_ufunc_check(np.tanh) def test_real(): unary_ufunc_check(np.real) def test_real_ic(): unary_ufunc_check(np.real_if_close) def test_imag(): unary_ufunc_check(np.imag) def test_conj(): unary_ufunc_check(np.conj) def test_conjugate(): unary_ufunc_check(np.conjugate) def test_angle(): unary_ufunc_check(np.angle) # Binary ufunc tests def test_add(): binary_ufunc_check(np.add) def test_logaddexp(): binary_ufunc_check(np.logaddexp, test_complex=False) def test_logaddexp2(): binary_ufunc_check(np.logaddexp2, test_complex=False) def test_remainder(): binary_ufunc_check_no_same_args(np.remainder, lims_A=[-0.9, 0.9], lims_B=[0.7, 1.9], test_complex=False) def test_true_divide(): binary_ufunc_check(np.true_divide, lims_B=[0.8, 1.2], test_complex=False) def test_mod(): binary_ufunc_check_no_same_args(np.mod, lims_B=[0.8, 2.1], test_complex=False) def test_true_divide_neg(): binary_ufunc_check(np.true_divide, lims_B=[-0.3, -2.0], test_complex=False) def test_mod_neg(): binary_ufunc_check_no_same_args(np.mod, lims_B=[-0.3, -2.0], test_complex=False) def test_op_mul(): binary_ufunc_check(op.mul) def test_op_add(): binary_ufunc_check(op.add) def test_op_sub(): binary_ufunc_check(op.sub) def test_op_mod(): binary_ufunc_check_no_same_args(op.mod, lims_B=[0.3, 2.0], test_complex=False) def test_op_mod_neg(): binary_ufunc_check_no_same_args(op.mod, lims_B=[-0.3, -2.0], test_complex=False) # Misc tests R = npr.randn C = lambda *shape: npr.randn(*shape) + 1j * npr.randn(*shape) def test_transpose(): combo_check(np.transpose, [0])( [R(2, 3, 4)], axes=[None, [0, 1, 2], [0, 2, 1], [2, 0, 1], [2, 1, 0], [1, 0, 2], [1, 2, 0]] ) def test_moveaxis(): combo_check(np.moveaxis, [0])([R(2, 3, 4)], source=[0, 1, 2], destination=[0, 1, 2]) def test_repeat(): combo_check(np.repeat, [0])([R(2, 3, 4), R(3, 1)], repeats=[0, 1, 2], axis=[None, 0, 1]) def test_diff(): combo_check(np.diff, [0])([R(5, 5), R(5, 5, 5)], n=[1, 2], axis=[0, 1]) combo_check(np.diff, [0])([R(1), R(1, 1)], axis=[0]) combo_check(np.diff, [0])([R(1, 1), R(3, 1)], axis=[1]) def test_gradient(): combo_check(np.gradient, [0])([R(5, 5), R(5, 5, 5)], axis=[None, 0, 1, -1]) combo_check(np.gradient, [0])([R(5, 5, 5)], axis=[(0, 1), (0, -1)]) def test_tile(): combo_check(np.tile, [0])([R(2, 1, 3, 1)], reps=[(1, 4, 1, 2)]) combo_check(np.tile, [0])([R(1, 2)], reps=[(1, 2), (2, 3), (3, 2, 1)]) combo_check(np.tile, [0])([R(1)], reps=[(2,), 2]) def test_kron(): combo_check(np.kron, [0, 1])( [R(5, 5), R(4, 4), R(5), R(5, 1), R(1, 5), R(), C(5, 5)], [R(3, 3), R(2, 2), R(3), R(1, 3), R(3, 1), R(), C(3, 3)], ) def test_inner(): combo_check(np.inner, [0, 1])([1.5, R(3), R(2, 3)], [0.3, R(3), R(4, 3)]) def test_dot(): combo_check(np.dot, [0, 1], order=3)( [1.5, R(3), R(2, 3), R(2, 2, 3), C(3), C(2, 3)], [0.3, R(3), R(3, 4), R(2, 3, 4), C(3)] ) def test_outer(): combo_check(np.outer, [0, 1], order=3)([R(3), C(3)], [R(3), C(3)]) def test_matmul(): combo_check(np.matmul, [0, 1])( [R(3), R(2, 3), R(2, 2, 3), C(3), C(2, 3)], [R(3), R(3, 4), R(2, 3, 4), C(3), C(3, 4)] ) def test_matmul_broadcast(): combo_check(np.matmul, [0, 1])([R(1, 2, 2)], [R(3, 2, 1)]) def test_tensordot_1(): combo_check(np.tensordot, [0, 1], order=3)( [R(1, 3), R(2, 3, 2), C(1, 3)], [R(3), R(3, 1), R(3, 4, 2), C(3)], axes=[[(1,), (0,)]] ) def test_tensordot_2(): combo_check(np.tensordot, [0, 1], order=3)( [R(3), R(3, 1), R(3, 4, 2)], [R(1, 3), R(2, 3, 2)], axes=[[(0,), (1,)]] ) def test_tensordot_3(): combo_check(np.tensordot, [0, 1], order=3)( [R(2, 3), R(2, 3, 4)], [R(1, 2, 3), R(2, 2, 3, 4)], axes=[[(0, 1), (1, 2)], [(1, 0), (2, 1)]] ) def test_tensordot_4(): combo_check(np.tensordot, [0, 1], order=3)([R(2, 2), R(4, 2, 2)], [R(2, 2), R(2, 2, 4)], axes=[1, 2]) def test_tensordot_5(): combo_check(np.tensordot, [0, 1], order=3)([R(4)], [R()], axes=[0]) def test_tensordot_6(): combo_check(np.tensordot, [0, 1], order=3)([R(2, 6)], [R(6, 3)], axes=[[[-1], [0]]]) def test_tensordot_7(): combo_check(np.tensordot, [0, 1], order=3)([R(2, 6)], [R(6, 3)], axes=[[-1, 0]]) def test_tensordot_8(): combo_check(np.tensordot, [0, 1], order=3)([R(2)], [R(2, 2)], axes=[[0, 1]]) # Need custom tests because gradient is undefined when arguments are identical. def test_maximum(): combo_check(np.maximum, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)]) def test_fmax(): combo_check(np.fmax, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)]) def test_minimum(): combo_check(np.minimum, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)]) def test_fmin(): combo_check(np.fmin, [0, 1])([R(1), R(1, 4), R(3, 4)], [R(1), R(1, 4), R(3, 4)]) def test_sort(): combo_check(np.sort, [0])([R(1), R(7)]) if onp.lib.NumpyVersion(onp.__version__) < "2.0.0": def test_msort(): combo_check(np.msort, [0])([R(1), R(7)]) def test_partition(): combo_check(np.partition, [0])([R(7), R(14)], kth=[0, 3, 6]) def test_atleast_1d(): combo_check(np.atleast_1d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5)]) def test_atleast_2d(): combo_check(np.atleast_2d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5)]) def test_atleast_3d(): combo_check(np.atleast_3d, [0])([1.2, R(1), R(7), R(1, 4), R(2, 4), R(2, 4, 5), R(2, 4, 3, 5)]) def test_einsum_transpose(): combo_check(np.einsum, [1])(["ij->ji"], [R(1, 1), R(4, 4), R(3, 4)]) def test_einsum_matmult(): combo_check(np.einsum, [1, 2])(["ij,jk->ik"], [R(2, 3), C(2, 3)], [R(3, 4), C(3, 4)]) def test_einsum_matmult_broadcast(): combo_check(np.einsum, [1, 2])(["...ij,...jk->...ik"], [R(2, 3), R(2, 2, 3)], [R(3, 4), R(2, 3, 4)]) def test_einsum_matmult_broadcast_leadzero(): combo_check(np.einsum, [1, 2])(["...ij,...jk->...ik"], [R(0, 2, 3)], [R(0, 3, 4)]) def test_einsum_covsum(): combo_check(np.einsum, [1, 2])(["ijk,lji->lki"], [R(3, 4, 4)], [R(4, 4, 3)]) def test_einsum_ellipses(): combo_check(np.einsum, [1, 2])( ["...jk,...lj->...lk", "...,...->..."], [R(4, 4), R(3, 4, 4)], [R(4, 4), R(3, 4, 4)] ) def test_einsum_ellipses_tail(): combo_check(np.einsum, [1, 2])(["jk...,lj...->lk..."], [R(3, 2), R(3, 2, 4)], [R(2, 3), R(2, 3, 4)]) def test_einsum_ellipses_center(): combo_check(np.einsum, [1, 2])(["j...k,lj...->lk..."], [R(2, 2), R(2, 2, 2)], [R(2, 2), R(2, 2, 2)]) def test_einsum_three_args(): combo_check(np.einsum, [1, 2])(["ijk,lji,lli->lki"], [R(3, 4, 4)], [R(4, 4, 3)], [R(4, 4, 3)]) def test_einsum2_transpose(): combo_check(np.einsum, [0])([R(1, 1), R(4, 4), R(3, 4)], [(0, 1)], [(1, 0)]) def test_einsum2_matmult(): combo_check(np.einsum, [0, 2])([R(2, 3)], [(0, 1)], [R(3, 4)], [(1, 2)], [(0, 2)]) def test_einsum2_matmult_broadcast(): combo_check(np.einsum, [0, 2])( [R(2, 3), R(2, 2, 3)], [(Ellipsis, 0, 1)], [R(3, 4), R(2, 3, 4)], [(Ellipsis, 1, 2)], [(Ellipsis, 0, 2)], ) def test_einsum2_covsum(): combo_check(np.einsum, [0, 2])([R(3, 4, 4)], [(0, 1, 2)], [R(4, 4, 3)], [(3, 1, 0)], [(3, 2, 0)]) def test_einsum2_three_args(): combo_check(np.einsum, [0, 2])( [R(3, 4, 4)], [(0, 1, 2)], [R(4, 4, 3)], [(3, 1, 0)], [R(4, 4, 3)], [(3, 3, 0)], [(3, 2, 0)] ) def test_einsum_naked_sum(): combo_check(np.einsum, [1, 2])(["k,nk->"], [R(5)], [R(10, 5)]) def test_einsum_naked_sum2(): combo_check(np.einsum, [1])(["abcd->bd"], [R(3, 2, 3, 2)]) def test_einsum_naked_sum_ellipsis(): combo_check(np.einsum, [1, 2])(["...k,...nk->..."], [R(3, 5)], [R(3, 10, 5)]) def test_einsum_no_output_indices(): combo_check(np.einsum, [1, 2])(["ij,k"], [R(3, 4)], [R(3)]) def test_trace(): combo_check(np.trace, [0])([R(5, 5), R(4, 5), R(5, 4), R(3, 4, 5)], offset=[-1, 0, 1]) def test_diag(): combo_check(np.diag, [0])([R(5, 5)], k=[-1, 0, 1]) def test_diag_flat(): combo_check(np.diag, [0])([R(5)], k=[-1, 0, 1]) def test_tril(): combo_check(np.tril, [0])([R(5, 5)], k=[-1, 0, 1]) def test_triu(): combo_check(np.triu, [0])([R(5, 5)], k=[-1, 0, 1]) def test_tril_3d(): combo_check(np.tril, [0])([R(5, 5, 4)], k=[-1, 0, 1]) def test_triu_3d(): combo_check(np.triu, [0])([R(5, 5, 4)], k=[-1, 0, 1]) def test_swapaxes(): combo_check(np.swapaxes, [0])([R(3, 4, 5)], axis1=[0, 1, 2], axis2=[0, 1, 2]) def test_rollaxis(): combo_check(np.rollaxis, [0])([R(2, 3, 4)], axis=[0, 1, 2], start=[0, 1, 2]) def test_cross(): combo_check(np.cross, [0, 1])( [R(3, 3)], [R(3, 3)], axisa=[-1, 0, 1], axisb=[-1, 0, 1], axisc=[-1, 0, 1], axis=[None, -1, 0, 1] ) def test_vsplit_2d(): combo_check(np.vsplit, [0])([R(4, 8)], [4, [1, 2]]) def test_vsplit_3d(): combo_check(np.vsplit, [0])([R(4, 4, 4)], [2, [1, 2]]) def test_hsplit_2d(): combo_check(np.hsplit, [0])([R(4, 8)], [4, [1, 2]]) def test_hsplit_3d(): combo_check(np.hsplit, [0])([R(4, 4, 4)], [2, [1, 2]]) def test_dsplit_3d(): combo_check(np.dsplit, [0])([R(4, 4, 4)], [2, [1, 2]]) def test_split_1d(): combo_check(np.split, [0])([R(1), R(7)], [1], axis=[0]) def test_split_2d(): combo_check(np.split, [0])([R(4, 8)], [4, [1, 2]], axis=[0, 1]) def test_split_3d(): combo_check(np.split, [0])([R(4, 4, 4)], [2, [1, 2]], axis=[0, 1, 2]) def test_array_split_1d(): combo_check(np.array_split, [0])([R(1), R(7)], [1, 3], axis=[0]) def test_array_split_2d(): combo_check(np.array_split, [0])([R(7, 7)], [4, [3, 5]], axis=[0, 1]) def test_array_split_3d(): combo_check(np.array_split, [0])([R(7, 7, 7)], [4, [3, 5]], axis=[0, 1, 2]) def test_concatenate_1ist(): combo_check(np.concatenate, [0])([(R(1), R(3))], axis=[0]) def test_concatenate_tuple(): combo_check(np.concatenate, [0])([[R(1), R(3)]], axis=[0]) def test_concatenate_2d(): combo_check(np.concatenate, [0])([(R(2, 2), R(2, 2))], axis=[0, 1]) def test_concatenate_3d(): combo_check(np.concatenate, [0])([(R(2, 2, 2), R(2, 2, 2))], axis=[0, 1, 2]) def test_vstack_1d(): combo_check(np.vstack, [0])([R(2), (R(2), R(2))]) def test_vstack_2d(): combo_check(np.vstack, [0])([R(2, 3), (R(2, 4), R(1, 4))]) def test_vstack_3d(): combo_check(np.vstack, [0])([R(2, 3, 4), (R(2, 3, 4), R(5, 3, 4))]) def test_hstack_1d(): combo_check(np.hstack, [0])([R(2), (R(2), R(2))]) def test_hstack_2d(): combo_check(np.hstack, [0])([R(3, 2), (R(3, 4), R(3, 5))]) def test_hstack_3d(): combo_check(np.hstack, [0])([R(2, 3, 4), (R(2, 1, 4), R(2, 5, 4))]) def test_stack_1d(): combo_check(np.stack, [0])([(R(2),), (R(2), R(2))], axis=[0, 1]) def test_row_stack_1d(): combo_check(np.row_stack, [0])([R(2), (R(2), R(2))]) def test_row_stack_2d(): combo_check(np.row_stack, [0])([R(2, 3), (R(2, 4), R(1, 4))]) def test_column_stack_1d(): combo_check(np.column_stack, [0])([R(2), (R(2), R(2))]) def test_column_stack_2d(): combo_check(np.column_stack, [0])([R(2, 2), (R(2, 2), R(2, 2))]) def test_select(): combo_check(np.select, [1])( [[R(3, 4, 5) > 0, R(3, 4, 5) > 0, R(3, 4, 5) > 0]], [[R(3, 4, 5), R(3, 4, 5), R(3, 4, 5)]], default=[0.0, 1.1], ) def test_pad(): combo_check(np.pad, [0])( [R(2, 2)], [0, 3, (3,), (3, 2), ((3, 2),), ((1, 2), (3, 4)), ((0, 0), (0, 0))], ["constant"] ) ================================================ FILE: tests/test_tests.py ================================================ from pytest import raises from autograd.extend import defvjp from autograd.test_util import check_grads from autograd.tracer import primitive def test_check_vjp_1st_order_fail(): @primitive def foo(x): return x * 2.0 defvjp(foo, lambda ans, x: lambda g: g * 2.001) with raises(AssertionError, match="\\(VJP\\) check of foo failed"): check_grads(foo, modes=["rev"])(1.0) def test_check_vjp_2nd_order_fail(): @primitive def foo(x): return x * 2.0 defvjp(foo, lambda ans, x: lambda g: bar(g) * 2) @primitive def bar(x): return x defvjp(bar, lambda ans, x: lambda g: g * 1.001) with raises(AssertionError, match="\\(VJP\\) check of vjp_foo failed"): check_grads(foo, modes=["rev"])(1.0) ================================================ FILE: tests/test_truediv.py ================================================ # This file is to check that future division works. from test_binary_ops import arg_pairs import autograd.numpy as np from autograd.test_util import check_grads def test_div(): fun = lambda x, y: x / y make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5) for arg1, arg2 in arg_pairs(): arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun)(arg1, arg2) ================================================ FILE: tests/test_tuple.py ================================================ import autograd.numpy as np import autograd.numpy.random as npr from autograd import grad from autograd import isinstance as ag_isinstance from autograd import tuple as ag_tuple from autograd.test_util import check_grads npr.seed(1) def test_getter(): def fun(input_tuple): A = np.sum(input_tuple[0]) B = np.sum(input_tuple[1]) C = np.sum(input_tuple[1]) return A + B + C d_fun = grad(fun) input_tuple = (npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)) result = d_fun(input_tuple) assert np.allclose(result[0], np.ones((5, 6))) assert np.allclose(result[1], 2 * np.ones((4, 3))) assert np.allclose(result[2], np.zeros((2, 4))) def test_grads(): def fun(input_tuple): A = np.sum(np.sin(input_tuple[0])) B = np.sum(np.cos(input_tuple[1])) return A + B def d_fun(input_tuple): g = grad(fun)(input_tuple) A = np.sum(g[0]) B = np.sum(np.sin(g[0])) C = np.sum(np.sin(g[1])) return A + B + C input_tuple = (npr.randn(5, 6), npr.randn(4, 3), npr.randn(2, 4)) check_grads(fun)(input_tuple) check_grads(d_fun)(input_tuple) def test_nested_higher_order(): def outer_fun(x): def inner_fun(y): return y[0] * y[1] return np.sum(np.sin(np.array(grad(inner_fun)(ag_tuple((x, x)))))) check_grads(outer_fun)(5.0) check_grads(grad(outer_fun))(10.0) check_grads(grad(grad(outer_fun)))(10.0) def test_isinstance(): def fun(x): assert ag_isinstance(x, tuple) assert ag_isinstance(x, ag_tuple) return x[0] fun((1.0, 2.0, 3.0)) grad(fun)((1.0, 2.0, 3.0)) ================================================ FILE: tests/test_vspaces.py ================================================ import itertools as it from functools import reduce import numpy as np from autograd.core import vspace from autograd.test_util import check_grads, scalar_close def check_vspace(value): vs = vspace(value) # --- required attributes --- size = vs.size add = vs.add scalar_mul = vs.scalar_mul inner_prod = vs.inner_prod randn = vs.randn zeros = vs.zeros ones = vs.ones standard_basis = vs.standard_basis # --- util --- def randns(N=2): return [randn() for i in range(N)] def rand_scalar(): return float(np.random.randn()) def rand_scalars(N=2): return [rand_scalar() for i in range(N)] def vector_close(x, y): z = randn() return scalar_close(inner_prod(z, x), inner_prod(z, y)) # --- vector space axioms --- def associativity_of_add(x, y, z): return vector_close(add(x, add(y, z)), add(add(x, y), z)) def commutativity_of_add(x, y): return vector_close(add(x, y), add(y, x)) def identity_element_of_add(x): return vector_close(add(zeros(), x), x) def inverse_elements_of_add(x): return vector_close(zeros(), add(x, scalar_mul(x, -1.0))) def compatibility_of_scalar_mul_with_field_mul(x, a, b): return vector_close(scalar_mul(x, a * b), scalar_mul(scalar_mul(x, a), b)) def identity_element_of_scalar_mul(x): return vector_close(scalar_mul(x, 1.0), x) def distributivity_of_scalar_mul_wrt_vector_add(x, y, a): return vector_close(scalar_mul(add(x, y), a), add(scalar_mul(x, a), scalar_mul(y, a))) def distributivity_of_scalar_mul_wrt_scalar_add(x, a, b): return vector_close(scalar_mul(x, a + b), add(scalar_mul(x, a), scalar_mul(x, b))) # --- closure --- def add_preserves_vspace(x, y): return vs == vspace(add(x, y)) def scalar_mul_preserves_vspace(x, a): return vs == vspace(scalar_mul(x, a)) # --- inner product axioms --- def symmetry(x, y): return scalar_close(inner_prod(x, y), inner_prod(y, x)) def linearity(x, y, a): return scalar_close(inner_prod(scalar_mul(x, a), y), a * inner_prod(x, y)) def positive_definitive(x): return 0 < inner_prod(x, x) def inner_zeros(): return scalar_close(0, inner_prod(zeros(), zeros())) # --- basis vectors and special vectors--- def basis_orthonormality(): return all( [ scalar_close(inner_prod(x, y), 1.0 * (ix == iy)) for (ix, x), (iy, y) in it.product(enumerate(standard_basis()), enumerate(standard_basis())) ] ) def ones_sum_of_basis_vects(): return vector_close(reduce(add, standard_basis()), ones()) def basis_correct_size(): return len(list(standard_basis())) == size def basis_correct_vspace(): return (vs == vspace(x) for x in standard_basis()) def zeros_correct_vspace(): return vs == vspace(zeros()) def ones_correct_vspace(): return vs == vspace(ones()) def randn_correct_vspace(): return vs == vspace(randn()) assert associativity_of_add(*randns(3)) assert commutativity_of_add(*randns()) assert identity_element_of_add(randn()) assert inverse_elements_of_add(randn()) assert compatibility_of_scalar_mul_with_field_mul(randn(), *rand_scalars()) assert identity_element_of_scalar_mul(randn()) assert distributivity_of_scalar_mul_wrt_vector_add(randn(), randn(), rand_scalar()) assert distributivity_of_scalar_mul_wrt_scalar_add(randn(), *rand_scalars()) assert add_preserves_vspace(*randns()) assert scalar_mul_preserves_vspace(randn(), rand_scalar()) assert symmetry(*randns()) assert linearity(randn(), randn(), rand_scalar()) assert positive_definitive(randn()) assert inner_zeros() assert basis_orthonormality() assert ones_sum_of_basis_vects() assert basis_correct_size() assert basis_correct_vspace() assert zeros_correct_vspace() assert ones_correct_vspace() assert randn_correct_vspace() # --- grads of basic operations --- check_grads(add)(*randns()) check_grads(scalar_mul)(randn(), rand_scalar()) check_grads(inner_prod)(*randns()) def test_array_vspace(): check_vspace(np.zeros((3, 2))) def test_array_vspace_0_dim(): check_vspace(0.0) def test_array_vspace_complex(): check_vspace(1.0j * np.zeros((2, 1))) def test_list_vspace(): check_vspace([1.0, np.zeros((2, 1))]) def test_tuple_vspace(): check_vspace((1.0, np.zeros((2, 1)))) def test_dict_vspace(): check_vspace({"a": 1.0, "b": np.zeros((2, 1))}) def test_mixed_vspace(): check_vspace({"x": [0.0, np.zeros((3, 1))], "y": ({"a": 0.0}, [0.0])}) ================================================ FILE: tests/test_wrappers.py ================================================ import warnings from functools import partial import pytest import autograd.numpy as np import autograd.numpy.random as npr from autograd import ( checkpoint, elementwise_grad, grad, grad_and_aux, hessian, hessian_tensor_product, jacobian, make_ggnvp, make_hvp, make_jvp, tensor_jacobian_product, value_and_grad, ) from autograd.test_util import check_equivalent, check_grads # , nd from autograd.tracer import isbox npr.seed(1) def test_return_both(): fun = lambda x: 3.0 * x**3.2 d_fun = grad(fun) f_and_d_fun = value_and_grad(fun) test_x = 1.7 f, d = f_and_d_fun(test_x) assert f == fun(test_x) assert d == d_fun(test_x) def test_value_and_grad(): fun = lambda x: np.sum(np.sin(x) ** 2) dfun = grad(fun) dfun_both = value_and_grad(fun) x = npr.randn(5) assert not isbox(dfun_both(x)[0]) check_equivalent(fun(x), dfun_both(x)[0]) check_equivalent(dfun(x), dfun_both(x)[1]) def fun2(x): return dfun_both(x)[0] check_grads(fun2)(x) def test_hessian(): # Check Hessian of a quadratic function. D = 5 H = npr.randn(D, D) def fun(x): return np.dot(np.dot(x, H), x) hess = hessian(fun) x = npr.randn(D) check_equivalent(hess(x), H + H.T) def test_multigrad(): def complicated_fun(a, b, c, d, e, f=1.1, g=9.0): return a + np.sin(b) + np.cosh(c) + np.cos(d) + np.tan(e) + f + g def complicated_fun_3_1(d_b): d, b = d_b return complicated_fun(A, b, C, d, E, f=F, g=G) A = 0.5 B = -0.3 C = 0.2 D = -1.1 E = 0.7 F = 0.6 G = -0.1 wrapped = grad(complicated_fun, argnum=[3, 1])(A, B, C, D, E, f=F, g=G) explicit = grad(complicated_fun_3_1)((D, B)) check_equivalent(wrapped, explicit) def test_value_and_multigrad(): def complicated_fun(a, b, c, d, e, f=1.1, g=9.0): return a + np.sin(b) + np.cosh(c) + np.cos(d) + np.tan(e) + f + g A = 0.5 B = -0.3 C = 0.2 D = -1.1 E = 0.7 F = 0.6 G = -0.1 dfun = grad(complicated_fun, argnum=[3, 1]) dfun_both = value_and_grad(complicated_fun, argnum=[3, 1]) check_equivalent(complicated_fun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[0]) check_equivalent(dfun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[1]) def test_multigrad_onearg(): fun = lambda x, y: np.sum(x + np.sin(y)) packed_fun = lambda xy: np.sum(xy[0] + np.sin(xy[1])) A, B = npr.randn(3), npr.randn(3) check_equivalent(grad(fun, argnum=[0])(A, B), (grad(packed_fun)((A, B))[0],)) def test_elementwise_grad(): def simple_fun(a): return a + np.sin(a) + np.cosh(a) A = npr.randn(10) wrapped = elementwise_grad(simple_fun)(A) explicit = np.array([grad(simple_fun)(A[i]) for i in range(len(A))]) check_equivalent(wrapped, explicit) def test_elementwise_grad_multiple_args(): def simple_fun(a, b): return a + np.sin(a) + np.cosh(b) A = 0.9 B = npr.randn(10) argnum = 1 wrapped = elementwise_grad(simple_fun, argnum)(A, B) explicit = np.array([grad(simple_fun, argnum)(A, B[i]) for i in range(len(B))]) check_equivalent(wrapped, explicit) def test_hessian_tensor_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5) v = npr.randn(5) H = hessian(fun)(a) check_equivalent(np.dot(H, v), hessian_tensor_product(fun)(a, v)) def test_hvp(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5) v = npr.randn(5) H = hessian(fun)(a) hvp = make_hvp(fun)(a)[0] check_equivalent(np.dot(H, v), hvp(v)) def test_hessian_matrix_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5, 4) V = npr.randn(5, 4) H = hessian(fun)(a) check_equivalent(np.tensordot(H, V), hessian_tensor_product(fun)(a, V)) def test_hessian_tensor_product_3d(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5, 4, 3) V = npr.randn(5, 4, 3) H = hessian(fun)(a) check_equivalent(np.tensordot(H, V, axes=np.ndim(V)), hessian_tensor_product(fun)(a, V)) def test_tensor_jacobian_product(): # This function will have an asymmetric jacobian matrix. fun = lambda a: np.roll(np.sin(a), 1) a = npr.randn(5) V = npr.randn(5) J = jacobian(fun)(a) check_equivalent(np.dot(V.T, J), tensor_jacobian_product(fun)(a, V)) def test_matrix_jacobian_product(): fun = lambda a: np.roll(np.sin(a), 1) a = npr.randn(5, 4) V = npr.randn(5, 4) J = jacobian(fun)(a) check_equivalent(np.tensordot(V, J), tensor_jacobian_product(fun)(a, V)) def test_tensor_jacobian_product(): fun = lambda a: np.roll(np.sin(a), 1) a = npr.randn(5, 4, 3) V = npr.randn(5, 4) J = jacobian(fun)(a) check_equivalent(np.tensordot(V, J, axes=np.ndim(V)), tensor_jacobian_product(fun)(a, V)) def test_deprecated_defgrad_wrapper(): from autograd.core import primitive @primitive def new_mul(x, y): return x * y with warnings.catch_warnings(record=True) as w: new_mul.defgrad(lambda ans, x, y: lambda g: y * g) new_mul.defgrad(lambda ans, x, y: lambda g: x * g, argnum=1) def fun(x, y): return new_mul(x, y) mat1 = npr.randn(2, 2) mat2 = npr.randn(2, 2) check_grads(fun, modes=["rev"])(mat1, mat2) def test_deprecated_defvjp_wrapper(): from autograd.core import primitive @primitive def new_mul(x, y): return x * y with warnings.catch_warnings(record=True) as w: new_mul.defvjp(lambda g, ans, vs, gvs, x, y: y * g) new_mul.defvjp(lambda g, ans, vs, gvs, x, y: x * g, argnum=1) def fun(x, y): return new_mul(x, y) mat1 = npr.randn(2, 2) mat2 = npr.randn(2, 2) check_grads(fun, modes=["rev"])(mat1, mat2) def test_deprecated_defvjp_is_zero_wrapper(): from autograd.core import primitive @primitive def new_mul(x, y): return 0 * x * y with warnings.catch_warnings(record=True) as w: new_mul.defvjp_is_zero([0, 1]) def fun(x, y): return new_mul(x, y) mat1 = npr.randn(2, 2) mat2 = npr.randn(2, 2) with warnings.catch_warnings(record=True) as w: check_grads(fun, modes=["rev"])(mat1, mat2) def test_deprecated_quick_grad_check_wrapper(): from autograd.util import quick_grad_check with warnings.catch_warnings(record=True) as w: quick_grad_check(lambda x, y: x**2 + y, 1.0, (2.0,)) def test_partial(): def f(x, y): return x grad(partial(f, y=1)) @pytest.mark.skip(reason="fails with NumPy nightlies") def test_dtypes(): def f(x): return np.real(np.sum(x**2)) # Array y with dtype np.float32 y = np.random.randn(10, 10).astype(np.float32) assert grad(f)(y).dtype.type is np.float32 y = np.random.randn(10, 10).astype(np.float16) assert grad(f)(y).dtype.type is np.float16 y = np.random.randn(10, 10).astype(np.longdouble) grad(f)(y) y = np.random.randn(10, 10).astype(np.clongdouble) grad(f)(y) def test_checkpoint_correctness(): bar = lambda x, y: 2 * x + y + 5 checkpointed_bar = checkpoint(bar) foo = lambda x: bar(x, x / 3.0) + bar(x, x**2) foo2 = lambda x: checkpointed_bar(x, x / 3.0) + checkpointed_bar(x, x**2) assert np.allclose(foo(3.0), foo2(3.0)) assert np.allclose(grad(foo)(3.0), grad(foo2)(3.0)) baz = lambda *args: sum(args) checkpointed_baz = checkpoint(baz) foobaz = lambda x: baz(x, x / 3.0) foobaz2 = lambda x: checkpointed_baz(x, x / 3.0) assert np.allclose(foobaz(3.0), foobaz2(3.0)) assert np.allclose(grad(foobaz)(3.0), grad(foobaz2)(3.0)) def checkpoint_memory(): """This test is meant to be run manually, since it depends on memory_profiler and its behavior may vary.""" try: from memory_profiler import memory_usage except ImportError: return def f(a): for _ in range(10): a = np.sin(a**2 + 1) return a checkpointed_f = checkpoint(f) def testfun(f, x): for _ in range(5): x = f(x) return np.sum(x) gradfun = grad(testfun, 1) A = npr.RandomState(0).randn(100000) max_usage = max(memory_usage((gradfun, (f, A)))) max_checkpointed_usage = max(memory_usage((gradfun, (checkpointed_f, A)))) assert max_checkpointed_usage < max_usage / 2.0 def test_make_jvp(): A = npr.randn(3, 5) x = npr.randn(5) v = npr.randn(5) fun = lambda x: np.tanh(np.dot(A, x)) jvp_explicit = lambda x: lambda v: np.dot(jacobian(fun)(x), v) jvp = make_jvp(fun) check_equivalent(jvp_explicit(x)(v), jvp(x)(v)[1]) def _make_explicit_ggnvp(f, g=lambda x: 1.0 / 2 * np.dot(x, x)): def ggnvp_maker(x): J = jacobian(f)(x) H = hessian(g)(f(x)) def ggnvp(v): return np.dot(J.T, np.dot(H, np.dot(J, v))) return ggnvp return ggnvp_maker def test_make_ggnvp(): A = npr.randn(5, 4) x = npr.randn(4) v = npr.randn(4) fun = lambda x: np.dot(A, x) check_equivalent(make_ggnvp(fun)(x)(v), _make_explicit_ggnvp(fun)(x)(v)) fun2 = lambda x: np.tanh(np.dot(A, x)) check_equivalent(make_ggnvp(fun2)(x)(v), _make_explicit_ggnvp(fun2)(x)(v)) def test_make_ggnvp_nondefault_g(): A = npr.randn(5, 4) x = npr.randn(4) v = npr.randn(4) g = lambda y: np.sum(2.0 * y**2 + y**4) fun = lambda x: np.dot(A, x) check_equivalent(make_ggnvp(fun, g)(x)(v), _make_explicit_ggnvp(fun, g)(x)(v)) fun2 = lambda x: np.tanh(np.dot(A, x)) check_equivalent(make_ggnvp(fun2, g)(x)(v), _make_explicit_ggnvp(fun2, g)(x)(v)) def test_grad_and_aux(): A = npr.randn(5, 4) x = npr.randn(4) f = lambda x: (np.sum(np.dot(A, x)), x**2) g = lambda x: np.sum(np.dot(A, x)) assert len(grad_and_aux(f)(x)) == 2 check_equivalent(grad_and_aux(f)(x)[0], grad(g)(x)) check_equivalent(grad_and_aux(f)(x)[1], x**2) ## No longer support this behavior # def test_make_ggnvp_broadcasting(): # A = npr.randn(4, 5) # x = npr.randn(10, 4) # v = npr.randn(10, 4) # fun = lambda x: np.tanh(np.dot(x, A)) # res1 = np.stack([_make_explicit_ggnvp(fun)(xi)(vi) for xi, vi in zip(x, v)]) # res2 = make_ggnvp(fun)(x)(v) # check_equivalent(res1, res2) def test_wrapped_name_and_docs(): def foo(x): pass assert grad.__name__ == "grad" # Python 3.13: Compiler now strip indents from docstrings. # https://docs.python.org/3.13/whatsnew/3.13.html#other-language-changes assert grad.__doc__.startswith(tuple(f"\n{indent}Returns a function which" for indent in (" ", ""))) assert grad(foo, 1).__name__ == "grad_of_foo_wrt_argnum_1" assert grad(foo, 1).__doc__.startswith(" grad of function foo with")