Repository: ddbourgin/numpy-ml
Branch: master
Commit: b0359af5285f
Files: 194
Total size: 1.3 MB

Directory structure:
gitextract_t47luwfk/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── a--bug-performance-issue.md
│   └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs/
│   ├── Makefile
│   ├── README.md
│   ├── conf.py
│   ├── index.rst
│   ├── make.bat
│   ├── numpy_ml.bandits.bandits.rst
│   ├── numpy_ml.bandits.policies.rst
│   ├── numpy_ml.bandits.rst
│   ├── numpy_ml.bandits.trainer.rst
│   ├── numpy_ml.factorization.factors.rst
│   ├── numpy_ml.factorization.rst
│   ├── numpy_ml.gmm.gmm.rst
│   ├── numpy_ml.gmm.rst
│   ├── numpy_ml.hmm.MultinomialHMM.rst
│   ├── numpy_ml.hmm.rst
│   ├── numpy_ml.lda.lda.rst
│   ├── numpy_ml.lda.rst
│   ├── numpy_ml.lda.smoothed_lda.rst
│   ├── numpy_ml.linear_models.lm.rst
│   ├── numpy_ml.linear_models.rst
│   ├── numpy_ml.neural_nets.activations.rst
│   ├── numpy_ml.neural_nets.initializers.rst
│   ├── numpy_ml.neural_nets.layers.rst
│   ├── numpy_ml.neural_nets.losses.rst
│   ├── numpy_ml.neural_nets.models.rst
│   ├── numpy_ml.neural_nets.modules.rst
│   ├── numpy_ml.neural_nets.optimizers.rst
│   ├── numpy_ml.neural_nets.rst
│   ├── numpy_ml.neural_nets.schedulers.rst
│   ├── numpy_ml.neural_nets.utils.rst
│   ├── numpy_ml.neural_nets.wrappers.rst
│   ├── numpy_ml.ngram.additive.rst
│   ├── numpy_ml.ngram.goodturing.rst
│   ├── numpy_ml.ngram.mle.rst
│   ├── numpy_ml.ngram.rst
│   ├── numpy_ml.nonparametric.gp.rst
│   ├── numpy_ml.nonparametric.kernel_regression.rst
│   ├── numpy_ml.nonparametric.knn.rst
│   ├── numpy_ml.nonparametric.rst
│   ├── numpy_ml.preprocessing.dsp.rst
│   ├── numpy_ml.preprocessing.general.rst
│   ├── numpy_ml.preprocessing.nlp.rst
│   ├── numpy_ml.preprocessing.rst
│   ├── numpy_ml.rl_models.agents.rst
│   ├── numpy_ml.rl_models.rl_utils.rst
│   ├── numpy_ml.rl_models.rst
│   ├── numpy_ml.rl_models.trainer.rst
│   ├── numpy_ml.trees.dt.rst
│   ├── numpy_ml.trees.gbdt.rst
│   ├── numpy_ml.trees.losses.rst
│   ├── numpy_ml.trees.rf.rst
│   ├── numpy_ml.trees.rst
│   ├── numpy_ml.utils.data_structures.rst
│   ├── numpy_ml.utils.distance_metrics.rst
│   ├── numpy_ml.utils.graphs.rst
│   ├── numpy_ml.utils.kernels.rst
│   ├── numpy_ml.utils.rst
│   ├── numpy_ml.utils.testing.rst
│   ├── numpy_ml.utils.windows.rst
│   └── requirements.txt
├── numpy_ml/
│   ├── README.md
│   ├── __init__.py
│   ├── bandits/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── bandits.py
│   │   ├── policies.py
│   │   └── trainer.py
│   ├── factorization/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   └── factors.py
│   ├── gmm/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   └── gmm.py
│   ├── hmm/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   └── hmm.py
│   ├── lda/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── lda.py
│   │   └── lda_smoothed.py
│   ├── linear_models/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── bayesian_regression.py
│   │   ├── glm.py
│   │   ├── linear_regression.py
│   │   ├── logistic.py
│   │   ├── naive_bayes.py
│   │   └── ridge.py
│   ├── neural_nets/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── activations/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── activations.py
│   │   ├── initializers/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── initializers.py
│   │   ├── layers/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── layers.py
│   │   ├── losses/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── losses.py
│   │   ├── models/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── vae.py
│   │   │   ├── w2v.py
│   │   │   └── wgan_gp.py
│   │   ├── modules/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── modules.py
│   │   ├── optimizers/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── optimizers.py
│   │   ├── schedulers/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── schedulers.py
│   │   ├── utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── utils.py
│   │   └── wrappers/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       └── wrappers.py
│   ├── ngram/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   └── ngram.py
│   ├── nonparametric/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── gp.py
│   │   ├── kernel_regression.py
│   │   └── knn.py
│   ├── plots/
│   │   ├── bandit_plots.py
│   │   ├── gmm_plots.py
│   │   ├── hmm_plots.py
│   │   ├── lda_plots.py
│   │   ├── lm_plots.py
│   │   ├── ngram_plots.py
│   │   ├── nn_activations_plots.py
│   │   ├── nn_schedulers_plots.py
│   │   ├── nonparametric_plots.py
│   │   ├── rl_plots.py
│   │   └── trees_plots.py
│   ├── preprocessing/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── dsp.py
│   │   ├── general.py
│   │   └── nlp.py
│   ├── rl_models/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── agents.py
│   │   ├── rl_utils.py
│   │   ├── tiles/
│   │   │   ├── __init__.py
│   │   │   └── tiles3.py
│   │   └── trainer.py
│   ├── tests/
│   │   ├── __init__.py
│   │   ├── nn_torch_models.py
│   │   ├── test_glm.py
│   │   ├── test_linear_regression.py
│   │   ├── test_naive_bayes.py
│   │   ├── test_ngram.py
│   │   ├── test_nn.py
│   │   ├── test_nn_activations.py
│   │   ├── test_nonparametric.py
│   │   ├── test_preprocessing.py
│   │   ├── test_trees.py
│   │   └── test_utils.py
│   ├── trees/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── dt.py
│   │   ├── gbdt.py
│   │   ├── losses.py
│   │   └── rf.py
│   └── utils/
│       ├── README.md
│       ├── __init__.py
│       ├── data_structures.py
│       ├── distance_metrics.py
│       ├── graphs.py
│       ├── kernels.py
│       ├── misc.py
│       ├── testing.py
│       └── windows.py
├── requirements-dev.txt
├── requirements-test.txt
├── requirements.txt
├── setup.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/a--bug-performance-issue.md
================================================
---
name: Bug/Performance Issue
about: Use this template for reporting a bug or a performance issue.
labels: bugfix
---

**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- NumPy version:

**Describe the current behavior**

**Describe the expected behavior**

**Code to reproduce the issue**
<!-- Provide a reproducible test case that is the bare minimum necessary to generate the problem. -->

**Other info / logs**
<!-- Include any logs or source code that would be helpful to diagnose the problem.
If including tracebacks, please include the full traceback. Large logs and files should be attached. -->


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### All Submissions

* [ ] Is the code you are submitting your own work?
* [ ] Have you followed the [contributing guidelines](https://github.com/ddbourgin/numpy-ml/CONTRIBUTING.md)?
* [ ] Have you checked to ensure there aren't other open [Pull Requests](https://github.com/ddbourgin/numpy-ml/pulls) for the same update/change?

### New Model Submissions

* [ ] Is the code you are submitting your own work?
* [ ] Did you properly attribute the authors of any code you referenced?
* [ ] Did you write unit tests for your new model?
* [ ] Does your submission pass the unit tests?
* [ ] Did you write documentation for your new model?
* [ ] Have you formatted your code using the [black](https://black.now.sh/) deaults?

### Changes to Existing Models

* [ ] Have you added an explanation of what your changes do and why you'd like us to include them?
* [ ] Have you written new tests for your changes, as applicable?
* [ ] Have you successfully ran tests with your changes locally?


================================================
FILE: .gitignore
================================================
### OSX ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

### Python Patch ###
.venv/

### Vim ###
# Swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]

# Session
Session.vim

# Temporary
.netrwhist
*~

# Auto-generated tag files
tags

# Persistent undo
[._]*.un~

# No pdfs
*.pdf

# No TODOs ;-)
TODO

_build
_static


================================================
FILE: .readthedocs.yml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
  configuration: docs/conf.py

# Build documentation with MkDocs
#mkdocs:
#  configuration: mkdocs.yml

# Optionally build your docs in additional formats such as PDF and ePub
formats:
    - htmlzip

# Optionally set the version of Python and requirements required to build your docs
python:
  version: 3.7
  install:
    - requirements: docs/requirements.txt


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# NumPy-ML Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
  advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
  address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at ddbourgin@gmail.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: CONTRIBUTING.md
================================================
## Contributing

Thank you for contributing to numpy-ml!

| <p align="center">⚠️ ⚠️ All PRs should reflect earnest attempts at implementing a model yourself. ⚠️⚠️ </p> It is fine to reference others' code. It is not fine to blindly copy without attribution. When in doubt, please ask. |
| --- |

### General guidelines
1. Please include a clear list of what you've done
2. For pull requests, please make sure all commits are [*atomic*](https://en.wikipedia.org/wiki/Atomic_commit) (i.e., one feature per commit)
3. If you're submitting a new model / feature / module, **please include proper documentation and unit tests.**
    - See the `test.py` file in one of the existing modules for examples of unit tests.
    - Documentation is loosely based on the [NumPy docstring style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html). When in doubt, refer to existing examples
4. Please format your code using the [black](https://github.com/python/black) defaults. You can use this [online formatter](https://black.now.sh/).

### Specific guidelines
#### I have a new model / model component to contribute
 Awesome - create a [pull request](https://github.com/ddbourgin/numpy-ml/pulls)! When preparing your PR, please include a brief description of the model, the canonical reference(s) in the literature, and, most importantly unit tests against an existing implementation!
  - Refer to the `test.py` file in one of the existing modules for examples.

#### I have a major new enhancement / adjustment that will affect multiple models
 Please post an [issue](https://github.com/ddbourgin/numpy-ml/issues) with your proposal before you begin working on it. When outlining your proposal, please include as much detail about your intended changes as possible.

#### I found a bug
 If there isn't already an [open issue](https://github.com/ddbourgin/numpy-ml/issues), please start one! When creating your issue, include:
  1. A title and clear description
  2. As much relevant information as possible
  3. A code sample demonstrating the expected behavior that is not occurring

#### I fixed a bug
 Thank you! Please open a new [pull request](https://github.com/ddbourgin/numpy-ml/pulls) with the patch. When doing so, ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.


================================================
FILE: LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.


================================================
FILE: MANIFEST.in
================================================
include README.md
include requirements*.txt
include docs/*.rst
include docs/img/*.png


================================================
FILE: README.md
================================================
# numpy-ml
Ever wish you had an inefficient but somewhat legible collection of machine
learning algorithms implemented exclusively in NumPy? No?

## Installation

### For rapid experimentation
To use this code as a starting point for ML prototyping / experimentation, just clone the repository, create a new [virtualenv](https://pypi.org/project/virtualenv/), and start hacking:

```sh
$ git clone https://github.com/ddbourgin/numpy-ml.git
$ cd numpy-ml && virtualenv npml && source npml/bin/activate
$ pip3 install -r requirements-dev.txt
```

### As a package
If you don't plan to modify the source, you can also install numpy-ml as a
Python package: `pip3 install -u numpy_ml`.

The reinforcement learning agents train on environments defined in the [OpenAI
gym](https://github.com/openai/gym). To install these alongside numpy-ml, you
can use `pip3 install -u 'numpy_ml[rl]'`.

## Documentation
For more details on the available models, see the [project documentation](https://numpy-ml.readthedocs.io/).

## Available models
<details>
  <summary>Click to expand!</summary>

1. **Gaussian mixture model**
    - EM training

2. **Hidden Markov model**
    - Viterbi decoding
    - Likelihood computation
    - MLE parameter estimation via Baum-Welch/forward-backward algorithm

3. **Latent Dirichlet allocation** (topic model)
    - Standard model with MLE parameter estimation via variational EM
    - Smoothed model with MAP parameter estimation via MCMC

4. **Neural networks**
    * Layers / Layer-wise ops
        - Add
        - Flatten
        - Multiply
        - Softmax
        - Fully-connected/Dense
        - Sparse evolutionary connections
        - LSTM
        - Elman-style RNN
        - Max + average pooling
        - Dot-product attention
        - Embedding layer
        - Restricted Boltzmann machine (w. CD-n training)
        - 2D deconvolution (w. padding and stride)
        - 2D convolution (w. padding, dilation, and stride)
        - 1D convolution (w. padding, dilation, stride, and causality)
    * Modules
        - Bidirectional LSTM
        - ResNet-style residual blocks (identity and convolution)
        - WaveNet-style residual blocks with dilated causal convolutions
        - Transformer-style multi-headed scaled dot product attention
    * Regularizers
        - Dropout
    * Normalization
        - Batch normalization (spatial and temporal)
        - Layer normalization (spatial and temporal)
    * Optimizers
        - SGD w/ momentum
        - AdaGrad
        - RMSProp
        - Adam
    * Learning Rate Schedulers
        - Constant
        - Exponential
        - Noam/Transformer
        - Dlib scheduler
    * Weight Initializers
        - Glorot/Xavier uniform and normal
        - He/Kaiming uniform and normal
        - Standard and truncated normal
    * Losses
        - Cross entropy
        - Squared error
        - Bernoulli VAE loss
        - Wasserstein loss with gradient penalty
        - Noise contrastive estimation loss
    * Activations
        - ReLU
        - Tanh
        - Affine
        - Sigmoid
        - Leaky ReLU
        - ELU
        - SELU
        - GELU
        - Exponential
        - Hard Sigmoid
        - Softplus
    * Models
        - Bernoulli variational autoencoder
        - Wasserstein GAN with gradient penalty
        - word2vec encoder with skip-gram and CBOW architectures
    * Utilities
        - `col2im` (MATLAB port)
        - `im2col` (MATLAB port)
        - `conv1D`
        - `conv2D`
        - `deconv2D`
        - `minibatch`

5. **Tree-based models**
    - Decision trees (CART)
    - [Bagging] Random forests
    - [Boosting] Gradient-boosted decision trees

6. **Linear models**
    - Ridge regression
    - Logistic regression
    - Ordinary least squares
    - Weighted linear regression
    - Generalized linear model (log, logit, and identity link)
    - Gaussian naive Bayes classifier
    - Bayesian linear regression w/ conjugate priors
        - Unknown mean, known variance (Gaussian prior)
        - Unknown mean, unknown variance (Normal-Gamma / Normal-Inverse-Wishart prior)

7. **n-Gram sequence models**
    - Maximum likelihood scores
    - Additive/Lidstone smoothing
    - Simple Good-Turing smoothing

8. **Multi-armed bandit models**
    - UCB1
    - LinUCB
    - Epsilon-greedy
    - Thompson sampling w/ conjugate priors
        - Beta-Bernoulli sampler
    - LinUCB

8. **Reinforcement learning models**
    - Cross-entropy method agent
    - First visit on-policy Monte Carlo agent
    - Weighted incremental importance sampling Monte Carlo agent
    - Expected SARSA agent
    - TD-0 Q-learning agent
    - Dyna-Q / Dyna-Q+ with prioritized sweeping

9. **Nonparameteric models**
    - Nadaraya-Watson kernel regression
    - k-Nearest neighbors classification and regression
    - Gaussian process regression

10. **Matrix factorization**
    - Regularized alternating least-squares
    - Non-negative matrix factorization

11. **Preprocessing**
    - Discrete Fourier transform (1D signals)
    - Discrete cosine transform (type-II) (1D signals)
    - Bilinear interpolation (2D signals)
    - Nearest neighbor interpolation (1D and 2D signals)
    - Autocorrelation (1D signals)
    - Signal windowing
    - Text tokenization
    - Feature hashing
    - Feature standardization
    - One-hot encoding / decoding
    - Huffman coding / decoding
    - Byte pair encoding / decoding
    - Term frequency-inverse document frequency (TF-IDF) encoding
    - MFCC encoding

12. **Utilities**
    - Similarity kernels
    - Distance metrics
    - Priority queue
    - Ball tree
    - Discrete sampler
    - Graph processing and generators
</details>

## Contributing

Am I missing your favorite model? Is there something that could be cleaner /
less confusing? Did I mess something up? Submit a PR! The only requirement is
that your models are written with just the [Python standard
library](https://docs.python.org/3/library/) and [NumPy](https://www.numpy.org/). The
[SciPy library](https://scipy.github.io/devdocs/) is also permitted under special
circumstances ;)

See full contributing guidelines [here](./CONTRIBUTING.md).


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/README.md
================================================
To build the documentation locally, [install sphinx](http://www.sphinx-doc.org/en/master/usage/installation.html), cd into the docs, directory and run `make html`. Local files will be generated in the `docs/_build/html` directory.


================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
import inspect

sys.path.insert(0, os.path.abspath(".."))


gh_url = "https://github.com/ddbourgin/numpy-ml"

# -- Project information -----------------------------------------------------

project = "numpy-ml"
copyright = "2022, David Bourgin"
author = "David Bourgin"

# The short X.Y version
version = "0.1"
# The full version, including alpha/beta/rc tags
release = "0.1.0"


# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.doctest",
    "sphinx.ext.intersphinx",
    "sphinx.ext.todo",
    "sphinx.ext.coverage",
    "sphinx.ext.mathjax",
    "sphinx.ext.ifconfig",
    "sphinx.ext.githubpages",
    "sphinx.ext.napoleon",
    "sphinx.ext.linkcode"
    #  "numpydoc",
]

# to avoid memory errors in the read-the-docs build process
autodoc_mock_imports = ["tensorflow", "torch", "gym"]

# Try to link to source code on GitHub
def linkcode_resolve(domain, info):
    if domain != "py":
        return None

    module = info.get("module", None)
    fullname = info.get("fullname", None)

    if not module or not fullname:
        return None

    obj = sys.modules.get(module, None)
    if obj is None:
        return None

    for part in fullname.split("."):
        obj = getattr(obj, part)
        if isinstance(obj, property):
            obj = obj.fget

    try:
        file = inspect.getsourcefile(obj)
        if file is None:
            return None
    except:
        return None

    file = os.path.relpath(file, start=os.path.abspath(".."))
    source, line_start = inspect.getsourcelines(obj)
    line_end = line_start + len(source) - 1
    filename = f"{file}#L{line_start}-L{line_end}"
    return f"{gh_url}/blob/master/{filename}"


# Napoleon settings
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/sphinxcontrib.napoleon.html#sphinxcontrib.napoleon.Config
napoleon_google_docstring = False
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = False
napoleon_include_private_with_doc = False
napoleon_include_special_with_doc = False
napoleon_use_admonition_for_examples = False
napoleon_use_admonition_for_notes = False
napoleon_use_admonition_for_references = False
napoleon_use_ivar = True
napoleon_use_param = True
napoleon_use_rtype = False
napoleon_use_keyword = True


# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"

# The master toctree document.
master_doc = "index"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "friendly"

autoclass_content = "both"


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = "alabaster"

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
html_css_files = ["css/custom.css"]

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
html_sidebars = {
    "**": [
        "about.html",
        "navigation.html",
        "relations.html",
        "searchbox.html",
        "donate.html",
    ]
}

html_theme_options = {
    "github_user": "ddbourgin",
    "github_repo": "numpy-ml",
    "description": "Machine learning, in NumPy",
    "github_button": True,
    "show_powered_by": False,
    "fixed_sidebar": True,
    "analytics_id": "UA-65839510-3",
    #  'logo': 'logo.png',
}


# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = "numpy-mldoc"


# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, "numpy-ml.tex", "numpy-ml Documentation", "David Bourgin", "manual")
]


# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "numpy-ml", "numpy-ml Documentation", [author], 1)]


# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (
        master_doc,
        "numpy-ml",
        "numpy-ml Documentation",
        author,
        "numpy-ml",
        "Machine learning, in NumPy.",
        "Miscellaneous",
    )
]


# -- Options for Epub output -------------------------------------------------

# Bibliographic Dublin Core info.
epub_title = project

# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''

# A unique identification for the text.
#
# epub_uid = ''

# A list of files that should not be packed into the epub file.
epub_exclude_files = ["search.html"]

autodoc_member_order = "bysource"


# -- Extension configuration -------------------------------------------------

# -- Options for intersphinx extension ---------------------------------------

# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
    "python": ("https://docs.python.org/", None),
    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
}

# -- Options for todo extension ----------------------------------------------

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True

# -- Options for numpydocs extension -----------------------------------------
# https://numpydoc.readthedocs.io/en/latest/install.html

# Whether to produce plot:: directives for Examples sections that contain
# import matplotlib or from matplotlib import.
numpydoc_use_plots = True

# Whether to show all members of a class in the Methods and Attributes sections
# automatically. True by default.
numpydoc_show_class_members = True

#  Whether to show all inherited members of a class in the Methods and
#  Attributes sections automatically. If it’s false, inherited members won’t
#  shown. True by default.
numpydoc_show_inherited_class_members = True

#  Whether to create a Sphinx table of contents for the lists of class methods
#  and attributes. If a table of contents is made, Sphinx expects each entry to
#  have a separate page. True by default.
numpydoc_class_members_toctree = False

# A regular expression matching citations which should be mangled to avoid
# conflicts due to duplication across the documentation. Defaults to [\w-]+.
numpydoc_citation_re = r"[\w-]+"

# Until version 0.8, parameter definitions were shown as blockquotes, rather
# than in a definition list. If your styling requires blockquotes, switch this
# config option to True. This option will be removed in version 0.10.
numpydoc_use_blockquotes = False

# Whether to format the Attributes section of a class page in the same way as
# the Parameter section. If it's False, the Attributes section will be
# formatted as the Methods section using an autosummary table. True by default.
numpydoc_attributes_as_param_list = False

# Whether to create cross-references for the parameter types in the Parameters,
# Other Parameters, Returns and Yields sections of the docstring. False by
# default.
numpydoc_xref_param_type = False

#  Mappings to fully qualified paths (or correct ReST references) for the
#  aliases/shortcuts used when specifying the types of parameters. The keys
#  should not have any spaces. Together with the intersphinx extension, you can
#  map to links in any documentation. The default is an empty dict.  This
#  option depends on the numpydoc_xref_param_type option being True.
numpydoc_xref_aliases = {}

#  Words not to cross-reference. Most likely, these are common words used in
#  parameter type descriptions that may be confused for classes of the same
#  name. For example: {'type', 'optional', 'default'}. The default is an empty
#  set.
numpydoc_xref_ignore = set([])

#  Deprecated since version edit: your HTML template instead Whether to insert
#  an edit link after docstrings.
numpydoc_edit_link: bool


================================================
FILE: docs/index.rst
================================================
Welcome to numpy-ml
===================
`numpy-ml`_ is a growing collection of machine learning models, algorithms, and
tools written exclusively in `NumPy`_ and the Python `standard library`_.

The purpose of the project is to provide reference implementations of common
machine learning components for rapid prototyping and experimentation. With
that in mind, don't just read the docs -- read the source!

.. _numpy-ml: https://www.github.com/ddbourgin/numpy-ml
.. _NumPy: https://numpy.org/
.. _standard library: https://docs.python.org/3/library/

.. topic:: This documentation is under development!

    We're working to expand our coverage. During this time there are likely to
    be typos, bugs, and poorly-worded sections. If you encounter any of the
    above, please file an `issue`_ or submit a `pull request`_!

.. _issue: https://github.com/ddbourgin/numpy-ml/issues
.. _pull request: https://github.com/ddbourgin/numpy-ml/pulls

.. toctree::
   :maxdepth: 3
   :hidden:

   numpy_ml.hmm

   numpy_ml.gmm

   numpy_ml.lda

   numpy_ml.ngram

   numpy_ml.bandits

   numpy_ml.rl_models

   numpy_ml.nonparametric

   numpy_ml.factorization

   numpy_ml.trees

   numpy_ml.neural_nets

   numpy_ml.linear_models

   numpy_ml.preprocessing

   numpy_ml.utils

##########
Disclaimer
##########

This software is provided as-is: there are no guarantees that it fits your
purposes or that it is bug-free. Use it at your own risk!


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/numpy_ml.bandits.bandits.rst
================================================
Bandit Environments
===================

``Bandit``
-----------
.. autoclass:: numpy_ml.bandits.bandits.Bandit
    :members:
    :undoc-members:
    :inherited-members:


``MultinomialBandit``
-------------------------
.. autoclass:: numpy_ml.bandits.MultinomialBandit
    :members:
    :undoc-members:
    :show-inheritance:

``BernoulliBandit``
-----------------------
.. autoclass:: numpy_ml.bandits.BernoulliBandit
    :members:
    :undoc-members:
    :show-inheritance:


``GaussianBandit``
----------------------
.. autoclass:: numpy_ml.bandits.GaussianBandit
    :members:
    :undoc-members:
    :show-inheritance:

``ShortestPathBandit``
-----------------------
.. autoclass:: numpy_ml.bandits.ShortestPathBandit
    :members:
    :undoc-members:
    :show-inheritance:

``ContextualBernoulliBandit``
------------------------------
.. autoclass:: numpy_ml.bandits.ContextualBernoulliBandit
    :members:
    :undoc-members:
    :show-inheritance:

``ContextualLinearBandit``
------------------------------
.. autoclass:: numpy_ml.bandits.ContextualLinearBandit
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.bandits.policies.rst
================================================
Policies
=========

``BanditPolicyBase``
--------------------
.. autoclass:: numpy_ml.bandits.policies.BanditPolicyBase
    :members:
    :undoc-members:
    :inherited-members:

``EpsilonGreedy``
-----------------
.. autoclass:: numpy_ml.bandits.policies.EpsilonGreedy
    :members:
    :undoc-members:
    :show-inheritance:

``UCB1``
--------
.. autoclass:: numpy_ml.bandits.policies.UCB1
    :members:
    :undoc-members:
    :show-inheritance:

``ThompsonSamplingBetaBinomial``
--------------------------------
.. autoclass:: numpy_ml.bandits.policies.ThompsonSamplingBetaBinomial
    :members:
    :undoc-members:
    :show-inheritance:

``LinUCB``
--------------------------------
.. autoclass:: numpy_ml.bandits.policies.LinUCB
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.bandits.rst
================================================
Multi-armed bandits
###################

.. toctree::
   :maxdepth: 3

   numpy_ml.bandits.bandits

   numpy_ml.bandits.policies

   numpy_ml.bandits.trainer


================================================
FILE: docs/numpy_ml.bandits.trainer.rst
================================================
Trainer
=======

``BanditTrainer``
------------------
.. autoclass:: numpy_ml.bandits.trainer.BanditTrainer
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.factorization.factors.rst
================================================
``VanillaALS``
--------------
.. autoclass:: numpy_ml.factorization.VanillaALS
    :members:
    :undoc-members:

``NMF``
--------
.. autoclass:: numpy_ml.factorization.NMF
    :members:
    :undoc-members:


================================================
FILE: docs/numpy_ml.factorization.rst
================================================
Matrix factorization
####################

.. toctree::
   :maxdepth: 3

   numpy_ml.factorization.factors


================================================
FILE: docs/numpy_ml.gmm.gmm.rst
================================================
``GMM``
-------

.. autoclass:: numpy_ml.gmm.GMM
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.gmm.rst
================================================
#######################
Gaussian mixture models
#######################

A `Gaussian mixture model`_ (GMM) is a latent variable model commonly used for
unsupervised clustering.

.. figure:: img/gmm_model.png
    :scale: 30 %
    :align: center

    Graphical model for a GMM with `K` mixture components and `N` data points.

.. _`Gaussian mixture model` : https://en.wikipedia.org/wiki/Mixture_model#Gaussian_mixture_model

A GMM assumes that:

    1. The observed data are generated from a `mixture distribution`_, `P`,
       made up of `K` mixture components.

    2. Each mixture component is a multivariate Gaussian with its own mean
       :math:`\mu`, covariance matrix, :math:`\Sigma`, and mixture weight,
       :math:`\pi`.

    .. 3. To generate a new data point, we sample a mixture component in
    .. proportion to its prior probability, then draw a sample from the
    .. distribution parameterized by that component's mean and covariance.

.. _mixture distribution: https://en.wikipedia.org/wiki/Mixture_distribution

The parameters of a GMM model are:

    - :math:`\theta`, the set of parameters for each of the `K` mixture
      components. :math:`\theta = \{ \mu_1, \Sigma_1, \pi_i, \ldots, \mu_k,
      \Sigma_k, \pi_k \}`.

Under a GMM, the joint probability of a sequence of cluster assignments `Z` and an observed
dataset :math:`X = \{x_1, \ldots, x_N \}`, is:

.. math::
    p(Z, X \mid \theta) =
        \prod_{i=1}^N p(z_i, x_i \mid \theta) =
            \prod_{i=1}^N \prod_{k=1}^K
                [\mathcal{N}(x_i \mid \mu_k, \Sigma_k) \pi_k ]^{\mathbb{1}_{[z_{i} = k]}}

where

    - :math:`\theta` is the set of GMM parameters: :math:`\theta = \{ \mu_1,
      \Sigma_1, \pi_i, \ldots, \mu_k, \Sigma_k, \pi_k \}`.

    - :math:`Z_i \in \{ 1, \ldots, k \}` is a latent variable reflecting the ID
      of the mixture component that generated data point `i`.

    - :math:`\mathbb{1}_{[z_i = k]}` is a binary indicator function returning
      1 if data point :math:`x_i` was sampled from mixture component :math:`k`
      and 0 otherwise.

As with other latent-variable models, we use the `expectation-maximization (EM)
algorithm`_ to learn the GMM parameters.

.. _expectation-maximization (EM) algorithm : https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm

**Models**

- :class:`~numpy_ml.gmm.GMM`

**References**

.. [1] Bilmes, J. A. (1998). "A gentle tutorial of the EM algorithm and its
   application to parameter estimation for Gaussian mixture and hidden
   Markov models" *International Computer Science Institute, 4(510)*
   https://www.inf.ed.ac.uk/teaching/courses/pmr/docs/EM.pdf


.. toctree::
   :maxdepth: 2
   :hidden:

   numpy_ml.gmm.gmm


================================================
FILE: docs/numpy_ml.hmm.MultinomialHMM.rst
================================================
``MultinomialHMM``
------------------

.. autoclass:: numpy_ml.hmm.MultinomialHMM
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.hmm.rst
================================================
####################
Hidden Markov models
####################

A `hidden Markov model`_ (HMM) is a generative model for sequences of observations.

.. _`hidden Markov model` : https://en.wikipedia.org/wiki/Hidden_Markov_model

.. figure:: img/hmm_model.png
    :scale: 25 %
    :align: center

    Graphical model for an HMM with :math:`T=4` timesteps.


An HMM assumes:

    1. The observations, `O`, are generated by a process whose states,
       :math:`S`, are *hidden* from the observer.

    2. Each hidden state is a discrete random variable.

    3. The hidden state at time `t` is independent of all hidden states before
       time :math:`t - 1`.

    4. The observation :math:`O_t` is independent of all previous states and
       observations given the current hidden state, :math:`S_t`.

The parameters of an HMM model are:

    - :math:`\pi`, the prior specifying :math:`P(S_1)`.

    - :math:`\theta`, the :math:`K \times K` transition matrix specifying
      :math:`P(S_t \mid S_{t-1})`.

    - :math:`\phi`, the output model defining :math:`P(Y_t \mid S_t)`. If the
      observations are discrete, this is a :math:`K \times L` emission matrix,
      where `L` is the number of unique observation symbols.

The HMM joint distribution of a sequence of states and observations is:

.. math::

    P(S_{1:T}, O_{1:T}) = P(S_1) P(O_1 \mid S_1) \prod_{t=2}^T P(S_t \mid S_{t-1})P(O_t \mid S_t)

where :math:`X_{1:T}` is shorthand for :math:`X_1, \ldots, X_T`.

As with other latent-variable models, we use the `expectation-maximization
(EM) algorithm`_ to learn the model parameters.  The HMM-optimized version of
the EM algorithm is known as the `forward-backward`_ / `Baum-Welch algorithm`_.

.. _expectation-maximization (EM) algorithm : https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm
.. _forward-backward: https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm
.. _Baum-Welch algorithm: https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm

**Models**

- :class:`~numpy_ml.hmm.MultinomialHMM`

**References**

.. [1] Ghahramani, Z. (2001). "An Intro to HMMs and Bayesian networks".
       *International Journal of Pattern Recognition and AI, 15(1)*: 9-42.

.. toctree::
   :maxdepth: 2
   :hidden:

   numpy_ml.hmm.MultinomialHMM


================================================
FILE: docs/numpy_ml.lda.lda.rst
================================================
``LDA``
=======

.. autoclass:: numpy_ml.lda.LDA
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.lda.rst
================================================
###########################
Latent Dirichlet allocation
###########################

`Latent Dirichlet allocation`_ (LDA, commonly known as a topic model) is a
generative model for `bags of words`_.

.. _`Latent Dirichlet allocation` : https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
.. _bags of words : https://en.wikipedia.org/wiki/Bag-of-words_model


.. figure:: img/lda_model_smoothed.png
    :scale: 25 %
    :align: center

    The smoothed LDA model with `T` topics, `D` documents, and :math:`N_d` words per document.

In LDA, each word in a piece of text is associated with one of `T` latent
`topics`. A `document` is an unordered collection (bag) of words. During
inference, the goal is to estimate probability of each word token under each
topic, along with the per-document topic mixture weights, using only the
observed text.

The parameters of the LDA model are:

    - :math:`\theta`, the document-topic distribution. We use
      :math:`\theta^{(i)}` to denote the parameters of the `categorical`_
      distribution over topics associated with document :math:`i`.

    - :math:`\phi`, the topic-word distribution. We use :math:`\phi^{(j)}` to
      denote the parameters of the `categorical`_ distribution over words
      associated with topic :math:`j`.

.. _categorical : https://en.wikipedia.org/wiki/Categorical_distribution

The standard LDA model [1]_ places a `Dirichlet`_ prior on :math:`\theta`:

.. math::
    \theta^{(d)}  \sim  \text{Dir}(\alpha)

The smoothed/fully-Bayesian LDA model [2]_ adds an additional `Dirichlet`_ prior on :math:`\phi`:

.. math::
    \phi^{(j)}  \sim  \text{Dir}(\beta)

.. _Dirichlet : https://en.wikipedia.org/wiki/Dirichlet_distribution

To generate a document with the smoothed LDA model, we:

    1. Sample the parameters for the distribution over topics,
       :math:`\theta \sim \text{Dir}(\alpha)`.

    2. Sample a topic, :math:`z \sim \text{Cat}(\theta)`.

    3. If we haven't already, sample the parameters for topic `z`'s categorical
       distribution over words, :math:`\phi^{(z)} \sim \text{Dir}(\beta)`.

    4. Sample a word, :math:`w \sim \text{Cat}(\phi^{(z)})`.

    5. Repeat steps 2 through 4 until we have a bag of `N` words.

The joint distribution over words, topics, :math:`\theta`, and :math:`\phi`
under the smoothed LDA model is:

.. math::

    P(w, z, \phi, \theta \mid \alpha, \beta) = \left( \prod_{t=1}^T \text{Dir}(\phi^{(t)}; \beta) \right) \prod_{d=1}^D \text{Dir}(\theta^{(d)}; \alpha) \prod_{n=1}^{N_d} P(z_n \mid \theta^{(d)}) P(w_n \mid \phi^{(z_n)})

The parameters of the LDA model can be learned using `variational expectation
maximization`_ or Markov chain Monte Carlo (e.g., `collapsed Gibbs sampling`_).

.. _`variational expectation maximization`: https://en.wikipedia.org/wiki/Variational_Bayesian_methods
.. _`collapsed Gibbs sampling`: https://en.wikipedia.org/wiki/Gibbs_sampling#Collapsed_Gibbs_sampler

**Models**

- :class:`~numpy_ml.lda.LDA`
- :class:`~numpy_ml.lda.SmoothedLDA`

**References**

.. [1]  Blei, D., Ng, A., & Jordan, M. (2003). "Latent Dirichlet allocation". *Journal of
   Machine Learning Research*, *3*, 993–1022.
.. [2]  Griffiths, T. & Steyvers, M. (2004). "Finding scientific topics".
   *PNAS*, *101(1)*, 5228-5235.

.. toctree::
   :maxdepth: 3
   :hidden:

   numpy_ml.lda.lda
   numpy_ml.lda.smoothed_lda


================================================
FILE: docs/numpy_ml.lda.smoothed_lda.rst
================================================
``SmoothedLDA``
===============

.. autoclass:: numpy_ml.lda.SmoothedLDA
    :members:
    :undoc-members:
    :inherited-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.linear_models.lm.rst
================================================
``LinearRegression``
--------------------

.. autoclass:: numpy_ml.linear_models.LinearRegression
	:members:
	:undoc-members:
	:inherited-members:

``RidgeRegression``
-------------------

.. autoclass:: numpy_ml.linear_models.RidgeRegression
	:members:
	:undoc-members:
	:inherited-members:

``LogisticRegression``
----------------------

.. autoclass:: numpy_ml.linear_models.LogisticRegression
	:members:
	:undoc-members:
	:inherited-members:

``BayesianLinearRegressionUnknownVariance``
-------------------------------------------

.. autoclass:: numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance
	:members:
	:undoc-members:
	:inherited-members:

``BayesianLinearRegressionKnownVariance``
-----------------------------------------

.. autoclass:: numpy_ml.linear_models.BayesianLinearRegressionKnownVariance
	:members:
	:undoc-members:
	:inherited-members:

``GaussianNBClassifier``
-----------------------------------------

.. autoclass:: numpy_ml.linear_models.GaussianNBClassifier
	:members:
	:undoc-members:
	:inherited-members:

``GeneralizedLinearModel``
-----------------------------------------

.. autoclass:: numpy_ml.linear_models.GeneralizedLinearModel
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.linear_models.rst
================================================
Linear models
#############

.. raw:: html

   <h2>Ordinary and Weighted Linear Least Squares</h2>

In weighted linear least-squares regression (WLS), a real-valued target
:math:`y_i`, is modeled as a linear combination of covariates
:math:`\mathbf{x}_i` and model coefficients **b**:

.. math::

    y_i = \mathbf{b}^\top \mathbf{x}_i + \epsilon_i

In the above equation, :math:`\epsilon_i \sim \mathcal{N}(0, \sigma_i^2)` is a
normally distributed error term with variance :math:`\sigma_i^2`. Ordinary
least squares (OLS) is a special case of this model where the variance is fixed
across all examples, i.e., :math:`\sigma_i = \sigma_j \ \forall i,j`. The
maximum likelihood model parameters, :math:`\hat{\mathbf{b}}_{WLS}`, are those
that minimize the weighted squared error between the model predictions and the
true values:

.. math::

    \mathcal{L} = ||\mathbf{W}^{0.5}(\mathbf{y} - \mathbf{bX})||_2^2

where :math:`\mathbf{W}` is a diagonal matrix of the example weights. In OLS,
:math:`\mathbf{W}` is the identity matrix. The maximum likelihood estimate for
the model parameters can be computed in closed-form using the normal equations:

.. math::

    \hat{\mathbf{b}}_{WLS} =
        (\mathbf{X}^\top \mathbf{WX})^{-1} \mathbf{X}^\top \mathbf{Wy}


**Models**

- :class:`~numpy_ml.linear_models.LinearRegression`

.. raw:: html

   <h2>Ridge Regression</h2>

Ridge regression uses the same simple linear regression model but adds an
additional penalty on the `L2`-norm of the coefficients to the loss function.
This is sometimes known as Tikhonov regularization.

In particular, the ridge model is the same as the OLS model:

.. math::

    \mathbf{y} = \mathbf{bX} + \mathbf{\epsilon}

where :math:`\epsilon \sim \mathcal{N}(\mathbf{0}, \sigma^2 \mathbf{I})`,
except now the error for the model is calculated as

.. math::

    \mathcal{L} = ||\mathbf{y} - \mathbf{bX}||_2^2 + \alpha ||\mathbf{b}||_2^2

The MLE for the model parameters **b** can be computed in closed form via
the adjusted normal equation:

.. math::

    \hat{\mathbf{b}}_{Ridge} =
        (\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I})^{-1} \mathbf{X}^\top \mathbf{y}

where :math:`(\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I})^{-1}
\mathbf{X}^\top` is the pseudoinverse / Moore-Penrose inverse adjusted for
the `L2` penalty on the model coefficients.

**Models**

- :class:`~numpy_ml.linear_models.RidgeRegression`

.. raw:: html

   <h2>Bayesian Linear Regression</h2>

In its general form, Bayesian linear regression extends the simple linear
regression model by introducing priors on model parameters *b* and/or the
error variance :math:`\sigma^2`.

The introduction of a prior allows us to quantify the uncertainty in our
parameter estimates for b by replacing the MLE point estimate in simple
linear regression with an entire posterior *distribution*, :math:`p(b \mid X, y,
\sigma)`, simply by applying Bayes rule:

.. math::

    p(b \mid X, y) = \frac{ p(y \mid X, b) p(b \mid \sigma) }{p(y \mid X)}

We can also quantify the uncertainty in our predictions :math:`y^*` for some new
data :math:`X^*` with the posterior predictive distribution:

.. math::

    p(y^* \mid X^*, X, Y) = \int_{b} p(y^* \mid X^*, b) p(b \mid X, y) \ \text{d}b

Depending on the choice of prior it may be impossible to compute an
analytic form for the posterior / posterior predictive distribution. In
these cases, it is common to use approximations, either via MCMC or
variational inference.

.. raw:: html

   <h4>Known variance</h4>

--------------------------------

If we happen to already know the error variance :math:`\sigma^2`, the conjugate
prior on `b` is Gaussian. A common parameterization is:

.. math::

    b | \sigma, V  \sim  \mathcal{N}(\mu, \sigma^2 V)

where :math:`\mu`, :math:`\sigma` and :math:`V` are hyperparameters. Ridge
regression is a special case of this model where :math:`\mu = 0`,
:math:`\sigma = 1` and :math:`V = I` (i.e., the prior on *b* is a zero-mean,
unit covariance Gaussian).

Due to the conjugacy of the above prior with the Gaussian likelihood, there
exists a closed-form solution for the posterior over the model
parameters:

.. math::

    A  &=  (V^{-1} + X^\top X)^{-1} \\
    \mu_b  &=  A V^{-1} \mu + A X^\top y \\
    \Sigma_b  &=  \sigma^2 A \\

The model posterior is then

.. math::

    b \mid X, y  \sim  \mathcal{N}(\mu_b, \Sigma_b)

We can also compute a closed-form solution for the posterior predictive distribution as
well:

.. math::

    y^* \mid X^*, X, Y \sim \mathcal{N}(X^* \mu_b, \ \ X^* \Sigma X^{* \top} + I)

where :math:`X^*` is the matrix of new data we wish to predict, and :math:`y^*`
are the predicted targets for those data.

**Models**

- :class:`~numpy_ml.linear_models.BayesianLinearRegressionKnownVariance`


.. raw:: html

   <h4>Unknown variance</h4>

--------------------------------

If *both* *b* and the error variance :math:`\sigma^2` are unknown, the
conjugate prior for the Gaussian likelihood is the Normal-Gamma
distribution (univariate likelihood) or the Normal-Inverse-Wishart
distribution (multivariate likelihood).

    **Univariate**

    .. math::

        b, \sigma^2  &\sim  \text{NG}(\mu, V, \alpha, \beta) \\
        \sigma^2  &\sim  \text{InverseGamma}(\alpha, \beta) \\
        b \mid \sigma^2  &\sim  \mathcal{N}(\mu, \sigma^2 V)

    where :math:`\alpha, \beta, V`, and :math:`\mu` are parameters of the
    prior.

    **Multivariate**

    .. math::

        b, \Sigma  &\sim  \mathcal{NIW}(\mu, \lambda, \Psi, \rho) \\
        \Sigma  &\sim  \mathcal{W}^{-1}(\Psi, \rho) \\
        b \mid \Sigma  &\sim  \mathcal{N}(\mu, \frac{1}{\lambda} \Sigma)

    where :math:`\mu, \lambda, \Psi`, and :math:`\rho` are
    parameters of the prior.


Due to the conjugacy of the above priors with the Gaussian likelihood,
there exists a closed-form solution for the posterior over the model
parameters:

.. math::

    B  &=  y - X \mu \\
    \text{shape}  &=  N + \alpha \\
    \text{scale}  &=  \frac{1}{\text{shape}} (\alpha \beta + B^\top (X V X^\top + I)^{-1} B) \\

where

.. math::

    \sigma^2 \mid X, y  &\sim  \text{InverseGamma}(\text{shape}, \text{scale}) \\
    A  &=  (V^{-1} + X^\top X)^{-1} \\
    \mu_b  &=  A V^{-1} \mu + A X^\top y \\
    \Sigma_b  &=  \sigma^2 A

The model posterior is then

.. math::

    b | X, y, \sigma^2 \sim \mathcal{N}(\mu_b, \Sigma_b)

We can also compute a closed-form solution for the posterior predictive distribution:

.. math::

    y^* \mid X^*, X, Y \sim \mathcal{N}(X^* \mu_b, \ X^* \Sigma_b X^{* \top} + I)

**Models**

- :class:`~numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance`

.. raw:: html

   <h2>Naive Bayes Classifier</h2>

The naive Bayes model assumes the features of a training example
:math:`\mathbf{x}` are mutually independent given the example label :math:`y`:

.. math::

    P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)

where :math:`M` is the rank of the :math:`i^{th}` example :math:`\mathbf{x}_i`
and :math:`y_i` is the label associated with the :math:`i^{th}` example.

Combining this conditional independence assumption with a simple application of
Bayes' theorem gives the naive Bayes classification rule:

.. math::

    \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
            &= \arg \max_y  P(y) P(\mathbf{x} \mid y) \\
            &= \arg \max_y  P(y) \prod_{j=1}^M P(x_j \mid y)

The prior class probability :math:`P(y)` can be specified in advance or
estimated empirically from the training data.

**Models**

- :class:`~numpy_ml.linear_models.GaussianNBClassifier`

.. raw:: html

   <h2>Generalized Linear Model</h2>

The generalized linear model (GLM) assumes that each target/dependent variable
:math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots, y_n)`, has been
drawn independently from a pre-specified distribution in the exponential family
with unknown mean :math:`\mu_i`. The GLM models a (one-to-one, continuous,
differentiable) function, *g*, of this mean value as a linear combination of
the model parameters :math:`\mathbf{b}` and observed covariates,
:math:`\mathbf{x}_i` :

.. math::

    g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
        g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i

where *g* is known as the link function.  The choice of link function is
informed by the instance of the exponential family the target is drawn from.

**Models**

- :class:`~numpy_ml.linear_models.GeneralizedLinearModel`

.. toctree::
   :maxdepth: 2
   :hidden:

   numpy_ml.linear_models.lm


================================================
FILE: docs/numpy_ml.neural_nets.activations.rst
================================================
Activations
===========

Popular (and some not-so-popular) activation functions for use within arbitrary
neural networks.

``Affine``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.Affine
    :members:
    :undoc-members:
    :inherited-members:

``ELU``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.ELU
    :members:
    :undoc-members:
    :inherited-members:

``Exponential``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.Exponential
    :members:
    :undoc-members:
    :inherited-members:

``HardSigmoid``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.HardSigmoid
    :members:
    :undoc-members:
    :inherited-members:

``Identity``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.Identity
    :members:
    :undoc-members:
    :inherited-members:

``LeakyReLU``
-------------
.. autoclass:: numpy_ml.neural_nets.activations.LeakyReLU
    :members:
    :undoc-members:
    :inherited-members:

``ReLU``
---------
.. autoclass:: numpy_ml.neural_nets.activations.ReLU
    :members:
    :undoc-members:
    :inherited-members:

``SELU``
---------
.. autoclass:: numpy_ml.neural_nets.activations.SELU
    :members:
    :undoc-members:
    :inherited-members:

``GELU``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.GELU
    :members:
    :undoc-members:
    :inherited-members:

``Sigmoid``
------------
.. autoclass:: numpy_ml.neural_nets.activations.Sigmoid
    :members:
    :undoc-members:
    :inherited-members:

``SoftPlus``
------------
.. autoclass:: numpy_ml.neural_nets.activations.SoftPlus
    :members:
    :undoc-members:
    :inherited-members:

``Tanh``
---------
.. autoclass:: numpy_ml.neural_nets.activations.Tanh
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.neural_nets.initializers.rst
================================================
Initializers
=============

``ActivationInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.ActivationInitializer
    :members:
    :undoc-members:
    :inherited-members:

``OptimizerInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.OptimizerInitializer
    :members:
    :undoc-members:
    :inherited-members:

``SchedulerInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.SchedulerInitializer
    :members:
    :undoc-members:
    :inherited-members:

``WeightInitializer``
------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.WeightInitializer
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.neural_nets.layers.rst
================================================
Layers
======

``LayerBase``
-------------
.. autoclass:: numpy_ml.neural_nets.layers.layers.LayerBase
    :members:
    :undoc-members:
    :inherited-members:

``Add``
-------
.. autoclass:: numpy_ml.neural_nets.layers.Add
    :members:
    :undoc-members:
    :show-inheritance:

``BatchNorm1D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.BatchNorm1D
    :members:
    :undoc-members:
    :show-inheritance:

``BatchNorm2D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.BatchNorm2D
    :members:
    :undoc-members:
    :show-inheritance:

``Conv1D``
----------
.. autoclass:: numpy_ml.neural_nets.layers.Conv1D
    :members:
    :undoc-members:
    :show-inheritance:

``Conv2D``
----------
.. autoclass:: numpy_ml.neural_nets.layers.Conv2D
    :members:
    :undoc-members:
    :show-inheritance:

``Deconv2D``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Deconv2D
    :members:
    :undoc-members:
    :show-inheritance:

``DotProductAttention``
-----------------------
.. autoclass:: numpy_ml.neural_nets.layers.DotProductAttention
    :members:
    :undoc-members:
    :show-inheritance:

``Embedding``
-------------
.. autoclass:: numpy_ml.neural_nets.layers.Embedding
    :members:
    :undoc-members:
    :show-inheritance:

``Flatten``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.Flatten
    :members:
    :undoc-members:
    :show-inheritance:

``FullyConnected``
------------------
.. autoclass:: numpy_ml.neural_nets.layers.FullyConnected
    :members:
    :undoc-members:
    :show-inheritance:

``LSTM``
--------
.. autoclass:: numpy_ml.neural_nets.layers.LSTM
    :members:
    :undoc-members:
    :show-inheritance:

``LSTMCell``
------------
.. autoclass:: numpy_ml.neural_nets.layers.LSTMCell
    :members:
    :undoc-members:
    :show-inheritance:

``LayerNorm1D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.LayerNorm1D
    :members:
    :undoc-members:
    :show-inheritance:

``LayerNorm2D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.LayerNorm2D
    :members:
    :undoc-members:
    :show-inheritance:

``Multiply``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Multiply
    :members:
    :undoc-members:
    :show-inheritance:

``Pool2D``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Pool2D
    :members:
    :undoc-members:
    :show-inheritance:

``RNN``
-------
.. autoclass:: numpy_ml.neural_nets.layers.RNN
    :members:
    :undoc-members:
    :show-inheritance:

``RNNCell``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.RNNCell
    :members:
    :undoc-members:
    :show-inheritance:

``RBM``
-------------------------------
.. autoclass:: numpy_ml.neural_nets.layers.RBM
    :members:
    :undoc-members:
    :show-inheritance:

``Softmax``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.Softmax
    :members:
    :undoc-members:
    :show-inheritance:

``SparseEvolution``
-------------------
.. autoclass:: numpy_ml.neural_nets.layers.SparseEvolution
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.neural_nets.losses.rst
================================================
Loss functions
==============

``CrossEntropy``
----------------
.. autoclass:: numpy_ml.neural_nets.losses.CrossEntropy
    :members:
    :undoc-members:
    :inherited-members:

``SquaredError``
----------------
.. autoclass:: numpy_ml.neural_nets.losses.SquaredError
    :members:
    :undoc-members:
    :inherited-members:

``NCELoss``
-----------
.. autoclass:: numpy_ml.neural_nets.losses.NCELoss
    :members:
    :undoc-members:
    :inherited-members:

``VAELoss``
-----------
.. autoclass:: numpy_ml.neural_nets.losses.VAELoss
    :members:
    :undoc-members:
    :inherited-members:

``WGAN_GPLoss``
---------------
.. autoclass:: numpy_ml.neural_nets.losses.WGAN_GPLoss
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.neural_nets.models.rst
================================================
Full networks
==============

``WGAN_GP``
----------
.. autoclass:: numpy_ml.neural_nets.models.WGAN_GP
    :members:
    :undoc-members:
    :inherited-members:

``BernoulliVAE``
----------------
.. autoclass:: numpy_ml.neural_nets.models.BernoulliVAE
    :members:
    :undoc-members:
    :inherited-members:

``Word2Vec``
------------
.. autoclass:: numpy_ml.neural_nets.models.Word2Vec
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.neural_nets.modules.rst
================================================
Modules
========

``BidirectionalLSTM``
---------------------
.. autoclass:: numpy_ml.neural_nets.modules.BidirectionalLSTM
    :members:
    :undoc-members:

``MultiHeadedAttentionModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.MultiHeadedAttentionModule
    :members:
    :undoc-members:

``SkipConnectionConvModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.SkipConnectionConvModule
    :members:
    :undoc-members:

``SkipConnectionIdentityModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.SkipConnectionIdentityModule
    :members:
    :undoc-members:

``WavenetResidualModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.WavenetResidualModule
    :members:
    :undoc-members:


================================================
FILE: docs/numpy_ml.neural_nets.optimizers.rst
================================================
Optimizers
===========
Popular gradient-based strategies for optimizing parameters in neural networks.

For a discussion regarding the generalization performance of the solutions
found via different optimization strategies, see:

.. [1] Wilson et al. (2017) "The marginal value of adaptive gradient methods in machine
   learning", *Proceedings of the 31st Conference on Neural Information Processing Systems*
   https://arxiv.org/pdf/1705.08292.pdf

``OptimizerBase``
-------------
.. autoclass:: numpy_ml.neural_nets.optimizers.optimizers.OptimizerBase
    :members:
    :undoc-members:
    :show-inheritance:

``SGD``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.SGD
    :members:
    :undoc-members:
    :show-inheritance:

``AdaGrad``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.AdaGrad
    :members:
    :undoc-members:
    :show-inheritance:

``Adam``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.Adam
    :members:
    :undoc-members:
    :show-inheritance:

``RMSProp``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.RMSProp
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.neural_nets.rst
================================================
Neural networks
###############
The neural network module includes common building blocks for implementing
modern `deep learning`_ models.

.. _`deep learning`: https://en.wikipedia.org/wiki/Deep_learning

.. raw:: html

   <h2>Layers</h2>

Most modern neural networks can be represented as a `composition`_ of
many small, parametric functions. The functions in this composition are
commonly referred to as the "layers" of the network. As an example, the
multilayer perceptron (MLP) below computes the function :math:`(f
\circ g \circ h)` where, `f`, `g`, and `h` are the individual network layers.

.. figure:: img/mlp_model.png
    :scale: 40 %
    :align: center

    A multilayer perceptron with three layers labeled `f`, `g`, and `h`.

Many neural network layers are parametric: they express different
transformations depending on the setting of their weights (coefficients),
biases (intercepts), and/or other tunable values. These parameters are adjusted
during training to improve the performance of the network on a particular
metric.

The :doc:`numpy_ml.neural_nets.layers` module contains a number of common
transformations that can be composed to create larger networks.

.. _`composition`: https://en.wikipedia.org/wiki/Function_composition

**Layers**

+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Add`         | - :class:`~numpy_ml.neural_nets.layers.Deconv2D`            | - :class:`~numpy_ml.neural_nets.layers.LSTM`            |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.BatchNorm1D` | - :class:`~numpy_ml.neural_nets.layers.DotProductAttention` | - :class:`~numpy_ml.neural_nets.layers.LSTMCell`        |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` | - :class:`~numpy_ml.neural_nets.layers.Embedding`           | - :class:`~numpy_ml.neural_nets.layers.LayerNorm1D`     |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Conv1D`      | - :class:`~numpy_ml.neural_nets.layers.Flatten`             | - :class:`~numpy_ml.neural_nets.layers.LayerNorm2D`     |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Conv2D`      | - :class:`~numpy_ml.neural_nets.layers.FullyConnected`      | - :class:`~numpy_ml.neural_nets.layers.Multiply`        |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Pool2D`      | - :class:`~numpy_ml.neural_nets.layers.RNN`                 | - :class:`~numpy_ml.neural_nets.layers.RNNCell`         |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.RBM`         | - :class:`~numpy_ml.neural_nets.layers.Softmax`             | - :class:`~numpy_ml.neural_nets.layers.SparseEvolution` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+

.. raw:: html

   <h2>Activations</h2>

Each unit in a neural network sums its input and passes it through an
`activation function`_ before sending it on to its outgoing weights. Activation
functions in most modern networks are real-valued, non-linear functions that
are computationally inexpensive to compute and easily differentiable.

The :doc:`Activations <numpy_ml.neural_nets.activations>` module contains a
number of common activation functions.

.. _`activation function`: https://en.wikipedia.org/wiki/Activation_function

**Activations**

+----------------------------------------------------------+--------------------------------------------------------+-------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.activations.Affine`      | - :class:`~numpy_ml.neural_nets.activations.Identity`  | - :class:`~numpy_ml.neural_nets.activations.Sigmoid`  |
|----------------------------------------------------------|--------------------------------------------------------|-------------------------------------------------------|
| - :class:`~numpy_ml.neural_nets.activations.ELU`         | - :class:`~numpy_ml.neural_nets.activations.LeakyReLU` | - :class:`~numpy_ml.neural_nets.activations.SoftPlus` |
| - :class:`~numpy_ml.neural_nets.activations.Exponential` | - :class:`~numpy_ml.neural_nets.activations.ReLU`      | - :class:`~numpy_ml.neural_nets.activations.Tanh`     |
| - :class:`~numpy_ml.neural_nets.activations.HardSigmoid` | - :class:`~numpy_ml.neural_nets.activations.SELU`      |                                                       |
+----------------------------------------------------------+--------------------------------------------------------+-------------------------------------------------------+

.. raw:: html

   <h2>Losses</h2>

Training a neural network involves searching for layer parameters that optimize
the network's performance on a given task. `Loss functions`_ are the
quantitative metric we use to measure how well the network is performing. Loss
functions are typically scalar-valued functions of a network's output on some
training data.

The :doc:`Losses <numpy_ml.neural_nets.losses>` module contains loss functions
for a number of common tasks.

.. _`Loss functions`: https://en.wikipedia.org/wiki/Loss_function

**Losses**

+------------------------------------------------------+-------------------------------------------------+-----------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.losses.CrossEntropy` | - :class:`~numpy_ml.neural_nets.losses.NCELoss` | - :class:`~numpy_ml.neural_nets.losses.WGAN_GPLoss` |
|------------------------------------------------------|-------------------------------------------------|-----------------------------------------------------|
| - :class:`~numpy_ml.neural_nets.losses.SquaredError` | - :class:`~numpy_ml.neural_nets.losses.VAELoss` |                                                     |
+------------------------------------------------------+-------------------------------------------------+-----------------------------------------------------+

.. raw:: html

   <h2>Optimizers</h2>

The :doc:`Optimizers <numpy_ml.neural_nets.optimizers>` module contains several
popular gradient-based strategies for adjusting the parameters of a neural
network to optimize a loss function. The proper choice of optimization strategy
can help reduce training time / speed up convergence, though see [1]_ for a
discussion on the generalization performance of the solutions identified via
different strategies.

.. [1] Wilson, A. C., Roelofs, R., Stern, M., Srebro, M., & Recht, B. (2017)
   "The marginal value of adaptive gradient methods in machine learning",
   *Proceedings of the 31st Conference on Neural Information Processing
   Systems*. https://arxiv.org/pdf/1705.08292.pdf

**Optimizers**

+-------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+-----------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.optimizers.SGD` | - :class:`~numpy_ml.neural_nets.optimizers.AdaGrad` | - :class:`~numpy_ml.neural_nets.optimizers.Adam` | - :class:`~numpy_ml.neural_nets.optimizers.RMSProp` |
+-------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+-----------------------------------------------------+

.. raw:: html

   <h2>Learning Rate Schedulers</h2>

It is common to reduce an optimizer's learning rate(s) over the course of
training in order to eke out additional performance improvements. The
:doc:`Schedulers <numpy_ml.neural_nets.schedulers>` module contains several
strategies for automatically adjusting the learning rate as a function of the
number of elapsed training steps.

**Schedulers**

+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.schedulers.ConstantScheduler` | - :class:`~numpy_ml.neural_nets.schedulers.ExponentialScheduler` | - :class:`~numpy_ml.neural_nets.schedulers.KingScheduler` |
+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.schedulers.NoamScheduler`     |                                                                  |                                                           |
+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+

.. raw:: html

   <h2>Wrappers</h2>

The :doc:`Wrappers <numpy_ml.neural_nets.wrappers>` module contains classes
that wrap or otherwise modify the behavior of a network layer.

**Wrappers**

- :class:`~numpy_ml.neural_nets.wrappers.Dropout`

.. raw:: html

   <h2>Modules</h2>

Many deep networks consist of stacks of repeated modules. These modules, often
consisting of several layers / layer operations, can themselves be abstracted
in order to simplify the building of more complex networks. The :doc:`Modules
<numpy_ml.neural_nets.modules>` module contains a few common architectural
patterns that appear across a number of popular deep learning approaches.

**Modules**

+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.modules.BidirectionalLSTM`            | - :class:`~numpy_ml.neural_nets.modules.MultiHeadedAttentionModule` | - :class:`~numpy_ml.neural_nets.modules.SkipConnectionConvModule` |
+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.modules.SkipConnectionIdentityModule` | - :class:`~numpy_ml.neural_nets.modules.WavenetResidualModule`      |                                                                   |
+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+


.. raw:: html

   <h2>Full Networks</h2>

The :doc:`Models <numpy_ml.neural_nets.models>` module contains implementations
of several well-known neural networks from recent papers.

**Full Networks**

- :class:`~numpy_ml.neural_nets.models.WGAN_GP`
- :class:`~numpy_ml.neural_nets.models.BernoulliVAE`
- :class:`~numpy_ml.neural_nets.models.Word2Vec`


.. raw:: html

   <h2>Utilities</h2>

The :doc:`Utilities <numpy_ml.neural_nets.utils>` module contains a number of
helper functions for dealing with weight initialization, convolution
arithmetic, padding, and minibatching.

**Utilities**

+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.minibatch`        | - :class:`~numpy_ml.neural_nets.utils.pad1D`            | - :class:`~numpy_ml.neural_nets.utils.calc_fan`           | - :class:`~numpy_ml.neural_nets.utils.col2im`    |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.conv2D`           | - :class:`~numpy_ml.neural_nets.utils.pad2D`            | - :class:`~numpy_ml.neural_nets.utils.calc_conv_out_dims` | - :class:`~numpy_ml.neural_nets.utils.conv2D`    |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.calc_pad_dims_1D` | - :class:`~numpy_ml.neural_nets.utils.dilate`           | - :class:`~numpy_ml.neural_nets.utils.im2col`             | - :class:`~numpy_ml.neural_nets.utils.conv1D`    |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.deconv2D_naive`   | - :class:`~numpy_ml.neural_nets.utils.conv2D_naive`     | - :class:`~numpy_ml.neural_nets.utils.he_uniform`         | - :class:`~numpy_ml.neural_nets.utils.he_normal` |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.glorot_uniform`   | - :class:`~numpy_ml.neural_nets.utils.truncated_normal` |                                                           |                                                  |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+


.. toctree::
   :maxdepth: 3
   :hidden:

   numpy_ml.neural_nets.layers

   numpy_ml.neural_nets.activations

   numpy_ml.neural_nets.losses

   numpy_ml.neural_nets.optimizers

   numpy_ml.neural_nets.schedulers

   numpy_ml.neural_nets.wrappers

   numpy_ml.neural_nets.modules

   numpy_ml.neural_nets.models

   numpy_ml.neural_nets.utils


================================================
FILE: docs/numpy_ml.neural_nets.schedulers.rst
================================================
Learning rate schedulers
=========================

``ConstantScheduler``
---------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.ConstantScheduler
    :members:
    :undoc-members:
    :inherited-members:

``ExponentialScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.ExponentialScheduler
    :members:
    :undoc-members:
    :inherited-members:

``KingScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.KingScheduler
    :members:
    :undoc-members:
    :inherited-members:

``NoamScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.NoamScheduler
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.neural_nets.utils.rst
================================================
Utilities
==========

``minibatch``
-------------
.. autofunction:: numpy_ml.neural_nets.utils.minibatch

``calc_pad_dims_2D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_pad_dims_2D

``calc_pad_dims_1D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_pad_dims_1D

``pad1D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.pad1D

``pad2D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.pad2D

``dilate``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.dilate

``calc_fan``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_fan

``calc_conv_out_dims``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_conv_out_dims

``im2col``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.im2col

``col2im``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.col2im

``conv2D``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv2D

``conv1D``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv1D

``deconv2D_naive``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.deconv2D_naive

``conv2D_naive``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv2D_naive

``he_uniform``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.he_uniform

``he_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.he_normal

``glorot_uniform``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.glorot_uniform

``glorot_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.glorot_normal

``truncated_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.truncated_normal


================================================
FILE: docs/numpy_ml.neural_nets.wrappers.rst
================================================
Wrappers
=========

``WrapperBase``
---------------
.. autoclass:: numpy_ml.neural_nets.wrappers.wrappers.WrapperBase
    :members:
    :undoc-members:
    :inherited-members:

``Dropout``
-----------
.. autoclass:: numpy_ml.neural_nets.wrappers.Dropout
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.ngram.additive.rst
================================================
``AdditiveNGram``
-----------------

.. autoclass:: numpy_ml.ngram.AdditiveNGram
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.ngram.goodturing.rst
================================================
``GoodTuringNGram``
-------------------

.. autoclass:: numpy_ml.ngram.GoodTuringNGram
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.ngram.mle.rst
================================================
``MLENGram``
------------

.. autoclass:: numpy_ml.ngram.MLENGram
        :members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.ngram.rst
================================================
#######################
N-gram smoothing models
#######################

When dealing with `n-gram`_ models, smoothing refers to the practice of
adjusting empirical probability estimates to account for insufficient data.

In the descriptions below, we use the notation :math:`w^{j}_{i}`, :math:`i < j`, to
denote the `(j - i)`-gram :math:`(w_{i}, w_{i+1}, \ldots, w_{j})`.

.. raw:: html

   <h3>Laplace Smoothing</h3>

`Laplace smoothing`_ is the assumption that each `n`-gram in a corpus occurs
exactly one more time than it actually does.

.. math::

    p(w_i \mid w^{i-1}_{i-n+1}) = \frac{1 + c(w^{i}_{i-n+1})}{|V| \sum_{w_i} c(w^{i}_{i-n+1})}

where :math:`c(a)` denotes the empirical count of the `n`-gram :math:`a` in the
corpus, and :math:`|V|` corresponds to the number of unique `n`-grams in the
corpus.

.. _`Laplace smoothing`: https://en.wikipedia.org/wiki/Additive_smoothing

**Models**

- :class:`~numpy_ml.ngram.AdditiveNGram`

.. raw:: html

   <h3>Additive/Lidstone Smoothing</h3>

`Additive/Lidstone smoothing`_ is a generalization of Laplace smoothing, where we
assume that each `n`-gram in a corpus occurs `k` more times than it actually
does (where `k` can be any non-negative value, but typically ranges between `[0, 1]`):

.. math::

    p(w_i \mid w^{i-1}_{i-n+1}) = \frac{k + c(w^{i}_{i-n+1})}{k |V| \sum_{w_i} c(w^{i}_{i-n+1})}

where :math:`c(a)` denotes the empirical count of the `n`-gram :math:`a` in the
corpus, and :math:`|V|` corresponds to the number of unique `n`-grams in the
corpus.

.. _`Additive/Lidstone smoothing`: https://en.wikipedia.org/wiki/Additive_smoothing

**Models**

- :class:`~numpy_ml.ngram.AdditiveNGram`


.. raw:: html

   <h3>Good-Turing Smoothing</h3>

`Good-Turing smoothing`_ is a more sophisticated technique which takes into
account the identity of the particular `n`-gram when deciding the amount of
smoothing to apply. It proceeds by allocating a portion of the probability
space occupied by `n`-grams which occur with count `r+1` and dividing it among
the `n`-grams which occur with rate `r`.

.. math::
    r^*  =  (r + 1) \frac{g(r + 1)}{g(r)} \\
    p(w^{i}_{i-n+1} \mid c(w^{i}_{i-n+1}) = r)  =  \frac{r^*}{N}

where :math:`r^*` is the adjusted count for an `n`-gram which occurs `r` times,
`g(x)` is the number of `n`-grams in the corpus which occur `x` times, and `N`
is the total number of `n`-grams in the corpus.

.. _n-gram: https://en.wikipedia.org/wiki/N-gram
.. _`Good-Turing smoothing`: https://en.wikipedia.org/wiki/Good%E2%80%93Turing_frequency_estimation

**Models**

- :class:`~numpy_ml.ngram.GoodTuringNGram`

**References**

.. [1]  Chen & Goodman (1998). "An empirical study of smoothing techniques
   for language modeling".  *Harvard Computer Science Group Technical Report
   TR-10-98*.
.. [2] Gale & Sampson (1995). "Good-Turing frequency estimation without
   tears". *Journal of Quantitative Linguistics*, 2(3), 217-237.

.. toctree::
   :maxdepth: 3
   :hidden:

   numpy_ml.ngram.mle

   numpy_ml.ngram.additive

   numpy_ml.ngram.goodturing


================================================
FILE: docs/numpy_ml.nonparametric.gp.rst
================================================
``GPRegression``
#################

.. autoclass:: numpy_ml.nonparametric.GPRegression
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.nonparametric.kernel_regression.rst
================================================
``KernelRegression``
#####################

.. autoclass:: numpy_ml.nonparametric.KernelRegression
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.nonparametric.knn.rst
================================================
``KNN``
#######

.. autoclass:: numpy_ml.nonparametric.KNN
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.nonparametric.rst
================================================
Nonparametric models
####################

.. raw:: html

   <h2>K-Nearest Neighbors</h2>

The `k-nearest neighbors`_ (KNN) model is a nonparametric supervised learning
approach that can be applied to classification or regression problems. In a
classification context, the KNN model assigns a class label for a new datapoint
by taking a majority vote amongst the labels for the `k` closest points
("neighbors") in the training data. Similarly, in a regression context, the KNN
model predicts the target value associated with a new datapoint by taking the
average of the targets associated with the `k` closes points in the training
data.

.. _`k-nearest neighbors`: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

**Models**

- :class:`~numpy_ml.nonparametric.KNN`

.. raw:: html

   <h2>Gaussian Process Regression</h2>

A `Gaussian process`_ defines a prior distribution over functions mapping
:math:`X \rightarrow \mathbb{R}`, where `X` can be any finite (or
infinite!)-dimensional set.

Let :math:`f(x_k)` be the random variable corresponding to
the value of a function `f` at a point :math:`x_k \in X`. Define a random
variable :math:`z = [f(x_1), \ldots, f(x_N)]` for any finite set of points
:math:`\{x_1, \ldots, x_N\} \subset X`. If `f` is distributed according to a
Gaussian Process, it is the case that

.. math::

    z \sim \mathcal{N}(\mu, K)

for

.. math::

    \mu  &=  [\text{mean}(x_1), \ldots, \text{mean}(x_N)] \\
    K_{ij}  &=  \text{kernel}(x_i, x_j)

where mean is the mean function (in Gaussian process regression it is common
to define mean(`x`) = 0), and `kernel` is a :doc:`kernel
<numpy_ml.utils.kernels>` / covariance function that determines the general
shape of the GP prior over functions, `p(f)`.

In `Gaussian process regression`_ (AKA simple Kriging [2]_ [3]_), a Gaussian
process is used as a prior on functions and is combined with the Gaussian
likelihood from the linear model via Bayes' rule to compute a posterior over
functions `f`:

.. math::

    y \mid X, f  &\sim  \mathcal{N}( [f(x_1), \ldots, f(x_n)], \alpha I ) \\
    f \mid X     &\sim  \text{GP}(0, K)

Due to the conjugacy of the Gaussian Process prior with the regression model's
Gaussian likelihood, the posterior will also be Gaussian and can be computed in
closed form.

.. _`Gaussian process`: https://en.wikipedia.org/wiki/Gaussian_process
.. _`Gaussian process regression`: https://en.wikipedia.org/wiki/Kriging

**Models**

- :class:`~numpy_ml.nonparametric.GPRegression`

**References**

.. [1] Rasmussen, C. E., & Williams, C. K. I. (2006). Gaussian Processes for
   Machine Learning. MIT Press, Cambridge, MA.
.. [2] Krige, D. G., (1951). "A statistical approach to some mine valuations and
   allied problems at the Witwatersrand", *Master's thesis of the University of
   Witwatersrand*.
.. [3] Matheron, G., (1963). "Principles of geostatistics", *Economic Geology, 58*, 1246-1266.

.. raw:: html

   <h2>Kernel Regression</h2>

Kernel regression is another nonparametric approach to nonlinear regression.
Like the Gaussian Process regression approach (or, more generally, all
regression models), kernel regression attempts to learn a function `f` which
captures the conditional expectation of some targets **y** given the data
**X**, under the assumption that

.. math::
    y_i = f(x_i) + \epsilon_i \ \ \ \ \text{where } \mathbb{E}[\epsilon | \mathbf{x}] = \mathbb{E}[\epsilon] = 0

Unlike the Gaussian Process regression approach, however, kernel regression
does not place a prior over `f`. Instead, it models :math:`f = \mathbb{E}[y |
X] = \int_y \frac{p(X, y)}{p(X)} y \ \text{d}y` using a :doc:`kernel function
<numpy_ml.utils.kernels>`, `k`, to estimate the smoothed data probabilities.
For example, the :class:`Nadaraya-Watson <numpy_ml.nonparametric.KernelRegression>`
estimator [4]_ [5]_ uses the following probability estimates:

.. math::
    \hat{p}(X)  &=  \prod_{i=1}^N \hat{p}(x_i) = \prod_{i=1}^N \sum_{j=1}^N \frac{k(x_i - x_j)}{N} \\
    \hat{p}(X, y)  &  \prod_{i=1}^N \hat{p}(x_i, y_i) = \prod_{i=1}^N \sum_{j=1}^N \frac{k(x_i - x_j) k(y_i - y_j)}{N}


**Models**

- :class:`~numpy_ml.nonparametric.KernelRegression`

**References**

.. [4] Nadaraya, E. A. (1964). "On estimating regression". *Theory of
   Probability and Its Applications, 9 (1)*, 141-2.
.. [5] Watson, G. S. (1964). "Smooth regression analysis". *Sankhyā: The Indian
   Journal of Statistics, Series A. 26 (4)*, 359–372.

.. raw:: html

   <h2>See Also</h2>

The :doc:`trees <numpy_ml.trees>` module contains other classic nonparametric
approaches, including :doc:`decision trees <numpy_ml.trees.dt>`,
:doc:`random forests <numpy_ml.trees.rf>`, and :doc:`gradient
boosted decision trees <numpy_ml.trees.gbdt>`.

.. toctree::
   :maxdepth: 2
   :hidden:

   numpy_ml.nonparametric.knn
   numpy_ml.nonparametric.gp
   numpy_ml.nonparametric.kernel_regression


================================================
FILE: docs/numpy_ml.preprocessing.dsp.rst
================================================
Digital signal processing
#########################

``DCT``
-------
.. autofunction:: numpy_ml.preprocessing.dsp.DCT

``DFT``
-------
.. autofunction:: numpy_ml.preprocessing.dsp.DFT

``dft_bins``
------------
.. autofunction:: numpy_ml.preprocessing.dsp.dft_bins

``magnitude_spectrum``
----------------------
.. autofunction:: numpy_ml.preprocessing.dsp.magnitude_spectrum

``power_spectrum``
------------------
.. autofunction:: numpy_ml.preprocessing.dsp.power_spectrum

``batch_resample``
------------------
.. autofunction:: numpy_ml.preprocessing.dsp.batch_resample

``nn_interpolate_2D``
---------------------
.. autofunction:: numpy_ml.preprocessing.dsp.nn_interpolate_2D

``nn_interpolate_1D``
---------------------
.. autofunction:: numpy_ml.preprocessing.dsp.nn_interpolate_1D

``bilinear_interpolate``
-------------------------
.. autofunction:: numpy_ml.preprocessing.dsp.bilinear_interpolate

``to_frames``
-------------

.. autofunction:: numpy_ml.preprocessing.dsp.to_frames

``autocorrelate1D``
-------------------

.. autofunction:: numpy_ml.preprocessing.dsp.autocorrelate1D

``preemphasis``
---------------

.. autofunction:: numpy_ml.preprocessing.dsp.preemphasis

``cepstral_lifter``
-------------------

.. autofunction:: numpy_ml.preprocessing.dsp.cepstral_lifter

``mel_spectrogram``
-------------------

.. autofunction:: numpy_ml.preprocessing.dsp.mel_spectrogram

``mfcc``
--------

.. autofunction:: numpy_ml.preprocessing.dsp.mfcc

``mel2hz``
----------

.. autofunction:: numpy_ml.preprocessing.dsp.mel2hz

``hz2mel``
----------

.. autofunction:: numpy_ml.preprocessing.dsp.hz2mel

``mel_filterbank``
------------------

.. autofunction:: numpy_ml.preprocessing.dsp.mel_filterbank


================================================
FILE: docs/numpy_ml.preprocessing.general.rst
================================================
General
#######

``FeatureHasher``
-----------------

.. autoclass:: numpy_ml.preprocessing.general.FeatureHasher
	:members:
	:undoc-members:
	:inherited-members:

``OneHotEncoder``
-----------------

.. autoclass:: numpy_ml.preprocessing.general.OneHotEncoder
	:members:
	:undoc-members:
	:inherited-members:

``Standardizer``
----------------

.. autoclass:: numpy_ml.preprocessing.general.Standardizer
	:members:
	:undoc-members:
	:inherited-members:

``minibatch``
-------------

.. automodule:: numpy_ml.preprocessing.general
	:members: minibatch


================================================
FILE: docs/numpy_ml.preprocessing.nlp.rst
================================================
Natural language processing
###########################

``BytePairEncoder``
-------------------

.. autoclass:: numpy_ml.preprocessing.nlp.BytePairEncoder
	:members:
	:undoc-members:
	:inherited-members:

``HuffmanEncoder``
------------------

.. autoclass:: numpy_ml.preprocessing.nlp.HuffmanEncoder
	:members:
	:undoc-members:
	:inherited-members:

``TFIDFEncoder``
------------------

.. autoclass:: numpy_ml.preprocessing.nlp.TFIDFEncoder
	:members:
	:undoc-members:
	:inherited-members:

``Vocabulary``
--------------

.. autoclass:: numpy_ml.preprocessing.nlp.Vocabulary
	:members:
	:undoc-members:
	:inherited-members:

``Token``
---------

.. autoclass:: numpy_ml.preprocessing.nlp.Token
	:members:
	:undoc-members:
	:inherited-members:

``ngrams``
-----------

.. autofunction:: numpy_ml.preprocessing.nlp.ngrams

``remove_stop_words``
---------------------

.. autofunction:: numpy_ml.preprocessing.nlp.remove_stop_words

``strip_punctuation``
---------------------

.. autofunction:: numpy_ml.preprocessing.nlp.strip_punctuation

``tokenize_words``
-------------------

.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_words

``tokenize_whitespace``
------------------------

.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_whitespace

``tokenize_chars``
-------------------

.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_chars

``tokenize_bytes_raw``
-----------------------

.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_bytes_raw

``bytes_to_chars``
-----------------------

.. autofunction:: numpy_ml.preprocessing.nlp.bytes_to_chars


================================================
FILE: docs/numpy_ml.preprocessing.rst
================================================
Preprocessing
#############

.. toctree::
   :maxdepth: 3

   numpy_ml.preprocessing.general

   numpy_ml.preprocessing.dsp

   numpy_ml.preprocessing.nlp


================================================
FILE: docs/numpy_ml.rl_models.agents.rst
================================================
Agents
======

``CrossEntropyAgent``
---------------------
.. autoclass:: numpy_ml.rl_models.agents.CrossEntropyAgent
    :members:
    :undoc-members:
    :inherited-members:

``DynaAgent``
-------------
.. autoclass:: numpy_ml.rl_models.agents.DynaAgent
    :members:
    :undoc-members:
    :inherited-members:

``MonteCarloAgent``
-------------------
Monte Carlo methods are ways of solving RL problems based on averaging
sample returns for each state-action pair. Parameters are updated only at
the completion of an episode.

In on-policy learning, the agent maintains a single policy that it updates
over the course of training. In order to ensure the policy converges to a
(near-) optimal policy, the agent must maintain that the policy assigns
non-zero probability to ALL state-action pairs during training to ensure
continual exploration.

- Thus on-policy learning is a compromise--it learns action values not for the optimal policy, but for a *near*-optimal policy that still explores.

In off-policy learning, the agent maintains two separate policies:

1. **Target policy**: The policy that is learned during training and that will eventually become the optimal policy.
2. **Behavior policy**: A policy that is more exploratory and is used to generate behavior during training.

Off-policy methods are often of greater variance and are slower to
converge. On the other hand, off-policy methods are more powerful and
general than on-policy methods.

.. autoclass:: numpy_ml.rl_models.agents.MonteCarloAgent
    :members:
    :undoc-members:
    :inherited-members:

``TemporalDifferenceAgent``
---------------------------

Temporal difference methods are examples of bootstrapping in that they update
their estimate for the value of state `s` on the basis of a previous estimate.

Advantages of TD algorithms:

1. They do not require a model of the environment, its reward, or its next-state probability distributions.
2. They are implemented in an online, fully incremental fashion. This allows them to be used with infinite-horizons / when episodes take prohibitively long to finish.
3. TD algorithms learn from each transition regardless of what subsequent actions are taken.
4. In practice, TD methods have usually been found to converge faster than constant-:math:`\alpha` Monte Carlo methods on stochastic tasks.

.. autoclass:: numpy_ml.rl_models.agents.TemporalDifferenceAgent
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.rl_models.rl_utils.rst
================================================
Utilities
=========

.. automodule:: numpy_ml.rl_models.rl_utils
    :members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.rl_models.rst
================================================
Reinforcement learning
######################

.. toctree::
   :maxdepth: 3

   numpy_ml.rl_models.agents

   numpy_ml.rl_models.trainer

   numpy_ml.rl_models.rl_utils


================================================
FILE: docs/numpy_ml.rl_models.trainer.rst
================================================
Training
========

``Trainer``
-----------

.. automodule:: numpy_ml.rl_models.trainer
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.trees.dt.rst
================================================
################
``DecisionTree``
################

.. autoclass:: numpy_ml.trees.DecisionTree
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.trees.gbdt.rst
================================================
``GradientBoostedDecisionTree``
###############################

.. autoclass:: numpy_ml.trees.GradientBoostedDecisionTree
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.trees.losses.rst
================================================
#########################
Losses (``trees.losses``)
#########################

.. automodule:: numpy_ml.trees.losses
	:members:
	:undoc-members:
	:inherited-members:
	:show-inheritance:


================================================
FILE: docs/numpy_ml.trees.rf.rst
================================================
``RandomForest``
################

.. autoclass:: numpy_ml.trees.RandomForest
	:members:
	:undoc-members:
	:inherited-members:


================================================
FILE: docs/numpy_ml.trees.rst
================================================
Tree-based models
#################
.. raw:: html

   <h2>Decision Trees</h2>

`Decision trees`_ [1]_ are popular nonparametric models that iteratively split a
training dataset into smaller, more homogenous subsets. Each node in the tree
is associated with a decision rule, which dictates how to divide the data the
node inherits from its parent among each of its children. Each leaf node is
associated with at least one data point from the original training set.

.. figure:: img/decision_tree.png
    :width: 95%
    :align: center

    A binary decision tree trained on the dataset :math:`X = \{ \mathbf{x}_1,
    \ldots, \mathbf{x}_{10} \}`. Each example in the dataset is a 4-dimensional
    vector of real-valued features labeled :math:`x_1, \ldots, x_4`. Unshaded
    circles correspond to internal decision nodes, while shaded circles
    correspond to leaf nodes. Each leaf node is associated with a subset of the
    examples in `X`, selected based on the decision rules along the path from
    root to leaf.

At test time, new examples travel from the tree root to one of the leaves,
their path through the tree determined by the decision rules at each of the
nodes it visits. When a test example arrives at a leaf node, the targets for
the training examples at that leaf node are used to compute the model's
prediction.

Training decision trees corresponds to learning the set of decision rules to
partition the training data. This learning process proceeds greedily by
selecting the decision rule at each node that results in the greatest reduction
in an inhomogeneity or "impurity" metric, :math:`\mathcal{L}`. One popular
metric is the **information entropy**:

.. math::

    -\sum_j P_n(\omega_j) \log P_n(\omega_j)

where :math:`P_n(\omega_j)` is the fraction of data points at split `n` that are
associated with category :math:`\omega_j`. Another useful metric is the **Gini
impurity**:

.. math::

    \sum_{i \neq j} P_n(\omega_i) P_n(\omega_j) = 1 - \sum_{j} P_n(\omega_j)^2

For a binary tree (where each node has only two children), the reduction in
impurity after a particular split is

.. math::

    \Delta \mathcal{L} = \mathcal{L}(\text{Parent}) -
        P_{\text{left}} \mathcal{L}(\text{Left child}) -
            (1 - P_{\text{left}})\mathcal{L}(\text{Right child})

where :math:`\mathcal{L}(x)` is the impurity of the dataset at node `x`,
and :math:`P_{\text{left}}`/:math:`P_{\text{right}}` are the proportion of
examples at the current node that are partitioned into the left / right
children, respectively, by the proposed split.

.. _`Decision trees`: https://en.wikipedia.org/wiki/Decision_tree_learning

**Models**

- :class:`~numpy_ml.trees.DecisionTree`

**References**

.. [1] Breiman, L., Friedman, J. H., Olshen, R. A., and Stone, C. J. (1984).
   Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole
   Advanced Books & Software.

.. raw:: html

   <h2>Bootstrap Aggregating</h2>

`Bootstrap aggregating`_ (bagging) methods [2]_ are an `ensembling approach`_ that
proceeds by creating `n` bootstrapped samples of a training dataset by sampling
from it with replacement. A separate learner is fit on each of the `n`
bootstrapped datasets, with the final bootstrap aggregated model prediction
corresponding to the average (or majority vote, for classifiers) across each
of the `n` learners' predictions for a given datapoint.

The `random forest`_ model [3]_ [4]_ is a canonical example of bootstrap
aggregating. For this approach, each of the `n` learners is a different
decision tree. In addition to training each decision tree on a different
bootstrapped dataset, random forests employ a `random subspace`_ approach [5]_:
each decision tree is trained on a subsample (without replacement) of the full
collection of dataset features.

.. _`Bootstrap aggregating`: https://en.wikipedia.org/wiki/Bootstrap_aggregating
.. _`random forest`: https://en.wikipedia.org/wiki/Random_forest
.. _`ensembling approach`: https://en.wikipedia.org/wiki/Ensemble_learning
.. _`random subspace`: https://en.wikipedia.org/wiki/Random_subspace_method

**Models**

- :class:`~numpy_ml.trees.RandomForest`

**References**

.. [2] Breiman, L. (1994). "Bagging predictors". *Technical Report 421.
   Statistics Department, UC Berkeley*.
.. [3] Ho, T. K. (1995). "Random decision forests". *Proceedings of the Third
   International Conference on Document Analysis and Recognition, 1*: 278-282.
.. [4] Breiman, L. (2001). "Random forests". *Machine Learning. 45(1)*: 5-32.
.. [5] Ho, T. K. (1998). "The random subspace method for constructing decision
   forests". *IEEE Transactions on Pattern Analysis and Machine Intelligence.
   20(8)*: 832-844.

.. raw:: html

   <h2>Gradient Boosting</h2>

`Gradient boosting`_ [6]_ [7]_ [8]_ is another popular `ensembling technique`_
that proceeds by iteratively fitting a sequence of `m` weak learners such that:

.. math::

    f_m(X) = b(X) + \eta w_1 g_1 + \ldots + \eta w_m g_m

where `b` is a fixed initial estimate for the targets, :math:`\eta` is
a learning rate parameter, and :math:`w_{i}` and :math:`g_{i}`
denote the weights and predictions of the :math:`i^{th}` learner.

At each training iteration a new weak learner is fit to predict the negative
gradient of the loss with respect to the previous prediction,
:math:`\nabla_{f_{i-1}} \mathcal{L}(y, \ f_{i-1}(X))`.  We then use the
element-wise product of the predictions of this weak learner, :math:`g_i`, with
a weight, :math:`w_i`, computed via, e.g., `line-search`_ on the objective
:math:`w_i = \arg \min_{w} \sum_{j=1}^n \mathcal{L}(y_j, f_{i-1}(x_j) + w g_i)`
, to adjust the predictions of the model from the previous iteration,
:math:`f_{i-1}(X)`:

.. math::

    f_i(X) := f_{i-1}(X) + w_i g_i

The current module implements gradient boosting using decision trees as the
weak learners.

.. _`Gradient boosting`: https://en.wikipedia.org/wiki/Gradient_boosting
.. _`ensembling technique`: https://en.wikipedia.org/wiki/Ensemble_learning
.. _`line-search`: https://en.wikipedia.org/wiki/Line_search

**Models**

- :class:`~numpy_ml.trees.GradientBoostedDecisionTree`

**References**

.. [6]  Breiman, L. (1997). "Arcing the edge". *Technical Report 486.
   Statistics Department, UC Berkeley*.
.. [7] Friedman, J. H. (1999). "Greedy function approximation: A gradient
   boosting machine". *IMS 1999 Reitz Lecture*.
.. [8]  Mason, L., Baxter, J., Bartlett, P. L., Frean, M. (1999). "Boosting
   algorithms as gradient descent" *Advances in Neural Information Processing
   Systems, 12*: 512–518.

.. toctree::
   :maxdepth: 3
   :hidden:

   numpy_ml.trees.dt

   numpy_ml.trees.rf

   numpy_ml.trees.gbdt


================================================
FILE: docs/numpy_ml.utils.data_structures.rst
================================================
Data structures
================

``BallTree``
------------

.. autoclass:: numpy_ml.utils.data_structures.BallTree
	:members:
	:undoc-members:
	:inherited-members:

``DiscreteSampler``
-------------------

.. autoclass:: numpy_ml.utils.data_structures.DiscreteSampler
	:members:
	:undoc-members:
	:inherited-members:

``PriorityQueue``
-----------------

.. autoclass:: numpy_ml.utils.data_structures.PriorityQueue
	:members:
	:undoc-members:
	:inherited-members:

``PQNode``
-----------------

.. autoclass:: numpy_ml.utils.data_structures.PQNode
	:members:
	:undoc-members:
	:inherited-members:

``Dict``
--------

.. autoclass:: numpy_ml.utils.data_structures.Dict
	:members:
	:undoc-members:
        :show-inheritance:


================================================
FILE: docs/numpy_ml.utils.distance_metrics.rst
================================================
Distance metrics
================

Common distance functions.

``euclidean``
---------------
.. autofunction:: numpy_ml.utils.distance_metrics.euclidean

``chebyshev``
---------------
.. autofunction:: numpy_ml.utils.distance_metrics.chebyshev

``hamming``
-------------
.. autofunction:: numpy_ml.utils.distance_metrics.hamming

``manhattan``
--------------
.. autofunction:: numpy_ml.utils.distance_metrics.manhattan

``minkowski``
--------------
.. autofunction:: numpy_ml.utils.distance_metrics.minkowski


================================================
FILE: docs/numpy_ml.utils.graphs.rst
================================================
Graphs
======

``Graph``
---------
.. autoclass:: numpy_ml.utils.graphs.Graph
    :members:
    :undoc-members:
    :inherited-members:

``Edge``
--------
.. autoclass:: numpy_ml.utils.graphs.Edge
    :members:
    :undoc-members:
    :inherited-members:

``DiGraph``
-----------
.. autoclass:: numpy_ml.utils.graphs.DiGraph
    :members:
    :undoc-members:
    :show-inheritance:

``UndirectedGraph``
-------------------
.. autoclass:: numpy_ml.utils.graphs.UndirectedGraph
    :members:
    :undoc-members:
    :show-inheritance:

``random_unweighted_graph``
---------------------------

.. autofunction:: numpy_ml.utils.graphs.random_unweighted_graph

``random_DAG``
--------------

.. autofunction:: numpy_ml.utils.graphs.random_DAG


================================================
FILE: docs/numpy_ml.utils.kernels.rst
================================================
Kernels
=======

A collection of common kernel / similarity functions. All kernels are
continuous, bounded, and symmetric real functions which integrate to 1.

``LinearKernel``
----------------

.. autoclass:: numpy_ml.utils.kernels.LinearKernel
    :members:
    :undoc-members:
    :inherited-members:

``PolynomialKernel``
--------------------

.. autoclass:: numpy_ml.utils.kernels.PolynomialKernel
    :members:
    :undoc-members:
    :inherited-members:

``RBFKernel``
-------------

.. autoclass:: numpy_ml.utils.kernels.RBFKernel
    :members:
    :undoc-members:
    :inherited-members:


================================================
FILE: docs/numpy_ml.utils.rst
================================================
Utilities
#########

.. toctree::
   :maxdepth: 3

   numpy_ml.utils.data_structures

   numpy_ml.utils.distance_metrics

   numpy_ml.utils.graphs

   numpy_ml.utils.kernels

   numpy_ml.utils.windows

   numpy_ml.utils.testing


================================================
FILE: docs/numpy_ml.utils.testing.rst
================================================
Testing
-------
Common helper functions for testing the ML algorithms in the rest of the repo.

.. automodule:: numpy_ml.utils.testing
    :members:
    :undoc-members:
    :inherited-members:
    :show-inheritance:


================================================
FILE: docs/numpy_ml.utils.windows.rst
================================================
Window functions
================
In digital signal processing, windowing functions are useful to counteract the
assumption made by the FFT that data is infinite and to reduce spectral
leakage.

``blackman_harris``
-------------------

.. autofunction:: numpy_ml.utils.windows.blackman_harris

``generalized_cosine``
----------------------

.. autofunction:: numpy_ml.utils.windows.generalized_cosine

``hamming``
-----------

.. autofunction:: numpy_ml.utils.windows.hamming

``hann``
-----------

.. autofunction:: numpy_ml.utils.windows.hann


================================================
FILE: docs/requirements.txt
================================================
numpy
scipy

# all this is for the dang tests
matplotlib
seaborn
pandas
sklearn
huffman


================================================
FILE: numpy_ml/README.md
================================================
# Models
This repo includes code for the following models:

1. **Gaussian mixture model**
    - EM training

2. **Hidden Markov model**
    - Viterbi decoding
    - Likelihood computation
    - MLE parameter estimation via Baum-Welch/forward-backward algorithm

3. **Latent Dirichlet allocation** (topic model)
    - Standard model with MLE parameter estimation via variational EM
    - Smoothed model with MAP parameter estimation via MCMC

4. **Neural networks**
    * Layers / Layer-wise ops
        - Add
        - Flatten
        - Multiply
        - Softmax
        - Fully-connected/Dense
        - Sparse evolutionary connections
        - LSTM
        - Elman-style RNN
        - Max + average pooling
        - Dot-product attention
        - Embedding layer
        - Restricted Boltzmann machine (w. CD-n training)
        - 2D deconvolution (w. padding and stride)
        - 2D convolution (w. padding, dilation, and stride)
        - 1D convolution (w. padding, dilation, stride, and causality)
    * Modules
        - Bidirectional LSTM
        - ResNet-style residual blocks (identity and convolution)
        - WaveNet-style residual blocks with dilated causal convolutions
        - Transformer-style multi-headed scaled dot product attention
    * Regularizers
        - Dropout
    * Normalization
        - Batch normalization (spatial and temporal)
        - Layer normalization (spatial and temporal)
    * Optimizers
        - SGD w/ momentum
        - AdaGrad
        - RMSProp
        - Adam
    * Learning Rate Schedulers
        - Constant
        - Exponential
        - Noam/Transformer
        - Dlib scheduler
    * Weight Initializers
        - Glorot/Xavier uniform and normal
        - He/Kaiming uniform and normal
        - Standard and truncated normal
    * Losses
        - Cross entropy
        - Squared error
        - Bernoulli VAE loss
        - Wasserstein loss with gradient penalty
        - Noise contrastive estimation loss
    * Activations
        - ReLU
        - Tanh
        - Affine
        - Sigmoid
        - Leaky ReLU
        - ELU
        - SELU
        - Exponential
        - Hard Sigmoid
        - Softplus
    * Models
        - Bernoulli variational autoencoder
        - Wasserstein GAN with gradient penalty
        - word2vec encoder with skip-gram and CBOW architectures
    * Utilities
        - `col2im` (MATLAB port)
        - `im2col` (MATLAB port)
        - `conv1D`
        - `conv2D`
        - `deconv2D`
        - `minibatch`

5. **Tree-based models**
    - Decision trees (CART)
    - [Bagging] Random forests
    - [Boosting] Gradient-boosted decision trees

6. **Linear models**
    - Ridge regression
    - Logistic regression
    - Ordinary least squares
    - Gaussian naive Bayes classifier
    - Generalized linear model (identity, log, and logit links)
    - Bayesian linear regression w/ conjugate priors
        - Unknown mean, known variance (Gaussian prior)
        - Unknown mean, unknown variance (Normal-Gamma / Normal-Inverse-Wishart prior)

7. **n-Gram sequence models**
    - Maximum likelihood scores
    - Additive/Lidstone smoothing
    - Simple Good-Turing smoothing

8. **Multi-armed bandit models**
    - UCB1
    - LinUCB
    - Epsilon-greedy
    - Thompson sampling w/ conjugate priors
        - Beta-Bernoulli sampler
    - LinUCB

8. **Reinforcement learning models**
    - Cross-entropy method agent
    - First visit on-policy Monte Carlo agent
    - Weighted incremental importance sampling Monte Carlo agent
    - Expected SARSA agent
    - TD-0 Q-learning agent
    - Dyna-Q / Dyna-Q+ with prioritized sweeping

9. **Nonparameteric models**
    - Nadaraya-Watson kernel regression
    - k-Nearest neighbors classification and regression
    - Gaussian process regression

10. **Matrix factorization**
    - Regularized alternating least-squares
    - Non-negative matrix factorization

11. **Preprocessing**
    - Discrete Fourier transform (1D signals)
    - Discrete cosine transform (type-II) (1D signals)
    - Bilinear interpolation (2D signals)
    - Nearest neighbor interpolation (1D and 2D signals)
    - Autocorrelation (1D signals)
    - Signal windowing
    - Text tokenization
    - Feature hashing
    - Feature standardization
    - One-hot encoding / decoding
    - Huffman coding / decoding
    - Byte pair encoding / decoding
    - Term frequency-inverse document frequency (TF-IDF) encoding
    - MFCC encoding

12. **Utilities**
    - Similarity kernels
    - Distance metrics
    - Priority queue
    - Ball tree
    - Discrete sampler
    - Graph processing and generators


================================================
FILE: numpy_ml/__init__.py
================================================
# noqa
"""Common ML and ML-adjacent algorithms implemented in NumPy"""

from . import utils
from . import preprocessing

from . import gmm
from . import hmm
from . import lda
from . import linear_models
from . import neural_nets
from . import ngram
from . import nonparametric
from . import rl_models
from . import trees
from . import bandits
from . import factorization


================================================
FILE: numpy_ml/bandits/README.md
================================================
# Bandits
The `bandit.py` module includes several simple multi-arm bandit
environments.

The `policies.py` module implements a number of standard multi-arm bandit
policies.

1. **Bandits**
    - MAB: Bernoulli, Multinomial, and Gaussian payout distributions
    - Contextual MAB: Linear contextual bandits

2. **Policies**
    - Epsilon-greedy
    - UCB1 ([Auer, Cesa-Bianchi, & Fisher, 2002](https://link.springer.com/content/pdf/10.1023/A:1013689704352.pdf))
    - Conjugate Thompson sampler for Bernoulli bandits ([Thompson, 1933](https://www.gwern.net/docs/statistics/decision/1933-thompson.pdf); [Chapelle & Li, 2010](https://papers.nips.cc/paper/4321-an-empirical-evaluation-of-thompson-sampling.pdf))
    - LinUCB ([Li, Chu, Langford, & Schapire, 2010](http://rob.schapire.net/papers/www10.pdf))

## Plots
<p align="center">
<img src="img/ThompsonSamplingBetaBinomial.png" align='center' height="400" />

<img src="img/UCB1.png" align='center' height="400" />

<img src="img/EpsilonGreedy.png" align='center' height="400" />
</p>


================================================
FILE: numpy_ml/bandits/__init__.py
================================================
from .bandits import *
from . import policies
from . import trainer


================================================
FILE: numpy_ml/bandits/bandits.py
================================================
"""A module containing different variations on multi-armed bandit environments."""

from abc import ABC, abstractmethod

import numpy as np

from numpy_ml.utils.testing import random_one_hot_matrix, is_number


class Bandit(ABC):
    def __init__(self, rewards, reward_probs, context=None):
        assert len(rewards) == len(reward_probs)
        self.step = 0
        self.n_arms = len(rewards)

        super().__init__()

    def __repr__(self):
        """A string representation for the bandit"""
        HP = self.hyperparameters
        params = ", ".join(["{}={}".format(k, v) for (k, v) in HP.items() if k != "id"])
        return "{}({})".format(HP["id"], params)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {}

    @abstractmethod
    def oracle_payoff(self, context=None):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            The current context matrix for each of the bandit arms, if
            applicable. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        """
        pass

    def pull(self, arm_id, context=None):
        """
        "Pull" (i.e., sample from) a given arm's payoff distribution.

        Parameters
        ----------
        arm_id : int
            The integer ID of the arm to sample from
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D,)` or None
            The context vector for the current timestep if this is a contextual
            bandit. Otherwise, this argument is unused and defaults to None.

        Returns
        -------
        reward : float
            The reward sampled from the given arm's payoff distribution
        """
        assert arm_id < self.n_arms

        self.step += 1
        return self._pull(arm_id, context)

    def reset(self):
        """Reset the bandit step and action counters to zero."""
        self.step = 0

    @abstractmethod
    def _pull(self, arm_id):
        pass


class MultinomialBandit(Bandit):
    def __init__(self, payoffs, payoff_probs):
        """
        A multi-armed bandit where each arm is associated with a different
        multinomial payoff distribution.

        Parameters
        ----------
        payoffs : ragged list of length `K`
            The payoff values for each of the `n` bandits. ``payoffs[k][i]``
            holds the `i` th payoff value for arm `k`.
        payoff_probs : ragged list of length `K`
            A list of the probabilities associated with each of the payoff
            values in ``payoffs``. ``payoff_probs[k][i]`` holds the probability
            of payoff index `i` for arm `k`.
        """
        super().__init__(payoffs, payoff_probs)

        for r, rp in zip(payoffs, payoff_probs):
            assert len(r) == len(rp)
            np.testing.assert_almost_equal(sum(rp), 1.0)

        payoffs = np.array([np.array(x) for x in payoffs])
        payoff_probs = np.array([np.array(x) for x in payoff_probs])

        self.payoffs = payoffs
        self.payoff_probs = payoff_probs
        self.arm_evs = np.array([sum(p * v) for p, v in zip(payoff_probs, payoffs)])
        self.best_ev = np.max(self.arm_evs)
        self.best_arm = np.argmax(self.arm_evs)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "MultinomialBandit",
            "payoffs": self.payoffs,
            "payoff_probs": self.payoff_probs,
        }

    def oracle_payoff(self, context=None):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            Unused. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        return self.best_ev, self.best_arm

    def _pull(self, arm_id, context):
        payoffs = self.payoffs[arm_id]
        probs = self.payoff_probs[arm_id]
        return np.random.choice(payoffs, p=probs)


class BernoulliBandit(Bandit):
    def __init__(self, payoff_probs):
        """
        A multi-armed bandit where each arm is associated with an independent
        Bernoulli payoff distribution.

        Parameters
        ----------
        payoff_probs : list of length `K`
            A list of the payoff probability for each arm. ``payoff_probs[k]``
            holds the probability of payoff for arm `k`.
        """
        payoffs = [1] * len(payoff_probs)
        super().__init__(payoffs, payoff_probs)

        for p in payoff_probs:
            assert p >= 0 and p <= 1

        self.payoffs = np.array(payoffs)
        self.payoff_probs = np.array(payoff_probs)

        self.arm_evs = self.payoff_probs
        self.best_ev = np.max(self.arm_evs)
        self.best_arm = np.argmax(self.arm_evs)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "BernoulliBandit",
            "payoff_probs": self.payoff_probs,
        }

    def oracle_payoff(self, context=None):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            Unused. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        return self.best_ev, self.best_arm

    def _pull(self, arm_id, context):
        return int(np.random.rand() <= self.payoff_probs[arm_id])


class GaussianBandit(Bandit):
    def __init__(self, payoff_dists, payoff_probs):
        """
        A multi-armed bandit that is similar to
        :class:`BernoulliBandit`, but instead of each arm having
        a fixed payout of 1, the payoff values are sampled from independent
        Gaussian RVs.

        Parameters
        ----------
        payoff_dists : list of 2-tuples of length `K`
            The parameters the distributions over payoff values for each of the
            `n` arms. Specifically, ``payoffs[k]`` is a tuple of (mean, variance)
            for the Gaussian distribution over payoffs associated with arm `k`.
        payoff_probs : list of length `n`
            A list of the probabilities associated with each of the payoff
            values in ``payoffs``. ``payoff_probs[k]`` holds the probability of
            payoff for arm `k`.
        """
        super().__init__(payoff_dists, payoff_probs)

        for (mean, var), rp in zip(payoff_dists, payoff_probs):
            assert var > 0
            assert np.testing.assert_almost_equal(sum(rp), 1.0)

        self.payoff_dists = payoff_dists
        self.payoff_probs = payoff_probs
        self.arm_evs = np.array([mu for (mu, var) in payoff_dists])
        self.best_ev = np.max(self.arm_evs)
        self.best_arm = np.argmax(self.arm_evs)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "GaussianBandit",
            "payoff_dists": self.payoff_dists,
            "payoff_probs": self.payoff_probs,
        }

    def _pull(self, arm_id, context):
        mean, var = self.payoff_dists[arm_id]

        reward = 0
        if np.random.rand() < self.payoff_probs[arm_id]:
            reward = np.random.normal(mean, var)

        return reward

    def oracle_payoff(self, context=None):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            Unused. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        return self.best_ev, self.best_arm


class ShortestPathBandit(Bandit):
    def __init__(self, G, start_vertex, end_vertex):
        """
        A weighted graph shortest path problem formulated as a multi-armed
        bandit.

        Notes
        -----
        Each arm corresponds to a valid path through the graph from start to
        end vertex. The agent's goal is to find the path that minimizes the
        expected sum of the weights on the edges it traverses.

        Parameters
        ----------
        G : :class:`Graph <numpy_ml.utils.graphs.Graph>` instance
            A weighted graph object. Weights can be fixed or probabilistic.
        start_vertex : int
            The index of the path's start vertex in the graph
        end_vertex : int
            The index of the path's end vertex in the graph
        """
        self.G = G
        self.end_vertex = end_vertex
        self.adj_dict = G.to_adj_dict()
        self.start_vertex = start_vertex
        self.paths = G.all_paths(start_vertex, end_vertex)

        self.arm_evs = self._calc_arm_evs()
        self.best_ev = np.max(self.arm_evs)
        self.best_arm = np.argmax(self.arm_evs)

        placeholder = [None] * len(self.paths)
        super().__init__(placeholder, placeholder)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "ShortestPathBandit",
            "G": self.G,
            "end_vertex": self.end_vertex,
            "start_vertex": self.start_vertex,
        }

    def oracle_payoff(self, context=None):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            Unused. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        return self.best_ev, self.best_arm

    def _calc_arm_evs(self):
        I2V = self.G.get_vertex
        evs = np.zeros(len(self.paths))
        for p_ix, path in enumerate(self.paths):
            for ix, v_i in enumerate(path[:-1]):
                e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]
                evs[p_ix] -= e.weight
        return evs

    def _pull(self, arm_id, context):
        reward = 0
        I2V = self.G.get_vertex
        path = self.paths[arm_id]
        for ix, v_i in enumerate(path[:-1]):
            e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]
            reward -= e.weight
        return reward


class ContextualBernoulliBandit(Bandit):
    def __init__(self, context_probs):
        """
        A contextual version of :class:`BernoulliBandit` where each binary
        context feature is associated with an independent Bernoulli payoff
        distribution.

        Parameters
        ----------
        context_probs : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)`
            A matrix of the payoff probabilities associated with each of the
            `D` context features, for each of the `K` arms. Index `(i, j)`
            contains the probability of payoff for arm `j` under context `i`.
        """
        D, K = context_probs.shape

        # use a dummy placeholder variable to initialize the Bandit superclass
        placeholder = [None] * K
        super().__init__(placeholder, placeholder)

        self.context_probs = context_probs
        self.arm_evs = self.context_probs
        self.best_evs = self.arm_evs.max(axis=1)
        self.best_arms = self.arm_evs.argmax(axis=1)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "ContextualBernoulliBandit",
            "context_probs": self.context_probs,
        }

    def get_context(self):
        """
        Sample a random one-hot context vector. This vector will be the same
        for all arms.

        Returns
        -------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)`
            A random `D`-dimensional one-hot context vector repeated for each
            of the `K` bandit arms.
        """
        D, K = self.context_probs.shape
        context = np.zeros((D, K))
        context[np.random.choice(D), :] = 1
        return random_one_hot_matrix(1, D).ravel()

    def oracle_payoff(self, context):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            The current context matrix for each of the bandit arms.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        context_id = context[:, 0].argmax()
        return self.best_evs[context_id], self.best_arms[context_id]

    def _pull(self, arm_id, context):
        D, K = self.context_probs.shape
        arm_probs = context[:, arm_id] @ self.context_probs
        arm_rwds = (np.random.rand(K) <= arm_probs).astype(int)
        return arm_rwds[arm_id]


class ContextualLinearBandit(Bandit):
    def __init__(self, K, D, payoff_variance=1):
        r"""
        A contextual linear multi-armed bandit.

        Notes
        -----
        In a contextual linear bandit the expected payoff of an arm :math:`a
        \in \mathcal{A}` at time `t` is a linear combination of its context
        vector :math:`\mathbf{x}_{t,a}` with a coefficient vector
        :math:`\theta_a`:

        .. math::

            \mathbb{E}[r_{t, a} \mid \mathbf{x}_{t, a}] = \mathbf{x}_{t,a}^\top \theta_a

        In this implementation, the arm coefficient vectors :math:`\theta` are
        initialized independently from a uniform distribution on the interval
        [-1, 1], and the specific reward at timestep `t` is normally
        distributed:

        .. math::

            r_{t, a} \mid \mathbf{x}_{t, a} \sim
                \mathcal{N}(\mathbf{x}_{t,a}^\top \theta_a, \sigma_a^2)

        Parameters
        ----------
        K : int
            The number of bandit arms
        D : int
            The dimensionality of the context vectors
        payoff_variance : float or :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
            The variance of the random noise in the arm payoffs. If a float,
            the variance is assumed to be equal for each arm. Default is 1.
        """
        if is_number(payoff_variance):
            payoff_variance = [payoff_variance] * K

        assert len(payoff_variance) == K
        assert all(v > 0 for v in payoff_variance)

        self.K = K
        self.D = D
        self.payoff_variance = payoff_variance

        # use a dummy placeholder variable to initialize the Bandit superclass
        placeholder = [None] * K
        super().__init__(placeholder, placeholder)

        # initialize the theta matrix
        self.thetas = np.random.uniform(-1, 1, size=(D, K))
        self.thetas /= np.linalg.norm(self.thetas, 2)

    @property
    def hyperparameters(self):
        """A dictionary of the bandit hyperparameters"""
        return {
            "id": "ContextualLinearBandit",
            "K": self.K,
            "D": self.D,
            "payoff_variance": self.payoff_variance,
        }

    @property
    def parameters(self):
        """A dictionary of the current bandit parameters"""
        return {"thetas": self.thetas}

    def get_context(self):
        """
        Sample the context vectors for each arm from a multivariate standard
        normal distribution.

        Returns
        -------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)`
            A `D`-dimensional context vector sampled from a standard normal
            distribution for each of the `K` bandit arms.
        """
        return np.random.normal(size=(self.D, self.K))

    def oracle_payoff(self, context):
        """
        Return the expected reward for an optimal agent.

        Parameters
        ----------
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
            The current context matrix for each of the bandit arms, if
            applicable. Default is None.

        Returns
        -------
        optimal_rwd : float
            The expected reward under an optimal policy.
        optimal_arm : float
            The arm ID with the largest expected reward.
        """
        best_arm = np.argmax(self.arm_evs)
        return self.arm_evs[best_arm], best_arm

    def _pull(self, arm_id, context):
        K, thetas = self.K, self.thetas
        self._noise = np.random.normal(scale=self.payoff_variance, size=self.K)
        self.arm_evs = np.array([context[:, k] @ thetas[:, k] for k in range(K)])
        return (self.arm_evs + self._noise)[arm_id]


================================================
FILE: numpy_ml/bandits/policies.py
================================================
"""A module containing exploration policies for various multi-armed bandit problems."""

from abc import ABC, abstractmethod
from collections import defaultdict

import numpy as np

from ..utils.testing import is_number


class BanditPolicyBase(ABC):
    def __init__(self):
        """A simple base class for multi-armed bandit policies"""
        self.step = 0
        self.ev_estimates = {}
        self.is_initialized = False
        super().__init__()

    def __repr__(self):
        """Return a string representation of the policy"""
        HP = self.hyperparameters
        params = ", ".join(["{}={}".format(k, v) for (k, v) in HP.items() if k != "id"])
        return "{}({})".format(HP["id"], params)

    @property
    def hyperparameters(self):
        """A dictionary containing the policy hyperparameters"""
        pass

    @property
    def parameters(self):
        """A dictionary containing the current policy parameters"""
        pass

    def act(self, bandit, context=None):
        """
        Select an arm and sample from its payoff distribution.

        Parameters
        ----------
        bandit : :class:`Bandit <numpy_ml.bandits.bandits.Bandit>` instance
            The multi-armed bandit to act upon
        context : :py:class:`ndarray <numpy.ndarray>` of shape `(D,)` or None
            The context vector for the current timestep if interacting with a
            contextual bandit. Otherwise, this argument is unused. Default is
            None.

        Returns
        -------
        rwd : float
            The reward received after pulling ``arm_id``.
        arm_id : int
            The arm that was pulled to generate ``rwd``.
        """
        if not self.is_initialized:
            self._initialize_params(bandit)

        arm_id = self._select_arm(bandit, context)
        rwd = self._pull_arm(bandit, arm_id, context)
        self._update_params(arm_id, rwd, context)
        return rwd, arm_id

    def reset(self):
        """Reset the policy parameters and counters to their initial states."""
        self.step = 0
        self._reset_params()
        self.is_initialized = False

    def _pull_arm(self, bandit, arm_id, context):
        """Execute a bandit action and return the received reward."""
        self.step += 1
        return bandit.pull(arm_id, context)

    @abstractmethod
    def _select_arm(self, bandit, context):
        """Select an arm based on the current context"""
        pass

    @abstractmethod
    def _update_params(self, bandit, context):
        """Update the policy parameters after an interaction"""
        pass

    @abstractmethod
    def _initialize_params(self, bandit):
        """
        Initialize any policy-specific parameters that depend on information
        from the bandit environment.
        """
        pass

    @abstractmethod
    def _reset_params(self):
        """
        Reset any model-specific parameters. This gets called within the
        public `self.reset()` method.
        """
        pass


class EpsilonGreedy(BanditPolicyBase):
    def __init__(self, epsilon=0.05, ev_prior=0.5):
        r"""
        An epsilon-greedy policy for multi-armed bandit problems.

        Notes
        -----
        Epsilon-greedy policies greedily select the arm with the highest
        expected payoff with probability :math:`1-\epsilon`, and selects an arm
        uniformly at random with probability :math:`\epsilon`:

        .. math::

            P(a) = \left\{
                 \begin{array}{lr}
                   \epsilon / N + (1 - \epsilon) &\text{if }
                        a = \arg \max_{a' \in \mathcal{A}}
                            \mathbb{E}_{q_{\hat{\theta}}}[r \mid a']\\
                   \epsilon / N &\text{otherwise}
                 \end{array}
               \right.

        where :math:`N = |\mathcal{A}|` is the number of arms,
        :math:`q_{\hat{\theta}}` is the estimate of the arm payoff
        distribution under current model parameters :math:`\hat{\theta}`, and
        :math:`\mathbb{E}_{q_{\hat{\theta}}}[r \mid a']` is the expected
        reward under :math:`q_{\hat{\theta}}` of receiving reward `r` after
        taking action :math:`a'`.

        Parameters
        ----------
        epsilon : float in [0, 1]
            The probability of taking a random action. Default is 0.05.
        ev_prior : float
            The starting expected payoff for each arm before any data has been
            observed. Default is 0.5.
        """
        super().__init__()
        self.epsilon = epsilon
        self.ev_prior = ev_prior
        self.pull_counts = defaultdict(lambda: 0)

    @property
    def parameters(self):
        """A dictionary containing the current policy parameters"""
        return {"ev_estimates": self.ev_estimates}

    @property
    def hyperparameters(self):
        """A dictionary containing the policy hyperparameters"""
        return {
            "id": "EpsilonGreedy",
            "epsilon": self.epsilon,
            "ev_prior": self.ev_prior,
        }

    def _initialize_params(self, bandit):
        """
        Initialize any policy-specific parameters that depend on information
        from the bandit environment.
        """
        self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}
        self.is_initialized = True

    def _select_arm(self, bandit, context=None):
        if np.random.rand() < self.epsilon:
            arm_id = np.random.choice(bandit.n_arms)
        else:
            ests = self.ev_estimates
            (arm_id, _) = max(ests.items(), key=lambda x: x[1])
        return arm_id

    def _update_params(self, arm_id, reward, context=None):
        E, C = self.ev_estimates, self.pull_counts
        C[arm_id] += 1
        E[arm_id] += (reward - E[arm_id]) / (C[arm_id])

    def _reset_params(self):
        """
        Reset any model-specific parameters. This gets called within the
        public `self.reset()` method.
        """
        self.ev_estimates = {}
        self.pull_counts = defaultdict(lambda: 0)


class UCB1(BanditPolicyBase):
    def __init__(self, C=1, ev_prior=0.5):
        r"""
        A UCB1 policy for multi-armed bandit problems.

        Notes
        -----
        The UCB1 algorithm [*]_ guarantees the cumulative regret is bounded by log
        `t`, where `t` is the current timestep. To make this guarantee UCB1
        assumes all arm payoffs are between 0 and 1.

        Under UCB1, the upper confidence bound on the expected value for
        pulling arm `a` at timestep `t` is:

        .. math::

            \text{UCB}(a, t) = \text{EV}_t(a) + C \sqrt{\frac{2 \log t}{N_t(a)}}

        where :math:`\text{EV}_t(a)` is the average of the rewards recieved so
        far from pulling arm `a`, `C` is a free parameter controlling the
        "optimism" of the confidence upper bound for :math:`\text{UCB}(a, t)`
        (for logarithmic regret bounds, `C` must equal 1), and :math:`N_t(a)`
        is the number of times arm `a` has been pulled during the previous `t -
        1` timesteps.

        References
        ----------
        .. [*] Auer, P., Cesa-Bianchi, N., & Fischer, P. (2002). Finite-time
           analysis of the multiarmed bandit problem. *Machine Learning,
           47(2)*.

        Parameters
        ----------
        C : float in (0, +infinity)
            A confidence/optimisim parameter affecting the degree of
            exploration, where larger values encourage greater exploration. The
            UCB1 algorithm assumes `C=1`. Default is 1.
        ev_prior : float
            The starting expected value for each arm before any data has been
            observed. Default is 0.5.
        """
        self.C = C
        self.ev_prior = ev_prior
        super().__init__()

    @property
    def parameters(self):
        """A dictionary containing the current policy parameters"""
        return {"ev_estimates": self.ev_estimates}

    @property
    def hyperparameters(self):
        """A dictionary containing the policy hyperparameters"""
        return {
            "C": self.C,
            "id": "UCB1",
            "ev_prior": self.ev_prior,
        }

    def _initialize_params(self, bandit):
        """
        Initialize any policy-specific parameters that depend on information
        from the bandit environment.
        """
        self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}
        self.is_initialized = True

    def _select_arm(self, bandit, context=None):
        # add eps to avoid divide-by-zero errors on the first pull of each arm
        eps = np.finfo(float).eps
        N, T = bandit.n_arms, self.step + 1
        E, C = self.ev_estimates, self.pull_counts
        scores = [E[a] + self.C * np.sqrt(np.log(T) / (C[a] + eps)) for a in range(N)]
        return np.argmax(scores)

    def _update_params(self, arm_id, reward, context=None):
        E, C = self.ev_estimates, self.pull_counts
        C[arm_id] += 1
        E[arm_id] += (reward - E[arm_id]) / (C[arm_id])

    def _reset_params(self):
        """
        Reset any model-specific parameters. This gets called within the
        public :method:`reset` method.
        """
        self.ev_estimates = {}
        self.pull_counts = defaultdict(lambda: 0)


class ThompsonSamplingBetaBinomial(BanditPolicyBase):
    def __init__(self, alpha=1, beta=1):
        r"""
        A conjugate Thompson sampling [1]_ [2]_ policy for multi-armed bandits with
        Bernoulli likelihoods.

        Notes
        -----
        The policy assumes independent Beta priors on the Bernoulli arm payoff
        probabilities, :math:`\theta`:

        .. math::

            \theta_k \sim \text{Beta}(\alpha_k, \beta_k) \\
            r \mid \theta_k \sim \text{Bernoulli}(\theta_k)

        where :math:`k \in \{1,\ldots,K \}` indexes arms in the MAB and
        :math:`\theta_k` is the parameter of the Bernoulli likelihood for arm
        `k`. The sampler begins by selecting an arm with probability
        proportional to its payoff probability under the initial Beta prior.
        After pulling the sampled arm and receiving a reward, `r`, the sampler
        computes the posterior over the model parameters (arm payoffs) via
        Bayes' rule, and then samples a new action in proportion to its payoff
        probability under this posterior. This process (i.e., sample action
        from posterior, take action and receive reward, compute updated
        posterior) is repeated until the number of trials is exhausted.

        Note that due to the conjugacy between the Beta prior and Bernoulli
        likelihood the posterior for each arm will also be Beta-distributed and
        can computed and sampled from efficiently:

        .. math::

            \theta_k \mid r \sim \text{Beta}(\alpha_k + r, \beta_k + 1 - r)

        References
        ----------
        .. [1] Thompson, W. (1933). On the likelihood that one unknown
           probability exceeds another in view of the evidence of two samples.
           *Biometrika, 25(3/4)*, 285-294.
        .. [2] Chapelle, O., & Li, L. (2011). An empirical evaluation of
           Thompson sampling. *Advances in Neural Information Processing
           Systems, 24*, 2249-2257.

        Parameters
        ----------
        alpha : float or list of length `K`
            Parameter for the Beta prior on arm payouts. If a float, this value
            will be used in the prior for all of the `K` arms.
        beta : float or list of length `K`
            Parameter for the Beta prior on arm payouts. If a float, this value
            will be used in the prior for all of the `K` arms.
        """
        super().__init__()
        self.alphas, self.betas = [], []
        self.alpha, self.beta = alpha, beta
        self.is_initialized = False

    @property
    def parameters(self):
        """A dictionary containing the current policy parameters"""
        return {
            "ev_estimates": self.ev_estimates,
            "alphas": self.alphas,
            "betas": self.betas,
        }

    @property
    def hyperparameters(self):
        """A dictionary containing the policy hyperparameters"""
        return {
            "id": "ThompsonSamplingBetaBinomial",
            "alpha": self.alpha,
            "beta": self.beta,
        }

    def _initialize_params(self, bandit):
        bhp = bandit.hyperparameters
        fstr = "ThompsonSamplingBetaBinomial only defined for BernoulliBandit, got: {}"
        assert bhp["id"] == "BernoulliBandit", fstr.format(bhp["id"])

        # initialize the model prior
        if is_number(self.alpha):
            self.alphas = [self.alpha] * bandit.n_arms
        if is_number(self.beta):
            self.betas = [self.beta] * bandit.n_arms
        assert len(self.alphas) == len(self.betas) == bandit.n_arms

        self.ev_estimates = {i: self._map_estimate(i, 1) for i in range(bandit.n_arms)}
        self.is_initialized = True

    def _select_arm(self, bandit, context):
        if not self.is_initialized:
            self._initialize_prior(bandit)

        # draw a sample from the current model posterior
        posterior_sample = np.random.beta(self.alphas, self.betas)

        # greedily select an action based on this sample
        return np.argmax(posterior_sample)

    def _update_params(self, arm_id, rwd, context):
        """
        Compute the parameters of the Beta posterior, P(payoff prob | rwd),
        for arm `arm_id`.
        """
        self.alphas[arm_id] += rwd
        self.betas[arm_id] += 1 - rwd
        self.ev_estimates[arm_id] = self._map_estimate(arm_id, rwd)

    def _map_estimate(self, arm_id, rwd):
        """Compute the current MAP estimate for an arm's payoff probability"""
        A, B = self.alphas, self.betas
        if A[arm_id] > 1 and B[arm_id] > 1:
            map_payoff_prob = (A[arm_id] - 1) / (A[arm_id] + B[arm_id] - 2)
        elif A[arm_id] < 1 and B[arm_id] < 1:
            map_payoff_prob = rwd  # 0 or 1 equally likely, make a guess
        elif A[arm_id] <= 1 and B[arm_id] > 1:
            map_payoff_prob = 0
        elif A[arm_id] > 1 and B[arm_id] <= 1:
            map_payoff_prob = 1
        else:
            map_payoff_prob = 0.5
        return map_payoff_prob

    def _reset_params(self):
        """
        Reset any model-specific parameters. This gets called within the
        public `self.reset()` method.
        """
        self.alphas, self.betas = [], []
        self.ev_estimates = {}


class LinUCB(BanditPolicyBase):
    def __init__(self, alpha=1):
        """
        A disjoint linear UCB policy [*]_ for contextual linear bandits.

        Notes
        -----
        LinUCB is only defined for :class:`ContextualLinearBandit <numpy_ml.bandits.ContextualLinearBandit>` environments.

        References
        ----------
        .. [*] Li, L., Chu, W., Langford, J., & Schapire, R. (2010). A
           contextual-bandit approach to personalized news article
           recommendation. In *Proceedings of the 19th International Conference
           on World Wide Web*, 661-670.

        Parameters
        ----------
        alpha : float
            A confidence/optimisim parameter affecting the amount of
            exploration. Default is 1.
        """  # noqa
        super().__init__()

        self.alpha = alpha
        self.A, self.b = [], []
        self.is_initialized = False

    @property
    def parameters(self):
        """A dictionary containing the current policy parameters"""
        return {"ev_estimates": self.ev_estimates, "A": self.A, "b": self.b}

    @property
    def hyperparameters(self):
        """A dictionary containing the policy hyperparameters"""
        return {
            "id": "LinUCB",
            "alpha": self.alpha,
        }

    def _initialize_params(self, bandit):
        """
        Initialize any policy-specific parameters that depend on information
        from the bandit environment.
        """
        bhp = bandit.hyperparameters
        fstr = "LinUCB only defined for contextual linear bandits, got: {}"
        assert bhp["id"] == "ContextualLinearBandit", fstr.format(bhp["id"])

        self.A, self.b = [], []
        for _ in range(bandit.n_arms):
            self.A.append(np.eye(bandit.D))
            self.b.append(np.zeros(bandit.D))

        self.is_initialized = True

    def _select_arm(self, bandit, context):
        probs = []
        for a in range(bandit.n_arms):
            C, A, b = context[:, a], self.A[a], self.b[a]
            A_inv = np.linalg.inv(A)
            theta_hat = A_inv @ b
            p = theta_hat @ C + self.alpha * np.sqrt(C.T @ A_inv @ C)

            probs.append(p)
        return np.argmax(probs)

    def _update_params(self, arm_id, rwd, context):
        """Compute the parameters for A and b."""
        self.A[arm_id] += context[:, arm_id] @ context[:, arm_id].T
        self.b[arm_id] += rwd * context[:, arm_id]

    def _reset_params(self):
        """
        Reset any model-specific parameters. This gets called within the
        public `self.reset()` method.
        """
        self.A, self.b = [], []
        self.ev_estimates = {}


================================================
FILE: numpy_ml/bandits/trainer.py
================================================
"""A trainer/runner object for executing and comparing MAB policies."""

import warnings
import os.path as op
from collections import defaultdict

import numpy as np

from numpy_ml.utils.testing import DependencyWarning

try:
    import matplotlib.pyplot as plt

    _PLOTTING = True
except ImportError:
    fstr = "Cannot import matplotlib. Plotting functionality disabled."
    warnings.warn(fstr, DependencyWarning)
    _PLOTTING = False


def get_scriptdir():
    """Return the directory containing the `trainer.py` script"""
    return op.dirname(op.realpath(__file__))


def mse(bandit, policy):
    """
    Computes the mean squared error between a policy's estimates of the
    expected arm payouts and the true expected payouts.
    """
    if not hasattr(policy, "ev_estimates") or len(policy.ev_estimates) == 0:
        return np.nan

    se = []
    evs = bandit.arm_evs
    ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])
    for ix, (est, ev) in enumerate(zip(ests, evs)):
        se.append((est[1] - ev) ** 2)
    return np.mean(se)


def smooth(prev, cur, weight):
    r"""
    Compute a simple weighted average of the previous and current value.

    Notes
    -----
    The smoothed value at timestep `t`, :math:`\tilde{X}_t` is calculated as

    .. math::

        \tilde{X}_t = \epsilon \tilde{X}_{t-1} + (1 - \epsilon) X_t

    where :math:`X_t` is the value at timestep `t`, :math:`\tilde{X}_{t-1}` is
    the value of the smoothed signal at timestep `t-1`, and :math:`\epsilon` is
    the smoothing weight.

    Parameters
    ----------
    prev : float or :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The value of the smoothed signal at the immediately preceding
        timestep.
    cur : float or :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The value of the signal at the current timestep
    weight : float or :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The smoothing weight. Values closer to 0 result in less smoothing,
        values closer to 1 produce more aggressive smoothing. If weight is an
        array, each dimension will be interpreted as a separate smoothing
        weight the corresponding dimension in `cur`.

    Returns
    -------
    smoothed : float or :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The smoothed signal
    """
    return weight * prev + (1 - weight) * cur


class BanditTrainer:
    def __init__(self):
        """
        An object to facilitate multi-armed bandit training, comparison, and
        evaluation.
        """
        self.logs = {}

    def compare(
        self,
        policies,
        bandit,
        n_trials,
        n_duplicates,
        plot=True,
        seed=None,
        smooth_weight=0.999,
        out_dir=None,
    ):
        """
        Compare the performance of multiple policies on the same bandit
        environment, generating a plot for each.

        Parameters
        ----------
        policies : list of :class:`BanditPolicyBase <numpy_ml.bandits.policies.BanditPolicyBase>` instances
            The multi-armed bandit policies to compare.
        bandit : :class:`Bandit <numpy_ml.bandits.bandits.Bandit>` instance
            The environment to train the policies on.
        n_trials : int
            The number of trials per run.
        n_duplicates: int
            The number of times to evaluate each policy on the bandit
            environment. Larger values permit a better estimate of the
            variance in payoff / cumulative regret for each policy.
        plot : bool
            Whether to generate a plot of the policy's average reward and
            regret across the episodes. Default is True.
        seed : int
            The seed for the random number generator. Default is None.
        smooth_weight : float in [0, 1]
            The smoothing weight. Values closer to 0 result in less smoothing,
            values closer to 1 produce more aggressive smoothing. Default is
            0.999.
        out_dir : str or None
            Plots will be saved to this directory if `plot` is True. If
            `out_dir` is None, plots will not be saved. Default is None.
        """  # noqa: E501
        self.init_logs(policies)

        all_axes = [None] * len(policies)
        if plot and _PLOTTING:
            fig, all_axes = plt.subplots(len(policies), 2, sharex=True)
            fig.set_size_inches(10.5, len(policies) * 5.25)

        for policy, axes in zip(policies, all_axes):
            if seed:
                np.random.seed(seed)

            bandit.reset()
            policy.reset()

            self.train(
                policy,
                bandit,
                n_trials,
                n_duplicates,
                axes=axes,
                plot=plot,
                verbose=False,
                out_dir=out_dir,
                smooth_weight=smooth_weight,
            )

        # enforce the same y-ranges across plots for straightforward comparison
        a1_r, a2_r = zip(*[(a1.get_ylim(), a2.get_ylim()) for (a1, a2) in all_axes])

        a1_min = min(a1_r, key=lambda x: x[0])[0]
        a1_max = max(a1_r, key=lambda x: x[1])[1]
        a2_min = min(a2_r, key=lambda x: x[0])[0]
        a2_max = max(a2_r, key=lambda x: x[1])[1]

        for (a1, a2) in all_axes:
            a1.set_ylim(a1_min, a1_max)
            a2.set_ylim(a2_min, a2_max)

        if plot and _PLOTTING:
            if out_dir is not None:
                plt.savefig(op.join(out_dir, "bandit_comparison.png"), dpi=300)
            plt.show()

    def train(
        self,
        policy,
        bandit,
        n_trials,
        n_duplicates,
        plot=True,
        axes=None,
        verbose=True,
        print_every=100,
        smooth_weight=0.999,
        out_dir=None,
    ):
        """
        Train a MAB policies on a multi-armed bandit problem, logging training
        statistics along the way.

        Parameters
        ----------
        policy : :class:`BanditPolicyBase <numpy_ml.bandits.policies.BanditPolicyBase>` instance
            The multi-armed bandit policy to train.
        bandit : :class:`Bandit <numpy_ml.bandits.bandits.Bandit>` instance
            The environment to run the policy on.
        n_trials : int
            The number of trials per run.
        n_duplicates: int
            The number of runs to evaluate
        plot : bool
            Whether to generate a plot of the policy's average reward and
            regret across the episodes. Default is True.
        axes : list of :py:class:`Axis <matplotlib.axes.Axis>` instances or None
            If not None and ``plot = True``, these are the axes that will be
            used to plot the cumulative reward and regret, respectively.
            Default is None.
        verbose : boolean
            Whether to print run statistics during training. Default is True.
        print_every : int
            The number of episodes to run before printing loss values to
            stdout. This is ignored if ``verbose`` is false. Default is 100.
        smooth_weight : float in [0, 1]
            The smoothing weight. Values closer to 0 result in less smoothing,
            values closer to 1 produce more aggressive smoothing. Default is
            0.999.
        out_dir : str or None
            Plots will be saved to this directory if `plot` is True. If
            `out_dir` is None, plots will not be saved. Default is None.

        Returns
        -------
        policy : :class:`BanditPolicyBase <numpy_ml.bandits.policies.BanditPolicyBase>` instance
            The policy trained during the last (i.e. most recent) duplicate
            run.
        """  # noqa: E501
        if not str(policy) in self.logs:
            self.init_logs(policy)

        p = str(policy)
        D, L = n_duplicates, self.logs

        for d in range(D):
            if verbose:
                print("\nDUPLICATE {}/{}\n".format(d + 1, D))

            bandit.reset()
            policy.reset()

            avg_oracle_reward, cregret = 0, 0
            for trial_id in range(n_trials):
                rwd, arm, orwd, oarm = self._train_step(bandit, policy)

                loss = mse(bandit, policy)
                regret = orwd - rwd

                avg_oracle_reward += orwd
                cregret += regret

                L[p]["mse"][trial_id + 1].append(loss)
                L[p]["reward"][trial_id + 1].append(rwd)
                L[p]["regret"][trial_id + 1].append(regret)
                L[p]["cregret"][trial_id + 1].append(cregret)
                L[p]["optimal_arm"][trial_id + 1].append(oarm)
                L[p]["selected_arm"][trial_id + 1].append(arm)
                L[p]["optimal_reward"][trial_id + 1].append(orwd)

                if (trial_id + 1) % print_every == 0 and verbose:
                    fstr = "Trial {}/{}, {}/{}, Regret: {:.4f}"
                    print(fstr.format(trial_id + 1, n_trials, d + 1, D, regret))

            avg_oracle_reward /= n_trials

            if verbose:
                self._print_run_summary(bandit, policy, regret)

        if plot and _PLOTTING:
            self._plot_reward(avg_oracle_reward, policy, smooth_weight, axes, out_dir)

        return policy

    def _train_step(self, bandit, policy):
        P, B = policy, bandit
        C = B.get_context() if hasattr(B, "get_context") else None
        rwd, arm = P.act(B, C)
        oracle_rwd, oracle_arm = B.oracle_payoff(C)
        return rwd, arm, oracle_rwd, oracle_arm

    def init_logs(self, policies):
        """
        Initialize the episode logs.

        Notes
        -----
        Training logs are represented as a nested set of dictionaries with the
        following structure:

            log[model_id][metric][trial_number][duplicate_number]

        For example, ``logs['model1']['regret'][3][1]`` holds the regret value
        accrued on the 3rd trial of the 2nd duplicate run for model1.

        Available fields are 'regret', 'cregret' (cumulative regret), 'reward',
        'mse' (mean-squared error between estimated arm EVs and the true EVs),
        'optimal_arm', 'selected_arm', and 'optimal_reward'.
        """
        if not isinstance(policies, list):
            policies = [policies]

        self.logs = {
            str(p): {
                "mse": defaultdict(lambda: []),
                "regret": defaultdict(lambda: []),
                "reward": defaultdict(lambda: []),
                "cregret": defaultdict(lambda: []),
                "optimal_arm": defaultdict(lambda: []),
                "selected_arm": defaultdict(lambda: []),
                "optimal_reward": defaultdict(lambda: []),
            }
            for p in policies
        }

    def _print_run_summary(self, bandit, policy, regret):
        if not hasattr(policy, "ev_estimates") or len(policy.ev_estimates) == 0:
            return None

        evs, se = bandit.arm_evs, []
        fstr = "Arm {}: {:.4f} v. {:.4f}"
        ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])
        print("\n\nEstimated vs. Real EV\n" + "-" * 21)
        for ix, (est, ev) in enumerate(zip(ests, evs)):
            print(fstr.format(ix + 1, est[1], ev))
            se.append((est[1] - ev) ** 2)
        fstr = "\nFinal MSE: {:.4f}\nFinal Regret: {:.4f}\n\n"
        print(fstr.format(np.mean(se), regret))

    def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None, out_dir=None):
        L = self.logs[str(policy)]
        smds = self._smoothed_metrics(policy, optimal_rwd, smooth_weight)

        if axes is None:
            fig, [ax1, ax2] = plt.subplots(1, 2)
        else:
            assert len(axes) == 2
            ax1, ax2 = axes

        e_ids = range(1, len(L["reward"]) + 1)
        plot_params = [[ax1, ax2], ["reward", "cregret"], ["b", "r"], [optimal_rwd, 0]]

        for (ax, m, c, opt) in zip(*plot_params):
            avg, std = "sm_{}_avg sm_{}_std".format(m, m).split()
            ax.plot(e_ids, smds[avg], color=c)
            ax.axhline(opt, 0, 1, color=c, ls="--")
            ax.fill_between(
                e_ids,
                smds[avg] + smds[std],
                smds[avg] - smds[std],
                color=c,
                alpha=0.25,
            )
            ax.set_xlabel("Trial")
            m = "Cumulative Regret" if m == "cregret" else m
            ax.set_ylabel("Smoothed Avg. {}".format(m.title()))

            if axes is None:
                ax.set_aspect(np.diff(ax.get_xlim()) / np.diff(ax.get_ylim()))

            if axes is not None:
                ax.set_title(str(policy))

        if axes is None:
            fig.suptitle(str(policy))
            fig.tight_layout()

            if out_dir is not None:
                bid = policy.hyperparameters["id"]
                plt.savefig(op.join(out_dir, f"{bid}.png"), dpi=300)
            plt.show()
        return ax1, ax2

    def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):
        L = self.logs[str(policy)]

        # pre-allocate smoothed data structure
        smds = {}
        for m in L.keys():
            if m == "selections":
                continue

            smds["sm_{}_avg".format(m)] = np.zeros(len(L["reward"]))
            smds["sm_{}_avg".format(m)][0] = np.mean(L[m][1])

            smds["sm_{}_std".format(m)] = np.zeros(len(L["reward"]))
            smds["sm_{}_std".format(m)][0] = np.std(L[m][1])

        smoothed = {m: L[m][1] for m in L.keys()}
        for e_id in range(2, len(L["reward"]) + 1):
            for m in L.keys():
                if m == "selections":
                    continue
                prev, cur = smoothed[m], L[m][e_id]
                smoothed[m] = [smooth(p, c, smooth_weight) for p, c in zip(prev, cur)]
                smds["sm_{}_avg".format(m)][e_id - 1] = np.mean(smoothed[m])
                smds["sm_{}_std".format(m)][e_id - 1] = np.std(smoothed[m])
        return smds


================================================
FILE: numpy_ml/factorization/README.md
================================================
# Factors
The `factors.py` module includes common approximate matrix-factorization
algorithms including:

- Regularized alternating least squares (ALS)
- Non-negative matrix factorization via fast hierarchical least squares (HALS) ([Cichocki & Phan, 2008](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.214.6398&rep=rep1&type=pdf))


================================================
FILE: numpy_ml/factorization/__init__.py
================================================
"""Algorithms for approximate matrix factorization"""

from .factors import *


================================================
FILE: numpy_ml/factorization/factors.py
================================================
"""Algorithms for approximate matrix factorization"""

from copy import deepcopy

import numpy as np


class VanillaALS:
    def __init__(self, K, alpha=1, max_iter=200, tol=1e-4):
        r"""
        Approximately factor a real-valued matrix using regularized alternating
        least-squares (ALS).

        Notes
        -----
        The regularized ALS minimization problem is

        .. math::

            \min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2 -
                \alpha \left(
                    ||\mathbf{W}||^2 + ||\mathbf{H}||^2
                \right)

        where :math:`||\cdot||` denotes the Frobenius norm, **X** is the
        :math:`N \times M` data matrix, :math:`\mathbf{W}` and
        :math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
        \times K` and :math:`K \times M`, respectively, and :math:`\alpha` is a
        user-defined regularization weight.

        ALS proceeds by alternating between fixing **W** and optimizing for
        **H** and fixing **H** and optimizing for **W**. Vanilla ALS has no
        convergance guarantees and the objective function is prone to
        oscillation across updates, particularly for dense input matrices [1]_.

        References
        ----------
        .. [1] Gillis, N. (2014). The why and how of nonnegative matrix
           factorization.  *Regularization, optimization, kernels, and support
           vector machines, 12(257)*, 257-291.

        Parameters
        ----------
        K : int
            The number of latent factors to include in the factor matrices W
            and H.
        alpha : float
            The L2 regularization weight on the factor matrices. Larger
            values result in more aggressive regularization. Default is 1.
        max_iter : int
            The maximum number of iterations to run before stopping. Default is
            200.
        tol : float
            The tolerance for the stopping condition. Default is 1e-4.
        """
        self.K = K
        self.W = None
        self.H = None
        self.tol = tol
        self.alpha = alpha
        self.max_iter = max_iter

    @property
    def parameters(self):
        """Return a dictionary of the current model parameters"""
        return {"W": self.W, "H": self.H}

    @property
    def hyperparameters(self):
        """Return a dictionary of the model hyperparameters"""
        return {
            "id": "ALSFactor",
            "K": self.K,
            "tol": self.tol,
            "alpha": self.alpha,
            "max_iter": self.max_iter,
        }

    def _init_factor_matrices(self, X, W=None, H=None):
        """Randomly initialize the factor matrices"""
        N, M = X.shape
        scale = np.sqrt(X.mean() / self.K)
        self.W = np.random.rand(N, self.K) * scale if W is None else W
        self.H = np.random.rand(self.K, M) * scale if H is None else H

        assert self.W.shape == (N, self.K)
        assert self.H.shape == (self.K, M)

    def _loss(self, X, Xhat):
        """Regularized Frobenius loss"""
        alpha, W, H = self.alpha, self.W, self.H
        sq_fnorm = lambda x: np.sum(x ** 2)  # noqa: E731
        return sq_fnorm(X - Xhat) + alpha * (sq_fnorm(W) + sq_fnorm(H))

    def _update_factor(self, X, A):
        """Perform the ALS update"""
        T1 = np.linalg.inv(A.T @ A + self.alpha * np.eye(self.K))
        return X @ A @ T1

    def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
        """
        Factor a data matrix into two low rank factors via ALS.

        Parameters
        ----------
        X : numpy array of shape `(N, M)`
            The data matrix to factor.
        W : numpy array of shape `(N, K)` or None
            An initial value for the `W` factor matrix. If None, initialize `W`
            randomly. Default is None.
        H : numpy array of shape `(K, M)` or None
            An initial value for the `H` factor matrix. If None, initialize `H`
            randomly. Default is None.
        n_initializations : int
            Number of re-initializations of the algorithm to perform before
            taking the answer with the lowest reconstruction error. This value
            is ignored and set to 1 if both `W` and `H` are not None. Default
            is 10.
        verbose : bool
            Whether to print the loss at each iteration. Default is False.
        """
        if W is not None and H is not None:
            n_initializations = 1

        best_loss = np.inf
        for f in range(n_initializations):
            if verbose:
                print("\nINITIALIZATION {}".format(f + 1))

            new_W, new_H, loss = self._fit(X, W, H, verbose)

            if loss <= best_loss:
                best_loss = loss
                best_W, best_H = deepcopy(new_W), deepcopy(new_H)

        self.W, self.H = best_W, best_H

        if verbose:
            print("\nFINAL LOSS: {}".format(best_loss))

    def _fit(self, X, W, H, verbose):
        self._init_factor_matrices(X, W, H)
        W, H = self.W, self.H

        for i in range(self.max_iter):
            W = self._update_factor(X, H.T)
            H = self._update_factor(X.T, W).T

            loss = self._loss(X, W @ H)

            if verbose:
                print("[Iter {}] Loss: {:.8f}".format(i + 1, loss))

            if loss <= self.tol:
                break

        return W, H, loss


class NMF:
    def __init__(self, K, max_iter=200, tol=1e-4):
        r"""
        Nonnegative matrix factorization (NMF) performed using fast
        hierarchical alternating least squares (HALS) [*]_.

        Notes
        -----
        The NMF minimization problem is

        .. math::

            \min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2
                \ \ \ \ \text{subject to } \mathbf{W}, \mathbf{H} \geq 0

        where :math:`||\cdot||` denotes the Frobenius norm, and the notation
        :math:`\mathbf{A} \geq 0` indicates that each element of **A** is
        greater than or equal to 0. In the above equation, **X** is the
        :math:`N \times M` data matrix, :math:`\mathbf{W}` and
        :math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
        \times K` and :math:`K \times M`, respectively.

        As with other ALS-based approaches, there is no guarantee that NMF will
        converge to a stationary point, let alone a global minimum. As a result
        it is generally good practice to run the algorithm multiple times with
        different initializations, taking the outcome that achieves the lowest
        reconstruction error.

        References
        ----------
        .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
           large scale nonnegative matrix and tensor factorizations. *IEICE
           Transactions on Fundamentals of Electronics, Communications and
           Computer Sciences, 92(3)*, 708-721.

        Parameters
        ----------
        K : int
            The number of latent factors to include in the factor matrices **W**
            and **H**.
        max_iter : int
            The maximum number of iterations to run before stopping. Default is
            200.
        tol : float
            The tolerance for the stopping condition. Default is 1e-4.
        """
        self.K = K
        self.W = None
        self.H = None
        self.tol = tol
        self.max_iter = max_iter

    @property
    def parameters(self):
        """Return a dictionary of the current model parameters"""
        return {"W": self.W, "H": self.H}

    @property
    def hyperparameters(self):
        """Return a dictionary of the model hyperparameters"""
        return {
            "id": "NMF",
            "K": self.K,
            "tol": self.tol,
            "max_iter": self.max_iter,
        }

    def _init_factor_matrices(self, X, W, H):
        """Initialize the factor matrices using vanilla ALS"""
        ALS = None
        N, M = X.shape

        # initialize factors using ALS if not already defined
        if W is None:
            ALS = VanillaALS(self.K, alpha=0, max_iter=200)
            ALS.fit(X, verbose=False)
            W = ALS.W / np.linalg.norm(ALS.W, axis=0)

        if H is None:
            H = np.abs(np.random.rand(self.K, M)) if ALS is None else ALS.H

        assert W.shape == (N, self.K)
        assert H.shape == (self.K, M)

        self.H = H
        self.W = W

    def _loss(self, X, Xhat):
        """Return the least-squares reconstruction loss between X and Xhat"""
        return np.sum((X - Xhat) ** 2)

    def _update_H(self, X, W, H):
        """Perform the fast HALS update for H"""
        eps = np.finfo(float).eps
        XtW = X.T @ W  # dim: (M, K)
        WtW = W.T @ W  # dim: (K, K)

        for k in range(self.K):
            H[k, :] += XtW[:, k] - H.T @ WtW[:, k]
            H[k, :] = np.clip(H[k, :], eps, np.inf)  # enforce nonnegativity
        return H

    def _update_W(self, X, W, H):
        """Perform the fast HALS update for W"""
        eps = np.finfo(float).eps
        XHt = X @ H.T  # dim: (N, K)
        HHt = H @ H.T  # dim: (K, K)

        for k in range(self.K):
            W[:, k] = W[:, k] * HHt[k, k] + XHt[:, k] - W @ HHt[:, k]
            W[:, k] = np.clip(W[:, k], eps, np.inf)  # enforce nonnegativity

            # renormalize the new column
            n = np.linalg.norm(W[:, k])
            W[:, k] /= n if n > 0 else 1.0
        return W

    def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
        r"""
        Factor a data matrix into two nonnegative low rank factor matrices via
        fast HALS.

        Notes
        -----
        This method implements Algorithm 2 from [*]_. In contrast to vanilla
        ALS, HALS proceeds by minimizing a *set* of local cost functions with
        the same global minima. Each cost function is defined on a "residue" of
        the factor matrices **W** and **H**:

        .. math::

           \mathbf{X}^{(j)} :=
                \mathbf{X} - \mathbf{WH}^\top + \mathbf{w}_j \mathbf{h}_j^\top

        where :math:`\mathbf{X}^{(j)}` is the :math:`j^{th}` residue, **X** is
        the input data matrix, and :math:`\mathbf{w}_j` and
        :math:`\mathbf{h}_j` are the :math:`j^{th}` columns of the current
        factor matrices **W** and **H**. HALS proceeds by minimizing the cost
        for each residue, first with respect to :math:`\mathbf{w}_j`, and then
        with respect to :math:`\mathbf{h}_j`. In either case, the cost for
        residue `j`, :math:`\mathcal{L}^{(j)}` is simply:

        .. math::

            \mathcal{L}^{(j)} :=
                || \mathbf{X}^{(j)} - \mathbf{w}_j \mathbf{h}_j^\top ||

        where :math:`||\cdot||` denotes the Frobenius norm. For NMF,
        minimization is performed under the constraint that all elements of
        both **W** and **H** are nonnegative.

        References
        ----------
        .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
           large scale nonnegative matrix and tensor factorizations. *IEICE
           Transactions on Fundamentals of Electronics, Communications and
           Computer Sciences, 92(3)*, 708-721.

        Parameters
        ----------
        X : numpy array of shape `(N, M)`
            The data matrix to factor.
        W : numpy array of shape `(N, K)` or None
            An initial value for the `W` factor matrix. If None, initialize
            **W** using vanilla ALS. Default is None.
        H : numpy array of shape `(K, M)` or None
            An initial value for the `H` factor matrix. If None, initialize
            **H** using vanilla ALS. Default is None.
        n_initializations : int
            Number of re-initializations of the algorithm to perform before
            taking the answer with the lowest reconstruction error. This value
            is ignored and set to 1 if both `W` and `H` are not None. Default
            is 10.
        verbose : bool
            Whether to print the loss at each iteration. Default is False.
        """
        if W is not None and H is not None:
            n_initializations = 1

        best_loss = np.inf
        for f in range(n_initializations):
            if verbose:
                print("\nINITIALIZATION {}".format(f + 1))

            new_W, new_H, loss = self._fit(X, W, H, verbose)

            if loss <= best_loss:
                best_loss = loss
                best_W, best_H = deepcopy(new_W), deepcopy(new_H)

        self.W, self.H = best_W, best_H
        if verbose:
            print("\nFINAL LOSS: {}".format(best_loss))

    def _fit(self, X, W, H, verbose):
        self._init_factor_matrices(X, W, H)

        W, H = self.W, self.H
        for i in range(self.max_iter):
            H = self._update_H(X, W, H)
            W = self._update_W(X, W, H)
            loss = self._loss(X, W @ H)

            if verbose:
                print("[Iter {}] Loss: {:.8f}".format(i + 1, loss))

            if loss <= self.tol:
                break
        return W, H, loss


================================================
FILE: numpy_ml/gmm/README.md
================================================
# Gaussian Mixture Models
The `gmm.py` module implements the standard (ie., non-Bayesian) [Gaussian mixture model](https://en.wikipedia.org/wiki/Mixture_model#Gaussian_mixture_model) with maximum-likelihood parameter estimates via the [EM algorithm](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm).

## Plots
<p align="center">
  <img src="img/plot.png" height="700" />
</p>


================================================
FILE: numpy_ml/gmm/__init__.py
================================================
from .gmm import *


================================================
FILE: numpy_ml/gmm/gmm.py
================================================
"""A Gaussian mixture model class"""
import numpy as np

from numpy_ml.utils.misc import logsumexp, log_gaussian_pdf


class GMM(object):
    def __init__(self, C=3, seed=None):
        """
        A Gaussian mixture model trained via the expectation maximization
        algorithm.

        Parameters
        ----------
        C : int
            The number of clusters / mixture components in the GMM. Default is
            3.
        seed : int
            Seed for the random number generator. Default is None.

        Attributes
        ----------
        N : int
            The number of examples in the training dataset.
        d : int
            The dimension of each example in the training dataset.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(C,)`
            The cluster priors.
        Q : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            The variational distribution `q(T)`.
        mu : :py:class:`ndarray <numpy.ndarray>` of shape `(C, d)`
            The cluster means.
        sigma : :py:class:`ndarray <numpy.ndarray>` of shape `(C, d, d)`
            The cluster covariance matrices.
        """
        self.elbo = None
        self.parameters = {}
        self.hyperparameters = {
            "C": C,
            "seed": seed,
        }

        self.is_fit = False

        if seed:
            np.random.seed(seed)

    def _initialize_params(self, X):
        """Randomly initialize the starting GMM parameters."""
        N, d = X.shape
        C = self.hyperparameters["C"]

        rr = np.random.rand(C)

        self.parameters = {
            "pi": rr / rr.sum(),  # cluster priors
            "Q": np.zeros((N, C)),  # variational distribution q(T)
            "mu": np.random.uniform(-5, 10, C * d).reshape(C, d),  # cluster means
            "sigma": np.array([np.eye(d) for _ in range(C)]),  # cluster covariances
        }

        self.elbo = None
        self.is_fit = False

    def likelihood_lower_bound(self, X):
        """Compute the LLB under the current GMM parameters."""
        N = X.shape[0]
        P = self.parameters
        C = self.hyperparameters["C"]
        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]

        eps = np.finfo(float).eps
        expec1, expec2 = 0.0, 0.0
        for i in range(N):
            x_i = X[i]

            for c in range(C):
                pi_k = pi[c]
                z_nk = Q[i, c]
                mu_k = mu[c, :]
                sigma_k = sigma[c, :, :]

                log_pi_k = np.log(pi_k + eps)
                log_p_x_i = log_gaussian_pdf(x_i, mu_k, sigma_k)
                prob = z_nk * (log_p_x_i + log_pi_k)

                expec1 += prob
                expec2 += z_nk * np.log(z_nk + eps)

        loss = expec1 - expec2
        return loss

    def fit(self, X, max_iter=100, tol=1e-3, verbose=False):
        """
        Fit the parameters of the GMM on some training data.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, d)`
            A collection of `N` training data points, each with dimension `d`.
        max_iter : int
            The maximum number of EM updates to perform before terminating
            training. Default is 100.
        tol : float
            The convergence tolerance. Training is terminated if the difference
            in VLB between the current and previous iteration is less than
            `tol`. Default is 1e-3.
        verbose : bool
            Whether to print the VLB at each training iteration. Default is
            False.

        Returns
        -------
        success : {0, -1}
            Whether training terminated without incident (0) or one of the
            mixture components collapsed and training was halted prematurely
            (-1).
        """
        prev_vlb = -np.inf
        self._initialize_params(X)

        for _iter in range(max_iter):
            try:
                self._E_step(X)
                self._M_step(X)
                vlb = self.likelihood_lower_bound(X)

                if verbose:
                    print(f"{_iter + 1}. Lower bound: {vlb}")

                converged = _iter > 0 and np.abs(vlb - prev_vlb) <= tol
                if np.isnan(vlb) or converged:
                    break

                prev_vlb = vlb

            except np.linalg.LinAlgError:
                print("Singular matrix: components collapsed")
                return -1

        self.elbo = vlb
        self.is_fit = True
        return 0

    def predict(self, X, soft_labels=True):
        """
        Return the log probability of each data point in `X` under each
        mixture components.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(M, d)`
            A collection of `M` data points, each with dimension `d`.
        soft_labels : bool
            If True, return the log probabilities of the M data points in X
            under each mixture component. If False, return only the ID of the
            most probable mixture. Default is True.

        Returns
        -------
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)` or `(M,)`
            If `soft_labels` is True, `y` is a 2D array where index (i,j) gives
            the log probability of the `i` th data point under the `j` th
            mixture component. If `soft_labels` is False, `y` is a 1D array
            where the `i` th index contains the ID of the most probable mixture
            component.
        """
        assert self.is_fit, "Must call the `.fit` method before making predictions"

        P = self.parameters
        C = self.hyperparameters["C"]
        mu, sigma = P["mu"], P["sigma"]

        y = []
        for x_i in X:
            cprobs = [log_gaussian_pdf(x_i, mu[c, :], sigma[c, :, :]) for c in range(C)]

            if not soft_labels:
                y.append(np.argmax(cprobs))
            else:
                y.append(cprobs)

        return np.array(y)

    def _E_step(self, X):
        P = self.parameters
        C = self.hyperparameters["C"]
        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]

        for i, x_i in enumerate(X):
            denom_vals = []
            for c in range(C):
                pi_c = pi[c]
                mu_c = mu[c, :]
                sigma_c = sigma[c, :, :]

                log_pi_c = np.log(pi_c)
                log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c)

                # log N(X_i | mu_c, Sigma_c) + log pi_c
                denom_vals.append(log_p_x_i + log_pi_c)

            # log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ]
            log_denom = logsumexp(denom_vals)
            q_i = np.exp([num - log_denom for num in denom_vals])
            np.testing.assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i)))

            Q[i, :] = q_i

    def _M_step(self, X):
        N, d = X.shape
        P = self.parameters
        C = self.hyperparameters["C"]
        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]

        denoms = np.sum(Q, axis=0)

        # update cluster priors
        pi = denoms / N

        # update cluster means
        nums_mu = [np.dot(Q[:, c], X) for c in range(C)]
        for ix, (num, den) in enumerate(zip(nums_mu, denoms)):
            mu[ix, :] = num / den if den > 0 else np.zeros_like(num)

        # update cluster covariances
        for c in range(C):
            mu_c = mu[c, :]
            n_c = denoms[c]

            outer = np.zeros((d, d))
            for i in range(N):
                wic = Q[i, c]
                xi = X[i, :]
                outer += wic * np.outer(xi - mu_c, xi - mu_c)

            outer = outer / n_c if n_c > 0 else outer
            sigma[c, :, :] = outer

        np.testing.assert_allclose(np.sum(pi), 1, err_msg="{}".format(np.sum(pi)))


================================================
FILE: numpy_ml/hmm/README.md
================================================
# Hidden Markov model
The `hmm.py` module implements a standard (i.e., non-Bayesian) [Hidden Markov
model](https://en.wikipedia.org/wiki/Hidden_Markov_model) with
maximum-likelihood parameter estimation via the
[EM-algorithm](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (specifically, [Baum-Welch](https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm)).

## Plots
<p align="center">
  <img src="img/plot.png" height="700" />
</p>


================================================
FILE: numpy_ml/hmm/__init__.py
================================================
from .hmm import *


================================================
FILE: numpy_ml/hmm/hmm.py
================================================
"""Hidden Markov model module"""

import numpy as np
from numpy_ml.utils.misc import logsumexp


class MultinomialHMM:
    def __init__(self, A=None, B=None, pi=None, eps=None):
        r"""
        A simple hidden Markov model with multinomial emission distribution.

        Parameters
        ----------
        A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` or None
            The transition matrix between latent states in the HMM. Index `i`,
            `j` gives the probability of transitioning from latent state `i` to
            latent state `j`. Default is None.
        B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)` or None
            The emission matrix. Entry `i`, `j` gives the probability of latent
            state i emitting an observation of type `j`. Default is None.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` or None
            The prior probability of each latent state. If None, use a uniform
            prior over states. Default is None.
        eps : float or None
            Epsilon value to avoid :math:`\log(0)` errors. If None, defaults to
            the machine epsilon. Default is None.

        Attributes
        ----------
        A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The transition matrix between latent states in the HMM. Index `i`,
            `j` gives the probability of transitioning from latent state `i` to
            latent state `j`.
        B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)`
            The emission matrix. Entry `i`, `j` gives the probability of latent
            state `i` emitting an observation of type `j`.
        N : int
            The number of unique latent states
        V : int
            The number of unique observation types
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(I, T)`
            The collection of observed training sequences.
        I : int
            The number of sequences in `O`.
        T : int
            The number of observations in each sequence in `O`.
        """
        eps = np.finfo(float).eps if eps is None else eps

        # prior probability of each latent state
        if pi is not None:
            pi[pi == 0] = eps

        # number of latent state types
        N = None
        if A is not None:
            N = A.shape[0]
            A[A == 0] = eps

        # number of observation types
        V = None
        if B is not None:
            V = B.shape[1]
            B[B == 0] = eps

        self.parameters = {
            "A": A,  # transition matrix
            "B": B,  # emission matrix
            "pi": pi,  # prior probability of each latent state
        }

        self.hyperparameters = {
            "eps": eps,  # epsilon
        }

        self.derived_variables = {
            "N": N,  # number of latent state types
            "V": V,  # number of observation types
        }

    def generate(self, n_steps, latent_state_types, obs_types):
        """
        Sample a sequence from the HMM.

        Parameters
        ----------
        n_steps : int
            The length of the generated sequence
        latent_state_types : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            A collection of labels for the latent states
        obs_types : :py:class:`ndarray <numpy.ndarray>` of shape `(V,)`
            A collection of labels for the observations

        Returns
        -------
        states : :py:class:`ndarray <numpy.ndarray>` of shape `(n_steps,)`
            The sampled latent states.
        emissions : :py:class:`ndarray <numpy.ndarray>` of shape `(n_steps,)`
            The sampled emissions.
        """
        P = self.parameters
        A, B, pi = P["A"], P["B"], P["pi"]

        # sample the initial latent state
        s = np.random.multinomial(1, pi).argmax()
        states = [latent_state_types[s]]

        # generate an emission given latent state
        v = np.random.multinomial(1, B[s, :]).argmax()
        emissions = [obs_types[v]]

        # sample a latent transition, rinse, and repeat
        for i in range(n_steps - 1):
            s = np.random.multinomial(1, A[s, :]).argmax()
            states.append(latent_state_types[s])

            v = np.random.multinomial(1, B[s, :]).argmax()
            emissions.append(obs_types[v])

        return np.array(states), np.array(emissions)

    def log_likelihood(self, O):
        r"""
        Given the HMM parameterized by :math:`(A`, B, \pi)` and an observation
        sequence `O`, compute the marginal likelihood of `O`,
        :math:`P(O \mid A,B,\pi)`, by marginalizing over latent states.

        Notes
        -----
        The log likelihood is computed efficiently via DP using the forward
        algorithm, which produces a 2D trellis, ``forward`` (sometimes referred
        to as `alpha` in the literature), where entry `i`, `j` represents the
        probability under the HMM of being in latent state `i` after seeing the
        first `j` observations:

        .. math::

            \mathtt{forward[i,j]} = P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of
        type `i`.

        The DP step is:

        .. math::

            \mathtt{forward[i,j]}
               &= \sum_{s'=1}^N \mathtt{forward[s',j-1]} \cdot
                   \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
               &= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
                    P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)

        In words, ``forward[i,j]`` is the weighted sum of the values computed on
        the previous timestep. The weight on each previous state value is the
        product of the probability of transitioning from that state to state `i`
        and the probability of emitting observation `j` in state `i`.

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(1, T)`
            A single set of observations.

        Returns
        -------
        likelihood : float
            The likelihood of the observations `O` under the HMM.
        """
        if O.ndim == 1:
            O = O.reshape(1, -1)  # noqa: E741

        I, T = O.shape  # noqa: E741

        if I != 1:  # noqa: E741
            raise ValueError("Likelihood only accepts a single sequence")

        forward = self._forward(O[0])
        log_likelihood = logsumexp(forward[:, T - 1])
        return log_likelihood

    def decode(self, O):
        r"""
        Given the HMM parameterized by :math:`(A, B, \pi)` and an observation
        sequence :math:`O = o_1, \ldots, o_T`, compute the most probable
        sequence of latent states, :math:`Q = q_1, \ldots, q_T`.

        Notes
        -----
        HMM decoding is done efficiently via DP using the Viterbi algorithm,
        which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the
        probability under the HMM of being in state `i` at time `j` after having
        passed through the *most probable* state sequence :math:`q_1,\ldots,q_{j-1}`:

        .. math::

            \mathtt{viterbi[i,j]} =
                \max_{q_1, \ldots, q_{j-1}}
                    P(o_1, \ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of
        type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over
        all possible latent state sequences for the first `j-1` observations.

        The DP step is:

        .. math::

            \mathtt{viterbi[i,j]} &=
                \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot
                    \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
               &=  \max_{s'=1}^N
                   P(o_1,\ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)
                   P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)

        In words, ``viterbi[i,j]`` is the weighted sum of the values computed
        on the previous timestep. The weight on each value is the product of
        the probability of transitioning from that state to state `i` and the
        probability of emitting observation `j` in state `i`.

        To compute the most probable state sequence we maintain a second
        trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the
        latent state at timestep `j-1` that is most likely to lead to latent
        state `i` at timestep `j`.

        When we have completed the ``viterbi`` and ``back_pointer`` trellises for
        all `T` timseteps/observations, we greedily move backwards through the
        ``back_pointer`` trellis to construct the best path for the full
        sequence of observations.

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            An observation sequence of length `T`.

        Returns
        -------
        best_path : list of length `T`
            The most probable sequence of latent states for observations `O`.
        best_path_prob : float
            The probability of the latent state sequence in `best_path` under
            the HMM.
        """
        P = self.parameters
        N = self.derived_variables["N"]
        eps = self.hyperparameters["eps"]
        A, B, pi = P["A"], P["B"], P["pi"]

        if O.ndim == 1:
            O = O.reshape(1, -1)  # noqa: E741

        # number of observations in each sequence
        T = O.shape[1]

        # number of training sequences
        I = O.shape[0]  # noqa: E741
        if I != 1:  # noqa: E741
            raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)")

        # initialize the viterbi and back_pointer matrices
        viterbi = np.zeros((N, T))
        back_pointer = np.zeros((N, T)).astype(int)

        ot = O[0, 0]
        for s in range(N):
            back_pointer[s, 0] = 0
            viterbi[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)

        for t in range(1, T):
            ot = O[0, t]
            for s in range(N):
                seq_probs = [
                    viterbi[s_, t - 1] + np.log(A[s_, s] + eps) + np.log(B[s, ot] + eps)
                    for s_ in range(N)
                ]

                viterbi[s, t] = np.max(seq_probs)
                back_pointer[s, t] = np.argmax(seq_probs)

        best_path_log_prob = viterbi[:, T - 1].max()

        # backtrack through the trellis to get the most likely sequence of
        # latent states
        pointer = viterbi[:, T - 1].argmax()
        best_path = [pointer]
        for t in reversed(range(1, T)):
            pointer = back_pointer[pointer, t]
            best_path.append(pointer)
        best_path = best_path[::-1]

        return best_path, best_path_log_prob

    def _forward(self, Obs):
        r"""
        Computes the forward probability trellis for an HMM parameterized by
        :math:`(A, B, \pi)`.

        Notes
        -----
        The forward trellis (sometimes referred to as `alpha` in the HMM
        literature), is a 2D array where entry `i`, `j` represents the probability
        under the HMM of being in latent state `i` after seeing the first `j`
        observations:

        .. math::

            \mathtt{forward[i,j]} =
                P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of
        type `i`.

        The DP step is::

        .. math::

            forward[i,j] &=
                \sum_{s'=1}^N forward[s',j-1] \times A[s',i] \times B[i,o_j] \\
                &= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
                    \times P(q_j=i \mid q_{j-1}=s') \times P(o_j \mid q_j=i)

        In words, ``forward[i,j]`` is the weighted sum of the values computed
        on the previous timestep. The weight on each previous state value is
        the product of the probability of transitioning from that state to
        state `i` and the probability of emitting observation `j` in state `i`.

        Parameters
        ----------
        Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            An observation sequence of length `T`.

        Returns
        -------
        forward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)`
            The forward trellis.
        """
        P = self.parameters
        N = self.derived_variables["N"]
        eps = self.hyperparameters["eps"]
        A, B, pi = P["A"], P["B"], P["pi"]

        T = Obs.shape[0]

        # initialize the forward probability matrix
        forward = np.zeros((N, T))

        ot = Obs[0]
        for s in range(N):
            forward[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)

        for t in range(1, T):
            ot = Obs[t]
            for s in range(N):
                forward[s, t] = logsumexp(
                    [
                        forward[s_, t - 1]
                        + np.log(A[s_, s] + eps)
                        + np.log(B[s, ot] + eps)
                        for s_ in range(N)
                    ]  # noqa: C812
                )
        return forward

    def _backward(self, Obs):
        r"""
        Compute the backward probability trellis for an HMM parameterized by
        :math:`(A, B, \pi)`.

        Notes
        -----
        The backward trellis (sometimes referred to as `beta` in the HMM
        literature), is a 2D array where entry `i`,`j` represents the probability
        of seeing the observations from time `j+1` onward given that the HMM is
        in state `i` at time `j`

        .. math::

            \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},\ldots,o_T \mid q_j=i,A,B,\pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`.

        The DP step is::

            backward[i,j] &=
                \sum_{s'=1}^N backward[s',j+1] \times A[i, s'] \times B[s',o_{j+1}] \\
                &= \sum_{s'=1}^N P(o_{j+1}, o_{j+2}, \ldots, o_T \mid q_j=i, A, B, pi)
                    \times P(q_{j+1}=s' \mid q_{j}=i) \times P(o_{j+1} \mid q_{j+1}=s')

        In words, ``backward[i,j]`` is the weighted sum of the values computed
        on the following timestep. The weight on each state value from the
        `j+1`'th timestep is the product of the probability of transitioning from
        state i to that state and the probability of emitting observation `j+1`
        from that state.

        Parameters
        ----------
        Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            A single observation sequence of length `T`.

        Returns
        -------
        backward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)`
            The backward trellis.
        """
        P = self.parameters
        A, B = P["A"], P["B"]
        N = self.derived_variables["N"]
        eps = self.hyperparameters["eps"]

        T = Obs.shape[0]

        # initialize the backward trellis
        backward = np.zeros((N, T))

        for s in range(N):
            backward[s, T - 1] = 0

        for t in reversed(range(T - 1)):
            ot1 = Obs[t + 1]
            for s in range(N):
                backward[s, t] = logsumexp(
                    [
                        np.log(A[s, s_] + eps)
                        + np.log(B[s_, ot1] + eps)
                        + backward[s_, t + 1]
                        for s_ in range(N)
                    ]  # noqa: C812
                )
        return backward

    def _initialize_parameters(self):
        P = self.parameters
        A, B, pi = P["A"], P["B"], P["pi"]
        N, V = self.derived_variables["N"], self.derived_variables["V"]

        # Uniform initialization of prior over latent states
        if pi is None:
            pi = np.ones(N)
            pi = pi / pi.sum()

        # Uniform initialization of A
        if A is None:
            A = np.ones((N, N))
            A = A / A.sum(axis=1)[:, None]

        # Random initialization of B
        if B is None:
            B = np.random.rand(N, V)
            B = B / B.sum(axis=1)[:, None]

        P["A"], P["B"], P["pi"] = A, B, pi

    def fit(
        self,
        O,
        latent_state_types,
        observation_types,
        pi=None,
        tol=1e-5,
        verbose=False,
    ):
        """
        Given an observation sequence `O` and the set of possible latent states,
        learn the MLE HMM parameters `A` and `B`.

        Notes
        -----
        Model fitting is done iterativly using the Baum-Welch/Forward-Backward
        algorithm, a special case of the EM algorithm.

        We begin with an intial estimate for the transition (`A`) and emission
        (`B`) matrices and then use these to derive better and better estimates
        by computing the forward probability for an observation and then
        dividing that probability mass among all the paths that contributed to
        it.

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(I, T)`
            The set of `I` training observations, each of length `T`.
        latent_state_types : list of length `N`
            The collection of valid latent states.
        observation_types : list of length `V`
            The collection of valid observation states.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The prior probability of each latent state. If None, assume each
            latent state is equally likely a priori. Default is None.
        tol : float
            The tolerance value. If the difference in log likelihood between
            two epochs is less than this value, terminate training. Default is
            1e-5.
        verbose : bool
            Print training stats after each epoch. Default is True.

        Returns
        -------
        A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The estimated transition matrix.
        B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)`
            The estimated emission matrix.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The estimated prior probabilities of each latent state.
        """
        # observations
        if O.ndim == 1:
            O = O.reshape(1, -1)  # noqa: E741

        # number of training examples (I) and their lengths (T)
        I, T = O.shape

        # number of types of observation
        self.derived_variables["V"] = len(observation_types)

        # number of latent state types
        self.derived_variables["N"] = len(latent_state_types)

        self._initialize_parameters()

        P = self.parameters

        # iterate E and M steps until convergence criteria is met
        step, delta = 0, np.inf
        ll_prev = np.sum([self.log_likelihood(o) for o in O])

        while delta > tol:
            gamma, xi, phi = self._E_step(O)
            P["A"], P["B"], P["pi"] = self._M_step(O, gamma, xi, phi)
            ll = np.sum([self.log_likelihood(o) for o in O])
            delta = ll - ll_prev
            ll_prev = ll
            step += 1

            if verbose:
                fstr = "[Epoch {}] LL: {:.3f} Delta: {:.5f}"
                print(fstr.format(step, ll_prev, delta))

        #  return A, B, pi

    def _E_step(self, O):
        r"""
        Run a single E-step update for the Baum-Welch/Forward-Backward
        algorithm. This step estimates ``xi`` and ``gamma``, the excepted
        state-state transition counts and the expected state-occupancy counts,
        respectively.

        ``xi[i,j,k]`` gives the probability of being in state `i` at time `k`
        and state `j` at time `k+1` given the observed sequence `O` and the
        current estimates for transition (`A`) and emission (`B`) matrices::

        .. math::

            xi[i,j,k] &= P(q_k=i,q_{k+1}=j \mid O,A,B,pi) \\
                      &= \frac{
                            P(q_k=i,q_{k+1}=j,O \mid A,B,pi)
                         }{P(O \mid A,B,pi)} \\
                      &= \frac{
                            P(o_1,o_2,\ldots,o_k,q_k=i \mid A,B,pi) \times
                            P(q_{k+1}=j \mid q_k=i) \times
                            P(o_{k+1} \mid q_{k+1}=j) \times
                            P(o_{k+2},o_{k+3},\ldots,o_T \mid q_{k+1}=j,A,B,pi)
                         }{P(O \mid A,B,pi)} \\
                      &= \frac{
                            \mathtt{fwd[j, k] * self.A[j, i] *
                            self.B[i, o_{k+1}] * bwd[i, k + 1]}
                         }{\mathtt{fwd[:, T].sum()}}

        The expected number of transitions from state `i` to state `j` across the
        entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``.

        ``gamma[i,j]`` gives the probability of being in state `i` at time `j`

        .. math:: \mathtt{gamma[i,j]} = P(q_j = i \mid O, A, B, \pi)

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(I, T)`
            The set of `I` training observations, each of length `T`.

        Returns
        -------
        gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)`
            The estimated state-occupancy count matrix.
        xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)`
            The estimated state-state transition count matrix.
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)`
            The estimated prior counts for each latent state.
        """
        I, T = O.shape
        P = self.parameters
        A, B = P["A"], P["B"]
        N = self.derived_variables["N"]
        eps = self.hyperparameters["eps"]

        phi = np.zeros((I, N))
        gamma = np.zeros((I, N, T))
        xi = np.zeros((I, N, N, T))

        for i in range(I):
            Obs = O[i, :]
            fwd = self._forward(Obs)
            bwd = self._backward(Obs)
            log_likelihood = logsumexp(fwd[:, T - 1])

            t = T - 1
            for si in range(N):
                gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
                phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood

            for t in range(T - 1):
                ot1 = Obs[t + 1]
                for si in range(N):
                    gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
                    for sj in range(N):
                        xi[i, si, sj, t] = (
                            fwd[si, t]
                            + np.log(A[si, sj] + eps)
                            + np.log(B[sj, ot1] + eps)
                            + bwd[sj, t + 1]
                            - log_likelihood
                        )

        return gamma, xi, phi

    def _M_step(self, O, gamma, xi, phi):
        """
        Run a single M-step update for the Baum-Welch/Forward-Backward
        algorithm.

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(I, T)`
            The set of `I` training observations, each of length `T`.
        gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)`
            The estimated state-occupancy count matrix.
        xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)`
            The estimated state-state transition count matrix.
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)`
            The estimated starting count matrix for each latent state.

        Returns
        -------
        A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The estimated transition matrix.
        B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)`
            The estimated emission matrix.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The estimated prior probabilities for each latent state.
        """
        I, T = O.shape
        P = self.parameters
        DV = self.derived_variables
        eps = self.hyperparameters["eps"]

        N, V = DV["N"], DV["V"]
        A, B, pi = P["A"], P["B"], P["pi"]

        # initialize the estimated transition (A) and emission (B) matrices
        A = np.zeros((N, N))
        B = np.zeros((N, V))
        pi = np.zeros(N)

        count_gamma = np.zeros((I, N, V))
        count_xi = np.zeros((I, N, N))

        for i in range(I):
            Obs = O[i, :]
            for si in range(N):
                for vk in range(V):
                    if not (Obs == vk).any():
                        count_gamma[i, si, vk] = np.log(eps)
                    else:
                        count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk])

                for sj in range(N):
                    count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :])

        pi = logsumexp(phi, axis=0) - np.log(I + eps)
        np.testing.assert_almost_equal(np.exp(pi).sum(), 1)

        for si in range(N):
            for vk in range(V):
                B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp(
                    count_gamma[:, si, :]  # noqa: C812
                )

            for sj in range(N):
                A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp(
                    count_xi[:, si, :]  # noqa: C812
                )

            np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1)
            np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1)
        return np.exp(A), np.exp(B), np.exp(pi)


================================================
FILE: numpy_ml/lda/README.md
================================================
# Latent Dirichlet allocation
The `lda.py` module implements:

1. [Standard (ie., non-Bayesian) latent Dirichlet
   allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) with MLE parameter
   estimates via variational EM (Blei, Ng, & Jordan, 2003).
2. [Fully-Bayesian (ie., smoothed) latent Dirichlet allocation](https://people.cs.umass.edu/~wallach/courses/s11/cmpsci791ss/readings/griffiths02gibbs.pdf) with MAP parameter
   estimates via collapsed Gibbs sampling (Griffiths & Steyvers, 2004).

## Plots

### Unsmoothed
<p align="center">
   <img src="img/plot_unsmoothed.png" height="400" />
</p>
### Smoothed
TODO


================================================
FILE: numpy_ml/lda/__init__.py
================================================
from .lda import *
from .lda_smoothed import *


================================================
FILE: numpy_ml/lda/lda.py
================================================
import numpy as np
from scipy.special import digamma, polygamma, gammaln


class LDA(object):
    def __init__(self, T=10):
        """
        Vanilla (non-smoothed) LDA model trained using variational EM.
        Generates maximum-likelihood estimates for model paramters
        `alpha` and `beta`.

        Parameters
        ----------
        T : int
            Number of topics

        Attributes
        ----------
        D : int
            Number of documents
        N : list of length `D`
            Number of words in each document
        V : int
            Number of unique word tokens across all documents
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(D, N[d], T)`
            Variational approximation to word-topic distribution
        gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(D, T)`
            Variational approximation to document-topic distribution
        alpha : :py:class:`ndarray <numpy.ndarray>` of shape `(1, T)`
            Parameter for the Dirichlet prior on the document-topic distribution
        beta  : :py:class:`ndarray <numpy.ndarray>` of shape `(V, T)`
            Word-topic distribution
        """
        self.T = T

    def _maximize_phi(self):
        """
        Optimize variational parameter phi
        ϕ_{t, n} ∝ β_{t, w_n}  e^( Ψ(γ_t) )
        """
        D = self.D
        N = self.N
        T = self.T

        phi = self.phi
        beta = self.beta
        gamma = self.gamma
        corpus = self.corpus

        for d in range(D):
            for n in range(N[d]):
                for t in range(T):
                    w_n = int(corpus[d][n])
                    phi[d][n, t] = beta[w_n, t] * np.exp(dg(gamma, d, t))

                # Normalize over topics
                phi[d][n, :] = phi[d][n, :] / np.sum(phi[d][n, :])
        return phi

    def _maximize_gamma(self):
        """
        Optimize variational parameter gamma
        γ_t = α_t + \sum_{n=1}^{N_d} ϕ_{t, n}
        """
        D = self.D
        phi = self.phi
        alpha = self.alpha

        gamma = np.tile(alpha, (D, 1)) + np.array(
            list(map(lambda x: np.sum(x, axis=0), phi))
        )
        return gamma

    def _maximize_beta(self):
        """
        Optimize model parameter beta
        β_{t, n} ∝ \sum_{d=1}^D \sum_{i=1}^{N_d} ϕ_{d, t, n} [ i = n]
        """
        T = self.T
        V = self.V

        phi = self.phi
        beta = self.beta
        corpus = self.corpus

        for n in range(V):
            # Construct binary mask [i == n] to be the same shape as phi
            mask = [np.tile((doc == n), (T, 1)).T for doc in corpus]
            beta[n, :] = np.sum(
                np.array(list(map(lambda x: np.sum(x, axis=0), phi * mask))), axis=0
            )

        # Normalize over words
        for t in range(T):
            beta[:, t] = beta[:, t] / np.sum(beta[:, t])

        return beta

    def _maximize_alpha(self, max_iters=1000, tol=0.1):
        """
        Optimize alpha using Blei's O(n) Newton-Raphson modification
        for a Hessian with special structure
        """
        D = self.D
        T = self.T

        alpha = self.alpha
        gamma = self.gamma

        for _ in range(max_iters):
            alpha_old = alpha

            #  Calculate gradient
            g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum(
                digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T,
                axis=0,
            )

            #  Calculate Hessian diagonal component
            h = -D * polygamma(1, alpha)

            #  Calculate Hessian constant component
            z = D * polygamma(1, np.sum(alpha))

            #  Calculate constant
            c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0)))

            #  Update alpha
            alpha = alpha - (g - c) / h

            #  Check convergence
            if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol:
                break

        return alpha

    def _E_step(self):
        """
        Maximize the VLB with respect to the variational parameters, γ and ϕ
        """
        self.phi = self._maximize_phi()
        self.gamma = self._maximize_gamma()

    def _M_step(self):
        """
        Maximize the VLB with respect to the model parameters, α and β
        """
        self.beta = self._maximize_beta()
        self.alpha = self._maximize_alpha()

    def VLB(self):
        """
        Return the variational lower bound associated with the current model
        parameters.
        """
        phi = self.phi
        alpha = self.alpha
        beta = self.beta
        gamma = self.gamma
        corpus = self.corpus

        D = self.D
        T = self.T
        N = self.N

        a, b, c, _d = 0, 0, 0, 0
        for d in range(D):
            a += (
                gammaln(np.sum(alpha))
                - np.sum(gammaln(alpha))
                + np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)])
            )

            _d += (
                gammaln(np.sum(gamma[d, :]))
                - np.sum(gammaln(gamma[d, :]))
                + np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)])
            )

            for n in range(N[d]):
                w_n = int(corpus[d][n])

                b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)])
                c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)])
                _d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)])

        return a + b + c - _d

    def initialize_parameters(self):
        """
        Provide reasonable initializations for model and variational parameters.
        """
        T = self.T
        V = self.V
        N = self.N
        D = self.D

        # initialize model parameters
        self.alpha = 100 * np.random.dirichlet(10 * np.ones(T), 1)[0]
        self.beta = np.random.dirichlet(np.ones(V), T).T

        # initialize variational parameters
        self.phi = np.array([1 / T * np.ones([N[d], T]) for d in range(D)])
        self.gamma = np.tile(self.alpha, (D, 1)) + np.tile(N / T, (T, 1)).T

    def train(self, corpus, verbose=False, max_iter=1000, tol=5):
        """
        Train the LDA model on a corpus of documents (bags of words).

        Parameters
        ----------
        corpus : list of length `D`
            A list of lists, with each sublist containing the tokenized text of
            a single document.
        verbose : bool
            Whether to print the VLB at each training iteration. Default is
            True.
        max_iter : int
            The maximum number of training iterations to perform before
            breaking. Default is 1000.
        tol : int
            Break the training loop if the difference betwen the VLB on the
            current iteration and the previous iteration is less than `tol`.
            Default is 5.
        """
        self.D = len(corpus)
        self.V = len(set(np.concatenate(corpus)))
        self.N = np.array([len(d) for d in corpus])
        self.corpus = corpus

        self.initialize_parameters()
        vlb = -np.inf

        for i in range(max_iter):
            old_vlb = vlb

            self._E_step()
            self._M_step()

            vlb = self.VLB()
            delta = vlb - old_vlb

            if verbose:
                print("Iteration {}: {:.3f} (delta: {:.2f})".format(i + 1, vlb, delta))

            if delta < tol:
                break


#######################################################################
#                                Utils                                #
#######################################################################


def dg(gamma, d, t):
    """
    E[log X_t] where X_t ~ Dir
    """
    return digamma(gamma[d, t]) - digamma(np.sum(gamma[d, :]))


================================================
FILE: numpy_ml/lda/lda_smoothed.py
================================================
import numpy as np


class SmoothedLDA(object):
    def __init__(self, T, **kwargs):
        """
        A smoothed LDA model trained using collapsed Gibbs sampling. Generates
        posterior mean estimates for model parameters `phi` and `theta`.

        Parameters
        ----------
        T : int
            Number of topics

        Attributes
        ----------
        D : int
            Number of documents
        N : int
            Total number of words across all documents
        V : int
            Number of unique word tokens across all documents
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(N[d], T)`
            The word-topic distribution
        theta : :py:class:`ndarray <numpy.ndarray>` of shape `(D, T)`
            The document-topic distribution
        alpha : :py:class:`ndarray <numpy.ndarray>` of shape `(1, T)`
            Parameter for the Dirichlet prior on the document-topic distribution
        beta  : :py:class:`ndarray <numpy.ndarray>` of shape `(V, T)`
            Parameter for the Dirichlet prior on the topic-word distribution
        """
        self.T = T

        self.alpha = (50.0 / self.T) * np.ones(self.T)
        if "alpha" in kwargs.keys():
            self.alpha = (kwargs["alpha"]) * np.ones(self.T)

        self.beta = 0.01
        if "beta" in kwargs.keys():
            self.beta = kwargs["beta"]

    def _init_params(self, texts, tokens):
        self.tokens = tokens
        self.D = len(texts)
        self.V = len(np.unique(self.tokens))
        self.N = np.sum(np.array([len(doc) for doc in texts]))
        self.word_document = np.zeros(self.N)

        # now that we know the number of tokens in our corpus, we can set beta
        self.beta = self.beta * np.ones(self.V)

        count = 0
        for doc_idx, doc in enumerate(texts):
            for word_idx, word in enumerate(doc):
                word_idx = word_idx + count
                self.word_document[word_idx] = doc_idx
            count = count + len(doc)

    def train(self, texts, tokens, n_gibbs=2000):
        """
        Trains a topic model on the documents in texts.

        Parameters
        ----------
        texts : array of length `(D,)`
            The training corpus represented as an array of subarrays, where
            each subarray corresponds to the tokenized words of a single
            document.
        tokens : array of length `(V,)`
            The set of unique tokens in the documents in `texts`.
        n_gibbs : int
            The number of steps to run the collapsed Gibbs sampler during
            training. Default is 2000.

        Returns
        -------
        C_wt : :py:class:`ndarray <numpy.ndarray>` of shape (V, T)
            The word-topic count matrix
        C_dt : :py:class:`ndarray <numpy.ndarray>` of shape (D, T)
            The document-topic count matrix
        assignments : :py:class:`ndarray <numpy.ndarray>` of shape (N, n_gibbs)
            The topic assignments for each word in the corpus on each Gibbs
            step.
        """
        self._init_params(texts, tokens)
        C_wt, C_dt, assignments = self._gibbs_sampler(n_gibbs, texts)
        self.fit_params(C_wt, C_dt)
        return C_wt, C_dt, assignments

    def what_did_you_learn(self, top_n=10):
        """
        Print the `top_n` most probable words under each topic
        """
        for tt in range(self.T):
            top_idx = np.argsort(self.phi[:, tt])[::-1][:top_n]
            top_tokens = self.tokens[top_idx]
            print("\nTop Words for Topic %s:\n" % (str(tt)))
            for token in top_tokens:
                print("\t%s\n" % (str(token)))

    def fit_params(self, C_wt, C_dt):
        """
        Estimate `phi`, the word-topic distribution, and `theta`, the
        topic-document distribution.

        Parameters
        ----------
        C_wt : :py:class:`ndarray <numpy.ndarray>` of shape (V, T)
            The word-topic count matrix
        C_dt : :py:class:`ndarray <numpy.ndarray>` of shape (D, T)
            The document-topic count matrix

        Returns
        -------
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(V, T)`
            The word-topic distribution
        theta : :py:class:`ndarray <numpy.ndarray>` of shape `(D, T)`
            The document-topic distribution
        """
        self.phi = np.zeros([self.V, self.T])
        self.theta = np.zeros([self.D, self.T])

        b, a = self.beta[0], self.alpha[0]
        for ii in range(self.V):
            for jj in range(self.T):
                self.phi[ii, jj] = (C_wt[ii, jj] + b) / (
                    np.sum(C_wt[:, jj]) + self.V * b
                )

        for dd in range(self.D):
            for jj in range(self.T):
                self.theta[dd, jj] = (C_dt[dd, jj] + a) / (
                    np.sum(C_dt[dd, :]) + self.T * a
                )
        return self.phi, self.theta

    def _estimate_topic_prob(self, ii, d, C_wt, C_dt):
        """
        Compute an approximation of the conditional probability that token ii
        is assigned to topic jj given all previous topic assignments and the
        current document d: p(t_i = j | t_{-i}, w_i, d_i)
        """
        p_vec = np.zeros(self.T)
        b, a = self.beta[0], self.alpha[0]
        for jj in range(self.T):
            # prob of word ii under topic jj
            frac1 = (C_wt[ii, jj] + b) / (np.sum(C_wt[:, jj]) + self.V * b)
            # prob of topic jj under document d
            frac2 = (C_dt[d, jj] + a) / (np.sum(C_dt[d, :]) + self.T * a)
            p_vec[jj] = frac1 * frac2
        return p_vec / np.sum(p_vec)

    def _gibbs_sampler(self, n_gibbs, texts):
        """
        Collapsed Gibbs sampler for estimating the posterior distribution over
        topic assignments.
        """
        # Initialize count matrices
        C_wt = np.zeros([self.V, self.T])
        C_dt = np.zeros([self.D, self.T])
        assignments = np.zeros([self.N, n_gibbs + 1])

        # Randomly initialize topic assignments for words
        for ii in range(self.N):
            token_idx = np.concatenate(texts)[ii]
            assignments[ii, 0] = np.random.randint(0, self.T)

            doc = self.word_document[ii]
            C_dt[doc, assignments[ii, 0]] += 1
            C_wt[token_idx, assignments[ii, 0]] += 1

        # run collapsed Gibbs sampler
        for gg in range(n_gibbs):
            print("Gibbs iteration {} of {}".format(gg + 1, n_gibbs))
            for jj in range(self.N):
                token_idx = np.concatenate(texts)[jj]

                # Decrement count matrices by 1
                doc = self.word_document[jj]
                C_wt[token_idx, assignments[jj, gg]] -= 1
                C_dt[doc, assignments[jj, gg]] -= 1

                # Draw new topic from our approximation of the conditional dist.
                p_topics = self._estimate_topic_prob(token_idx, doc, C_wt, C_dt)
                sampled_topic = np.nonzero(np.random.multinomial(1, p_topics))[0][0]

                # Update count matrices
                C_wt[token_idx, sampled_topic] += 1
                C_dt[doc, sampled_topic] += 1
                assignments[jj, gg + 1] = sampled_topic
        return C_wt, C_dt, assignments


================================================
FILE: numpy_ml/linear_models/README.md
================================================
# Linear Models
The `linear_models` module includes:

1. [OLS linear regression](https://en.wikipedia.org/wiki/Ordinary_least_squares) with maximum likelihood parameter estimates via the normal equation. 
    - Includes optional weight arguments for [weighted least squares](https://en.wikipedia.org/wiki/Weighted_least_squares)
    - Supports batch and online coefficient updates.
3. [Ridge regression / Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)
   with maximum likelihood parameter estimates via the normal equation.
2. [Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) with maximum likelihood parameter estimates via gradient descent.
3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions)
    - Known coefficient prior mean and known error variance
    - Known coefficient prior mean and unknown error variance
4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods.
5. [Generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model) with identity, log, and logit link functions.

## Plots
<p align="center">
<img src="img/plot_logistic.png" align='center' height="550" />

<img src="img/plot_bayes.png" align='center' height="300" />

<img src="img/plot_regression.png" align='center' height="550" />
</p>


================================================
FILE: numpy_ml/linear_models/__init__.py
================================================
"""A module containing assorted linear models."""

from .ridge import RidgeRegression
from .glm import GeneralizedLinearModel
from .logistic import LogisticRegression
from .bayesian_regression import (
    BayesianLinearRegressionKnownVariance,
    BayesianLinearRegressionUnknownVariance,
)
from .naive_bayes import GaussianNBClassifier
from .linear_regression import LinearRegression


================================================
FILE: numpy_ml/linear_models/bayesian_regression.py
================================================
"""A module of Bayesian linear regression models."""
import numpy as np
import scipy.stats as stats

from numpy_ml.utils.testing import is_number, is_symmetric_positive_definite


class BayesianLinearRegressionUnknownVariance:
    def __init__(self, alpha=1, beta=2, mu=0, V=None, fit_intercept=True):
        r"""
        Bayesian linear regression model with unknown variance. Assumes a
        conjugate normal-inverse-gamma joint prior on the model parameters and
        error variance.

        Notes
        -----
        The current model uses a conjugate normal-inverse-gamma joint prior on
        model parameters **b** and error variance :math:`\sigma^2`. The joint
        and marginal posteriors over each are:

        .. math::

            \mathbf{b}, \sigma^2 &\sim
                \text{N-\Gamma^{-1}}(\mu, \mathbf{V}^{-1}, \alpha, \beta) \\
            \sigma^2 &\sim \text{InverseGamma}(\alpha, \beta) \\
            \mathbf{b} \mid \sigma^2 &\sim \mathcal{N}(\mu, \sigma^2 \mathbf{V})

        Parameters
        ----------
        alpha : float
            The shape parameter for the Inverse-Gamma prior on
            :math:`\sigma^2`. Must be strictly greater than 0. Default is 1.
        beta : float
            The scale parameter for the Inverse-Gamma prior on
            :math:`\sigma^2`. Must be strictly greater than 0. Default is 1.
        mu : :py:class:`ndarray <numpy.ndarray>` of shape `(M,)` or float
            The mean of the Gaussian prior on `b`. If a float, assume `mu`
            is ``np.ones(M) * mu``. Default is 0.
        V : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` or `(N,)` or None
            A symmetric positive definite matrix that when multiplied
            element-wise by :math:`\sigma^2` gives the covariance matrix for
            the Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If
            None, assume `V` is the identity matrix.  Default is None.
        fit_intercept : bool
            Whether to fit an intercept term in addition to the coefficients in
            b. If True, the estimates for b will have `M + 1` dimensions, where
            the first dimension corresponds to the intercept. Default is True.

        Attributes
        ----------
        posterior : dict or None
            Frozen random variables for the posterior distributions
            :math:`P(\sigma^2 \mid X)` and :math:`P(b \mid X, \sigma^2)`.
        posterior_predictive : dict or None
            Frozen random variable for the posterior predictive distribution,
            :math:`P(y \mid X)`. This value is only set following a call to
            :meth:`predict <numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance.predict>`.
        """  # noqa: E501
        # this is a placeholder until we know the dimensions of X
        V = 1.0 if V is None else V

        if isinstance(V, list):
            V = np.array(V)

        if isinstance(V, np.ndarray):
            if V.ndim == 1:
                V = np.diag(V)
            elif V.ndim == 2:
                fstr = "V must be symmetric positive definite"
                assert is_symmetric_positive_definite(V), fstr

        self.V = V
        self.mu = mu
        self.beta = beta
        self.alpha = alpha
        self.fit_intercept = fit_intercept

        self.posterior = None
        self.posterior_predictive = None

    def fit(self, X, y):
        """
        Compute the posterior over model parameters using the data in `X` and
        `y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            The targets for each of the `N` examples in `X`, where each target
            has dimension `K`.

        Returns
        -------
        self : :class:`BayesianLinearRegressionUnknownVariance<numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance>` instance
        """  # noqa: E501
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        N, M = X.shape
        alpha, beta, V, mu = self.alpha, self.beta, self.V, self.mu

        if is_number(V):
            V *= np.eye(M)

        if is_number(mu):
            mu *= np.ones(M)

        # sigma
        I = np.eye(N)  # noqa: E741
        a = y - (X @ mu)
        b = np.linalg.inv(X @ V @ X.T + I)
        c = y - (X @ mu)

        shape = N + alpha
        sigma = (1 / shape) * (alpha * beta ** 2 + a @ b @ c)
        scale = sigma ** 2

        # sigma is the mode of the inverse gamma prior on sigma^2
        sigma = scale / (shape - 1)

        # mean
        V_inv = np.linalg.inv(V)
        L = np.linalg.inv(V_inv + X.T @ X)
        R = V_inv @ mu + X.T @ y

        mu = L @ R
        cov = L * sigma

        # posterior distribution for sigma^2 and b
        self.posterior = {
            "sigma**2": stats.distributions.invgamma(a=shape, scale=scale),
            "b | sigma**2": stats.multivariate_normal(mean=mu, cov=cov),
        }
        return self

    def predict(self, X):
        """
        Return the MAP prediction for the targets associated with `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, K)`
            The model predictions for the items in `X`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        I = np.eye(X.shape[0])  # noqa: E741
        mu = X @ self.posterior["b | sigma**2"].mean
        cov = X @ self.posterior["b | sigma**2"].cov @ X.T + I

        # MAP estimate for y corresponds to the mean of the posterior
        # predictive
        self.posterior_predictive = stats.multivariate_normal(mu, cov)
        return mu


class BayesianLinearRegressionKnownVariance:
    def __init__(self, mu=0, sigma=1, V=None, fit_intercept=True):
        r"""
        Bayesian linear regression model with known error variance and
        conjugate Gaussian prior on model parameters.

        Notes
        -----
        Uses a conjugate Gaussian prior on the model coefficients **b**. The
        posterior over model coefficients is then

        .. math::

            \mathbf{b} \mid \mu, \sigma^2, \mathbf{V}
                \sim \mathcal{N}(\mu, \sigma^2 \mathbf{V})

        Ridge regression is a special case of this model where :math:`\mu =
        \mathbf{0}`, :math:`\sigma = 1` and :math:`\mathbf{V} = \mathbf{I}`
        (ie., the prior on the model coefficients **b** is a zero-mean, unit
        covariance Gaussian).

        Parameters
        ----------
        mu : :py:class:`ndarray <numpy.ndarray>` of shape `(M,)` or float
            The mean of the Gaussian prior on `b`. If a float, assume `mu` is
            ``np.ones(M) * mu``. Default is 0.
        sigma : float
            The square root of the scaling term for covariance of the Gaussian
            prior on `b`. Default is 1.
        V : :py:class:`ndarray <numpy.ndarray>` of shape `(N,N)` or `(N,)` or None
            A symmetric positive definite matrix that when multiplied
            element-wise by ``sigma ** 2`` gives the covariance matrix for the
            Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If None,
            assume `V` is the identity matrix. Default is None.
        fit_intercept : bool
            Whether to fit an intercept term in addition to the coefficients in
            `b`. If True, the estimates for `b` will have `M + 1` dimensions, where
            the first dimension corresponds to the intercept. Default is True.

        Attributes
        ----------
        posterior : dict or None
            Frozen random variable for the posterior distribution :math:`P(b
            \mid X, \sigma^2)`.
        posterior_predictive : dict or None
            Frozen random variable for the posterior predictive distribution,
            :math:`P(y \mid X)`. This value is only set following a call to
            :meth:`predict <numpy_ml.linear_models.BayesianLinearRegressionKnownVariance.predict>`.
        """  # noqa: E501
        # this is a placeholder until we know the dimensions of X
        V = 1.0 if V is None else V

        if isinstance(V, list):
            V = np.array(V)

        if isinstance(V, np.ndarray):
            if V.ndim == 1:
                V = np.diag(V)
            elif V.ndim == 2:
                fstr = "V must be symmetric positive definite"
                assert is_symmetric_positive_definite(V), fstr

        self.posterior = {}
        self.posterior_predictive = {}

        self.V = V
        self.mu = mu
        self.sigma = sigma
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        """
        Compute the posterior over model parameters using the data in `X` and
        `y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            The targets for each of the `N` examples in `X`, where each target
            has dimension `K`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        N, M = X.shape

        if is_number(self.V):
            self.V *= np.eye(M)

        if is_number(self.mu):
            self.mu *= np.ones(M)

        V = self.V
        mu = self.mu
        sigma = self.sigma

        V_inv = np.linalg.inv(V)
        L = np.linalg.inv(V_inv + X.T @ X)
        R = V_inv @ mu + X.T @ y

        mu = L @ R
        cov = L * sigma ** 2

        # posterior distribution over b conditioned on sigma
        self.posterior["b"] = stats.multivariate_normal(mu, cov)

    def predict(self, X):
        """
        Return the MAP prediction for the targets associated with `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, K)`
            The MAP predictions for the targets associated with the items in
            `X`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        I = np.eye(X.shape[0])  # noqa: E741
        mu = X @ self.posterior["b"].mean
        cov = X @ self.posterior["b"].cov @ X.T + I

        # MAP estimate for y corresponds to the mean/mode of the gaussian
        # posterior predictive distribution
        self.posterior_predictive = stats.multivariate_normal(mu, cov)
        return mu


================================================
FILE: numpy_ml/linear_models/glm.py
================================================
"""A module for the generalized linear model."""
import numpy as np

from numpy_ml.linear_models.linear_regression import LinearRegression

eps = np.finfo(float).eps

_GLM_LINKS = {
    "logit": {
        "link": lambda mu: np.log((mu + eps) / (1 - mu + eps)),
        "inv_link": lambda eta: 1.0 / (1.0 + np.exp(-eta)),
        "link_prime": lambda x: (1 / (x + eps)) + (1 / (1 - x + eps)),
        "theta": lambda mu: np.log((mu + eps) / (1 - mu + eps)),
        "phi": lambda x: np.ones(x.shape[0]),
        "a": lambda phi: phi,
        "b": lambda theta: np.log(1 + np.exp(theta)),
        "p": 1,
        "b_prime": lambda theta: np.exp(theta) / (1 + np.exp(theta)),
        "b_prime2": lambda theta: np.exp(theta) / ((1 + np.exp(theta)) ** 2),
    },
    "identity": {
        "link": lambda mu: mu,
        "inv_link": lambda eta: eta,
        "link_prime": lambda x: np.ones_like(x),
        "theta": lambda mu: mu,
        "phi": lambda x: np.var(x, axis=0),
        "a": lambda phi: phi,
        "b": lambda theta: 0.5 * theta ** 2,
        "p": 1,
        "b_prime": lambda theta: theta,
        "b_prime2": lambda theta: np.ones_like(theta),
    },
    "log": {
        "link": lambda mu: np.log(mu + eps),
        "inv_link": lambda eta: np.exp(eta),
        "link_prime": lambda x: 1 / (x + eps),
        "theta": lambda mu: np.log(mu + eps),
        "phi": lambda x: np.ones(x.shape[0]),
        "a": lambda phi: phi,
        "p": 1,
        "b": lambda theta: np.exp(theta),
        "b_prime": lambda theta: np.exp(theta),
        "b_prime2": lambda theta: np.exp(theta),
    },
}


class GeneralizedLinearModel:
    def __init__(self, link, fit_intercept=True, tol=1e-5, max_iter=100):
        r"""
        A generalized linear model with maximum likelihood fit via
        iteratively reweighted least squares (IRLS).

        Notes
        -----
        The generalized linear model (GLM) [7]_ [8]_ assumes that each target/dependent
        variable :math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots,
        y_n)`, has been drawn independently from a pre-specified distribution
        in the exponential family [11]_ with unknown mean :math:`\mu_i`. The GLM
        models a (one-to-one, continuous, differentiable) function, *g*, of
        this mean value as a linear combination of the model parameters
        :math:`\mathbf{b}` and observed covariates, :math:`\mathbf{x}_i`:

        .. math::

            g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
                g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i

        where *g* is known as the "link function" associated with the GLM.  The
        choice of link function is informed by the instance of the exponential
        family the target is drawn from. Common examples:

        .. csv-table::
           :header: "Distribution", "Link", "Formula"
           :widths: 25, 20, 30

           "Normal", "Identity", ":math:`g(x) = x`"
           "Bernoulli", "Logit", ":math:`g(x) = \log(x) - \log(1 - x)`"
           "Binomial", "Logit", ":math:`g(x) = \log(x) - \log(n - x)`"
           "Poisson", "Log", ":math:`g(x) = \log(x)`"

        An iteratively re-weighted least squares (IRLS) algorithm [9]_ can be
        employed to find the maximum likelihood estimate for the model
        parameters :math:`\beta` in any instance of the generalized linear
        model. IRLS is equivalent to Fisher scoring [10]_, which itself is
        a slight modification of classic Newton-Raphson for finding the zeros
        of the first derivative of the model log-likelihood.

        References
        ----------
        .. [7] Nelder, J., & Wedderburn, R. (1972). Generalized linear
               models. *Journal of the Royal Statistical Society, Series A
               (General), 135(3)*: 370–384.
        .. [8] https://en.wikipedia.org/wiki/Generalized_linear_model
        .. [9] https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
        .. [10] https://en.wikipedia.org/wiki/Scoring_algorithm
        .. [11] https://en.wikipedia.org/wiki/Exponential_family

        Parameters
        ----------
        link: {'identity', 'logit', 'log'}
            The link function to use during modeling.
        fit_intercept: bool
            Whether to fit an intercept term in addition to the model
            coefficients. Default is True.
        tol : float
            The minimum difference between successive iterations of IRLS
            Default is 1e-5.
        max_iter: int
            The maximum number of iteratively reweighted least squares
            iterations to run during fitting. Default is 100.

        Attributes
        ----------
        beta : :py:class:`ndarray <numpy.ndarray>` of shape `(M, 1)` or None
            Fitted model coefficients.
        """
        err_str = f"Valid link functions are {list(_GLM_LINKS.keys())} but got {link}"
        assert link in _GLM_LINKS, err_str

        self._is_fit = False

        self.tol = tol
        self.link = link
        self.beta = None
        self.max_iter = max_iter
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        """
        Find the maximum likelihood GLM coefficients via IRLS.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The targets for each of the `N` examples in `X`.

        Returns
        -------
        self : :class:`GeneralizedLinearModel <numpy_ml.linear_models.GeneralizedLinearModel>` instance
        """  # noqa: E501
        y = np.squeeze(y)
        assert y.ndim == 1

        N, M = X.shape
        L = _GLM_LINKS[self.link]

        # starting values for parameters
        mu = np.ones_like(y) * np.mean(y)
        eta = L["link"](mu)
        theta = L["theta"](mu)

        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(N), X]

        # IRLS for GLM
        i = 0
        diff, beta = np.inf, np.inf
        while diff > (self.tol * M):
            if i > self.max_iter:
                print("Warning: Model did not converge")
                break

            # compute first-order Taylor approx.
            z = eta + (y - mu) * L["link_prime"](mu)
            w = L["p"] / (L["b_prime2"](theta) * L["link_prime"](mu) ** 2)

            # perform weighted least-squares on z
            wlr = LinearRegression(fit_intercept=False)
            beta_new = wlr.fit(X, z, weights=w).beta.ravel()

            eta = X @ beta_new
            mu = L["inv_link"](eta)
            theta = L["theta"](mu)

            diff = np.linalg.norm(beta - beta_new, ord=1)
            beta = beta_new
            i += 1

        self.beta = beta
        self._is_fit = True
        return self

    def predict(self, X):
        r"""
        Use the trained model to generate predictions for the distribution
        means, :math:`\mu`, associated with the collection of data points in
        **X**.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        mu_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z,)`
            The model predictions for the expected value of the target
            associated with each item in `X`.
        """
        assert self._is_fit, "Must call `fit` before generating predictions"
        L = _GLM_LINKS[self.link]

        # convert X to a design matrix if we're using an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        mu_pred = L["inv_link"](X @ self.beta)
        return mu_pred.ravel()


================================================
FILE: numpy_ml/linear_models/linear_regression.py
================================================
"""Linear regression module."""

import numpy as np


class LinearRegression:
    def __init__(self, fit_intercept=True):
        r"""
        A weighted linear least-squares regression model.

        Notes
        -----
        In weighted linear least-squares regression [1]_, a real-valued target
        vector, **y**, is modeled as a linear combination of covariates, **X**,
        and model coefficients, :math:`\beta`:

        .. math::

            y_i = \beta^\top \mathbf{x}_i + \epsilon_i

        In this equation :math:`\epsilon_i \sim \mathcal{N}(0, \sigma^2_i)` is
        the error term associated with example :math:`i`, and
        :math:`\sigma^2_i` is the variance of the corresponding example.

        Under this model, the maximum-likelihood estimate for the regression
        coefficients, :math:`\beta`, is:

        .. math::

            \hat{\beta} = \Sigma^{-1} \mathbf{X}^\top \mathbf{Wy}

        where :math:`\Sigma^{-1} = (\mathbf{X}^\top \mathbf{WX})^{-1}` and
        **W** is a diagonal matrix of weights, with each entry inversely
        proportional to the variance of the corresponding measurement. When
        **W** is the identity matrix the examples are weighted equally and the
        model reduces to standard linear least squares [2]_.

        References
        ----------
        .. [1] https://en.wikipedia.org/wiki/Weighted_least_squares
        .. [2] https://en.wikipedia.org/wiki/General_linear_model

        Parameters
        ----------
        fit_intercept : bool
            Whether to fit an intercept term in addition to the model
            coefficients. Default is True.

        Attributes
        ----------
        beta : :py:class:`ndarray <numpy.ndarray>` of shape `(M, K)` or None
            Fitted model coefficients.
        sigma_inv : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` or None
            Inverse of the data covariance matrix.
        """
        self.beta = None
        self.sigma_inv = None
        self.fit_intercept = fit_intercept

        self._is_fit = False

    def update(self, X, y, weights=None):
        r"""
        Incrementally update the linear least-squares coefficients for a set of
        new examples.

        Notes
        -----
        The recursive least-squares algorithm [3]_ [4]_ is used to efficiently
        update the regression parameters as new examples become available. For
        a single new example :math:`(\mathbf{x}_{t+1}, \mathbf{y}_{t+1})`, the
        parameter updates are

        .. math::

            \beta_{t+1} = \left(
                \mathbf{X}_{1:t}^\top \mathbf{X}_{1:t} +
                    \mathbf{x}_{t+1}\mathbf{x}_{t+1}^\top \right)^{-1}
                        \mathbf{X}_{1:t}^\top \mathbf{Y}_{1:t} +
                            \mathbf{x}_{t+1}^\top \mathbf{y}_{t+1}

        where :math:`\beta_{t+1}` are the updated regression coefficients,
        :math:`\mathbf{X}_{1:t}` and :math:`\mathbf{Y}_{1:t}` are the set of
        examples observed from timestep 1 to *t*.

        In the single-example case, the RLS algorithm uses the Sherman-Morrison
        formula [5]_ to avoid re-inverting the covariance matrix on each new
        update. In the multi-example case (i.e., where :math:`\mathbf{X}_{t+1}`
        and :math:`\mathbf{y}_{t+1}` are matrices of `N` examples each), we use
        the generalized Woodbury matrix identity [6]_ to update the inverse
        covariance. This comes at a performance cost, but is still more
        performant than doing multiple single-example updates if *N* is large.

        References
        ----------
        .. [3] Gauss, C. F. (1821) *Theoria combinationis observationum
           erroribus minimis obnoxiae*, Werke, 4. Gottinge
        .. [4] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
        .. [5] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
        .. [6] https://en.wikipedia.org/wiki/Woodbury_matrix_identity

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            The targets for each of the `N` examples in `X`, where each target
            has dimension `K`
        weights : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` or None
            Weights associated with the examples in `X`. Examples
            with larger weights exert greater influence on model fit.  When
            `y` is a vector (i.e., `K = 1`), weights should be set to the
            reciporical of the variance for each measurement (i.e., :math:`w_i
            = 1/\sigma^2_i`). When `K > 1`, it is assumed that all columns of
            `y` share the same weight :math:`w_i`. If None, examples are
            weighted equally, resulting in the standard linear least squares
            update.  Default is None.

        Returns
        -------
        self : :class:`LinearRegression <numpy_ml.linear_models.LinearRegression>` instance
        """  # noqa: E501
        if not self._is_fit:
            raise RuntimeError("You must call the `fit` method before calling `update`")

        X, y = np.atleast_2d(X), np.atleast_2d(y)

        X1, Y1 = X.shape[0], y.shape[0]
        weights = np.ones(X1) if weights is None else np.atleast_1d(weights)
        weights = np.squeeze(weights) if weights.size > 1 else weights

        err_str = f"weights must have shape ({X1},) but got {weights.shape}"
        assert weights.shape == (X1,), err_str

        # scale X and y by the weight associated with each example
        W = np.diag(np.sqrt(weights))
        X, y = W @ X, W @ y

        self._update1D(X, y, W) if X1 == Y1 == 1 else self._update2D(X, y, W)
        return self

    def _update1D(self, x, y, w):
        """Sherman-Morrison update for a single example"""
        beta, S_inv = self.beta, self.sigma_inv

        # convert x to a design vector if we're fitting an intercept
        if self.fit_intercept:
            x = np.c_[np.diag(w), x]

        # update the inverse of the covariance matrix via Sherman-Morrison
        S_inv -= (S_inv @ x.T @ x @ S_inv) / (1 + x @ S_inv @ x.T)

        # update the model coefficients
        beta += S_inv @ x.T @ (y - x @ beta)

    def _update2D(self, X, y, W):
        """Woodbury update for multiple examples"""
        beta, S_inv = self.beta, self.sigma_inv

        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.diag(W), X]

        I = np.eye(X.shape[0])  # noqa: E741

        # update the inverse of the covariance matrix via Woodbury identity
        S_inv -= S_inv @ X.T @ np.linalg.pinv(I + X @ S_inv @ X.T) @ X @ S_inv

        # update the model coefficients
        beta += S_inv @ X.T @ (y - X @ beta)

    def fit(self, X, y, weights=None):
        r"""
        Fit regression coefficients via maximum likelihood.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            The targets for each of the `N` examples in `X`, where each target
            has dimension `K`.
        weights : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` or None
            Weights associated with the examples in `X`. Examples
            with larger weights exert greater influence on model fit.  When
            `y` is a vector (i.e., `K = 1`), weights should be set to the
            reciporical of the variance for each measurement (i.e., :math:`w_i
            = 1/\sigma^2_i`). When `K > 1`, it is assumed that all columns of
            `y` share the same weight :math:`w_i`. If None, examples are
            weighted equally, resulting in the standard linear least squares
            update.  Default is None.

        Returns
        -------
        self : :class:`LinearRegression <numpy_ml.linear_models.LinearRegression>` instance
        """  # noqa: E501
        N = X.shape[0]

        weights = np.ones(N) if weights is None else np.atleast_1d(weights)
        weights = np.squeeze(weights) if weights.size > 1 else weights
        err_str = f"weights must have shape ({N},) but got {weights.shape}"
        assert weights.shape == (N,), err_str

        # scale X and y by the weight associated with each example
        W = np.diag(np.sqrt(weights))
        X, y = W @ X, W @ y

        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.sqrt(weights), X]

        self.sigma_inv = np.linalg.pinv(X.T @ X)
        self.beta = np.atleast_2d(self.sigma_inv @ X.T @ y)

        self._is_fit = True
        return self

    def predict(self, X):
        """
        Use the trained model to generate predictions on a new collection of
        data points.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, K)`
            The model predictions for the items in `X`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        return X @ self.beta


================================================
FILE: numpy_ml/linear_models/logistic.py
================================================
"""Logistic regression module"""
import numpy as np


class LogisticRegression:
    def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
        r"""
        A simple binary logistic regression model fit via gradient descent on
        the penalized negative log likelihood.

        Notes
        -----
        In simple binary logistic regression, the entries in a binary target
        vector :math:`\mathbf{y} = (y_1, \ldots, y_N)` are assumed to have been
        drawn from a series of independent Bernoulli random variables with
        expected values :math:`p_1, \ldots, p_N`. The binary logistic regession
        model models the logit of these unknown mean parameters as a linear
        function of the model coefficients, :math:`\mathbf{b}`, and the
        covariates for the corresponding example, :math:`\mathbf{x}_i`:

        .. math::

            \text{Logit}(p_i) =
                \log \left( \frac{p_i}{1 - p_i} \right) = \mathbf{b}^\top\mathbf{x}_i

        The model predictions :math:`\hat{\mathbf{y}}` are the expected values
        of the Bernoulli parameters for each example:

        .. math::

            \hat{y}_i =
                \mathbb{E}[y_i \mid \mathbf{x}_i] = \sigma(\mathbf{b}^\top \mathbf{x}_i)

        where :math:`\sigma` is the logistic sigmoid function :math:`\sigma(x)
        = \frac{1}{1 + e^{-x}}`. Under this model, the (penalized) negative log
        likelihood of the targets **y** is

        .. math::

            - \log \mathcal{L}(\mathbf{b}, \mathbf{y}) = -\frac{1}{N} \left[
                \left(
                    \sum_{i=0}^N y_i \log(\hat{y}_i) +
                      (1-y_i) \log(1-\hat{y}_i)
                \right) - R(\mathbf{b}, \gamma)
            \right]

        where

        .. math::

            R(\mathbf{b}, \gamma) = \left\{
                \begin{array}{lr}
                    \frac{\gamma}{2} ||\mathbf{b}||_2^2 & :\texttt{ penalty = 'l2'}\\
                    \gamma ||\mathbf{b}||_1 & :\texttt{ penalty = 'l1'}
                \end{array}
                \right.

        is a regularization penalty, :math:`\gamma` is a regularization weight,
        `N` is the number of examples in **y**, :math:`\hat{y}_i` is the model
        prediction on example *i*, and **b** is the vector of model
        coefficients.

        Parameters
        ----------
        penalty : {'l1', 'l2'}
            The type of regularization penalty to apply on the coefficients
            `beta`. Default is 'l2'.
        gamma : float
            The regularization weight. Larger values correspond to larger
            regularization penalties, and a value of 0 indicates no penalty.
            Default is 0.
        fit_intercept : bool
            Whether to fit an intercept term in addition to the coefficients in
            b. If True, the estimates for `beta` will have `M + 1` dimensions,
            where the first dimension corresponds to the intercept. Default is
            True.

        Attributes
        ----------
        beta : :py:class:`ndarray <numpy.ndarray>` of shape `(M, 1)` or None
            Fitted model coefficients.
        """
        err_msg = "penalty must be 'l1' or 'l2', but got: {}".format(penalty)
        assert penalty in ["l2", "l1"], err_msg
        self.beta = None
        self.gamma = gamma
        self.penalty = penalty
        self.fit_intercept = fit_intercept

    def fit(self, X, y, lr=0.01, tol=1e-7, max_iter=1e7):
        """
        Fit the regression coefficients via gradient descent on the negative
        log likelihood.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The binary targets for each of the `N` examples in `X`.
        lr : float
            The gradient descent learning rate. Default is 1e-7.
        max_iter : float
            The maximum number of iterations to run the gradient descent
            solver. Default is 1e7.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        l_prev = np.inf
        self.beta = np.random.rand(X.shape[1])
        for _ in range(int(max_iter)):
            y_pred = _sigmoid(X @ self.beta)
            loss = self._NLL(X, y, y_pred)
            if l_prev - loss < tol:
                return
            l_prev = loss
            self.beta -= lr * self._NLL_grad(X, y, y_pred)

    def _NLL(self, X, y, y_pred):
        r"""
        Penalized negative log likelihood of the targets under the current
        model.

        .. math::

            \text{NLL} = -\frac{1}{N} \left[
                \left(
                    \sum_{i=0}^N y_i \log(\hat{y}_i) + (1-y_i) \log(1-\hat{y}_i)
                \right) - R(\mathbf{b}, \gamma)
            \right]
        """
        N, M = X.shape
        beta, gamma = self.beta, self.gamma
        order = 2 if self.penalty == "l2" else 1
        norm_beta = np.linalg.norm(beta, ord=order)

        nll = -np.log(y_pred[y == 1]).sum() - np.log(1 - y_pred[y == 0]).sum()
        penalty = (gamma / 2) * norm_beta ** 2 if order == 2 else gamma * norm_beta
        return (penalty + nll) / N

    def _NLL_grad(self, X, y, y_pred):
        """Gradient of the penalized negative log likelihood wrt beta"""
        N, M = X.shape
        p, beta, gamma = self.penalty, self.beta, self.gamma
        d_penalty = gamma * beta if p == "l2" else gamma * np.sign(beta)
        return -((y - y_pred) @ X + d_penalty) / N

    def predict(self, X):
        """
        Use the trained model to generate prediction probabilities on a new
        collection of data points.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z,)`
            The model prediction probabilities for the items in `X`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        return _sigmoid(X @ self.beta)


def _sigmoid(x):
    """The logistic sigmoid function"""
    return 1 / (1 + np.exp(-x))


================================================
FILE: numpy_ml/linear_models/naive_bayes.py
================================================
"""A module for naive Bayes classifiers"""
import numpy as np


class GaussianNBClassifier:
    def __init__(self, eps=1e-6):
        r"""
        A naive Bayes classifier for real-valued data.

        Notes
        -----
        The naive Bayes model assumes the features of each training example
        :math:`\mathbf{x}` are mutually independent given the example label
        *y*:

        .. math::

            P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)

        where :math:`M` is the rank of the :math:`i^{th}` example
        :math:`\mathbf{x}_i` and :math:`y_i` is the label associated with the
        :math:`i^{th}` example.

        Combining the conditional independence assumption with a simple
        application of Bayes' theorem gives the naive Bayes classification
        rule:

        .. math::

            \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
                    &= \arg \max_y  P(y) P(\mathbf{x} \mid y) \\
                    &= \arg \max_y  P(y) \prod_{j=1}^M P(x_j \mid y)

        In the final expression, the prior class probability :math:`P(y)` can
        be specified in advance or estimated empirically from the training
        data.

        In the Gaussian version of the naive Bayes model, the feature
        likelihood is assumed to be normally distributed for each class:

        .. math::

            \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)

        where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
        \Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
        unique classes present in the data, and the parameters for the Gaussian
        associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
        (where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
        training examples with label :math:`c`.

        Parameters
        ----------
        eps : float
            A value added to the variance to prevent numerical error. Default
            is 1e-6.

        Attributes
        ----------
        parameters : dict
            Dictionary of model parameters: "mean", the `(K, M)` array of
            feature means under each class, "sigma", the `(K, M)` array of
            feature variances under each class, and "prior", the `(K,)` array of
            empirical prior probabilities for each class label.
        hyperparameters : dict
            Dictionary of model hyperparameters
        labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
            An array containing the unique class labels for the training
            examples.
        """
        self.labels = None
        self.hyperparameters = {"eps": eps}
        self.parameters = {
            "mean": None,  # shape: (K, M)
            "sigma": None,  # shape: (K, M)
            "prior": None,  # shape: (K,)
        }

    def fit(self, X, y):
        """
        Fit the model parameters via maximum likelihood.

        Notes
        -----
        The model parameters are stored in the :py:attr:`parameters
        <numpy_ml.linear_models.GaussianNBClassifier.parameters>` attribute.
        The following keys are present:

            "mean": :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
                Feature means for each of the `K` label classes
            "sigma": :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
                Feature variances for each of the `K` label classes
            "prior": :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
                Prior probability of each of the `K` label classes, estimated
                empirically from the training data

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`
        y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The class label for each of the `N` examples in `X`

        Returns
        -------
        self : :class:`GaussianNBClassifier <numpy_ml.linear_models.GaussianNBClassifier>` instance
        """  # noqa: E501
        P = self.parameters
        H = self.hyperparameters

        self.labels = np.unique(y)

        K = len(self.labels)
        N, M = X.shape

        P["mean"] = np.zeros((K, M))
        P["sigma"] = np.zeros((K, M))
        P["prior"] = np.zeros((K,))

        for i, c in enumerate(self.labels):
            X_c = X[y == c, :]

            P["mean"][i, :] = np.mean(X_c, axis=0)
            P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
            P["prior"][i] = X_c.shape[0] / N
        return self

    def predict(self, X):
        """
        Use the trained classifier to predict the class label for each example
        in **X**.

        Parameters
        ----------
        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset of `N` examples, each of dimension `M`

        Returns
        -------
        labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
            The predicted class labels for each example in `X`
        """
        return self.labels[self._log_posterior(X).argmax(axis=1)]

    def _log_posterior(self, X):
        r"""
        Compute the (unnormalized) log posterior for each class.

        Parameters
        ----------
        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset of `N` examples, each of dimension `M`

        Returns
        -------
        log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            Unnormalized log posterior probability of each class for each
            example in `X`
        """
        K = len(self.labels)
        log_posterior = np.zeros((X.shape[0], K))
        for i in range(K):
            log_posterior[:, i] = self._log_class_posterior(X, i)
        return log_posterior

    def _log_class_posterior(self, X, class_idx):
        r"""
        Compute the (unnormalized) log posterior for the label at index
        `class_idx` in :py:attr:`labels <numpy_ml.linear_models.GaussianNBClassifier.labels>`.

        Notes
        -----
        Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
        :math:`c` is::

        .. math::

            \log P(y_i = c \mid \mathbf{x}_i, \theta)
                &\propto \log P(y=c \mid \theta) +
                    \log P(\mathbf{x}_i \mid y_i = c, \theta) \\
                &\propto \log P(y=c \mid \theta)
                    \sum{j=1}^M \log P(x_j \mid y_i = c, \theta)

        In the Gaussian naive Bayes model, the feature likelihood for class
        :math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
        be normally distributed

        .. math::

            \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)

        Parameters
        ----------
        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset of `N` examples, each of dimension `M`
        class_idx : int
            The index of the current class in :py:attr:`labels`

        Returns
        -------
        log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            Unnormalized log probability of the label at index `class_idx`
            in :py:attr:`labels <numpy_ml.linear_models.GaussianNBClassifier.labels>`
            for each example in `X`
        """  # noqa: E501
        P = self.parameters
        mu = P["mean"][class_idx]
        prior = P["prior"][class_idx]
        sigsq = P["sigma"][class_idx]

        # log likelihood = log X | N(mu, sigsq)
        log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
        log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
        return log_likelihood + np.log(prior)


================================================
FILE: numpy_ml/linear_models/ridge.py
================================================
"""Ridge regression module"""

import numpy as np


class RidgeRegression:
    def __init__(self, alpha=1, fit_intercept=True):
        r"""
        A ridge regression model with maximum likelihood fit via the normal
        equations.

        Notes
        -----
        Ridge regression is a biased estimator for linear models which adds an
        additional penalty proportional to the L2-norm of the model
        coefficients to the standard mean-squared-error loss:

        .. math::

            \mathcal{L}_{Ridge} = (\mathbf{y} - \mathbf{X} \beta)^\top
                (\mathbf{y} - \mathbf{X} \beta) + \alpha ||\beta||_2^2

        where :math:`\alpha` is a weight controlling the severity of the
        penalty.

        Given data matrix **X** and target vector **y**, the maximum-likelihood
        estimate for ridge coefficients, :math:`\beta`, is:

        .. math::

            \hat{\beta} =
                \left(\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I} \right)^{-1}
                    \mathbf{X}^\top \mathbf{y}

        It turns out that this estimate for :math:`\beta` also corresponds to
        the MAP estimate if we assume a multivariate Gaussian prior on the
        model coefficients, assuming that the data matrix **X** has been
        standardized and the target values **y** centered at 0:

        .. math::

            \beta \sim \mathcal{N}\left(\mathbf{0}, \frac{1}{2M} \mathbf{I}\right)

        Parameters
        ----------
        alpha : float
            L2 regularization coefficient. Larger values correspond to larger
            penalty on the L2 norm of the model coefficients. Default is 1.
        fit_intercept : bool
            Whether to fit an additional intercept term. Default is True.

        Attributes
        ----------
        beta : :py:class:`ndarray <numpy.ndarray>` of shape `(M, K)` or None
            Fitted model coefficients.
        """
        self.beta = None
        self.alpha = alpha
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        """
        Fit the regression coefficients via maximum likelihood.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A dataset consisting of `N` examples, each of dimension `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
            The targets for each of the `N` examples in `X`, where each target
            has dimension `K`.

        Returns
        -------
        self : :class:`RidgeRegression <numpy_ml.linear_models.RidgeRegression>` instance
        """  # noqa: E501
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]

        A = self.alpha * np.eye(X.shape[1])
        pseudo_inverse = np.linalg.inv(X.T @ X + A) @ X.T
        self.beta = pseudo_inverse @ y
        return self

    def predict(self, X):
        """
        Use the trained model to generate predictions on a new collection of
        data points.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, M)`
            A dataset consisting of `Z` new examples, each of dimension `M`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(Z, K)`
            The model predictions for the items in `X`.
        """
        # convert X to a design matrix if we're fitting an intercept
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        return np.dot(X, self.beta)


================================================
FILE: numpy_ml/neural_nets/README.md
================================================
# Neural network models
This module implements building-blocks for larger neural network models in the
Keras-style. This module does _not_ implement a general autograd system in order
emphasize conceptual understanding over flexibility.

1. **Activations**. Common activation nonlinearities. Includes:
    - Rectified linear units (ReLU) ([Hahnloser et al., 2000](http://invibe.net/biblio_database_dyva/woda/data/att/6525.file.pdf))
    - Leaky rectified linear units
      ([Maas, Hannun, & Ng, 2013](https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf))
    - Exponential linear units (ELU) ([Clevert, Unterthiner, & Hochreiter, 2016](http://arxiv.org/abs/1511.07289))
    - Scaled exponential linear units ([Klambauer, Unterthiner, & Mayr, 2017](https://arxiv.org/pdf/1706.02515.pdf))
    - Softplus units
    - Hard sigmoid units
    - Exponential units
    - Hyperbolic tangent (tanh)
    - Logistic sigmoid
    - Affine

2. **Losses**. Common loss functions. Includes:
    - Squared error
    - Categorical cross entropy
    - VAE Bernoulli loss ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
    - Wasserstein loss with gradient penalty ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf))
    - Noise contrastive estimation (NCE) loss ([Gutmann & Hyv&auml;rinen](https://www.cs.helsinki.fi/u/ahyvarin/papers/Gutmann10AISTATS.pdf); [Minh & Teh, 2012](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf))

3. **Wrappers**. Layer wrappers. Includes:
    - Dropout ([Srivastava, et al., 2014](http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf))

4. **Layers**. Common layers / layer-wise operations that can be composed to
   create larger neural networks. Includes:
    - Fully-connected
    - Sparse evolutionary ([Mocanu et al., 2018](https://www.nature.com/articles/s41467-018-04316-3))
    - Dot-product attention ([Luong, Pho, & Manning, 2015](https://arxiv.org/pdf/1508.04025.pdf); [Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
    - 1D and 2D convolution (with stride, padding, and dilation) ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf); [Yu & Kolton, 2016](https://arxiv.org/pdf/1511.07122.pdf))
    - 2D "deconvolution" (with stride and padding) ([Zeiler et al., 2010](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf))
    - Restricted Boltzmann machines (with CD-_n_ training) ([Smolensky, 1996](http://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf); [Carreira-Perpiñán & Hinton, 2005](http://www.cs.toronto.edu/~fritz/absps/cdmiguel.pdf))
    - Elementwise multiplication
    - Embedding
    - Summation
    - Flattening
    - Softmax
    - Max & average pooling
    - 1D and 2D batch normalization ([Ioffe & Szegedy, 2015](http://proceedings.mlr.press/v37/ioffe15.pdf))
    - 1D and 2D layer normalization ([Ba, Kiros, & Hinton, 2016](https://arxiv.org/pdf/1607.06450.pdf))
    - Recurrent ([Elman, 1990](https://crl.ucsd.edu/~elman/Papers/fsit.pdf))
    - Long short-term memory (LSTM) ([Hochreiter & Schmidhuber, 1997](http://www.bioinf.jku.at/publications/older/2604.pdf))

5. **Optimizers**. Common modifications to stochastic gradient descent.
   Includes:
    - SGD with momentum ([Rummelhart, Hinton, & Williams, 1986](https://www.cs.princeton.edu/courses/archive/spring18/cos495/res/backprop_old.pdf))
    - AdaGrad ([Duchi, Hazan, & Singer, 2011](http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
    - RMSProp ([Tieleman & Hinton, 2012](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
    - Adam ([Kingma & Ba, 2015](https://arxiv.org/pdf/1412.6980v8.pdf))

6. **Learning Rate Schedulers**. Common learning rate decay schedules.
    - Constant
    - Exponential decay
    - Noam/Transformer scheduler ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
    - King/Dlib scheduler ([King, 2018](http://blog.dlib.net/2018/02/automatic-learning-rate-scheduling-that.html))

6. **Initializers**. Common weight initialization strategies.
    - Glorot/Xavier uniform and normal ([Glorot & Bengio, 2010](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
    - He/Kaiming uniform and normal ([He et al., 2015](https://arxiv.org/pdf/1502.01852v1.pdf))
    - Standard normal
    - Truncated normal

7. **Modules**. Common multi-layer blocks that appear across many deep networks.
   Includes:
    - Bidirectional LSTMs ([Schuster & Paliwal, 1997](https://pdfs.semanticscholar.org/4b80/89bc9b49f84de43acc2eb8900035f7d492b2.pdf))
    - ResNet-style "identity" (i.e., `same`-convolution) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
    - ResNet-style "convolutional" (i.e., parametric) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
    - WaveNet-style residual block with dilated causal convolutions ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf))
    - Transformer-style multi-headed dot-product attention ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))

8. **Models**. Well-known network architectures. Includes:
    - `vae.py`: Bernoulli variational autoencoder ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
    - `wgan_gp.py`: Wasserstein generative adversarial network with gradient
      penalty ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf);
[Goodfellow et al., 2014](https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf))
    - `w2v.py`: word2vec model with CBOW and skip-gram architectures and
      training via noise contrastive estimation ([Mikolov et al., 2012](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf))

8. **Utils**. Common helper functions, primarily for dealing with CNNs.
   Includes:
    - `im2col`
    - `col2im`
    - `conv1D`
    - `conv2D`
    - `dilate`
    - `deconv2D`
    - `minibatch`
    - Various weight initialization utilities
    - Various padding and convolution arithmetic utilities


================================================
FILE: numpy_ml/neural_nets/__init__.py
================================================
"""A module of basic building blcoks for constructing neural networks"""
from . import utils
from . import losses
from . import activations
from . import schedulers
from . import optimizers
from . import wrappers
from . import layers
from . import initializers
from . import modules
from . import models


================================================
FILE: numpy_ml/neural_nets/activations/README.md
================================================
# Activation Functions
The `activations` module implements several common activation functions:

- Rectified linear units (ReLU) ([Hahnloser et al., 2000](http://invibe.net/biblio_database_dyva/woda/data/att/6525.file.pdf))
- Leaky rectified linear units
  ([Maas, Hannun, & Ng, 2013](https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf))
- Exponential linear units ([Clevert, Unterthiner, & Hochreiter, 2016](https://arxiv.org/pdf/1511.07289.pdf))
- Scaled exponential linear units ([Klambauer, Unterthiner, & Mayr, 2017](https://arxiv.org/pdf/1706.02515.pdf))
- Softplus units
- Hard sigmoid units
- Exponential units
- Hyperbolic tangent (tanh)
- Logistic sigmoid
- Affine


## Plots
<p align="center">
<img src="img/plot.png" align='center' width="850" />
</p>


================================================
FILE: numpy_ml/neural_nets/activations/__init__.py
================================================
from .activations import *


================================================
FILE: numpy_ml/neural_nets/activations/activations.py
================================================
"""A collection of activation function objects for building neural networks"""
from math import erf
from abc import ABC, abstractmethod

import numpy as np


class ActivationBase(ABC):
    def __init__(self, **kwargs):
        """Initialize the ActivationBase object"""
        super().__init__()

    def __call__(self, z):
        """Apply the activation function to an input"""
        if z.ndim == 1:
            z = z.reshape(1, -1)
        return self.fn(z)

    @abstractmethod
    def fn(self, z):
        """Apply the activation function to an input"""
        raise NotImplementedError

    @abstractmethod
    def grad(self, x, **kwargs):
        """Compute the gradient of the activation function wrt the input"""
        raise NotImplementedError


class Sigmoid(ActivationBase):
    def __init__(self):
        """A logistic sigmoid activation function."""
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Sigmoid"

    def fn(self, z):
        r"""
        Evaluate the logistic sigmoid, :math:`\sigma`, on the elements of input `z`.

        .. math::

            \sigma(x_i) = \frac{1}{1 + e^{-x_i}}
        """
        return 1 / (1 + np.exp(-z))

    def grad(self, x):
        r"""
        Evaluate the first derivative of the logistic sigmoid on the elements of `x`.

        .. math::

            \frac{\partial \sigma}{\partial x_i} = \sigma(x_i) (1 - \sigma(x_i))
        """
        fn_x = self.fn(x)
        return fn_x * (1 - fn_x)

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the logistic sigmoid on the elements of `x`.

        .. math::

            \frac{\partial^2 \sigma}{\partial x_i^2} =
                \frac{\partial \sigma}{\partial x_i} (1 - 2 \sigma(x_i))
        """
        fn_x = self.fn(x)
        return fn_x * (1 - fn_x) * (1 - 2 * fn_x)


class ReLU(ActivationBase):
    """
    A rectified linear activation function.

    Notes
    -----
    "ReLU units can be fragile during training and can "die". For example, a
    large gradient flowing through a ReLU neuron could cause the weights to
    update in such a way that the neuron will never activate on any datapoint
    again. If this happens, then the gradient flowing through the unit will
    forever be zero from that point on. That is, the ReLU units can
    irreversibly die during training since they can get knocked off the data
    manifold.

    For example, you may find that as much as 40% of your network can be "dead"
    (i.e. neurons that never activate across the entire training dataset) if
    the learning rate is set too high. With a proper setting of the learning
    rate this is less frequently an issue." [*]_

    References
    ----------
    .. [*] Karpathy, A. "CS231n: Convolutional neural networks for visual recognition."
    """

    def __init__(self):
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "ReLU"

    def fn(self, z):
        r"""
        Evaulate the ReLU function on the elements of input `z`.

        .. math::

            \text{ReLU}(z_i)
                &=  z_i \ \ \ \ &&\text{if }z_i > 0 \\
                &=  0 \ \ \ \ &&\text{otherwise}
        """
        return np.clip(z, 0, np.inf)

    def grad(self, x):
        r"""
        Evaulate the first derivative of the ReLU function on the elements of input `x`.

        .. math::

            \frac{\partial \text{ReLU}}{\partial x_i}
                &=  1 \ \ \ \ &&\text{if }x_i > 0 \\
                &=  0   \ \ \ \ &&\text{otherwise}
        """
        return (x > 0).astype(int)

    def grad2(self, x):
        r"""
        Evaulate the second derivative of the ReLU function on the elements of
        input `x`.

        .. math::

            \frac{\partial^2 \text{ReLU}}{\partial x_i^2}  =  0
        """
        return np.zeros_like(x)


class LeakyReLU(ActivationBase):
    """
    'Leaky' version of a rectified linear unit (ReLU).

    Notes
    -----
    Leaky ReLUs [*]_ are designed to address the vanishing gradient problem in
    ReLUs by allowing a small non-zero gradient when `x` is negative.

    Parameters
    ----------
    alpha: float
        Activation slope when x < 0. Default is 0.3.

    References
    ----------
    .. [*] Mass, L. M., Hannun, A. Y, & Ng, A. Y. (2013). "Rectifier
       nonlinearities improve neural network acoustic models." *Proceedings of
       the 30th International Conference of Machine Learning, 30*.
    """

    def __init__(self, alpha=0.3):
        self.alpha = alpha
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Leaky ReLU(alpha={})".format(self.alpha)

    def fn(self, z):
        r"""
        Evaluate the leaky ReLU function on the elements of input `z`.

        .. math::

            \text{LeakyReLU}(z_i)
                &=  z_i \ \ \ \ &&\text{if } z_i > 0 \\
                &=  \alpha z_i \ \ \ \ &&\text{otherwise}
        """
        _z = z.copy()
        _z[z < 0] = _z[z < 0] * self.alpha
        return _z

    def grad(self, x):
        r"""
        Evaluate the first derivative of the leaky ReLU function on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{LeakyReLU}}{\partial x_i}
                &=  1 \ \ \ \ &&\text{if }x_i > 0 \\
                &=  \alpha \ \ \ \ &&\text{otherwise}
        """
        out = np.ones_like(x)
        out[x < 0] *= self.alpha
        return out

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the leaky ReLU function on the
        elements of input `x`.

        .. math::

            \frac{\partial^2 \text{LeakyReLU}}{\partial x_i^2}  =  0
        """
        return np.zeros_like(x)


class GELU(ActivationBase):
    def __init__(self, approximate=True):
        r"""
        A Gaussian error linear unit (GELU). [*]_

        Notes
        -----
        A ReLU alternative. GELU weights inputs by their value, rather than
        gates inputs by their sign, as in vanilla ReLUs.

        References
        ----------
        .. [*] Hendrycks, D., & Gimpel, K. (2016). "Bridging nonlinearities and
           stochastic regularizers with Gaussian error linear units." *CoRR*.

        Parameters
        ----------
        approximate : bool
            Whether to use a faster but less precise approximation to the Gauss
            error function when calculating the unit activation and gradient.
            Default is True.
        """
        self.approximate = True
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return f"GELU(approximate={self.approximate})"

    def fn(self, z):
        r"""
        Compute the GELU function on the elements of input `z`.

        .. math::

            \text{GELU}(z_i) = z_i P(Z \leq z_i) = z_i \Phi(z_i)
                = z_i \cdot \frac{1}{2}(1 + \text{erf}(x/\sqrt{2}))
        """
        pi, sqrt, tanh = np.pi, np.sqrt, np.tanh

        if self.approximate:
            return 0.5 * z * (1 + tanh(sqrt(2 / pi) * (z + 0.044715 * z ** 3)))
        return 0.5 * z * (1 + erf(z / sqrt(2)))

    def grad(self, x):
        r"""
        Evaluate the first derivative of the GELU function on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{GELU}}{\partial x_i}  =
                \frac{1}{2} + \frac{1}{2}\left(\text{erf}(\frac{x}{\sqrt{2}}) +
                    \frac{x + \text{erf}'(\frac{x}{\sqrt{2}})}{\sqrt{2}}\right)

        where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
        """
        pi, exp, sqrt, tanh = np.pi, np.exp, np.sqrt, np.tanh

        s = x / sqrt(2)
        erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2))  # noqa: E731

        if self.approximate:
            approx = tanh(sqrt(2 / pi) * (x + 0.044715 * x ** 3))
            dx = 0.5 + 0.5 * approx + ((0.5 * x * erf_prime(s)) / sqrt(2))
        else:
            dx = 0.5 + 0.5 * erf(s) + ((0.5 * x * erf_prime(s)) / sqrt(2))
        return dx

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the GELU function on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{GELU}}{\partial x_i^2} =
                \frac{1}{2\sqrt{2}} \left\[
                    \text{erf}'(\frac{x}{\sqrt{2}}) +
                    \frac{1}{\sqrt{2}} \text{erf}''(\frac{x}{\sqrt{2}})
                \right]

        where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}` and
        :math:`\text{erf}''(x) = \frac{-4x}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
        """
        pi, exp, sqrt = np.pi, np.exp, np.sqrt
        s = x / sqrt(2)

        erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2))  # noqa: E731
        erf_prime2 = lambda x: -4 * x * exp(-(x ** 2)) / sqrt(pi)  # noqa: E731
        ddx = (1 / 2 * sqrt(2)) * (1 + erf_prime(s) + (erf_prime2(s) / sqrt(2)))
        return ddx


class Tanh(ActivationBase):
    def __init__(self):
        """A hyperbolic tangent activation function."""
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Tanh"

    def fn(self, z):
        """Compute the tanh function on the elements of input `z`."""
        return np.tanh(z)

    def grad(self, x):
        r"""
        Evaluate the first derivative of the tanh function on the elements
        of input `x`.

        .. math::

            \frac{\partial \tanh}{\partial x_i}  =  1 - \tanh(x)^2
        """
        return 1 - np.tanh(x) ** 2

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the tanh function on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \tanh}{\partial x_i^2} =
                -2 \tanh(x) \left(\frac{\partial \tanh}{\partial x_i}\right)
        """
        tanh_x = np.tanh(x)
        return -2 * tanh_x * (1 - tanh_x ** 2)


class Affine(ActivationBase):
    def __init__(self, slope=1, intercept=0):
        """
        An affine activation function.

        Parameters
        ----------
        slope: float
            Activation slope. Default is 1.
        intercept: float
            Intercept/offset term. Default is 0.
        """
        self.slope = slope
        self.intercept = intercept
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Affine(slope={}, intercept={})".format(self.slope, self.intercept)

    def fn(self, z):
        r"""
        Evaluate the Affine activation on the elements of input `z`.

        .. math::

            \text{Affine}(z_i)  =  \text{slope} \times z_i + \text{intercept}
        """
        return self.slope * z + self.intercept

    def grad(self, x):
        r"""
        Evaluate the first derivative of the Affine activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{Affine}}{\partial x_i}  =  \text{slope}
        """
        return self.slope * np.ones_like(x)

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the Affine activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{Affine}}{\partial x_i^2}  =  0
        """
        return np.zeros_like(x)


class Identity(Affine):
    def __init__(self):
        """
        Identity activation function.

        Notes
        -----
        :class:`Identity` is syntactic sugar for :class:`Affine` with
        slope = 1 and intercept = 0.
        """
        super().__init__(slope=1, intercept=0)

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Identity"


class ELU(ActivationBase):
    def __init__(self, alpha=1.0):
        r"""
        An exponential linear unit (ELU).

        Notes
        -----
        ELUs are intended to address the fact that ReLUs are strictly nonnegative
        and thus have an average activation > 0, increasing the chances of internal
        covariate shift and slowing down learning. ELU units address this by (1)
        allowing negative values when :math:`x < 0`, which (2) are bounded by a value
        :math:`-\alpha`. Similar to :class:`LeakyReLU`, the negative activation
        values help to push the average unit activation towards 0. Unlike
        :class:`LeakyReLU`, however, the boundedness of the negative activation
        allows for greater robustness in the face of large negative values,
        allowing the function to avoid conveying the *degree* of "absence"
        (negative activation) in the input. [*]_

        Parameters
        ----------
        alpha : float
            Slope of negative segment. Default is 1.

        References
        ----------
        .. [*] Clevert, D. A., Unterthiner, T., Hochreiter, S. (2016). "Fast
           and accurate deep network learning by exponential linear units
           (ELUs)". *4th International Conference on Learning
           Representations*.
        """
        self.alpha = alpha
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "ELU(alpha={})".format(self.alpha)

    def fn(self, z):
        r"""
        Evaluate the ELU activation on the elements of input `z`.

        .. math::

            \text{ELU}(z_i)
                &=  z_i \ \ \ \ &&\text{if }z_i > 0 \\
                &=  \alpha (e^{z_i} - 1) \ \ \ \ &&\text{otherwise}
        """
        # z if z > 0  else alpha * (e^z - 1)
        return np.where(z > 0, z, self.alpha * (np.exp(z) - 1))

    def grad(self, x):
        r"""
        Evaluate the first derivative of the ELU activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{ELU}}{\partial x_i}
                &=  1 \ \ \ \ &&\text{if } x_i > 0 \\
                &=  \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
        """
        # 1 if x > 0 else alpha * e^(z)
        return np.where(x > 0, np.ones_like(x), self.alpha * np.exp(x))

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the ELU activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{ELU}}{\partial x_i^2}
                &=  0 \ \ \ \ &&\text{if } x_i > 0 \\
                &=  \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
        """
        # 0 if x > 0 else alpha * e^(z)
        return np.where(x >= 0, np.zeros_like(x), self.alpha * np.exp(x))


class Exponential(ActivationBase):
    def __init__(self):
        """An exponential (base e) activation function"""
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Exponential"

    def fn(self, z):
        r"""
        Evaluate the activation function

        .. math::
            \text{Exponential}(z_i) = e^{z_i}
        """
        return np.exp(z)

    def grad(self, x):
        r"""
        Evaluate the first derivative of the exponential activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{Exponential}}{\partial x_i}  =  e^{x_i}
        """
        return np.exp(x)

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the exponential activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{Exponential}}{\partial x_i^2}  =  e^{x_i}
        """
        return np.exp(x)


class SELU(ActivationBase):
    r"""
    A scaled exponential linear unit (SELU).

    Notes
    -----
    SELU units, when used in conjunction with proper weight initialization and
    regularization techniques, encourage neuron activations to converge to
    zero-mean and unit variance without explicit use of e.g., batchnorm.

    For SELU units, the :math:`\alpha` and :math:`\text{scale}` values are
    constants chosen so that the mean and variance of the inputs are preserved
    between consecutive layers. As such the authors propose weights be
    initialized using Lecun-Normal initialization: :math:`w_{ij} \sim
    \mathcal{N}(0, 1 / \text{fan_in})`, and to use the dropout variant
    :math:`\alpha`-dropout during regularization. [*]_

    See the reference for more information (especially the appendix ;-) ).

    References
    ----------
    .. [*] Klambauer, G., Unterthiner, T., & Hochreiter, S. (2017).
       "Self-normalizing neural networks." *Advances in Neural Information
       Processing Systems, 30.*
    """

    def __init__(self):
        self.alpha = 1.6732632423543772848170429916717
        self.scale = 1.0507009873554804934193349852946
        self.elu = ELU(alpha=self.alpha)
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "SELU"

    def fn(self, z):
        r"""
        Evaluate the SELU activation on the elements of input `z`.

        .. math::

            \text{SELU}(z_i)  =  \text{scale} \times \text{ELU}(z_i, \alpha)

        which is simply

        .. math::

            \text{SELU}(z_i)
                &= \text{scale} \times z_i \ \ \ \ &&\text{if }z_i > 0 \\
                &= \text{scale} \times \alpha (e^{z_i} - 1) \ \ \ \ &&\text{otherwise}
        """
        return self.scale * self.elu.fn(z)

    def grad(self, x):
        r"""
        Evaluate the first derivative of the SELU activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{SELU}}{\partial x_i}
                &=  \text{scale} \ \ \ \ &&\text{if } x_i > 0 \\
                &=  \text{scale} \times \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
        """
        return np.where(
            x >= 0, np.ones_like(x) * self.scale, np.exp(x) * self.alpha * self.scale,
        )

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the SELU activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{SELU}}{\partial x_i^2}
                &=  0 \ \ \ \ &&\text{if } x_i > 0 \\
                &=  \text{scale} \times \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
        """
        return np.where(x > 0, np.zeros_like(x), np.exp(x) * self.alpha * self.scale)


class HardSigmoid(ActivationBase):
    def __init__(self):
        """
        A "hard" sigmoid activation function.

        Notes
        -----
        The hard sigmoid is a piecewise linear approximation of the logistic
        sigmoid that is computationally more efficient to compute.
        """
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "Hard Sigmoid"

    def fn(self, z):
        r"""
        Evaluate the hard sigmoid activation on the elements of input `z`.

        .. math::

            \text{HardSigmoid}(z_i)
                &= 0 \ \ \ \ &&\text{if }z_i < -2.5 \\
                &= 0.2 z_i + 0.5 \ \ \ \ &&\text{if }-2.5 \leq z_i \leq 2.5 \\
                &= 1 \ \ \ \ &&\text{if }z_i > 2.5
        """
        return np.clip((0.2 * z) + 0.5, 0.0, 1.0)

    def grad(self, x):
        r"""
        Evaluate the first derivative of the hard sigmoid activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{HardSigmoid}}{\partial x_i}
                &=  0.2 \ \ \ \ &&\text{if } -2.5 \leq x_i \leq 2.5\\
                &=  0 \ \ \ \ &&\text{otherwise}
        """
        return np.where((x >= -2.5) & (x <= 2.5), 0.2, 0)

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the hard sigmoid activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{HardSigmoid}}{\partial x_i^2} =  0
        """
        return np.zeros_like(x)


class SoftPlus(ActivationBase):
    def __init__(self):
        """
        A softplus activation function.

        Notes
        -----
        In contrast to :class:`ReLU`, the softplus activation is differentiable
        everywhere (including 0). It is, however, less computationally efficient to
        compute.

        The derivative of the softplus activation is the logistic sigmoid.
        """
        super().__init__()

    def __str__(self):
        """Return a string representation of the activation function"""
        return "SoftPlus"

    def fn(self, z):
        r"""
        Evaluate the softplus activation on the elements of input `z`.

        .. math::

            \text{SoftPlus}(z_i) = \log(1 + e^{z_i})
        """
        return np.log(np.exp(z) + 1)

    def grad(self, x):
        r"""
        Evaluate the first derivative of the softplus activation on the elements
        of input `x`.

        .. math::

            \frac{\partial \text{SoftPlus}}{\partial x_i} = \frac{e^{x_i}}{1 + e^{x_i}}
        """
        exp_x = np.exp(x)
        return exp_x / (exp_x + 1)

    def grad2(self, x):
        r"""
        Evaluate the second derivative of the softplus activation on the elements
        of input `x`.

        .. math::

            \frac{\partial^2 \text{SoftPlus}}{\partial x_i^2} =
                \frac{e^{x_i}}{(1 + e^{x_i})^2}
        """
        exp_x = np.exp(x)
        return exp_x / ((exp_x + 1) ** 2)


================================================
FILE: numpy_ml/neural_nets/initializers/README.md
================================================
# Initializers
The `initializers.py` module contains objects for initializing optimizers,
activation functions, weight initializers, and learning rate schedulers from
strings or parameter dictionaries.


================================================
FILE: numpy_ml/neural_nets/initializers/__init__.py
================================================
from .initializers import *


================================================
FILE: numpy_ml/neural_nets/initializers/initializers.py
================================================
"""A module containing objects to instantiate various neural network components."""
import re
from functools import partial
from ast import literal_eval as _eval

import numpy as np

from ..optimizers import OptimizerBase, SGD, AdaGrad, RMSProp, Adam
from ..activations import (
    ELU,
    GELU,
    SELU,
    ReLU,
    Tanh,
    Affine,
    Sigmoid,
    Identity,
    SoftPlus,
    LeakyReLU,
    Exponential,
    HardSigmoid,
    ActivationBase,
)
from ..schedulers import (
    SchedulerBase,
    ConstantScheduler,
    ExponentialScheduler,
    NoamScheduler,
    KingScheduler,
)

from ..utils import (
    he_normal,
    he_uniform,
    glorot_normal,
    glorot_uniform,
    truncated_normal,
)


class ActivationInitializer(object):
    def __init__(self, param=None):
        """
        A class for initializing activation functions. Valid `param` values
        are:
            (a) ``__str__`` representations of an `ActivationBase` instance
            (b) `ActivationBase` instance

        If `param` is `None`, return the identity function: f(X) = X
        """
        self.param = param

    def __call__(self):
        """Initialize activation function"""
        param = self.param
        if param is None:
            act = Identity()
        elif isinstance(param, ActivationBase):
            act = param
        elif isinstance(param, str):
            act = self.init_from_str(param)
        else:
            raise ValueError("Unknown activation: {}".format(param))
        return act

    def init_from_str(self, act_str):
        """Initialize activation function from the `param` string"""
        act_str = act_str.lower()
        if act_str == "relu":
            act_fn = ReLU()
        elif act_str == "tanh":
            act_fn = Tanh()
        elif act_str == "selu":
            act_fn = SELU()
        elif act_str == "sigmoid":
            act_fn = Sigmoid()
        elif act_str == "identity":
            act_fn = Identity()
        elif act_str == "hardsigmoid":
            act_fn = HardSigmoid()
        elif act_str == "softplus":
            act_fn = SoftPlus()
        elif act_str == "exponential":
            act_fn = Exponential()
        elif "affine" in act_str:
            r = r"affine\(slope=(.*), intercept=(.*)\)"
            slope, intercept = re.match(r, act_str).groups()
            act_fn = Affine(float(slope), float(intercept))
        elif "leaky relu" in act_str:
            r = r"leaky relu\(alpha=(.*)\)"
            alpha = re.match(r, act_str).groups()[0]
            act_fn = LeakyReLU(float(alpha))
        elif "gelu" in act_str:
            r = r"gelu\(approximate=(.*)\)"
            approx = re.match(r, act_str).groups()[0] == "true"
            act_fn = GELU(approximation=approx)
        elif "elu" in act_str:
            r = r"elu\(alpha=(.*)\)"
            approx = re.match(r, act_str).groups()[0]
            act_fn = ELU(alpha=float(alpha))
        else:
            raise ValueError("Unknown activation: {}".format(act_str))
        return act_fn


class SchedulerInitializer(object):
    def __init__(self, param=None, lr=None):
        """
        A class for initializing learning rate schedulers. Valid `param` values
        are:
            (a) __str__ representations of `SchedulerBase` instances
            (b) `SchedulerBase` instances
            (c) Parameter dicts (e.g., as produced via the `summary` method in
                `LayerBase` instances)

        If `param` is `None`, return the ConstantScheduler with learning rate
        equal to `lr`.
        """
        if all([lr is None, param is None]):
            raise ValueError("lr and param cannot both be `None`")

        self.lr = lr
        self.param = param

    def __call__(self):
        """Initialize scheduler"""
        param = self.param
        if param is None:
            scheduler = ConstantScheduler(self.lr)
        elif isinstance(param, SchedulerBase):
            scheduler = param
        elif isinstance(param, str):
            scheduler = self.init_from_str()
        elif isinstance(param, dict):
            scheduler = self.init_from_dict()
        return scheduler

    def init_from_str(self):
        """Initialize scheduler from the param string"""
        r = r"([a-zA-Z]*)=([^,)]*)"
        sch_str = self.param.lower()
        kwargs = {i: _eval(j) for i, j in re.findall(r, sch_str)}

        if "constant" in sch_str:
            scheduler = ConstantScheduler(**kwargs)
        elif "exponential" in sch_str:
            scheduler = ExponentialScheduler(**kwargs)
        elif "noam" in sch_str:
            scheduler = NoamScheduler(**kwargs)
        elif "king" in sch_str:
            scheduler = KingScheduler(**kwargs)
        else:
            raise NotImplementedError("{}".format(sch_str))
        return scheduler

    def init_from_dict(self):
        """Initialize scheduler from the param dictionary"""
        S = self.param
        sc = S["hyperparameters"] if "hyperparameters" in S else None

        if sc is None:
            raise ValueError("Must have `hyperparameters` key: {}".format(S))

        if sc and sc["id"] == "ConstantScheduler":
            scheduler = ConstantScheduler()
        elif sc and sc["id"] == "ExponentialScheduler":
            scheduler = ExponentialScheduler()
        elif sc and sc["id"] == "NoamScheduler":
            scheduler = NoamScheduler()
        elif sc:
            raise NotImplementedError("{}".format(sc["id"]))
        scheduler.set_params(sc)
        return scheduler


class OptimizerInitializer(object):
    def __init__(self, param=None):
        """
        A class for initializing optimizers. Valid `param` values are:
            (a) __str__ representations of `OptimizerBase` instances
            (b) `OptimizerBase` instances
            (c) Parameter dicts (e.g., as produced via the `summary` method in
                `LayerBase` instances)

        If `param` is `None`, return the SGD optimizer with default parameters.
        """
        self.param = param

    def __call__(self):
        """Initialize the optimizer"""
        param = self.param
        if param is None:
            opt = SGD()
        elif isinstance(param, OptimizerBase):
            opt = param
        elif isinstance(param, str):
            opt = self.init_from_str()
        elif isinstance(param, dict):
            opt = self.init_from_dict()
        return opt

    def init_from_str(self):
        """Initialize optimizer from the `param` string"""
        r = r"([a-zA-Z]*)=([^,)]*)"
        opt_str = self.param.lower()
        kwargs = {i: _eval(j) for i, j in re.findall(r, opt_str)}
        if "sgd" in opt_str:
            optimizer = SGD(**kwargs)
        elif "adagrad" in opt_str:
            optimizer = AdaGrad(**kwargs)
        elif "rmsprop" in opt_str:
            optimizer = RMSProp(**kwargs)
        elif "adam" in opt_str:
            optimizer = Adam(**kwargs)
        else:
            raise NotImplementedError("{}".format(opt_str))
        return optimizer

    def init_from_dict(self):
        """Initialize optimizer from the `param` dictonary"""
        D = self.param
        cc = D["cache"] if "cache" in D else None
        op = D["hyperparameters"] if "hyperparameters" in D else None

        if op is None:
            raise ValueError("`param` dictionary has no `hyperparemeters` key")

        if op and op["id"] == "SGD":
            optimizer = SGD()
        elif op and op["id"] == "RMSProp":
            optimizer = RMSProp()
        elif op and op["id"] == "AdaGrad":
            optimizer = AdaGrad()
        elif op and op["id"] == "Adam":
            optimizer = Adam()
        elif op:
            raise NotImplementedError("{}".format(op["id"]))
        optimizer.set_params(op, cc)
        return optimizer


class WeightInitializer(object):
    def __init__(self, act_fn_str, mode="glorot_uniform"):
        """
        A factory for weight initializers.

        Parameters
        ----------
        act_fn_str : str
            The string representation for the layer activation function
        mode : str (default: 'glorot_uniform')
            The weight initialization strategy. Valid entries are {"he_normal",
            "he_uniform", "glorot_normal", glorot_uniform", "std_normal",
            "trunc_normal"}
        """
        if mode not in [
            "he_normal",
            "he_uniform",
            "glorot_normal",
            "glorot_uniform",
            "std_normal",
            "trunc_normal",
        ]:
            raise ValueError("Unrecognize initialization mode: {}".format(mode))

        self.mode = mode
        self.act_fn = act_fn_str

        if mode == "glorot_uniform":
            self._fn = glorot_uniform
        elif mode == "glorot_normal":
            self._fn = glorot_normal
        elif mode == "he_uniform":
            self._fn = he_uniform
        elif mode == "he_normal":
            self._fn = he_normal
        elif mode == "std_normal":
            self._fn = np.random.randn
        elif mode == "trunc_normal":
            self._fn = partial(truncated_normal, mean=0, std=1)

    def __call__(self, weight_shape):
        """Initialize weights according to the specified strategy"""
        if "glorot" in self.mode:
            gain = self._calc_glorot_gain()
            W = self._fn(weight_shape, gain)
        elif self.mode == "std_normal":
            W = self._fn(*weight_shape)
        else:
            W = self._fn(weight_shape)
        return W

    def _calc_glorot_gain(self):
        """
        Values from:
        https://pytorch.org/docs/stable/nn.html?#torch.nn.init.calculate_gain
        """
        gain = 1.0
        act_str = self.act_fn.lower()
        if act_str == "tanh":
            gain = 5.0 / 3.0
        elif act_str == "relu":
            gain = np.sqrt(2)
        elif "leaky relu" in act_str:
            r = r"leaky relu\(alpha=(.*)\)"
            alpha = re.match(r, act_str).groups()[0]
            gain = np.sqrt(2 / 1 + float(alpha) ** 2)
        return gain


================================================
FILE: numpy_ml/neural_nets/layers/README.md
================================================
# Layers
The `layers.py` module implements common layers / layer-wise operations that can
be composed to create larger neural networks. It includes:

- Fully-connected layers
- Sparse evolutionary layers ([Mocanu et al., 2018](https://www.nature.com/articles/s41467-018-04316-3))
- Dot-product attention layers ([Luong, Pho, & Manning, 2015](https://arxiv.org/pdf/1508.04025.pdf); [Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
- 1D and 2D convolution (with stride, padding, and dilation) layers ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf); [Yu & Kolton, 2016](https://arxiv.org/pdf/1511.07122.pdf))
- 2D "deconvolution" (with stride and padding) layers ([Zeiler et al., 2010](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf))
- Restricted Boltzmann machines (with CD-_n_ training) ([Smolensky, 1996](http://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf); [Carreira-Perpiñán & Hinton, 2005](http://www.cs.toronto.edu/~fritz/absps/cdmiguel.pdf))
- Elementwise multiplication operation
- Summation operation
- Flattening operation
- Embedding layer
- Softmax layer
- Max & average pooling layer
- 1D and 2D batch normalization layers ([Ioffe & Szegedy, 2015](http://proceedings.mlr.press/v37/ioffe15.pdf))
- 1D and 2D layer normalization layers ([Ba, Kiros, & Hinton, 2016](https://arxiv.org/pdf/1607.06450.pdf))
- Recurrent layers ([Elman, 1990](https://crl.ucsd.edu/~elman/Papers/fsit.pdf))
- Long short-term memory (LSTM) layers ([Hochreiter & Schmidhuber, 1997](http://www.bioinf.jku.at/publications/older/2604.pdf))


================================================
FILE: numpy_ml/neural_nets/layers/__init__.py
================================================
from .layers import *


================================================
FILE: numpy_ml/neural_nets/layers/layers.py
================================================
"""A collection of composable layer objects for building neural networks"""
from abc import ABC, abstractmethod

import numpy as np

from ..wrappers import init_wrappers, Dropout

from ..initializers import (
    WeightInitializer,
    OptimizerInitializer,
    ActivationInitializer,
)

from ..utils import (
    pad1D,
    pad2D,
    conv1D,
    conv2D,
    im2col,
    col2im,
    dilate,
    deconv2D_naive,
    calc_pad_dims_2D,
)


class LayerBase(ABC):
    def __init__(self, optimizer=None):
        """An abstract base class inherited by all neural network layers"""
        self.X = []
        self.act_fn = None
        self.trainable = True
        self.optimizer = OptimizerInitializer(optimizer)()

        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {}

        super().__init__()

    @abstractmethod
    def _init_params(self, **kwargs):
        raise NotImplementedError

    @abstractmethod
    def forward(self, z, **kwargs):
        """Perform a forward pass through the layer"""
        raise NotImplementedError

    @abstractmethod
    def backward(self, out, **kwargs):
        """Perform a backward pass through the layer"""
        raise NotImplementedError

    def freeze(self):
        """
        Freeze the layer parameters at their current values so they can no
        longer be updated.
        """
        self.trainable = False

    def unfreeze(self):
        """Unfreeze the layer parameters so they can be updated."""
        self.trainable = True

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        assert self.trainable, "Layer is frozen"
        self.X = []
        for k, v in self.derived_variables.items():
            self.derived_variables[k] = []

        for k, v in self.gradients.items():
            self.gradients[k] = np.zeros_like(v)

    def update(self, cur_loss=None):
        """
        Update the layer parameters using the accrued gradients and layer
        optimizer. Flush all gradients once the update is complete.
        """
        assert self.trainable, "Layer is frozen"
        self.optimizer.step()
        for k, v in self.gradients.items():
            if k in self.parameters:
                self.parameters[k] = self.optimizer(self.parameters[k], v, k, cur_loss)
        self.flush_gradients()

    def set_params(self, summary_dict):
        """
        Set the layer parameters from a dictionary of values.

        Parameters
        ----------
        summary_dict : dict
            A dictionary of layer parameters and hyperparameters. If a required
            parameter or hyperparameter is not included within `summary_dict`,
            this method will use the value in the current layer's
            :meth:`summary` method.

        Returns
        -------
        layer : :doc:`Layer <numpy_ml.neural_nets.layers>` object
            The newly-initialized layer.
        """
        layer, sd = self, summary_dict

        # collapse `parameters` and `hyperparameters` nested dicts into a single
        # merged dictionary
        flatten_keys = ["parameters", "hyperparameters"]
        for k in flatten_keys:
            if k in sd:
                entry = sd[k]
                sd.update(entry)
                del sd[k]

        for k, v in sd.items():
            if k in self.parameters:
                layer.parameters[k] = v
            if k in self.hyperparameters:
                if k == "act_fn":
                    layer.act_fn = ActivationInitializer(v)()
                elif k == "optimizer":
                    layer.optimizer = OptimizerInitializer(sd[k])()
                elif k == "wrappers":
                    layer = init_wrappers(layer, sd[k])
                elif k not in ["wrappers", "optimizer"]:
                    setattr(layer, k, v)
        return layer

    def summary(self):
        """Return a dict of the layer parameters, hyperparameters, and ID."""
        return {
            "layer": self.hyperparameters["layer"],
            "parameters": self.parameters,
            "hyperparameters": self.hyperparameters,
        }


class DotProductAttention(LayerBase):
    def __init__(self, scale=True, dropout_p=0, init="glorot_uniform", optimizer=None):
        r"""
        A single "attention head" layer using a dot-product for the scoring function.

        Notes
        -----
        The equations for a dot product attention layer are:

        .. math::

            \mathbf{Z}  &=  \mathbf{K Q}^\\top \ \ \ \ &&\text{if scale = False} \\
                        &=  \mathbf{K Q}^\top / \sqrt{d_k} \ \ \ \ &&\text{if scale = True} \\
            \mathbf{Y}  &=  \text{dropout}(\text{softmax}(\mathbf{Z})) \mathbf{V}

        Parameters
        ----------
        scale : bool
            Whether to scale the the key-query dot product by the square root
            of the key/query vector dimensionality before applying the Softmax.
            This is useful, since the scale of dot product will otherwise
            increase as query / key dimensions grow. Default is True.
        dropout_p : float in [0, 1)
            The dropout propbability during training, applied to the output of
            the softmax. If 0, no dropout is applied. Default is 0.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
            Unused.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None. Unused.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Unused
        parameters : dict
            Unused
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.scale = scale
        self.dropout_p = dropout_p
        self._init_params()

    def _init_params(self):
        self.softmax = Dropout(Softmax(), self.dropout_p)
        smdv = self.softmax.derived_variables

        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {
            "attention_weights": [],
            "dropout_mask": smdv["wrappers"][0]["dropout_mask"],
        }

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "DotProductAttention",
            "init": self.init,
            "scale": self.scale,
            "dropout_p": self.dropout_p,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def freeze(self):
        """
        Freeze the layer parameters at their current values so they can no
        longer be updated.
        """
        self.trainable = False
        self.softmax.freeze()

    def unfreeze(self):
        """Unfreeze the layer parameters so they can be updated."""
        self.trainable = True
        self.softmax.unfreeze()

    def forward(self, Q, K, V, retain_derived=True):
        r"""
        Compute the attention-weighted output of a collection of keys, values,
        and queries.

        Notes
        -----
        In the most abstract (ie., hand-wave-y) sense:

            - Query vectors ask questions
            - Key vectors advertise their relevancy to questions
            - Value vectors give possible answers to questions
            - The dot product between Key and Query vectors provides scores for
              each of the the `n_ex` different Value vectors

        For a single query and `n` key-value pairs, dot-product attention (with
        scaling) is::

            w0 = dropout(softmax( (query @ key[0]) / sqrt(d_k) ))
            w1 = dropout(softmax( (query @ key[1]) / sqrt(d_k) ))
                                    ...
            wn = dropout(softmax( (query @ key[n]) / sqrt(d_k) ))

            y = np.array([w0, ..., wn]) @ values
                      (1 × n_ex)      (n_ex × d_v)

        In words, keys and queries are combined via dot-product to produce a
        score, which is then passed through a softmax to produce a weight on
        each value vector in Values. We elementwise multiply each value vector
        by its weight, and then take the elementwise sum of each weighted value
        vector to get the :math:`1 \times d_v` output for the current example.

        In vectorized form,

        .. math::

            \mathbf{Y} = \text{dropout}(
                \text{softmax}(\mathbf{KQ}^\top / \sqrt{d_k})
            ) \mathbf{V}

        Parameters
        ----------
        Q : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_k)`
            A set of `n_ex` query vectors packed into a single matrix.
            Optional middle dimensions can be used to specify, e.g., the number
            of parallel attention heads.
        K : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_k)`
            A set of `n_ex` key vectors packed into a single matrix. Optional
            middle dimensions can be used to specify, e.g., the number of
            parallel attention heads.
        V : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_v)`
            A set of `n_ex` value vectors packed into a single matrix. Optional
            middle dimensions can be used to specify, e.g., the number of
            parallel attention heads.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_v)`
            The attention-weighted output values
        """
        Y, weights = self._fwd(Q, K, V)

        if retain_derived:
            self.X.append((Q, K, V))
            self.derived_variables["attention_weights"].append(weights)

        return Y

    def _fwd(self, Q, K, V):
        """Actual computation of forward pass"""
        scale = 1 / np.sqrt(Q.shape[-1]) if self.scale else 1
        scores = Q @ K.swapaxes(-2, -1) * scale  # attention scores
        weights = self.softmax.forward(scores)  # attention weights
        Y = weights @ V
        return Y, weights

    def backward(self, dLdy, retain_grads=True):
        r"""
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_v)`
            The gradient of the loss wrt. the layer output `Y`
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dQ : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_k)` or list of arrays
            The gradient of the loss wrt. the layer query matrix/matrices `Q`.
        dK : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_k)` or list of arrays
            The gradient of the loss wrt. the layer key matrix/matrices `K`.
        dV : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *, d_v)` or list of arrays
            The gradient of the loss wrt. the layer value matrix/matrices `V`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dQ, dK, dV = [], [], []
        weights = self.derived_variables["attention_weights"]
        for dy, (q, k, v), w in zip(dLdy, self.X, weights):
            dq, dk, dv = self._bwd(dy, q, k, v, w)
            dQ.append(dq)
            dK.append(dk)
            dV.append(dv)

        if len(self.X) == 1:
            dQ, dK, dV = dQ[0], dK[0], dV[0]

        return dQ, dK, dV

    def _bwd(self, dy, q, k, v, weights):
        """Actual computation of the gradient of the loss wrt. q, k, and v"""
        d_k = k.shape[-1]
        scale = 1 / np.sqrt(d_k) if self.scale else 1

        dV = weights.swapaxes(-2, -1) @ dy
        dWeights = dy @ v.swapaxes(-2, -1)
        dScores = self.softmax.backward(dWeights)
        dQ = dScores @ k * scale
        dK = dScores.swapaxes(-2, -1) @ q * scale
        return dQ, dK, dV


class RBM(LayerBase):
    def __init__(self, n_out, K=1, init="glorot_uniform", optimizer=None):
        """
        A Restricted Boltzmann machine with Bernoulli visible and hidden units.

        Parameters
        ----------
        n_out : int
            The number of output dimensions/units.
        K : int
            The number of contrastive divergence steps to run before computing
            a single gradient update. Default is 1.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Unused
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.K = K  # CD-K
        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.is_initialized = False
        self.act_fn_V = ActivationInitializer("Sigmoid")()
        self.act_fn_H = ActivationInitializer("Sigmoid")()
        self.parameters = {"W": None, "b_in": None, "b_out": None}

        self._init_params()

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn_V), mode=self.init)

        b_in = np.zeros((1, self.n_in))
        b_out = np.zeros((1, self.n_out))
        W = init_weights((self.n_in, self.n_out))

        self.parameters = {"W": W, "b_in": b_in, "b_out": b_out}

        self.gradients = {
            "W": np.zeros_like(W),
            "b_in": np.zeros_like(b_in),
            "b_out": np.zeros_like(b_out),
        }

        self.derived_variables = {
            "V": None,
            "p_H": None,
            "p_V_prime": None,
            "p_H_prime": None,
            "positive_grad": None,
            "negative_grad": None,
        }
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "RBM",
            "K": self.K,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "init": self.init,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameterse,
            },
        }

    def CD_update(self, X):
        """
        Perform a single contrastive divergence-`k` training update using the
        visible inputs `X` as a starting point for the Gibbs sampler.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples. Each feature in X should ideally be
            binary-valued, although it is possible to also train on real-valued
            features ranging between (0, 1) (e.g., grayscale images).
        """
        self.forward(X)
        self.backward()

    def forward(self, V, K=None, retain_derived=True):
        """
        Perform the CD-`k` "forward pass" of visible inputs into hidden units
        and back.

        Notes
        -----
        This implementation follows [1]_'s recommendations for the RBM forward
        pass:

            - Use real-valued probabilities for both the data and the visible
              unit reconstructions.
            - Only the final update of the hidden units should use the actual
              probabilities -- all others should be sampled binary states.
            - When collecting the pairwise statistics for learning weights or
              the individual statistics for learning biases, use the
              probabilities, not the binary states.

        References
        ----------
        .. [1] Hinton, G. (2010). "A practical guide to training restricted
           Boltzmann machines". *UTML TR 2010-003*

        Parameters
        ----------
        V : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Visible input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples. Each feature in V should ideally be
            binary-valued, although it is possible to also train on real-valued
            features ranging between (0, 1) (e.g., grayscale images).
        K : int
            The number of steps of contrastive divergence steps to run before
            computing the gradient update. If None, use ``self.K``. Default is
            None.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.
        """
        if not self.is_initialized:
            self.n_in = V.shape[1]
            self._init_params()

        # override self.K if necessary
        K = self.K if K is None else K

        W = self.parameters["W"]
        b_in = self.parameters["b_in"]
        b_out = self.parameters["b_out"]

        # compute hidden unit probabilities
        Z_H = V @ W + b_out
        p_H = self.act_fn_H.fn(Z_H)

        # sample hidden states (stochastic binary values)
        H = np.random.rand(*p_H.shape) <= p_H
        H = H.astype(float)

        # always use probabilities when computing gradients
        positive_grad = V.T @ p_H

        # perform CD-k
        # TODO: use persistent CD-k
        # https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf
        H_prime = H.copy()
        for k in range(K):
            # resample v' given h (H_prime is binary for all but final step)
            Z_V_prime = H_prime @ W.T + b_in
            p_V_prime = self.act_fn_V.fn(Z_V_prime)

            # don't resample visual units - always use raw probabilities!
            V_prime = p_V_prime

            # compute p(h' | v')
            Z_H_prime = V_prime @ W + b_out
            p_H_prime = self.act_fn_H.fn(Z_H_prime)

            # if this is the final iteration of CD, keep hidden state
            # probabilities (don't sample)
            H_prime = p_H_prime
            if k != self.K - 1:
                H_prime = np.random.rand(*p_H_prime.shape) <= p_H_prime
                H_prime = H_prime.astype(float)

        negative_grad = p_V_prime.T @ p_H_prime

        if retain_derived:
            self.derived_variables["V"] = V
            self.derived_variables["p_H"] = p_H
            self.derived_variables["p_V_prime"] = p_V_prime
            self.derived_variables["p_H_prime"] = p_H_prime
            self.derived_variables["positive_grad"] = positive_grad
            self.derived_variables["negative_grad"] = negative_grad

    def backward(self, retain_grads=True, *args):
        """
        Perform a gradient update on the layer parameters via the contrastive
        divergence equations.

        Parameters
        ----------
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.
        """
        V = self.derived_variables["V"]
        p_H = self.derived_variables["p_H"]
        p_V_prime = self.derived_variables["p_V_prime"]
        p_H_prime = self.derived_variables["p_H_prime"]
        positive_grad = self.derived_variables["positive_grad"]
        negative_grad = self.derived_variables["negative_grad"]

        if retain_grads:
            self.gradients["b_in"] = V - p_V_prime
            self.gradients["b_out"] = p_H - p_H_prime
            self.gradients["W"] = positive_grad - negative_grad

    def reconstruct(self, X, n_steps=10, return_prob=False):
        """
        Reconstruct an input `X` by running the trained Gibbs sampler for
        `n_steps`-worth of CD-`k`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples. Each feature in `X` should ideally be
            binary-valued, although it is possible to also train on real-valued
            features ranging between (0, 1) (e.g., grayscale images). If `X` has
            missing values, it may be sufficient to mark them with random
            entries and allow the reconstruction to impute them.
        n_steps : int
            The number of Gibbs sampling steps to perform when generating the
            reconstruction. Default is 10.
        return_prob : bool
            Whether to return the real-valued feature probabilities for the
            reconstruction or the binary samples. Default is False.

        Returns
        -------
        V : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_ch)`
            The reconstruction (or feature probabilities if `return_prob` is
            true) of the visual input `X` after running the Gibbs sampler for
            `n_steps`.
        """
        self.forward(X, K=n_steps)
        p_V_prime = self.derived_variables["p_V_prime"]

        # ignore the gradients produced during this reconstruction
        self.flush_gradients()

        # sample V_prime reconstruction if return_prob is False
        V = p_V_prime
        if not return_prob:
            V = (np.random.rand(*p_V_prime.shape) <= p_V_prime).astype(float)
        return V


#######################################################################
#                              Layer Ops                              #
#######################################################################


class Add(LayerBase):
    def __init__(self, act_fn=None, optimizer=None):
        """
        An "addition" layer that returns the sum of its inputs, passed through
        an optional nonlinearity.

        Parameters
        ----------
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The element-wise output nonlinearity used in computing the final
            output. If None, use the identity function :math:`f(x) = x`.
            Default is None.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Unused
        parameters : dict
            Unused
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)
        self.act_fn = ActivationInitializer(act_fn)()
        self._init_params()

    def _init_params(self):
        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {"sum": []}

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Sum",
            "act_fn": str(self.act_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        r"""
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : list of length `n_inputs`
            A list of tensors, all of the same shape.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *)`
            The sum over the `n_ex` examples.
        """
        out = X[0].copy()
        for i in range(1, len(X)):
            out += X[i]
        if retain_derived:
            self.X.append(X)
            self.derived_variables["sum"].append(out)
        return self.act_fn(out)

    def backward(self, dLdY, retain_grads=True):
        r"""
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : list of length `n_inputs`
            The gradient of the loss wrt. each input in `X`.
        """
        if not isinstance(dLdY, list):
            dLdY = [dLdY]

        X = self.X
        _sum = self.derived_variables["sum"]
        grads = [self._bwd(dy, x, ss) for dy, x, ss in zip(dLdY, X, _sum)]
        return grads[0] if len(X) == 1 else grads

    def _bwd(self, dLdY, X, _sum):
        """Actual computation of gradient of the loss wrt. each input"""
        grads = [dLdY * self.act_fn.grad(_sum) for _ in X]
        return grads


class Multiply(LayerBase):
    def __init__(self, act_fn=None, optimizer=None):
        """
        A multiplication layer that returns the *elementwise* product of its
        inputs, passed through an optional nonlinearity.

        Parameters
        ----------
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The element-wise output nonlinearity used in computing the final
            output. If None, use the identity function :math:`f(x) = x`.
            Default is None.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Unused
        parameters : dict
            Unused
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)
        self.act_fn = ActivationInitializer(act_fn)()
        self._init_params()

    def _init_params(self):
        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {"product": []}

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Multiply",
            "act_fn": str(self.act_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        r"""
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : list of length `n_inputs`
            A list of tensors, all of the same shape.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *)`
            The product over the `n_ex` examples.
        """  # noqa: E501
        out = X[0].copy()
        for i in range(1, len(X)):
            out *= X[i]
        if retain_derived:
            self.X.append(X)
            self.derived_variables["product"].append(out)
        return self.act_fn(out)

    def backward(self, dLdY, retain_grads=True):
        r"""
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : list of length `n_inputs`
            The gradient of the loss wrt. each input in `X`.
        """
        if not isinstance(dLdY, list):
            dLdY = [dLdY]

        X = self.X
        _prod = self.derived_variables["product"]
        grads = [self._bwd(dy, x, pr) for dy, x, pr in zip(dLdY, X, _prod)]
        return grads[0] if len(X) == 1 else grads

    def _bwd(self, dLdY, X, prod):
        """Actual computation of gradient of loss wrt. each input"""
        grads = [dLdY * self.act_fn.grad(prod)] * len(X)
        for i, x in enumerate(X):
            grads = [g * x if j != i else g for j, g in enumerate(grads)]
        return grads


class Flatten(LayerBase):
    def __init__(self, keep_dim="first", optimizer=None):
        """
        Flatten a multidimensional input into a 2D matrix.

        Parameters
        ----------
        keep_dim : {'first', 'last', -1}
            The dimension of the original input to retain. Typically used for
            retaining the minibatch dimension.. If -1, flatten all dimensions.
            Default is 'first'.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Unused
        gradients : dict
            Unused
        parameters : dict
            Unused
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.keep_dim = keep_dim
        self._init_params()

    def _init_params(self):
        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {"in_dims": []}

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Flatten",
            "keep_dim": self.keep_dim,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        r"""
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>`
            Input volume to flatten.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(*out_dims)`
            Flattened output. If `keep_dim` is `'first'`, `X` is reshaped to
            ``(X.shape[0], -1)``, otherwise ``(-1, X.shape[0])``.
        """
        if retain_derived:
            self.derived_variables["in_dims"].append(X.shape)
        if self.keep_dim == -1:
            return X.flatten().reshape(1, -1)
        rs = (X.shape[0], -1) if self.keep_dim == "first" else (-1, X.shape[-1])
        return X.reshape(*rs)

    def backward(self, dLdy, retain_grads=True):
        r"""
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(*out_dims)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(*in_dims)` or list of arrays
            The gradient of the loss wrt. the layer input(s) `X`.
        """  # noqa: E501
        if not isinstance(dLdy, list):
            dLdy = [dLdy]
        in_dims = self.derived_variables["in_dims"]
        out = [dy.reshape(*dims) for dy, dims in zip(dLdy, in_dims)]
        return out[0] if len(dLdy) == 1 else out


#######################################################################
#                        Normalization Layers                         #
#######################################################################


class BatchNorm2D(LayerBase):
    def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):
        """
        A batch normalization layer for two-dimensional inputs with an
        additional channel dimension.

        Notes
        -----
        BatchNorm is an attempt address the problem of internal covariate
        shift (ICS) during training by normalizing layer inputs.

        ICS refers to the change in the distribution of layer inputs during
        training as a result of the changing parameters of the previous
        layer(s). ICS can make it difficult to train models with saturating
        nonlinearities, and in general can slow training by requiring a lower
        learning rate.

        Equations [train]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Equations [test]::

            Y = scaler * running_norm(X) + intercept
            running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)

        In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates
        the mean and var across the *batch* rather than the output features.
        This has two disadvantages:

            1. It is highly affected by batch size: smaller mini-batch sizes
            increase the variance of the estimates for the global mean and
            variance.

            2. It is difficult to apply in RNNs -- one must fit a separate
            BatchNorm layer for *each* time-step.

        Parameters
        ----------
        momentum : float
            The momentum term for the running mean/running std calculations.
            The closer this is to 1, the less weight will be given to the
            mean/std of the current batch (i.e., higher smoothing). Default is
            0.9.
        epsilon : float
            A small smoothing constant to use during computation of ``norm(X)``
            to avoid divide-by-zero errors. Default is 1e-5.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.in_ch = None
        self.out_ch = None
        self.epsilon = epsilon
        self.momentum = momentum
        self.parameters = {
            "scaler": None,
            "intercept": None,
            "running_var": None,
            "running_mean": None,
        }
        self.is_initialized = False

    def _init_params(self):
        scaler = np.random.rand(self.in_ch)
        intercept = np.zeros(self.in_ch)

        # init running mean and std at 0 and 1, respectively
        running_mean = np.zeros(self.in_ch)
        running_var = np.ones(self.in_ch)

        self.parameters = {
            "scaler": scaler,
            "intercept": intercept,
            "running_var": running_var,
            "running_mean": running_mean,
        }

        self.gradients = {
            "scaler": np.zeros_like(scaler),
            "intercept": np.zeros_like(intercept),
        }

        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "BatchNorm2D",
            "act_fn": None,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "epsilon": self.epsilon,
            "momentum": self.momentum,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def reset_running_stats(self):
        """Reset the running mean and variance estimates to 0 and 1."""
        assert self.trainable, "Layer is frozen"
        self.parameters["running_mean"] = np.zeros(self.in_ch)
        self.parameters["running_var"] = np.ones(self.in_ch)

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Notes
        -----
        Equations [train]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Equations [test]::

            Y = scaler * running_norm(X) + intercept
            running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)

        In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates the
        mean and var across the *batch* rather than the output features.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            Input volume containing the `in_rows` x `in_cols`-dimensional
            features for a minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to use the current intput to adjust the running mean and
            running_var computations. Setting this to False is the same as
            freezing the layer for the current input. Default is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            Layer output for each of the `n_ex` examples.
        """  # noqa: E501
        if not self.is_initialized:
            self.in_ch = self.out_ch = X.shape[3]
            self._init_params()

        ep = self.hyperparameters["epsilon"]
        mm = self.hyperparameters["momentum"]
        rm = self.parameters["running_mean"]
        rv = self.parameters["running_var"]

        scaler = self.parameters["scaler"]
        intercept = self.parameters["intercept"]

        # if the layer is frozen, use our running mean/std values rather
        # than the mean/std values for the new batch
        X_mean = self.parameters["running_mean"]
        X_var = self.parameters["running_var"]

        if self.trainable and retain_derived:
            X_mean, X_var = X.mean(axis=(0, 1, 2)), X.var(axis=(0, 1, 2))  # , ddof=1)
            self.parameters["running_mean"] = mm * rm + (1.0 - mm) * X_mean
            self.parameters["running_var"] = mm * rv + (1.0 - mm) * X_var

        if retain_derived:
            self.X.append(X)

        N = (X - X_mean) / np.sqrt(X_var + ep)
        y = scaler * N + intercept
        return y

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer input `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dScaler, dIntercept = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["scaler"] += dScaler
                self.gradients["intercept"] += dIntercept

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """Computation of gradient of loss wrt. X, scaler, and intercept"""
        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]

        # reshape to 2D, retaining channel dim
        X_shape = X.shape
        X = np.reshape(X, (-1, X.shape[3]))
        dLdy = np.reshape(dLdy, (-1, dLdy.shape[3]))

        # apply 1D batchnorm backward pass on reshaped array
        n_ex, in_ch = X.shape
        X_mean, X_var = X.mean(axis=0), X.var(axis=0)  # , ddof=1)

        N = (X - X_mean) / np.sqrt(X_var + ep)
        dIntercept = dLdy.sum(axis=0)
        dScaler = np.sum(dLdy * N, axis=0)

        dN = dLdy * scaler
        dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (
            n_ex * np.sqrt(X_var + ep)
        )

        return np.reshape(dX, X_shape), dScaler, dIntercept


class BatchNorm1D(LayerBase):
    def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):
        """
        A batch normalization layer for 1D inputs.

        Notes
        -----
        BatchNorm is an attempt address the problem of internal covariate
        shift (ICS) during training by normalizing layer inputs.

        ICS refers to the change in the distribution of layer inputs during
        training as a result of the changing parameters of the previous
        layer(s). ICS can make it difficult to train models with saturating
        nonlinearities, and in general can slow training by requiring a lower
        learning rate.

        Equations [train]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Equations [test]::

            Y = scaler * running_norm(X) + intercept
            running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)

        In contrast to :class:`LayerNorm1D`, the BatchNorm layer calculates
        the mean and var across the *batch* rather than the output features.
        This has two disadvantages:

            1. It is highly affected by batch size: smaller mini-batch sizes
            increase the variance of the estimates for the global mean and
            variance.

            2. It is difficult to apply in RNNs -- one must fit a separate
            BatchNorm layer for *each* time-step.

        Parameters
        ----------
        momentum : float
            The momentum term for the running mean/running std calculations.
            The closer this is to 1, the less weight will be given to the
            mean/std of the current batch (i.e., higher smoothing). Default is
            0.9.
        epsilon : float
            A small smoothing constant to use during computation of ``norm(X)``
            to avoid divide-by-zero errors. Default is 1e-5.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.n_in = None
        self.n_out = None
        self.epsilon = epsilon
        self.momentum = momentum
        self.parameters = {
            "scaler": None,
            "intercept": None,
            "running_var": None,
            "running_mean": None,
        }
        self.is_initialized = False

    def _init_params(self):
        scaler = np.random.rand(self.n_in)
        intercept = np.zeros(self.n_in)

        # init running mean and std at 0 and 1, respectively
        running_mean = np.zeros(self.n_in)
        running_var = np.ones(self.n_in)

        self.parameters = {
            "scaler": scaler,
            "intercept": intercept,
            "running_mean": running_mean,
            "running_var": running_var,
        }

        self.gradients = {
            "scaler": np.zeros_like(scaler),
            "intercept": np.zeros_like(intercept),
        }
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "BatchNorm1D",
            "act_fn": None,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "epsilon": self.epsilon,
            "momentum": self.momentum,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def reset_running_stats(self):
        """Reset the running mean and variance estimates to 0 and 1."""
        assert self.trainable, "Layer is frozen"
        self.parameters["running_mean"] = np.zeros(self.n_in)
        self.parameters["running_var"] = np.ones(self.n_in)

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to use the current intput to adjust the running mean and
            running_var computations. Setting this to True is the same as
            freezing the layer for the current input. Default is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer output for each of the `n_ex` examples
        """
        if not self.is_initialized:
            self.n_in = self.n_out = X.shape[1]
            self._init_params()

        ep = self.hyperparameters["epsilon"]
        mm = self.hyperparameters["momentum"]
        rm = self.parameters["running_mean"]
        rv = self.parameters["running_var"]

        scaler = self.parameters["scaler"]
        intercept = self.parameters["intercept"]

        # if the layer is frozen, use our running mean/std values rather
        # than the mean/std values for the new batch
        X_mean = self.parameters["running_mean"]
        X_var = self.parameters["running_var"]

        if self.trainable and retain_derived:
            X_mean, X_var = X.mean(axis=0), X.var(axis=0)  # , ddof=1)
            self.parameters["running_mean"] = mm * rm + (1.0 - mm) * X_mean
            self.parameters["running_var"] = mm * rv + (1.0 - mm) * X_var

        if retain_derived:
            self.X.append(X)

        N = (X - X_mean) / np.sqrt(X_var + ep)
        y = scaler * N + intercept
        return y

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer input `X`.
        """
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dScaler, dIntercept = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["scaler"] += dScaler
                self.gradients["intercept"] += dIntercept

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """Computation of gradient of loss wrt X, scaler, and intercept"""
        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]

        n_ex, n_in = X.shape
        X_mean, X_var = X.mean(axis=0), X.var(axis=0)  # , ddof=1)

        N = (X - X_mean) / np.sqrt(X_var + ep)
        dIntercept = dLdy.sum(axis=0)
        dScaler = np.sum(dLdy * N, axis=0)

        dN = dLdy * scaler
        dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (
            n_ex * np.sqrt(X_var + ep)
        )

        return dX, dScaler, dIntercept


class LayerNorm2D(LayerBase):
    def __init__(self, epsilon=1e-5, optimizer=None):
        """
        A layer normalization layer for 2D inputs with an additional channel
        dimension.

        Notes
        -----
        In contrast to :class:`BatchNorm2D`, the LayerNorm layer calculates the
        mean and variance across *features* rather than examples in the batch
        ensuring that the mean and variance estimates are independent of batch
        size and permitting straightforward application in RNNs.

        Equations [train & test]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Also in contrast to :class:`BatchNorm2D`, `scaler` and `intercept` are applied
        *elementwise* to ``norm(X)``.

        Parameters
        ----------
        epsilon : float
            A small smoothing constant to use during computation of ``norm(X)``
            to avoid divide-by-zero errors. Default is 1e-5.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.in_ch = None
        self.out_ch = None
        self.epsilon = epsilon
        self.parameters = {"scaler": None, "intercept": None}
        self.is_initialized = False

    def _init_params(self, X_shape):
        n_ex, in_rows, in_cols, in_ch = X_shape

        scaler = np.random.rand(in_rows, in_cols, in_ch)
        intercept = np.zeros((in_rows, in_cols, in_ch))

        self.parameters = {"scaler": scaler, "intercept": intercept}

        self.gradients = {
            "scaler": np.zeros_like(scaler),
            "intercept": np.zeros_like(intercept),
        }

        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "LayerNorm2D",
            "act_fn": None,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "epsilon": self.epsilon,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Notes
        -----
        Equations [train & test]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            Input volume containing the `in_rows` by `in_cols`-dimensional
            features for a minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            Layer output for each of the `n_ex` examples.
        """  # noqa: E501
        if not self.is_initialized:
            self.in_ch = self.out_ch = X.shape[3]
            self._init_params(X.shape)

        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]
        intercept = self.parameters["intercept"]

        if retain_derived:
            self.X.append(X)

        X_var = X.var(axis=(1, 2, 3), keepdims=True)
        X_mean = X.mean(axis=(1, 2, 3), keepdims=True)
        lnorm = (X - X_mean) / np.sqrt(X_var + ep)
        y = scaler * lnorm + intercept
        return y

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer input `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dScaler, dIntercept = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["scaler"] += dScaler
                self.gradients["intercept"] += dIntercept

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dy, X):
        """Computation of gradient of the loss wrt X, scaler, intercept"""
        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]

        X_mean = X.mean(axis=(1, 2, 3), keepdims=True)
        X_var = X.var(axis=(1, 2, 3), keepdims=True)
        lnorm = (X - X_mean) / np.sqrt(X_var + ep)

        dLnorm = dy * scaler
        dIntercept = dy.sum(axis=0)
        dScaler = np.sum(dy * lnorm, axis=0)

        n_in = np.prod(X.shape[1:])
        lnorm = lnorm.reshape(-1, n_in)
        dLnorm = dLnorm.reshape(lnorm.shape)
        X_var = X_var.reshape(X_var.shape[:2])

        dX = (
            n_in * dLnorm
            - dLnorm.sum(axis=1, keepdims=True)
            - lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)
        ) / (n_in * np.sqrt(X_var + ep))

        # reshape X gradients back to proper dimensions
        return np.reshape(dX, X.shape), dScaler, dIntercept


class LayerNorm1D(LayerBase):
    def __init__(self, epsilon=1e-5, optimizer=None):
        """
        A layer normalization layer for 1D inputs.

        Notes
        -----
        In contrast to :class:`BatchNorm1D`, the LayerNorm layer calculates the
        mean and variance across *features* rather than examples in the batch
        ensuring that the mean and variance estimates are independent of batch
        size and permitting straightforward application in RNNs.

        Equations [train & test]::

            Y = scaler * norm(X) + intercept
            norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)

        Also in contrast to :class:`BatchNorm1D`, `scaler` and `intercept` are applied
        *elementwise* to ``norm(X)``.

        Parameters
        ----------
        epsilon : float
            A small smoothing constant to use during computation of ``norm(X)``
            to avoid divide-by-zero errors. Default is 1e-5.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.n_in = None
        self.n_out = None
        self.epsilon = epsilon
        self.parameters = {"scaler": None, "intercept": None}
        self.is_initialized = False

    def _init_params(self):
        scaler = np.random.rand(self.n_in)
        intercept = np.zeros(self.n_in)

        self.parameters = {"scaler": scaler, "intercept": intercept}

        self.gradients = {
            "scaler": np.zeros_like(scaler),
            "intercept": np.zeros_like(intercept),
        }
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "LayerNorm1D",
            "act_fn": None,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "epsilon": self.epsilon,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer output for each of the `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = self.n_out = X.shape[1]
            self._init_params()

        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]
        intercept = self.parameters["intercept"]

        if retain_derived:
            self.X.append(X)

        X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)
        lnorm = (X - X_mean) / np.sqrt(X_var + ep)
        y = scaler * lnorm + intercept
        return y

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer input `X`.
        """
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dScaler, dIntercept = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["scaler"] += dScaler
                self.gradients["intercept"] += dIntercept

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """Computation of gradient of the loss wrt X, scaler, intercept"""
        scaler = self.parameters["scaler"]
        ep = self.hyperparameters["epsilon"]

        n_ex, n_in = X.shape
        X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)

        lnorm = (X - X_mean) / np.sqrt(X_var + ep)
        dIntercept = dLdy.sum(axis=0)
        dScaler = np.sum(dLdy * lnorm, axis=0)

        dLnorm = dLdy * scaler
        dX = (
            n_in * dLnorm
            - dLnorm.sum(axis=1, keepdims=True)
            - lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)
        ) / (n_in * np.sqrt(X_var + ep))

        return dX, dScaler, dIntercept


#######################################################################
#                             MLP Layers                              #
#######################################################################


class Embedding(LayerBase):
    def __init__(
        self, n_out, vocab_size, pool=None, init="glorot_uniform", optimizer=None,
    ):
        """
        An embedding layer.

        Notes
        -----
        Equations::

            Y = W[x]

        NB. This layer must be the first in a neural network as the gradients
        do not get passed back through to the inputs.

        Parameters
        ----------
        n_out : int
            The dimensionality of the embeddings
        vocab_size : int
            The total number of items in the vocabulary. All integer indices
            are expected to range between 0 and `vocab_size - 1`.
        pool : {'sum', 'mean', None}
            If not None, apply this function to the collection of `n_in`
            encodings in each example to produce a single, pooled embedding.
            Default is None.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)
        fstr = "'pool' must be either 'sum', 'mean', or None but got '{}'"
        assert pool in ["sum", "mean", None], fstr.format(pool)

        self.init = init
        self.pool = pool
        self.n_out = n_out
        self.vocab_size = vocab_size
        self.parameters = {"W": None}
        self.is_initialized = False
        self._init_params()

    def _init_params(self):
        init_weights = WeightInitializer("Affine(slope=1, intercept=0)", mode=self.init)
        W = init_weights((self.vocab_size, self.n_out))

        self.parameters = {"W": W}
        self.derived_variables = {}
        self.gradients = {"W": np.zeros_like(W)}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Embedding",
            "init": self.init,
            "pool": self.pool,
            "n_out": self.n_out,
            "vocab_size": self.vocab_size,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def lookup(self, ids):
        """
        Return the embeddings associated with the IDs in `ids`.

        Parameters
        ----------
        word_ids : :py:class:`ndarray <numpy.ndarray>` of shape (`M`,)
            An array of `M` IDs to retrieve embeddings for.

        Returns
        -------
        embeddings : :py:class:`ndarray <numpy.ndarray>` of shape (`M`, `n_out`)
            The embedding vectors for each of the `M` IDs.
        """
        return self.parameters["W"][ids]

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Notes
        -----
        Equations:
            Y = W[x]

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)` or list of length `n_ex`
            Layer input, representing a minibatch of `n_ex` examples. If
            ``self.pool`` is None, each example must consist of exactly `n_in`
            integer token IDs. Otherwise, `X` can be a ragged array, with each
            example consisting of a variable number of token IDs.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through with regard to this input.
            Default is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_out)`
            Embeddings for each coordinate of each of the `n_ex` examples
        """  # noqa: E501
        # if X is a ragged array
        if isinstance(X, list) and not issubclass(X[0].dtype.type, np.integer):
            fstr = "Input to Embedding layer must be an array of integers, got '{}'"
            raise TypeError(fstr.format(X[0].dtype.type))

        # otherwise
        if isinstance(X, np.ndarray) and not issubclass(X.dtype.type, np.integer):
            fstr = "Input to Embedding layer must be an array of integers, got '{}'"
            raise TypeError(fstr.format(X.dtype.type))

        Y = self._fwd(X)
        if retain_derived:
            self.X.append(X)
        return Y

    def _fwd(self, X):
        """Actual computation of forward pass"""
        W = self.parameters["W"]
        if self.pool is None:
            emb = W[X]
        elif self.pool == "sum":
            emb = np.array([W[x].sum(axis=0) for x in X])[:, None, :]
        elif self.pool == "mean":
            emb = np.array([W[x].mean(axis=0) for x in X])[:, None, :]
        return emb

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to embedding weights.

        Notes
        -----
        Because the items in `X` are interpreted as indices, we cannot compute
        the gradient of the layer output wrt. `X`.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_out)` or list of arrays
            The gradient(s) of the loss wrt. the layer output(s)
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        for dy, x in zip(dLdy, self.X):
            dw = self._bwd(dy, x)

            if retain_grads:
                self.gradients["W"] += dw

    def _bwd(self, dLdy, X):
        """Actual computation of gradient of the loss wrt. W"""
        dW = np.zeros_like(self.parameters["W"])
        dLdy = dLdy.reshape(-1, self.n_out)

        if self.pool is None:
            for ix, v_id in enumerate(X.flatten()):
                dW[v_id] += dLdy[ix]
        elif self.pool == "sum":
            for ix, v_ids in enumerate(X):
                dW[v_ids] += dLdy[ix]
        elif self.pool == "mean":
            for ix, v_ids in enumerate(X):
                dW[v_ids] += dLdy[ix] / len(v_ids)
        return dW


class FullyConnected(LayerBase):
    def __init__(self, n_out, act_fn=None, init="glorot_uniform", optimizer=None):
        r"""
        A fully-connected (dense) layer.

        Notes
        -----
        A fully connected layer computes the function

        .. math::

            \mathbf{Y} = f( \mathbf{WX} + \mathbf{b} )

        where `f` is the activation nonlinearity, **W** and **b** are
        parameters of the layer, and **X** is the minibatch of input examples.

        Parameters
        ----------
        n_out : int
            The dimensionality of the layer output
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The element-wise output nonlinearity used in computing `Y`. If None,
            use the identity function :math:`f(X) = X`. Default is None.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"W": None, "b": None}
        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        b = np.zeros((1, self.n_out))
        W = init_weights((self.n_in, self.n_out))

        self.parameters = {"W": W, "b": b}
        self.derived_variables = {"Z": []}
        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "FullyConnected",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            Layer output for each of the `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()

        Y, Z = self._fwd(X)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["Z"].append(Z)

        return Y

    def _fwd(self, X):
        """Actual computation of forward pass"""
        W = self.parameters["W"]
        b = self.parameters["b"]

        Z = X @ W + b
        Y = self.act_fn(Z)
        return Y, Z

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)` or list of arrays
            The gradient(s) of the loss wrt. the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)` or list of arrays
            The gradient of the loss wrt. the layer input(s) `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dw, db = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """Actual computation of gradient of the loss wrt. X, W, and b"""
        W = self.parameters["W"]
        b = self.parameters["b"]

        Z = X @ W + b
        dZ = dLdy * self.act_fn.grad(Z)

        dX = dZ @ W.T
        dW = X.T @ dZ
        dB = dZ.sum(axis=0, keepdims=True)
        return dX, dW, dB

    def _bwd2(self, dLdy, X, dLdy_bwd):
        """Compute second derivatives / deriv. of loss wrt. dX, dW, and db"""
        W = self.parameters["W"]
        b = self.parameters["b"]

        dZ = self.act_fn.grad(X @ W + b)
        ddZ = self.act_fn.grad2(X @ W + b)

        ddX = dLdy @ W * dZ
        ddW = dLdy.T @ (dLdy_bwd * dZ)
        ddB = np.sum(dLdy @ W * dLdy_bwd * ddZ, axis=0, keepdims=True)
        return ddX, ddW, ddB


class Softmax(LayerBase):
    def __init__(self, dim=-1, optimizer=None):
        r"""
        A softmax nonlinearity layer.

        Notes
        -----
        This is implemented as a layer rather than an activation primarily
        because it requires retaining the layer input in order to compute the
        softmax gradients properly. In other words, in contrast to other
        simple activations, the softmax function and its gradient are not
        computed elementwise, and thus are more easily expressed as a layer.

        The softmax function computes:

        .. math::

            y_i = \frac{e^{x_i}}{\sum_j e^{x_j}}

        where :math:`x_i` is the `i` th element of input example **x**.

        Parameters
        ----------
        dim: int
            The dimension in `X` along which the softmax will be computed.
            Default is -1.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None. Unused for this layer.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.dim = dim
        self.n_in = None
        self.is_initialized = False

    def _init_params(self):
        self.gradients = {}
        self.parameters = {}
        self.derived_variables = {}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "SoftmaxLayer",
            "n_in": self.n_in,
            "n_out": self.n_in,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            Layer output for each of the `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()

        Y = self._fwd(X)

        if retain_derived:
            self.X.append(X)

        return Y

    def _fwd(self, X):
        """Actual computation of softmax forward pass"""
        # center data to avoid overflow
        e_X = np.exp(X - np.max(X, axis=self.dim, keepdims=True))
        return e_X / e_X.sum(axis=self.dim, keepdims=True)

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)` or list of arrays
            The gradient(s) of the loss wrt. the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer input `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx = self._bwd(dy, x)
            dX.append(dx)

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """
        Actual computation of the gradient of the loss wrt. the input X.

        The Jacobian, J, of the softmax for input x = [x1, ..., xn] is:
            J[i, j] =
                softmax(x_i)  * (1 - softmax(x_j))  if i = j
                -softmax(x_i) * softmax(x_j)        if i != j
            where
                x_n is input example n (ie., the n'th row in X)
        """
        dX = []
        for dy, x in zip(dLdy, X):
            dxi = []
            for dyi, xi in zip(*np.atleast_2d(dy, x)):
                yi = self._fwd(xi.reshape(1, -1)).reshape(-1, 1)
                dyidxi = np.diagflat(yi) - yi @ yi.T  # jacobian wrt. input sample xi
                dxi.append(dyi @ dyidxi)
            dX.append(dxi)
        return np.array(dX).reshape(*X.shape)


class SparseEvolution(LayerBase):
    def __init__(
        self,
        n_out,
        zeta=0.3,
        epsilon=20,
        act_fn=None,
        init="glorot_uniform",
        optimizer=None,
    ):
        r"""
        A sparse Erdos-Renyi layer with evolutionary rewiring via the sparse
        evolutionary training (SET) algorithm.

        Notes
        -----
        .. math::

            Y = f( (\mathbf{W} \odot \mathbf{W}_{mask}) \mathbf{X} + \mathbf{b} )

        where :math:`\odot` is the elementwise multiplication operation, `f` is
        the layer activation function, and :math:`\mathbf{W}_{mask}` is an
        evolved binary mask.

        Parameters
        ----------
        n_out : int
            The dimensionality of the layer output
        zeta : float
            Proportion of the positive and negative weights closest to zero to
            drop after each training update. Default is 0.3.
        epsilon : float
            Layer sparsity parameter. Default is 20.
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The element-wise output nonlinearity used in computing `Y`. If None,
            use the identity function :math:`f(X) = X`. Default is None.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with default
            parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.zeta = zeta
        self.n_out = n_out
        self.epsilon = epsilon
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"W": None, "b": None}
        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        b = np.zeros((1, self.n_out))
        W = init_weights((self.n_in, self.n_out))

        # convert a fully connected base layer into a sparse layer
        n_in, n_out = W.shape
        p = (self.epsilon * (n_in + n_out)) / (n_in * n_out)
        mask = np.random.binomial(1, p, shape=W.shape)

        self.derived_variables = {"Z": []}
        self.parameters = {"W": W, "b": b, "W_mask": mask}
        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "SparseEvolutionary",
            "init": self.init,
            "zeta": self.zeta,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "epsilon": self.epsilon,
            "act_fn": str(self.act_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            Layer output for each of the `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()

        Y, Z = self._fwd(X)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["Z"].append(Z)

        return Y

    def _fwd(self, X):
        """Actual computation of forward pass"""
        W = self.parameters["W"]
        b = self.parameters["b"]
        W_mask = self.parameters["W_mask"]

        Z = X @ (W * W_mask) + b
        Y = self.act_fn(Z)
        return Y, Z

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from layer outputs to inputs

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)` or list of arrays
            The gradient(s) of the loss wrt. the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer input `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        for dy, x in zip(dLdy, X):
            dx, dw, db = self._bwd(dy, x)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X):
        """Actual computation of gradient of the loss wrt. X, W, and b"""
        W = self.parameters["W"]
        b = self.parameters["b"]
        W_sparse = W * self.parameters["W_mask"]

        Z = X @ W_sparse + b
        dZ = dLdy * self.act_fn.grad(Z)

        dX = dZ @ W_sparse.T
        dW = X.T @ dZ
        dB = dZ.sum(axis=0, keepdims=True)
        return dX, dW, dB

    def _bwd2(self, dLdy, X, dLdy_bwd):
        """Compute second derivatives / deriv. of loss wrt. dX, dW, and db"""
        W = self.parameters["W"]
        b = self.parameters["b"]
        W_sparse = W * self.parameters["W_mask"]

        dZ = self.act_fn.grad(X @ W_sparse + b)
        ddZ = self.act_fn.grad2(X @ W_sparse + b)

        ddX = dLdy @ W * dZ
        ddW = dLdy.T @ (dLdy_bwd * dZ)
        ddB = np.sum(dLdy @ W_sparse * dLdy_bwd * ddZ, axis=0, keepdims=True)
        return ddX, ddW, ddB

    def update(self):
        """
        Update parameters using current gradients and evolve network
        connections via SET.
        """
        assert self.trainable, "Layer is frozen"
        for k, v in self.gradients.items():
            if k in self.parameters:
                self.parameters[k] = self.optimizer(self.parameters[k], v, k)
        self.flush_gradients()
        self._evolve_connections()

    def _evolve_connections(self):
        assert self.trainable, "Layer is frozen"
        W = self.parameters["W"]
        W_mask = self.parameters["W_mask"]
        W_flat = (W * W_mask).reshape(-1)

        k = int(np.prod(W.shape) * self.zeta)

        (p_ix,) = np.where(W_flat > 0)
        (n_ix,) = np.where(W_flat < 0)

        # remove the k largest negative and k smallest positive weights
        k_smallest_p = p_ix[np.argsort(W_flat[p_ix])][:k]
        k_largest_n = n_ix[np.argsort(W_flat[n_ix])][-k:]
        n_rewired = len(k_smallest_p) + len(k_largest_n)

        self.mask = np.ones_like(W_flat)
        self.mask[k_largest_n] = 0
        self.mask[k_smallest_p] = 0

        zero_ixs = np.where(self.mask == 0)

        # resample new connections and update mask
        np.shuffle(zero_ixs)
        self.mask[zero_ixs[:n_rewired]] = 1
        self.mask = self.mask.reshape(*W.shape)


#######################################################################
#                        Convolutional Layers                         #
#######################################################################


class Conv1D(LayerBase):
    def __init__(
        self,
        out_ch,
        kernel_width,
        pad=0,
        stride=1,
        dilation=0,
        act_fn=None,
        init="glorot_uniform",
        optimizer=None,
    ):
        """
        Apply a one-dimensional convolution kernel over an input volume.

        Notes
        -----
        Equations::

            out = act_fn(pad(X) * W + b)
            out_dim = floor(1 + (n_rows_in + pad_left + pad_right - kernel_width) / stride)

        where '`*`' denotes the cross-correlation operation with stride `s` and dilation `d`.

        Parameters
        ----------
        out_ch : int
            The number of filters/kernels to compute in the current layer
        kernel_width : int
            The width of a single 1D filter/kernel in the current layer
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``Y[t]``. If None, use the
            identity function :math:`f(x) = x` by default. Default is None.
        pad : int, tuple, or {'same', 'causal'}
            The number of rows/columns to zero-pad the input with. If `'same'`,
            calculate padding to ensure the output length matches in the input
            length. If `'causal'` compute padding such that the output both has
            the same length as the input AND ``output[t]`` does not depend on
            ``input[t + 1:]``. Default is 0.
        stride : int
            The stride/hop of the convolution kernels as they move over the
            input volume. Default is 1.
        dilation : int
            Number of pixels inserted between kernel elements. Effective kernel
            shape after dilation is: ``[kernel_rows * (d + 1) - d, kernel_cols
            * (d + 1) - d]``. Default is 0.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.pad = pad
        self.init = init
        self.in_ch = None
        self.out_ch = out_ch
        self.stride = stride
        self.dilation = dilation
        self.kernel_width = kernel_width
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"W": None, "b": None}
        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        W = init_weights((self.kernel_width, self.in_ch, self.out_ch))
        b = np.zeros((1, 1, self.out_ch))

        self.parameters = {"W": W, "b": b}
        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
        self.derived_variables = {"Z": [], "out_rows": [], "out_cols": []}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Conv1D",
            "pad": self.pad,
            "init": self.init,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "stride": self.stride,
            "dilation": self.dilation,
            "act_fn": str(self.act_fn),
            "kernel_width": self.kernel_width,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_in, in_ch)`
            The input volume consisting of `n_ex` examples, each of length
            `l_in` and with `in_ch` input channels
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_out, out_ch)`
            The layer output.
        """
        if not self.is_initialized:
            self.in_ch = X.shape[2]
            self._init_params()

        W = self.parameters["W"]
        b = self.parameters["b"]

        n_ex, l_in, in_ch = X.shape
        s, p, d = self.stride, self.pad, self.dilation

        # pad the input and perform the forward convolution
        Z = conv1D(X, W, s, p, d) + b
        Y = self.act_fn(Z)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["Z"].append(Z)
            self.derived_variables["out_rows"].append(Z.shape[1])
            self.derived_variables["out_cols"].append(Z.shape[2])

        return Y

    def backward(self, dLdy, retain_grads=True):
        """
        Compute the gradient of the loss with respect to the layer parameters.

        Notes
        -----
        Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and
        :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the
        gradient calculation.  See the private method :meth:`_backward_naive`
        for a more straightforward implementation.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_out, out_ch)` or list of arrays
            The gradient(s) of the loss with respect to the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_in, in_ch)`
            The gradient of the loss with respect to the layer input volume.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        X = self.X
        Z = self.derived_variables["Z"]

        dX = []
        for dy, x, z in zip(dLdy, X, Z):
            dx, dw, db = self._bwd(dy, x, z)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X, Z):
        """Actual computation of gradient of the loss wrt. X, W, and b"""
        W = self.parameters["W"]

        # add a row dimension to X, W, and dZ to permit us to use im2col/col2im
        X2D = np.expand_dims(X, axis=1)
        W2D = np.expand_dims(W, axis=0)
        dLdZ = np.expand_dims(dLdy * self.act_fn.grad(Z), axis=1)

        d = self.dilation
        fr, fc, in_ch, out_ch = W2D.shape
        n_ex, l_out, out_ch = dLdy.shape
        fr, fc, s = 1, self.kernel_width, self.stride

        # use pad1D here in order to correctly handle self.pad = 'causal',
        # which isn't defined for pad2D
        _, p = pad1D(X, self.pad, self.kernel_width, s, d)
        p2D = (0, 0, p[0], p[1])

        # columnize W, X, and dLdy
        dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)
        W_col = W2D.transpose(3, 2, 0, 1).reshape(out_ch, -1).T
        X_col, _ = im2col(X2D, W2D.shape, p2D, s, d)

        # compute gradients via matrix multiplication and reshape
        dB = dLdZ_col.sum(axis=1).reshape(1, 1, -1)
        dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)

        # reshape columnized dX back into the same format as the input volume
        dX_col = W_col @ dLdZ_col
        dX = col2im(dX_col, X2D.shape, W2D.shape, p2D, s, d).transpose(0, 2, 3, 1)

        return np.squeeze(dX, axis=1), np.squeeze(dW, axis=0), dB

    def _backward_naive(self, dLdy, retain_grads=True):
        """
        A slower (ie., non-vectorized) but more straightforward implementation
        of the gradient computations for a 2D conv layer.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_out, out_ch)` or list of arrays
            The gradient(s) of the loss with respect to the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_in, in_ch)`
            The gradient of the loss with respect to the layer input volume.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        W = self.parameters["W"]
        b = self.parameters["b"]
        Zs = self.derived_variables["Z"]

        Xs, d = self.X, self.dilation
        fw, s, p = self.kernel_width, self.stride, self.pad

        dXs = []
        for X, Z, dy in zip(Xs, Zs, dLdy):
            n_ex, l_out, out_ch = dy.shape
            X_pad, (pr1, pr2) = pad1D(X, p, self.kernel_width, s, d)

            dX = np.zeros_like(X_pad)
            dZ = dy * self.act_fn.grad(Z)

            dW, dB = np.zeros_like(W), np.zeros_like(b)
            for m in range(n_ex):
                for i in range(l_out):
                    for c in range(out_ch):
                        # compute window boundaries w. stride and dilation
                        i0, i1 = i * s, (i * s) + fw * (d + 1) - d

                        wc = W[:, :, c]
                        kernel = dZ[m, i, c]
                        window = X_pad[m, i0 : i1 : (d + 1), :]

                        dB[:, :, c] += kernel
                        dW[:, :, c] += window * kernel
                        dX[m, i0 : i1 : (d + 1), :] += wc * kernel

            if retain_grads:
                self.gradients["W"] += dW
                self.gradients["b"] += dB

            pr2 = None if pr2 == 0 else -pr2
            dXs.append(dX[:, pr1:pr2, :])
        return dXs[0] if len(Xs) == 1 else dXs


class Conv2D(LayerBase):
    def __init__(
        self,
        out_ch,
        kernel_shape,
        pad=0,
        stride=1,
        dilation=0,
        act_fn=None,
        optimizer=None,
        init="glorot_uniform",
    ):
        """
        Apply a two-dimensional convolution kernel over an input volume.

        Notes
        -----
        Equations::

            out = act_fn(pad(X) * W + b)
            n_rows_out = floor(1 + (n_rows_in + pad_left + pad_right - filter_rows) / stride)
            n_cols_out = floor(1 + (n_cols_in + pad_top + pad_bottom - filter_cols) / stride)

        where `'*'` denotes the cross-correlation operation with stride `s` and
        dilation `d`.

        Parameters
        ----------
        out_ch : int
            The number of filters/kernels to compute in the current layer
        kernel_shape : 2-tuple
            The dimension of a single 2D filter/kernel in the current layer
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``Y[t]``. If None, use the
            identity function :math:`f(X) = X` by default. Default is None.
        pad : int, tuple, or 'same'
            The number of rows/columns to zero-pad the input with. Default is
            0.
        stride : int
            The stride/hop of the convolution kernels as they move over the
            input volume. Default is 1.
        dilation : int
            Number of pixels inserted between kernel elements. Effective kernel
            shape after dilation is: ``[kernel_rows * (d + 1) - d, kernel_cols
            * (d + 1) - d]``. Default is 0.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        X : list
            Running list of inputs to the :meth:`forward <numpy_ml.neural_nets.LayerBase.forward>` method since the last call to :meth:`update <numpy_ml.neural_nets.LayerBase.update>`. Only updated if the `retain_derived` argument was set to True.
        gradients : dict
            Dictionary of loss gradients with regard to the layer parameters
        parameters : dict
            Dictionary of layer parameters
        hyperparameters : dict
            Dictionary of layer hyperparameters
        derived_variables : dict
            Dictionary of any intermediate values computed during
            forward/backward propagation.
        """  # noqa: E501
        super().__init__(optimizer)

        self.pad = pad
        self.init = init
        self.in_ch = None
        self.out_ch = out_ch
        self.stride = stride
        self.dilation = dilation
        self.kernel_shape = kernel_shape
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"W": None, "b": None}
        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        fr, fc = self.kernel_shape
        W = init_weights((fr, fc, self.in_ch, self.out_ch))
        b = np.zeros((1, 1, 1, self.out_ch))

        self.parameters = {"W": W, "b": b}
        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
        self.derived_variables = {"Z": [], "out_rows": [], "out_cols": []}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """A dictionary containing the layer hyperparameters."""
        return {
            "layer": "Conv2D",
            "pad": self.pad,
            "init": self.init,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "stride": self.stride,
            "dilation": self.dilation,
            "act_fn": str(self.act_fn),
            "kernel_shape": self.kernel_shape,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`, `in_cols`, `in_ch`).
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The layer output.
        """  # noqa: E501
        if not self.is_initialized:
            self.in_ch = X.shape[3]
            self._init_params()

        W = self.parameters["W"]
        b = self.parameters["b"]

        n_ex, in_rows, in_cols, in_ch = X.shape
        s, p, d = self.stride, self.pad, self.dilation

        # pad the input and perform the forward convolution
        Z = conv2D(X, W, s, p, d) + b
        Y = self.act_fn(Z)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["Z"].append(Z)
            self.derived_variables["out_rows"].append(Z.shape[1])
            self.derived_variables["out_cols"].append(Z.shape[2])

        return Y

    def backward(self, dLdy, retain_grads=True):
        """
        Compute the gradient of the loss with respect to the layer parameters.

        Notes
        -----
        Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and
        :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the
        gradient calculation.

        See the private method :meth:`_backward_naive` for a more straightforward
        implementation.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)` or list of arrays
            The gradient(s) of the loss with respect to the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss with respect to the layer input volume.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        dX = []
        X = self.X
        Z = self.derived_variables["Z"]

        for dy, x, z in zip(dLdy, X, Z):
            dx, dw, db = self._bwd(dy, x, z)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdy, X, Z):
        """Actual computation of gradient of the loss wrt. X, W, and b"""
        W = self.parameters["W"]

        d = self.dilation
        fr, fc, in_ch, out_ch = W.shape
        n_ex, out_rows, out_cols, out_ch = dLdy.shape
        (fr, fc), s, p = self.kernel_shape, self.stride, self.pad

        # columnize W, X, and dLdy
        dLdZ = dLdy * self.act_fn.grad(Z)
        dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)
        W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1).T
        X_col, p = im2col(X, W.shape, p, s, d)

        # compute gradients via matrix multiplication and reshape
        dB = dLdZ_col.sum(axis=1).reshape(1, 1, 1, -1)
        dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)

        # reshape columnized dX back into the same format as the input volume
        dX_col = W_col @ dLdZ_col
        dX = col2im(dX_col, X.shape, W.shape, p, s, d).transpose(0, 2, 3, 1)

        return dX, dW, dB

    def _backward_naive(self, dLdy, retain_grads=True):
        """
        A slower (ie., non-vectorized) but more straightforward implementation
        of the gradient computations for a 2D conv layer.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The gradient of the loss with respect to the layer output.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss with respect to the layer input volume.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdy, list):
            dLdy = [dLdy]

        W = self.parameters["W"]
        b = self.parameters["b"]
        Zs = self.derived_variables["Z"]

        Xs, d = self.X, self.dilation
        (fr, fc), s, p = self.kernel_shape, self.stride, self.pad

        dXs = []
        for X, Z, dy in zip(Xs, Zs, dLdy):
            n_ex, out_rows, out_cols, out_ch = dy.shape
            X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s, d)

            dZ = dLdy * self.act_fn.grad(Z)

            dX = np.zeros_like(X_pad)
            dW, dB = np.zeros_like(W), np.zeros_like(b)
            for m in range(n_ex):
                for i in range(out_rows):
                    for j in range(out_cols):
                        for c in range(out_ch):
                            # compute window boundaries w. stride and dilation
                            i0, i1 = i * s, (i * s) + fr * (d + 1) - d
                            j0, j1 = j * s, (j * s) + fc * (d + 1) - d

                            wc = W[:, :, :, c]
                            kernel = dZ[m, i, j, c]
                            window = X_pad[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :]

                            dB[:, :, :, c] += kernel
                            dW[:, :, :, c] += window * kernel
                            dX[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :] += (
                                wc * kernel
                            )

            if retain_grads:
                self.gradients["W"] += dW
                self.gradients["b"] += dB

            pr2 = None if pr2 == 0 else -pr2
            pc2 = None if pc2 == 0 else -pc2
            dXs.append(dX[:, pr1:pr2, pc1:pc2, :])
        return dXs[0] if len(Xs) == 1 else dXs


class Pool2D(LayerBase):
    def __init__(self, kernel_shape, stride=1, pad=0, mode="max", optimizer=None):
        """
        A single two-dimensional pooling layer.

        Parameters
        ----------
        kernel_shape : 2-tuple
            The dimension of a single 2D filter/kernel in the current layer
        stride : int
            The stride/hop of the convolution kernels as they move over the
            input volume. Default is 1.
        pad : int, tuple, or 'same'
            The number of rows/columns of 0's to pad the input. Default is 0.
        mode : {"max", "average"}
            The pooling function to apply.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.pad = pad
        self.mode = mode
        self.in_ch = None
        self.out_ch = None
        self.stride = stride
        self.kernel_shape = kernel_shape
        self.is_initialized = False

    def _init_params(self):
        self.derived_variables = {"out_rows": [], "out_cols": []}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Pool2D",
            "act_fn": None,
            "pad": self.pad,
            "mode": self.mode,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "stride": self.stride,
            "kernel_shape": self.kernel_shape,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`,`in_cols`, `in_ch`)
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The layer output.
        """  # noqa: E501
        if not self.is_initialized:
            self.in_ch = self.out_ch = X.shape[3]
            self._init_params()

        n_ex, in_rows, in_cols, nc_in = X.shape
        (fr, fc), s, p = self.kernel_shape, self.stride, self.pad
        X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s)

        out_rows = np.floor(1 + (in_rows + pr1 + pr2 - fr) / s).astype(int)
        out_cols = np.floor(1 + (in_cols + pc1 + pc2 - fc) / s).astype(int)

        if self.mode == "max":
            pool_fn = np.max
        elif self.mode == "average":
            pool_fn = np.mean

        Y = np.zeros((n_ex, out_rows, out_cols, self.out_ch))
        for m in range(n_ex):
            for i in range(out_rows):
                for j in range(out_cols):
                    for c in range(self.out_ch):
                        # calculate window boundaries, incorporating stride
                        i0, i1 = i * s, (i * s) + fr
                        j0, j1 = j * s, (j * s) + fc

                        xi = X_pad[m, i0:i1, j0:j1, c]
                        Y[m, i, j, c] = pool_fn(xi)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["out_rows"].append(out_rows)
            self.derived_variables["out_cols"].append(out_cols)

        return Y

    def backward(self, dLdY, retain_grads=True):
        """
        Backprop from layer outputs to inputs

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer output `Y`.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss wrt. the layer input `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdY, list):
            dLdY = [dLdY]

        Xs = self.X
        out_rows = self.derived_variables["out_rows"]
        out_cols = self.derived_variables["out_cols"]

        (fr, fc), s, p = self.kernel_shape, self.stride, self.pad

        dXs = []
        for X, dy, out_row, out_col in zip(Xs, dLdY, out_rows, out_cols):
            n_ex, in_rows, in_cols, nc_in = X.shape
            X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s)

            dX = np.zeros_like(X_pad)
            for m in range(n_ex):
                for i in range(out_row):
                    for j in range(out_col):
                        for c in range(self.out_ch):
                            # calculate window boundaries, incorporating stride
                            i0, i1 = i * s, (i * s) + fr
                            j0, j1 = j * s, (j * s) + fc

                            if self.mode == "max":
                                xi = X[m, i0:i1, j0:j1, c]

                                # enforce that the mask can only consist of a
                                # single `True` entry, even if multiple entries in
                                # xi are equal to max(xi)
                                mask = np.zeros_like(xi).astype(bool)
                                x, y = np.argwhere(xi == np.max(xi))[0]
                                mask[x, y] = True

                                dX[m, i0:i1, j0:j1, c] += mask * dy[m, i, j, c]
                            elif self.mode == "average":
                                frame = np.ones((fr, fc)) * dy[m, i, j, c]
                                dX[m, i0:i1, j0:j1, c] += frame / np.prod((fr, fc))

            pr2 = None if pr2 == 0 else -pr2
            pc2 = None if pc2 == 0 else -pc2
            dXs.append(dX[:, pr1:pr2, pc1:pc2, :])
        return dXs[0] if len(Xs) == 1 else dXs


class Deconv2D(LayerBase):
    def __init__(
        self,
        out_ch,
        kernel_shape,
        pad=0,
        stride=1,
        act_fn=None,
        optimizer=None,
        init="glorot_uniform",
    ):
        """
        Apply a two-dimensional "deconvolution" to an input volume.

        Notes
        -----
        The term "deconvolution" in this context does not correspond with the
        deconvolution operation in mathematics. More accurately, this layer is
        computing a transposed convolution / fractionally-strided convolution.

        Parameters
        ----------
        out_ch : int
            The number of filters/kernels to compute in the current layer
        kernel_shape : 2-tuple
            The dimension of a single 2D filter/kernel in the current layer
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``Y[t]``. If None, use
            :class:`~numpy_ml.neural_nets.activations.Affine`
            activations by default. Default is None.
        pad : int, tuple, or 'same'
            The number of rows/columns to zero-pad the input with. Default is 0.
        stride : int
            The stride/hop of the convolution kernels as they move over the
            input volume. Default is 1.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.pad = pad
        self.init = init
        self.in_ch = None
        self.stride = stride
        self.out_ch = out_ch
        self.kernel_shape = kernel_shape
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"W": None, "b": None}
        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        fr, fc = self.kernel_shape
        W = init_weights((fr, fc, self.in_ch, self.out_ch))
        b = np.zeros((1, 1, 1, self.out_ch))

        self.parameters = {"W": W, "b": b}
        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
        self.derived_variables = {"Z": [], "out_rows": [], "out_cols": []}
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "Deconv2D",
            "pad": self.pad,
            "init": self.init,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "stride": self.stride,
            "act_fn": str(self.act_fn),
            "kernel_shape": self.kernel_shape,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`, `in_cols`, `in_ch`).
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The layer output.
        """  # noqa: E501
        if not self.is_initialized:
            self.in_ch = X.shape[3]
            self._init_params()

        W = self.parameters["W"]
        b = self.parameters["b"]

        s, p = self.stride, self.pad
        n_ex, in_rows, in_cols, in_ch = X.shape

        # pad the input and perform the forward deconvolution
        Z = deconv2D_naive(X, W, s, p, 0) + b
        Y = self.act_fn(Z)

        if retain_derived:
            self.X.append(X)
            self.derived_variables["Z"].append(Z)
            self.derived_variables["out_rows"].append(Z.shape[1])
            self.derived_variables["out_cols"].append(Z.shape[2])

        return Y

    def backward(self, dLdY, retain_grads=True):
        """
        Compute the gradient of the loss with respect to the layer parameters.

        Notes
        -----
        Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and
        :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the
        gradient calculations.

        Parameters
        ----------
        dLdY : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex, out_rows, out_cols, out_ch`)
            The gradient of the loss with respect to the layer output.
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex, in_rows, in_cols, in_ch`)
            The gradient of the loss with respect to the layer input volume.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        if not isinstance(dLdY, list):
            dLdY = [dLdY]

        dX = []
        X, Z = self.X, self.derived_variables["Z"]

        for dy, x, z in zip(dLdY, X, Z):
            dx, dw, db = self._bwd(dy, x, z)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        return dX[0] if len(X) == 1 else dX

    def _bwd(self, dLdY, X, Z):
        """Actual computation of gradient of the loss wrt. X, W, and b"""
        W = np.rot90(self.parameters["W"], 2)

        s = self.stride
        if self.stride > 1:
            X = dilate(X, s - 1)
            s = 1

        fr, fc, in_ch, out_ch = W.shape
        (fr, fc), p = self.kernel_shape, self.pad
        n_ex, out_rows, out_cols, out_ch = dLdY.shape

        # pad X the first time
        X_pad, p = pad2D(X, p, W.shape[:2], s)
        n_ex, in_rows, in_cols, in_ch = X_pad.shape
        pr1, pr2, pc1, pc2 = p

        # compute additional padding to produce the deconvolution
        out_rows = s * (in_rows - 1) - pr1 - pr2 + fr
        out_cols = s * (in_cols - 1) - pc1 - pc2 + fc
        out_dim = (out_rows, out_cols)

        # add additional "deconvolution" padding
        _p = calc_pad_dims_2D(X_pad.shape, out_dim, W.shape[:2], s, 0)
        X_pad, _ = pad2D(X_pad, _p, W.shape[:2], s)

        # columnize W, X, and dLdY
        dLdZ = dLdY * self.act_fn.grad(Z)
        dLdZ, _ = pad2D(dLdZ, p, W.shape[:2], s)

        dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)
        W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1)
        X_col, _ = im2col(X_pad, W.shape, 0, s, 0)

        # compute gradients via matrix multiplication and reshape
        dB = dLdZ_col.sum(axis=1).reshape(1, 1, 1, -1)
        dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)
        dW = np.rot90(dW, 2)

        # reshape columnized dX back into the same format as the input volume
        dX_col = W_col.T @ dLdZ_col

        total_pad = tuple(i + j for i, j in zip(p, _p))
        dX = col2im(dX_col, X.shape, W.shape, total_pad, s, 0).transpose(0, 2, 3, 1)
        dX = dX[:, :: self.stride, :: self.stride, :]

        return dX, dW, dB


#######################################################################
#                          Recurrent Layers                           #
#######################################################################


class RNNCell(LayerBase):
    def __init__(self, n_out, act_fn="Tanh", init="glorot_uniform", optimizer=None):
        r"""
        A single step of a vanilla (Elman) RNN.

        Notes
        -----
        At timestep `t`, the vanilla RNN cell computes

        .. math::

            \mathbf{Z}^{(t)}  &=
                \mathbf{W}_{ax} \mathbf{X}^{(t)} + \mathbf{b}_{ax} +
                    \mathbf{W}_{aa} \mathbf{A}^{(t-1)} + \mathbf{b}_{aa} \\
            \mathbf{A}^{(t)}  &=  f(\mathbf{Z}^{(t)})

        where

        - :math:`\mathbf{X}^{(t)}` is the input at time `t`
        - :math:`\mathbf{A}^{(t)}` is the hidden state at timestep `t`
        - `f` is the layer activation function
        - :math:`\mathbf{W}_{ax}` and :math:`\mathbf{b}_{ax}` are the weights
          and bias for the input to hidden layer
        - :math:`\mathbf{W}_{aa}` and :math:`\mathbf{b}_{aa}` are the weights
          and biases for the hidden to hidden layer

        Parameters
        ----------
        n_out : int
            The dimension of a single hidden state / output on a given timestep
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``A[t]``. Default is `'Tanh'`.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with default
            parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.n_timesteps = None
        self.act_fn = ActivationInitializer(act_fn)()
        self.parameters = {"Waa": None, "Wax": None, "ba": None, "bx": None}
        self.is_initialized = False

    def _init_params(self):
        self.X = []
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        Wax = init_weights((self.n_in, self.n_out))
        Waa = init_weights((self.n_out, self.n_out))
        ba = np.zeros((self.n_out, 1))
        bx = np.zeros((self.n_out, 1))

        self.parameters = {"Waa": Waa, "Wax": Wax, "ba": ba, "bx": bx}

        self.gradients = {
            "Waa": np.zeros_like(Waa),
            "Wax": np.zeros_like(Wax),
            "ba": np.zeros_like(ba),
            "bx": np.zeros_like(bx),
        }

        self.derived_variables = {
            "A": [],
            "Z": [],
            "n_timesteps": 0,
            "current_step": 0,
            "dLdA_accumulator": None,
        }

        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "RNNCell",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, Xt):
        """
        Compute the network output for a single timestep.

        Parameters
        ----------
        Xt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Input at timestep `t` consisting of `n_ex` examples each of
            dimensionality `n_in`.

        Returns
        -------
        At: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            The value of the hidden state at timestep `t` for each of the
            `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = Xt.shape[1]
            self._init_params()

        # increment timestep
        self.derived_variables["n_timesteps"] += 1
        self.derived_variables["current_step"] += 1

        # Retrieve parameters
        ba = self.parameters["ba"]
        bx = self.parameters["bx"]
        Wax = self.parameters["Wax"]
        Waa = self.parameters["Waa"]

        # initialize the hidden state to zero
        As = self.derived_variables["A"]
        if len(As) == 0:
            n_ex, n_in = Xt.shape
            A0 = np.zeros((n_ex, self.n_out))
            As.append(A0)

        # compute next hidden state
        Zt = As[-1] @ Waa + ba.T + Xt @ Wax + bx.T
        At = self.act_fn(Zt)

        self.derived_variables["Z"].append(Zt)
        self.derived_variables["A"].append(At)

        # store intermediate variables
        self.X.append(Xt)
        return At

    def backward(self, dLdAt):
        """
        Backprop for a single timestep.

        Parameters
        ----------
        dLdAt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            The gradient of the loss wrt. the layer outputs (ie., hidden
            states) at timestep `t`.

        Returns
        -------
        dLdXt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer inputs at timestep `t`.
        """
        assert self.trainable, "Layer is frozen"

        #  decrement current step
        self.derived_variables["current_step"] -= 1

        # extract context variables
        Zs = self.derived_variables["Z"]
        As = self.derived_variables["A"]
        t = self.derived_variables["current_step"]
        dA_acc = self.derived_variables["dLdA_accumulator"]

        # initialize accumulator
        if dA_acc is None:
            dA_acc = np.zeros_like(As[0])

        # get network weights for gradient calcs
        Wax = self.parameters["Wax"]
        Waa = self.parameters["Waa"]

        # compute gradient components at timestep t
        dA = dLdAt + dA_acc
        dZ = self.act_fn.grad(Zs[t]) * dA
        dXt = dZ @ Wax.T

        # update parameter gradients with signal from current step
        self.gradients["Waa"] += As[t].T @ dZ
        self.gradients["Wax"] += self.X[t].T @ dZ
        self.gradients["ba"] += dZ.sum(axis=0, keepdims=True).T
        self.gradients["bx"] += dZ.sum(axis=0, keepdims=True).T

        # update accumulator variable for hidden state
        self.derived_variables["dLdA_accumulator"] = dZ @ Waa.T
        return dXt

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        assert self.trainable, "Layer is frozen"

        self.X = []
        for k, v in self.derived_variables.items():
            self.derived_variables[k] = []

        self.derived_variables["n_timesteps"] = 0
        self.derived_variables["current_step"] = 0

        # reset parameter gradients to 0
        for k, v in self.parameters.items():
            self.gradients[k] = np.zeros_like(v)


class LSTMCell(LayerBase):
    def __init__(
        self,
        n_out,
        act_fn="Tanh",
        gate_fn="Sigmoid",
        init="glorot_uniform",
        optimizer=None,
    ):
        """
        A single step of a long short-term memory (LSTM) RNN.

        Notes
        -----
        Notation:

        - ``Z[t]``  is the input to each of the gates at timestep `t`
        - ``A[t]``  is the value of the hidden state at timestep `t`
        - ``Cc[t]`` is the value of the *candidate* cell/memory state at timestep `t`
        - ``C[t]``  is the value of the *final* cell/memory state at timestep `t`
        - ``Gf[t]`` is the output of the forget gate at timestep `t`
        - ``Gu[t]`` is the output of the update gate at timestep `t`
        - ``Go[t]`` is the output of the output gate at timestep `t`

        Equations::

            Z[t]  = stack([A[t-1], X[t]])
            Gf[t] = gate_fn(Wf @ Z[t] + bf)
            Gu[t] = gate_fn(Wu @ Z[t] + bu)
            Go[t] = gate_fn(Wo @ Z[t] + bo)
            Cc[t] = act_fn(Wc @ Z[t] + bc)
            C[t]  = Gf[t] * C[t-1] + Gu[t] * Cc[t]
            A[t]  = Go[t] * act_fn(C[t])

        where `@` indicates dot/matrix product, and '*' indicates elementwise
        multiplication.

        Parameters
        ----------
        n_out : int
            The dimension of a single hidden state / output on a given timestep.
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``A[t]``. Default is
            `'Tanh'`.
        gate_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The gate function for computing the update, forget, and output
            gates. Default is `'Sigmoid'`.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with default
            parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.n_timesteps = None
        self.act_fn = ActivationInitializer(act_fn)()
        self.gate_fn = ActivationInitializer(gate_fn)()
        self.parameters = {
            "Wf": None,
            "Wu": None,
            "Wc": None,
            "Wo": None,
            "bf": None,
            "bu": None,
            "bc": None,
            "bo": None,
        }
        self.is_initialized = False

    def _init_params(self):
        self.X = []
        init_weights_gate = WeightInitializer(str(self.gate_fn), mode=self.init)
        init_weights_act = WeightInitializer(str(self.act_fn), mode=self.init)

        Wf = init_weights_gate((self.n_in + self.n_out, self.n_out))
        Wu = init_weights_gate((self.n_in + self.n_out, self.n_out))
        Wc = init_weights_act((self.n_in + self.n_out, self.n_out))
        Wo = init_weights_gate((self.n_in + self.n_out, self.n_out))

        bf = np.zeros((1, self.n_out))
        bu = np.zeros((1, self.n_out))
        bc = np.zeros((1, self.n_out))
        bo = np.zeros((1, self.n_out))

        self.parameters = {
            "Wf": Wf,
            "Wu": Wu,
            "Wc": Wc,
            "Wo": Wo,
            "bf": bf,
            "bu": bu,
            "bc": bc,
            "bo": bo,
        }

        self.gradients = {
            "Wf": np.zeros_like(Wf),
            "Wu": np.zeros_like(Wu),
            "Wc": np.zeros_like(Wc),
            "Wo": np.zeros_like(Wo),
            "bf": np.zeros_like(bf),
            "bu": np.zeros_like(bu),
            "bc": np.zeros_like(bc),
            "bo": np.zeros_like(bo),
        }

        self.derived_variables = {
            "C": [],
            "A": [],
            "Gf": [],
            "Gu": [],
            "Go": [],
            "Gc": [],
            "Cc": [],
            "n_timesteps": 0,
            "current_step": 0,
            "dLdA_accumulator": None,
            "dLdC_accumulator": None,
        }

        self.is_initialized = True

    def _get_params(self):
        Wf = self.parameters["Wf"]
        Wu = self.parameters["Wu"]
        Wc = self.parameters["Wc"]
        Wo = self.parameters["Wo"]
        bf = self.parameters["bf"]
        bu = self.parameters["bu"]
        bc = self.parameters["bc"]
        bo = self.parameters["bo"]
        return Wf, Wu, Wc, Wo, bf, bu, bc, bo

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "LSTMCell",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "gate_fn": str(self.gate_fn),
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def forward(self, Xt):
        """
        Compute the layer output for a single timestep.

        Parameters
        ----------
        Xt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Input at timestep t consisting of `n_ex` examples each of
            dimensionality `n_in`.

        Returns
        -------
        At: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            The value of the hidden state at timestep `t` for each of the `n_ex`
            examples.
        Ct: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            The value of the cell/memory state at timestep `t` for each of the
            `n_ex` examples.
        """
        if not self.is_initialized:
            self.n_in = Xt.shape[1]
            self._init_params()

        Wf, Wu, Wc, Wo, bf, bu, bc, bo = self._get_params()

        self.derived_variables["n_timesteps"] += 1
        self.derived_variables["current_step"] += 1

        if len(self.derived_variables["A"]) == 0:
            n_ex, n_in = Xt.shape
            init = np.zeros((n_ex, self.n_out))
            self.derived_variables["A"].append(init)
            self.derived_variables["C"].append(init)

        A_prev = self.derived_variables["A"][-1]
        C_prev = self.derived_variables["C"][-1]

        # concatenate A_prev and Xt to create Zt
        Zt = np.hstack([A_prev, Xt])

        Gft = self.gate_fn(Zt @ Wf + bf)
        Gut = self.gate_fn(Zt @ Wu + bu)
        Got = self.gate_fn(Zt @ Wo + bo)
        Cct = self.act_fn(Zt @ Wc + bc)
        Ct = Gft * C_prev + Gut * Cct
        At = Got * self.act_fn(Ct)

        # bookkeeping
        self.X.append(Xt)
        self.derived_variables["A"].append(At)
        self.derived_variables["C"].append(Ct)
        self.derived_variables["Gf"].append(Gft)
        self.derived_variables["Gu"].append(Gut)
        self.derived_variables["Go"].append(Got)
        self.derived_variables["Cc"].append(Cct)
        return At, Ct

    def backward(self, dLdAt):
        """
        Backprop for a single timestep.

        Parameters
        ----------
        dLdAt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            The gradient of the loss wrt. the layer outputs (ie., hidden
            states) at timestep `t`.

        Returns
        -------
        dLdXt : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            The gradient of the loss wrt. the layer inputs at timestep `t`.
        """
        assert self.trainable, "Layer is frozen"

        Wf, Wu, Wc, Wo, bf, bu, bc, bo = self._get_params()

        self.derived_variables["current_step"] -= 1
        t = self.derived_variables["current_step"]

        Got = self.derived_variables["Go"][t]
        Gft = self.derived_variables["Gf"][t]
        Gut = self.derived_variables["Gu"][t]
        Cct = self.derived_variables["Cc"][t]
        At = self.derived_variables["A"][t + 1]
        Ct = self.derived_variables["C"][t + 1]
        C_prev = self.derived_variables["C"][t]
        A_prev = self.derived_variables["A"][t]

        Xt = self.X[t]
        Zt = np.hstack([A_prev, Xt])

        dA_acc = self.derived_variables["dLdA_accumulator"]
        dC_acc = self.derived_variables["dLdC_accumulator"]

        # initialize accumulators
        if dA_acc is None:
            dA_acc = np.zeros_like(At)

        if dC_acc is None:
            dC_acc = np.zeros_like(Ct)

        # Gradient calculations
        # ---------------------

        dA = dLdAt + dA_acc
        dC = dC_acc + dA * Got * self.act_fn.grad(Ct)

        # compute the input to the gate functions at timestep t
        _Go = Zt @ Wo + bo
        _Gf = Zt @ Wf + bf
        _Gu = Zt @ Wu + bu
        _Gc = Zt @ Wc + bc

        # compute gradients wrt the *input* to each gate
        dGot = dA * self.act_fn(Ct) * self.gate_fn.grad(_Go)
        dCct = dC * Gut * self.act_fn.grad(_Gc)
        dGut = dC * Cct * self.gate_fn.grad(_Gu)
        dGft = dC * C_prev * self.gate_fn.grad(_Gf)

        dZ = dGft @ Wf.T + dGut @ Wu.T + dCct @ Wc.T + dGot @ Wo.T
        dXt = dZ[:, self.n_out :]

        self.gradients["Wc"] += Zt.T @ dCct
        self.gradients["Wu"] += Zt.T @ dGut
        self.gradients["Wf"] += Zt.T @ dGft
        self.gradients["Wo"] += Zt.T @ dGot
        self.gradients["bo"] += dGot.sum(axis=0, keepdims=True)
        self.gradients["bu"] += dGut.sum(axis=0, keepdims=True)
        self.gradients["bf"] += dGft.sum(axis=0, keepdims=True)
        self.gradients["bc"] += dCct.sum(axis=0, keepdims=True)

        self.derived_variables["dLdA_accumulator"] = dZ[:, : self.n_out]
        self.derived_variables["dLdC_accumulator"] = Gft * dC
        return dXt

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        assert self.trainable, "Layer is frozen"

        self.X = []
        for k, v in self.derived_variables.items():
            self.derived_variables[k] = []

        self.derived_variables["n_timesteps"] = 0
        self.derived_variables["current_step"] = 0

        # reset parameter gradients to 0
        for k, v in self.parameters.items():
            self.gradients[k] = np.zeros_like(v)


class RNN(LayerBase):
    def __init__(self, n_out, act_fn="Tanh", init="glorot_uniform", optimizer=None):
        """
        A single vanilla (Elman)-RNN layer.

        Parameters
        ----------
        n_out : int
            The dimension of a single hidden state / output on a given
            timestep.
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``A[t]``. Default is
            `'Tanh'`.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with default
            parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.n_timesteps = None
        self.act_fn = ActivationInitializer(act_fn)()
        self.is_initialized = False

    def _init_params(self):
        self.cell = RNNCell(
            n_in=self.n_in,
            n_out=self.n_out,
            act_fn=self.act_fn,
            init=self.init,
            optimizer=self.optimizer,
        )
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "RNN",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "optimizer": self.cell.hyperparameters["optimizer"],
        }

    def forward(self, X):
        """
        Run a forward pass across all timesteps in the input.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_t)`
            Input consisting of `n_ex` examples each of dimensionality `n_in`
            and extending for `n_t` timesteps.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()

        Y = []
        n_ex, n_in, n_t = X.shape
        for t in range(n_t):
            yt = self.cell.forward(X[:, :, t])
            Y.append(yt)
        return np.dstack(Y)

    def backward(self, dLdA):
        """
        Run a backward pass across all timesteps in the input.

        Parameters
        ----------
        dLdA : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The gradient of the loss with respect to the layer output for each
            of the `n_ex` examples across all `n_t` timesteps.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_t)`
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """
        assert self.cell.trainable, "Layer is frozen"
        dLdX = []
        n_ex, n_out, n_t = dLdA.shape
        for t in reversed(range(n_t)):
            dLdXt = self.cell.backward(dLdA[:, :, t])
            dLdX.insert(0, dLdXt)
        dLdX = np.dstack(dLdX)
        return dLdX

    @property
    def derived_variables(self):
        """
        Return a dictionary containing any intermediate variables computed
        during the forward / backward passes.
        """
        return self.cell.derived_variables

    @property
    def gradients(self):
        """
        Return a dictionary of the gradients computed during the backward
        pass
        """
        return self.cell.gradients

    @property
    def parameters(self):
        """Return a dictionary of the current layer parameters"""
        return self.cell.parameters

    def set_params(self, summary_dict):
        """
        Set the layer parameters from a dictionary of values.

        Parameters
        ----------
        summary_dict : dict
            A dictionary of layer parameters and hyperparameters. If a required
            parameter or hyperparameter is not included within `summary_dict`,
            this method will use the value in the current layer's
            :meth:`summary` method.

        Returns
        -------
        layer : :doc:`Layer <numpy_ml.neural_nets.layers>` object
            The newly-initialized layer.
        """
        self = super().set_params(summary_dict)
        return self.cell.set_parameters(summary_dict)

    def freeze(self):
        """
        Freeze the layer parameters at their current values so they can no
        longer be updated.
        """
        self.cell.freeze()

    def unfreeze(self):
        """Unfreeze the layer parameters so they can be updated."""
        self.cell.unfreeze()

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        self.cell.flush_gradients()

    def update(self):
        """
        Update the layer parameters using the accrued gradients and layer
        optimizer. Flush all gradients once the update is complete.
        """
        self.cell.update()
        self.flush_gradients()


class LSTM(LayerBase):
    def __init__(
        self,
        n_out,
        act_fn="Tanh",
        gate_fn="Sigmoid",
        init="glorot_uniform",
        optimizer=None,
    ):
        """
        A single long short-term memory (LSTM) RNN layer.

        Parameters
        ----------
        n_out : int
            The dimension of a single hidden state / output on a given timestep.
        act_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The activation function for computing ``A[t]``. Default is `'Tanh'`.
        gate_fn : str, :doc:`Activation <numpy_ml.neural_nets.activations>` object, or None
            The gate function for computing the update, forget, and output
            gates. Default is `'Sigmoid'`.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is `'glorot_uniform'`.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.
        """  # noqa: E501
        super().__init__(optimizer)

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.n_timesteps = None
        self.act_fn = ActivationInitializer(act_fn)()
        self.gate_fn = ActivationInitializer(gate_fn)()
        self.is_initialized = False

    def _init_params(self):
        self.cell = LSTMCell(
            n_in=self.n_in,
            n_out=self.n_out,
            act_fn=self.act_fn,
            gate_fn=self.gate_fn,
            init=self.init,
        )
        self.is_initialized = True

    @property
    def hyperparameters(self):
        """Return a dictionary containing the layer hyperparameters."""
        return {
            "layer": "LSTM",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "gate_fn": str(self.gate_fn),
            "optimizer": self.cell.hyperparameters["optimizer"],
        }

    def forward(self, X):
        """
        Run a forward pass across all timesteps in the input.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_t)`
            Input consisting of `n_ex` examples each of dimensionality `n_in`
            and extending for `n_t` timesteps.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()

        Y = []
        n_ex, n_in, n_t = X.shape
        for t in range(n_t):
            yt, _ = self.cell.forward(X[:, :, t])
            Y.append(yt)
        return np.dstack(Y)

    def backward(self, dLdA):
        """
        Run a backward pass across all timesteps in the input.

        Parameters
        ----------
        dLdA : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The gradient of the loss with respect to the layer output for each
            of the `n_ex` examples across all `n_t` timesteps.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex`, `n_in`, `n_t`)
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """  # noqa: E501
        assert self.cell.trainable, "Layer is frozen"
        dLdX = []
        n_ex, n_out, n_t = dLdA.shape
        for t in reversed(range(n_t)):
            dLdXt, _ = self.cell.backward(dLdA[:, :, t])
            dLdX.insert(0, dLdXt)
        dLdX = np.dstack(dLdX)
        return dLdX

    @property
    def derived_variables(self):
        """
        Return a dictionary containing any intermediate variables computed
        during the forward / backward passes.
        """
        return self.cell.derived_variables

    @property
    def gradients(self):
        """
        Return a dictionary of the gradients computed during the backward
        pass
        """
        return self.cell.gradients

    @property
    def parameters(self):
        """Return a dictionary of the current layer parameters"""
        return self.cell.parameters

    def freeze(self):
        """
        Freeze the layer parameters at their current values so they can no
        longer be updated.
        """
        self.cell.freeze()

    def unfreeze(self):
        """Unfreeze the layer parameters so they can be updated."""
        self.cell.unfreeze()

    def set_params(self, summary_dict):
        """
        Set the layer parameters from a dictionary of values.

        Parameters
        ----------
        summary_dict : dict
            A dictionary of layer parameters and hyperparameters. If a required
            parameter or hyperparameter is not included within `summary_dict`,
            this method will use the value in the current layer's
            :meth:`summary` method.

        Returns
        -------
        layer : :doc:`Layer <numpy_ml.neural_nets.layers>` object
            The newly-initialized layer.
        """
        self = super().set_params(summary_dict)
        return self.cell.set_parameters(summary_dict)

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        self.cell.flush_gradients()

    def update(self):
        """
        Update the layer parameters using the accrued gradients and layer
        optimizer. Flush all gradients once the update is complete.
        """
        self.cell.update()
        self.flush_gradients()


================================================
FILE: numpy_ml/neural_nets/losses/README.md
================================================
# Losses

The `losses.py` module implements several common loss functions, including:

- Squared error
- Cross-entropy
- Variational lower-bound for binary VAE ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
- WGAN-GP loss for generator and critic ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf))
- Noise contrastive estimation (NCE) loss ([Gutmann &
  Hyv&auml;rinen, 2010](https://www.cs.helsinki.fi/u/ahyvarin/papers/Gutmann10AISTATS.pdf); [Minh & Teh, 2012](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf))


================================================
FILE: numpy_ml/neural_nets/losses/__init__.py
================================================
"""
Common neural network loss functions.

This module implements loss objects that can be used during neural network
training.
"""

from .losses import *


================================================
FILE: numpy_ml/neural_nets/losses/losses.py
================================================
from abc import ABC, abstractmethod

import numpy as np

from ...utils.testing import is_binary, is_stochastic
from ..initializers import (
    WeightInitializer,
    ActivationInitializer,
    OptimizerInitializer,
)


class ObjectiveBase(ABC):
    def __init__(self):
        super().__init__()

    @abstractmethod
    def loss(self, y_true, y_pred):
        pass

    @abstractmethod
    def grad(self, y_true, y_pred, **kwargs):
        pass


class SquaredError(ObjectiveBase):
    def __init__(self):
        """
        A squared-error / `L2` loss.

        Notes
        -----
        For real-valued target **y** and predictions :math:`\hat{\mathbf{y}}`, the
        squared error is

        .. math::
                \mathcal{L}(\mathbf{y}, \hat{\mathbf{y}})
                    = 0.5 ||\hat{\mathbf{y}} - \mathbf{y}||_2^2
        """
        super().__init__()

    def __call__(self, y, y_pred):
        return self.loss(y, y_pred)

    def __str__(self):
        return "SquaredError"

    @staticmethod
    def loss(y, y_pred):
        """
        Compute the squared error between `y` and `y_pred`.

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Ground truth values for each of `n` examples
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Predictions for the `n` examples in the batch.

        Returns
        -------
        loss : float
            The sum of the squared error across dimensions and examples.
        """
        return 0.5 * np.linalg.norm(y_pred - y) ** 2

    @staticmethod
    def grad(y, y_pred, z, act_fn):
        """
        Gradient of the squared error loss with respect to the pre-nonlinearity
        input, `z`.

        Notes
        -----
        The current method computes the gradient :math:`\\frac{\partial
        \mathcal{L}}{\partial \mathbf{z}}`, where

        .. math::

            \mathcal{L}(\mathbf{z})
                &=  \\text{squared_error}(\mathbf{y}, g(\mathbf{z})) \\\\
            g(\mathbf{z})
                &=  \\text{act_fn}(\mathbf{z})

        The gradient with respect to :math:`\mathbf{z}` is then

        .. math::

            \\frac{\partial \mathcal{L}}{\partial \mathbf{z}}
                = (g(\mathbf{z}) - \mathbf{y}) \left(
                    \\frac{\partial g}{\partial \mathbf{z}} \\right)

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Ground truth values for each of `n` examples.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Predictions for the `n` examples in the batch.
        act_fn : :doc:`Activation <numpy_ml.neural_nets.activations>` object
            The activation function for the output layer of the network.

        Returns
        -------
        grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the squared error loss with respect to `z`.
        """
        return (y_pred - y) * act_fn.grad(z)


class CrossEntropy(ObjectiveBase):
    def __init__(self):
        """
        A cross-entropy loss.

        Notes
        -----
        For a one-hot target **y** and predicted class probabilities
        :math:`\hat{\mathbf{y}}`, the cross entropy is

        .. math::
                \mathcal{L}(\mathbf{y}, \hat{\mathbf{y}})
                    = \sum_i y_i \log \hat{y}_i
        """
        super().__init__()

    def __call__(self, y, y_pred):
        return self.loss(y, y_pred)

    def __str__(self):
        return "CrossEntropy"

    @staticmethod
    def loss(y, y_pred):
        """
        Compute the cross-entropy (log) loss.

        Notes
        -----
        This method returns the sum (not the average!) of the losses for each
        sample.

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Class labels (one-hot with `m` possible classes) for each of `n`
            examples.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Probabilities of each of `m` classes for the `n` examples in the
            batch.

        Returns
        -------
        loss : float
            The sum of the cross-entropy across classes and examples.
        """
        is_binary(y)
        is_stochastic(y_pred)

        # prevent taking the log of 0
        eps = np.finfo(float).eps

        # each example is associated with a single class; sum the negative log
        # probability of the correct label over all samples in the batch.
        # observe that we are taking advantage of the fact that y is one-hot
        # encoded
        cross_entropy = -np.sum(y * np.log(y_pred + eps))
        return cross_entropy

    @staticmethod
    def grad(y, y_pred):
        """
        Compute the gradient of the cross entropy loss with regard to the
        softmax input, `z`.

        Notes
        -----
        The gradient for this method goes through both the cross-entropy loss
        AND the softmax non-linearity to return :math:`\\frac{\partial
        \mathcal{L}}{\partial \mathbf{z}}` (rather than :math:`\\frac{\partial
        \mathcal{L}}{\partial \\text{softmax}(\mathbf{z})}`).

        In particular, let:

        .. math::

            \mathcal{L}(\mathbf{z})
                = \\text{cross_entropy}(\\text{softmax}(\mathbf{z})).

        The current method computes:

        .. math::

            \\frac{\partial \mathcal{L}}{\partial \mathbf{z}}
                &= \\text{softmax}(\mathbf{z}) - \mathbf{y} \\\\
                &=  \hat{\mathbf{y}} - \mathbf{y}

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(n, m)`
            A one-hot encoding of the true class labels. Each row constitues a
            training example, and each column is a different class.
        y_pred: :py:class:`ndarray <numpy.ndarray>` of shape `(n, m)`
            The network predictions for the probability of each of `m` class
            labels on each of `n` examples in a batch.

        Returns
        -------
        grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the cross-entropy loss with respect to the *input*
            to the softmax function.
        """
        is_binary(y)
        is_stochastic(y_pred)

        # derivative of xe wrt z is y_pred - y_true, hence we can just
        # subtract 1 from the probability of the correct class labels
        grad = y_pred - y

        # [optional] scale the gradients by the number of examples in the batch
        # n, m = y.shape
        # grad /= n
        return grad


class VAELoss(ObjectiveBase):
    def __init__(self):
        """
        The variational lower bound for a variational autoencoder with Bernoulli
        units.

        Notes
        -----
        The VLB to the sum of the binary cross entropy between the true input and
        the predicted output (the "reconstruction loss") and the KL divergence
        between the learned variational distribution :math:`q` and the prior,
        :math:`p`, assumed to be a unit Gaussian.

        .. math::

            \\text{VAELoss} =
                \\text{cross_entropy}(\mathbf{y}, \hat{\mathbf{y}})
                    + \\mathbb{KL}[q \ || \ p]

        where :math:`\mathbb{KL}[q \ || \ p]` is the Kullback-Leibler
        divergence between the distributions :math:`q` and :math:`p`.

        References
        ----------
        .. [1] Kingma, D. P. & Welling, M. (2014). "Auto-encoding variational Bayes".
           *arXiv preprint arXiv:1312.6114.* https://arxiv.org/pdf/1312.6114.pdf
        """
        super().__init__()

    def __call__(self, y, y_pred, t_mean, t_log_var):
        return self.loss(y, y_pred, t_mean, t_log_var)

    def __str__(self):
        return "VAELoss"

    @staticmethod
    def loss(y, y_pred, t_mean, t_log_var):
        """
        Variational lower bound for a Bernoulli VAE.

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)`
            The original images.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)`
            The VAE reconstruction of the images.
        t_mean: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            Mean of the variational distribution :math:`q(t \mid x)`.
        t_log_var: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            Log of the variance vector of the variational distribution
            :math:`q(t \mid x)`.

        Returns
        -------
        loss : float
            The VLB, averaged across the batch.
        """
        # prevent nan on log(0)
        eps = np.finfo(float).eps
        y_pred = np.clip(y_pred, eps, 1 - eps)

        # reconstruction loss: binary cross-entropy
        rec_loss = -np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred), axis=1)

        # KL divergence between the variational distribution q and the prior p,
        # a unit gaussian
        kl_loss = -0.5 * np.sum(1 + t_log_var - t_mean ** 2 - np.exp(t_log_var), axis=1)
        loss = np.mean(kl_loss + rec_loss)
        return loss

    @staticmethod
    def grad(y, y_pred, t_mean, t_log_var):
        """
        Compute the gradient of the VLB with regard to the network parameters.

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)`
            The original images.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)`
            The VAE reconstruction of the images.
        t_mean: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            Mean of the variational distribution :math:`q(t | x)`.
        t_log_var: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            Log of the variance vector of the variational distribution
            :math:`q(t | x)`.

        Returns
        -------
        dY_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)`
            The gradient of the VLB with regard to `y_pred`.
        dLogVar : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            The gradient of the VLB with regard to `t_log_var`.
        dMean : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)`
            The gradient of the VLB with regard to `t_mean`.
        """
        N = y.shape[0]
        eps = np.finfo(float).eps
        y_pred = np.clip(y_pred, eps, 1 - eps)

        dY_pred = -y / (N * y_pred) - (y - 1) / (N - N * y_pred)
        dLogVar = (np.exp(t_log_var) - 1) / (2 * N)
        dMean = t_mean / N
        return dY_pred, dLogVar, dMean


class WGAN_GPLoss(ObjectiveBase):
    def __init__(self, lambda_=10):
        """
        The loss function for a Wasserstein GAN [*]_ [*]_ with gradient penalty.

        Notes
        -----
        Assuming an optimal critic, minimizing this quantity wrt. the generator
        parameters corresponds to minimizing the Wasserstein-1 (earth-mover)
        distance between the fake and real data distributions.

        The formula for the WGAN-GP critic loss is

        .. math::

            \\text{WGANLoss}
                &=  \sum_{x \in X_{real}} p(x) D(x)
                    - \sum_{x' \in X_{fake}} p(x') D(x') \\\\
            \\text{WGANLossGP}
                &=  \\text{WGANLoss} + \lambda
                    (||\\nabla_{X_{interp}} D(X_{interp})||_2 - 1)^2

        where

        .. math::

            X_{fake}  &=   \\text{Generator}(\mathbf{z}) \\\\
            X_{interp}   &=   \\alpha X_{real} + (1 - \\alpha) X_{fake} \\\\

        and

        .. math::

            \mathbf{z}  &\sim  \mathcal{N}(0, \mathbb{1}) \\\\
            \\alpha  &\sim  \\text{Uniform}(0, 1)

        References
        ----------
        .. [*] Gulrajani, I., Ahmed, F., Arjovsky, M., Dumoulin, V., &
           Courville, A. (2017) "Improved training of Wasserstein GANs"
           *Advances in Neural Information Processing Systems, 31*: 5769-5779.
        .. [*] Goodfellow, I. J, Abadie, P. A., Mirza, M., Xu, B., Farley, D.
           W., Ozair, S., Courville, A., & Bengio, Y. (2014) "Generative
           adversarial nets" *Advances in Neural Information Processing
           Systems, 27*: 2672-2680.

        Parameters
        ----------
        lambda_ : float
            The gradient penalty coefficient. Default is 10.
        """
        self.lambda_ = lambda_
        super().__init__()

    def __call__(self, Y_fake, module, Y_real=None, gradInterp=None):
        """
        Computes the generator and critic loss using the WGAN-GP value
        function.

        Parameters
        ----------
        Y_fake : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`
            The output of the critic for `X_fake`.
        module : {'C', 'G'}
            Whether to calculate the loss for the critic ('C') or the generator
            ('G'). If calculating loss for the critic, `Y_real` and
            `gradInterp` must not be None.
        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`, or None
            The output of the critic for `X_real`. Default is None.
        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)`, or None
            The gradient of the critic output for `X_interp` wrt. `X_interp`.
            Default is None.

        Returns
        -------
        loss : float
            Depending on the setting for `module`, either the critic or
            generator loss, averaged over examples in the minibatch.
        """
        return self.loss(Y_fake, module, Y_real=Y_real, gradInterp=gradInterp)

    def __str__(self):
        return "WGANLossGP(lambda_={})".format(self.lambda_)

    def loss(self, Y_fake, module, Y_real=None, gradInterp=None):
        """
        Computes the generator and critic loss using the WGAN-GP value
        function.

        Parameters
        ----------
        Y_fake : :py:class:`ndarray <numpy.ndarray>` of shape (n_ex,)
            The output of the critic for `X_fake`.
        module : {'C', 'G'}
            Whether to calculate the loss for the critic ('C') or the generator
            ('G'). If calculating loss for the critic, `Y_real` and
            `gradInterp` must not be None.
        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)` or None
            The output of the critic for `X_real`. Default is None.
        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)` or None
            The gradient of the critic output for `X_interp` wrt. `X_interp`.
            Default is None.

        Returns
        -------
        loss : float
            Depending on the setting for `module`, either the critic or
            generator loss, averaged over examples in the minibatch.
        """
        # calc critic loss including gradient penalty
        if module == "C":
            X_interp_norm = np.linalg.norm(gradInterp, axis=1, keepdims=True)
            gradient_penalty = (X_interp_norm - 1) ** 2
            loss = (
                Y_fake.mean() - Y_real.mean() + self.lambda_ * gradient_penalty.mean()
            )

        # calc generator loss
        elif module == "G":
            loss = -Y_fake.mean()

        else:
            raise ValueError("Unrecognized module: {}".format(module))

        return loss

    def grad(self, Y_fake, module, Y_real=None, gradInterp=None):
        """
        Computes the gradient of the generator or critic loss with regard to
        its inputs.

        Parameters
        ----------
        Y_fake : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`
            The output of the critic for `X_fake`.
        module : {'C', 'G'}
            Whether to calculate the gradient for the critic loss ('C') or the
            generator loss ('G'). If calculating grads for the critic, `Y_real`
            and `gradInterp` must not be None.
        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)` or None
            The output of the critic for `X_real`. Default is None.
        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)` or None
            The gradient of the critic output on `X_interp` wrt. `X_interp`.
            Default is None.

        Returns
        -------
        grads : tuple
            If `module` == 'C', returns a 3-tuple containing the gradient of
            the critic loss with regard to (`Y_fake`, `Y_real`, `gradInterp`).
            If `module` == 'G', returns the gradient of the generator with
            regard to `Y_fake`.
        """
        eps = np.finfo(float).eps
        n_ex_fake = Y_fake.shape[0]

        # calc gradient of the critic loss
        if module == "C":
            n_ex_real = Y_real.shape[0]

            dY_fake = -1 / n_ex_fake * np.ones_like(Y_fake)
            dY_real = 1 / n_ex_real * np.ones_like(Y_real)

            # differentiate through gradient penalty
            X_interp_norm = np.linalg.norm(gradInterp, axis=1, keepdims=True) + eps

            dGradInterp = (
                (2 / n_ex_fake)
                * self.lambda_
                * (X_interp_norm - 1)
                * (gradInterp / X_interp_norm)
            )
            grad = (dY_fake, dY_real, dGradInterp)

        # calc gradient of the generator loss
        elif module == "G":
            grad = -1 / n_ex_fake * np.ones_like(Y_fake)

        else:
            raise ValueError("Unrecognized module: {}".format(module))
        return grad


class NCELoss(ObjectiveBase):
    """
    """

    def __init__(
        self,
        n_classes,
        noise_sampler,
        num_negative_samples,
        optimizer=None,
        init="glorot_uniform",
        subtract_log_label_prob=True,
    ):
        """
        A noise contrastive estimation (NCE) loss function.

        Notes
        -----
        Noise contrastive estimation is a candidate sampling method often
        used to reduce the computational challenge of training a softmax
        layer on problems with a large number of output classes. It proceeds by
        training a logistic regression model to discriminate between samples
        from the true data distribution and samples from an artificial noise
        distribution.

        It can be shown that as the ratio of negative samples to data samples
        goes to infinity, the gradient of the NCE loss converges to the
        original softmax gradient.

        For input data **X**, target labels `targets`, loss parameters **W** and
        **b**, and noise samples `noise` sampled from the noise distribution `Q`,
        the NCE loss is

        .. math::

            \\text{NCE}(X, targets) =
                \\text{cross_entropy}(\mathbf{y}_{targets}, \hat{\mathbf{y}}_{targets}) +
                \\text{cross_entropy}(\mathbf{y}_{noise}, \hat{\mathbf{y}}_{noise})

        where

        .. math::

            \hat{\mathbf{y}}_{targets}
                &=  \sigma(\mathbf{W}[targets] \mathbf{X} + \mathbf{b}[targets] - \log Q(targets)) \\\\
            \hat{\mathbf{y}}_{noise}
                &=  \sigma(\mathbf{W}[noise] \mathbf{X} + \mathbf{b}[noise] - \log Q(noise))

        In the above equations, :math:`\sigma` is the logistic sigmoid
        function, and :math:`Q(x)` corresponds to the probability of the values
        in `x` under `Q`.

        References
        ----------
        .. [1] Gutmann, M. & Hyvarinen, A. (2010). Noise-contrastive
           estimation: A new estimation principle for unnormalized statistical
           models. *AISTATS, 13*: 297-304.
        .. [2] Minh, A. & Teh, Y. W. (2012). A fast and simple algorithm for
           training neural probabilistic language models. *ICML, 29*: 1751-1758.

        Parameters
        ----------
        n_classes : int
            The total number of output classes in the model.
        noise_sampler : :class:`~numpy_ml.utils.data_structures.DiscreteSampler` instance
            The negative sampler. Defines a distribution over all classes in
            the dataset.
        num_negative_samples : int
            The number of negative samples to draw for each target / batch of
            targets.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the :class:`SGD
            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
            default parameters. Default is None.
        subtract_log_label_prob : bool
            Whether to subtract the log of the probability of each label under
            the noise distribution from its respective logit. Set to False for
            negative sampling, True for NCE. Default is True.

        Attributes
        ----------
        gradients : dict
            The accumulated parameter gradients.
        parameters: dict
            The loss parameter values.
        hyperparameters: dict
            The loss hyperparameter values.
        derived_variables: dict
            Useful intermediate values computed during the loss computation.
        """
        super().__init__()

        self.init = init
        self.n_in = None
        self.trainable = True
        self.n_classes = n_classes
        self.noise_sampler = noise_sampler
        self.num_negative_samples = num_negative_samples
        self.act_fn = ActivationInitializer("Sigmoid")()
        self.optimizer = OptimizerInitializer(optimizer)()
        self.subtract_log_label_prob = subtract_log_label_prob

        self.is_initialized = False

    def _init_params(self):
        init_weights = WeightInitializer(str(self.act_fn), mode=self.init)

        self.X = []
        b = np.zeros((1, self.n_classes))
        W = init_weights((self.n_classes, self.n_in))

        self.parameters = {"W": W, "b": b}

        self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}

        self.derived_variables = {
            "y_pred": [],
            "target": [],
            "true_w": [],
            "true_b": [],
            "sampled_b": [],
            "sampled_w": [],
            "out_labels": [],
            "target_logits": [],
            "noise_samples": [],
            "noise_logits": [],
        }

        self.is_initialized = True

    @property
    def hyperparameters(self):
        return {
            "id": "NCELoss",
            "n_in": self.n_in,
            "init": self.init,
            "n_classes": self.n_classes,
            "noise_sampler": self.noise_sampler,
            "num_negative_samples": self.num_negative_samples,
            "subtract_log_label_prob": self.subtract_log_label_prob,
            "optimizer": {
                "cache": self.optimizer.cache,
                "hyperparameters": self.optimizer.hyperparameters,
            },
        }

    def __call__(self, X, target, neg_samples=None, retain_derived=True):
        return self.loss(X, target, neg_samples, retain_derived)

    def __str__(self):
        keys = [
            "{}={}".format(k, v)
            for k, v in self.hyperparameters.items()
            if k not in ["id", "optimizer"]
        ] + ["optimizer={}".format(self.optimizer)]
        return "NCELoss({})".format(", ".join(keys))

    def freeze(self):
        """
        Freeze the loss parameters at their current values so they can no
        longer be updated.
        """
        self.trainable = False

    def unfreeze(self):
        """Unfreeze the layer parameters so they can be updated."""
        self.trainable = True

    def flush_gradients(self):
        """Erase all the layer's derived variables and gradients."""
        assert self.trainable, "NCELoss is frozen"
        self.X = []
        for k, v in self.derived_variables.items():
            self.derived_variables[k] = []

        for k, v in self.gradients.items():
            self.gradients[k] = np.zeros_like(v)

    def update(self, cur_loss=None):
        """
        Update the loss parameters using the accrued gradients and optimizer.
        Flush all gradients once the update is complete.
        """
        assert self.trainable, "NCELoss is frozen"
        self.optimizer.step()
        for k, v in self.gradients.items():
            if k in self.parameters:
                self.parameters[k] = self.optimizer(self.parameters[k], v, k, cur_loss)
        self.flush_gradients()

    def loss(self, X, target, neg_samples=None, retain_derived=True):
        """
        Compute the NCE loss for a collection of inputs and associated targets.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_c, n_in)`
            Layer input. A minibatch of `n_ex` examples, where each example is
            an `n_c` by `n_in` matrix (e.g., the matrix of `n_c` context
            embeddings, each of dimensionality `n_in`, for a CBOW model).
        target : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`
            Integer indices of the target class(es) for each example in the
            minibatch (e.g., the target word id for an example in a CBOW model).
        neg_samples : :py:class:`ndarray <numpy.ndarray>` of shape (`num_negative_samples`,) or None
            An optional array of negative samples to use during the loss
            calculation. These will be used instead of samples draw from
            ``self.noise_sampler``. Default is None.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through with regard to this input.
            Default is True.

        Returns
        -------
        loss : float
            The NCE loss summed over the minibatch and samples.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex`, `n_c`)
            The network predictions for the conditional probability of each
            target given each context: entry (`i`, `j`) gives the predicted
            probability of target `i` under context vector `j`.
        """
        if not self.is_initialized:
            self.n_in = X.shape[-1]
            self._init_params()

        loss, Z_target, Z_neg, y_pred, y_true, noise_samples = self._loss(
            X, target, neg_samples
        )

        # cache derived variables for gradient calculation
        if retain_derived:
            self.X.append(X)

            self.derived_variables["y_pred"].append(y_pred)
            self.derived_variables["target"].append(target)
            self.derived_variables["out_labels"].append(y_true)
            self.derived_variables["target_logits"].append(Z_target)
            self.derived_variables["noise_samples"].append(noise_samples)
            self.derived_variables["noise_logits"].append(Z_neg)

        return loss, np.squeeze(y_pred[..., :1], -1)

    def _loss(self, X, target, neg_samples):
        """Actual computation of NCE loss"""
        fstr = "X must have shape (n_ex, n_c, n_in), but got {} dims instead"
        assert X.ndim == 3, fstr.format(X.ndim)

        W = self.parameters["W"]
        b = self.parameters["b"]

        # sample negative samples from the noise distribution
        if neg_samples is None:
            neg_samples = self.noise_sampler(self.num_negative_samples)
        assert len(neg_samples) == self.num_negative_samples

        # get the probability of the negative sample class and the target
        # class under the noise distribution
        p_neg_samples = self.noise_sampler.probs[neg_samples]
        p_target = np.atleast_2d(self.noise_sampler.probs[target])

        # save the noise samples for debugging
        noise_samples = (neg_samples, p_target, p_neg_samples)

        # compute the logit for the negative samples and target
        Z_target = X @ W[target].T + b[0, target]
        Z_neg = X @ W[neg_samples].T + b[0, neg_samples]

        # subtract the log probability of each label under the noise dist
        if self.subtract_log_label_prob:
            n, m = Z_target.shape[0], Z_neg.shape[0]
            Z_target[range(n), ...] -= np.log(p_target)
            Z_neg[range(m), ...] -= np.log(p_neg_samples)

        # only retain the probability of the target under its associated
        # minibatch example
        aa, _, cc = Z_target.shape
        Z_target = Z_target[range(aa), :, range(cc)][..., None]

        # p_target = (n_ex, n_c, 1)
        # p_neg = (n_ex, n_c, n_samples)
        pred_p_target = self.act_fn(Z_target)
        pred_p_neg = self.act_fn(Z_neg)

        # if we're in evaluation mode, ignore the negative samples - just
        # return the binary cross entropy on the targets
        y_pred = pred_p_target
        if self.trainable:
            # (n_ex, n_c, 1 + n_samples) (target is first column)
            y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1)

        n_targets = 1
        y_true = np.zeros_like(y_pred)
        y_true[..., :n_targets] = 1

        # binary cross entropy
        eps = np.finfo(float).eps
        np.clip(y_pred, eps, 1 - eps, y_pred)
        loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss, Z_target, Z_neg, y_pred, y_true, noise_samples

    def grad(self, retain_grads=True, update_params=True):
        """
        Compute the gradient of the NCE loss with regard to the inputs,
        weights, and biases.

        Parameters
        ----------
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.
        update_params : bool
            Whether to perform a single step of gradient descent on the layer
            weights and bias using the calculated gradients. If `retain_grads`
            is False, this option is ignored and the parameter gradients are
            not updated. Default is True.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex`, `n_in`) or list of arrays
            The gradient of the loss with regard to the layer input(s) `X`.
        """
        assert self.trainable, "NCE loss is frozen"

        dX = []
        for input_idx, x in enumerate(self.X):
            dx, dw, db = self._grad(x, input_idx)
            dX.append(dx)

            if retain_grads:
                self.gradients["W"] += dw
                self.gradients["b"] += db

        dX = dX[0] if len(self.X) == 1 else dX

        if retain_grads and update_params:
            self.update()

        return dX

    def _grad(self, X, input_idx):
        """Actual computation of gradient wrt. loss weights + input"""
        W, b = self.parameters["W"], self.parameters["b"]

        y_pred = self.derived_variables["y_pred"][input_idx]
        target = self.derived_variables["target"][input_idx]
        y_true = self.derived_variables["out_labels"][input_idx]
        Z_neg = self.derived_variables["noise_logits"][input_idx]
        Z_target = self.derived_variables["target_logits"][input_idx]
        neg_samples = self.derived_variables["noise_samples"][input_idx][0]

        # the number of target classes per minibatch example
        n_targets = 1

        # calculate the grad of the binary cross entropy wrt. the network
        # predictions
        preds, classes = y_pred.flatten(), y_true.flatten()

        dLdp_real = ((1 - classes) / (1 - preds)) - (classes / preds)
        dLdp_real = dLdp_real.reshape(*y_pred.shape)

        # partition the gradients into target and negative sample portions
        dLdy_pred_target = dLdp_real[..., :n_targets]
        dLdy_pred_neg = dLdp_real[..., n_targets:]

        # compute gradients of the loss wrt the data and noise logits
        dLdZ_target = dLdy_pred_target * self.act_fn.grad(Z_target)
        dLdZ_neg = dLdy_pred_neg * self.act_fn.grad(Z_neg)

        # compute param gradients on target + negative samples
        dB_neg = dLdZ_neg.sum(axis=(0, 1))
        dB_target = dLdZ_target.sum(axis=(1, 2))

        dW_neg = (dLdZ_neg.transpose(0, 2, 1) @ X).sum(axis=0)
        dW_target = (dLdZ_target.transpose(0, 2, 1) @ X).sum(axis=1)

        # TODO: can this be done with np.einsum instead?
        dX_target = np.vstack(
            [dLdZ_target[[ix]] @ W[[t]] for ix, t in enumerate(target)]
        )
        dX_neg = dLdZ_neg @ W[neg_samples]

        hits = list(set(target).intersection(set(neg_samples)))
        hit_ixs = [np.where(target == h)[0] for h in hits]

        # adjust param gradients if there's an accidental hit
        if len(hits) != 0:
            hit_ixs = np.concatenate(hit_ixs)
            target = np.delete(target, hit_ixs)
            dB_target = np.delete(dB_target, hit_ixs)
            dW_target = np.delete(dW_target, hit_ixs, 0)

        dX = dX_target + dX_neg

        # use np.add.at to ensure that repeated indices in the target (or
        # possibly in neg_samples if sampling is done with replacement) are
        # properly accounted for
        dB = np.zeros_like(b).flatten()
        np.add.at(dB, target, dB_target)
        np.add.at(dB, neg_samples, dB_neg)
        dB = dB.reshape(*b.shape)

        dW = np.zeros_like(W)
        np.add.at(dW, target, dW_target)
        np.add.at(dW, neg_samples, dW_neg)

        return dX, dW, dB


================================================
FILE: numpy_ml/neural_nets/models/README.md
================================================
# Models

The models module implements popular full neural networks. It includes:

- `vae.py`: A Bernoulli variational autoencoder ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
- `wgan_gp.py`: A Wasserstein generative adversarial network with gradient
      penalty ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf);
[Goodfellow et al., 2014](https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf))
- `w2v.py`: word2vec model with CBOW and skip-gram architectures and
  training via noise contrastive estimation ([Mikolov et al., 2012](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf))


================================================
FILE: numpy_ml/neural_nets/models/__init__.py
================================================
from .vae import *
from .wgan_gp import *
from .w2v import *


================================================
FILE: numpy_ml/neural_nets/models/vae.py
================================================
from time import time
from collections import OrderedDict

import numpy as np

from ..losses import VAELoss
from ..utils import minibatch
from ..activations import ReLU, Affine, Sigmoid
from ..layers import Conv2D, Pool2D, Flatten, FullyConnected


class BernoulliVAE(object):
    def __init__(
        self,
        T=5,
        latent_dim=256,
        enc_conv1_pad=0,
        enc_conv2_pad=0,
        enc_conv1_out_ch=32,
        enc_conv2_out_ch=64,
        enc_conv1_stride=1,
        enc_pool1_stride=2,
        enc_conv2_stride=1,
        enc_pool2_stride=1,
        enc_conv1_kernel_shape=(5, 5),
        enc_pool1_kernel_shape=(2, 2),
        enc_conv2_kernel_shape=(5, 5),
        enc_pool2_kernel_shape=(2, 2),
        optimizer="RMSProp(lr=0.0001)",
        init="glorot_uniform",
    ):
        """
        A variational autoencoder (VAE) with 2D convolutional encoder and Bernoulli
        input and output units.

        Notes
        -----
        The VAE architecture is

        .. code-block:: text

                            |-- t_mean ----|
            X -> [Encoder] -|              |--> [Sampler] -> [Decoder] -> X_recon
                            |-- t_log_var -|

        where ``[Encoder]`` is

        .. code-block:: text

            Conv1 -> ReLU -> MaxPool1 -> Conv2 -> ReLU ->
                MaxPool2 -> Flatten -> FC1 -> ReLU -> FC2

        ``[Decoder]`` is

        .. code-block:: text

            FC1 -> FC2 -> Sigmoid

        and ``[Sampler]`` draws a sample from the distribution

        .. math::

            \mathcal{N}(\\text{t_mean}, \exp \left\{\\text{t_log_var}\\right\} I)

        using the reparameterization trick.

        Parameters
        ----------
        T : int
            The dimension of the variational parameter `t`. Default is 5.
        enc_conv1_pad : int
            The padding for the first convolutional layer of the encoder. Default is 0.
        enc_conv1_stride : int
            The stride for the first convolutional layer of the encoder. Default is 1.
        enc_conv1_out_ch : int
            The number of output channels for the first convolutional layer of
            the encoder. Default is 32.
        enc_conv1_kernel_shape : tuple
            The number of rows and columns in each filter of the first
            convolutional layer of the encoder. Default is (5, 5).
        enc_pool1_kernel_shape : tuple
            The number of rows and columns in the receptive field of the first
            max pool layer of the encoder. Default is (2, 3).
        enc_pool1_stride : int
            The stride for the first MaxPool layer of the encoder. Default is
            2.
        enc_conv2_pad : int
            The padding for the second convolutional layer of the encoder.
            Default is 0.
        enc_conv2_out_ch : int
            The number of output channels for the second convolutional layer of
            the encoder. Default is 64.
        enc_conv2_kernel_shape : tuple
            The number of rows and columns in each filter of the second
            convolutional layer of the encoder. Default is (5, 5).
        enc_conv2_stride : int
            The stride for the second convolutional layer of the encoder.
            Default is 1.
        enc_pool2_stride : int
            The stride for the second MaxPool layer of the encoder. Default is
            1.
        enc_pool2_kernel_shape : tuple
            The number of rows and columns in the receptive field of the second
            max pool layer of the encoder. Default is (2, 3).
        latent_dim : int
            The dimension of the output for the first FC layer of the encoder.
            Default is 256.
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object or None
            The optimization strategy to use when performing gradient updates.
            If None, use the :class:`~numpy_ml.neural_nets.optimizers.SGD`
            optimizer with default parameters. Default is "RMSProp(lr=0.0001)".
        init : str
            The weight initialization strategy. Valid entries are
            {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform',
            'std_normal', 'trunc_normal'}. Default is 'glorot_uniform'.
        """
        self.T = T
        self.init = init
        self.loss = VAELoss()
        self.optimizer = optimizer
        self.latent_dim = latent_dim
        self.enc_conv1_pad = enc_conv1_pad
        self.enc_conv2_pad = enc_conv2_pad
        self.enc_conv1_stride = enc_conv1_stride
        self.enc_conv1_out_ch = enc_conv1_out_ch
        self.enc_pool1_stride = enc_pool1_stride
        self.enc_conv2_out_ch = enc_conv2_out_ch
        self.enc_conv2_stride = enc_conv2_stride
        self.enc_pool2_stride = enc_pool2_stride
        self.enc_conv2_kernel_shape = enc_conv2_kernel_shape
        self.enc_pool2_kernel_shape = enc_pool2_kernel_shape
        self.enc_conv1_kernel_shape = enc_conv1_kernel_shape
        self.enc_pool1_kernel_shape = enc_pool1_kernel_shape

        self._init_params()

    def _init_params(self):
        self._dv = {}
        self._build_encoder()
        self._build_decoder()

    def _build_encoder(self):
        """
        CNN encoder

        Conv1 -> ReLU -> MaxPool1 -> Conv2 -> ReLU -> MaxPool2 ->
            Flatten -> FC1 -> ReLU -> FC2
        """
        self.encoder = OrderedDict()
        self.encoder["Conv1"] = Conv2D(
            act_fn=ReLU(),
            init=self.init,
            pad=self.enc_conv1_pad,
            optimizer=self.optimizer,
            out_ch=self.enc_conv1_out_ch,
            stride=self.enc_conv1_stride,
            kernel_shape=self.enc_conv1_kernel_shape,
        )
        self.encoder["Pool1"] = Pool2D(
            mode="max",
            optimizer=self.optimizer,
            stride=self.enc_pool1_stride,
            kernel_shape=self.enc_pool1_kernel_shape,
        )
        self.encoder["Conv2"] = Conv2D(
            act_fn=ReLU(),
            init=self.init,
            pad=self.enc_conv2_pad,
            optimizer=self.optimizer,
            out_ch=self.enc_conv2_out_ch,
            stride=self.enc_conv2_stride,
            kernel_shape=self.enc_conv2_kernel_shape,
        )
        self.encoder["Pool2"] = Pool2D(
            mode="max",
            optimizer=self.optimizer,
            stride=self.enc_pool2_stride,
            kernel_shape=self.enc_pool2_kernel_shape,
        )
        self.encoder["Flatten3"] = Flatten(optimizer=self.optimizer)
        self.encoder["FC4"] = FullyConnected(
            n_out=self.latent_dim, act_fn=ReLU(), optimizer=self.optimizer
        )
        self.encoder["FC5"] = FullyConnected(
            n_out=self.T * 2,
            optimizer=self.optimizer,
            act_fn=Affine(slope=1, intercept=0),
            init=self.init,
        )

    def _build_decoder(self):
        """
        MLP decoder

        FC1 -> ReLU -> FC2 -> Sigmoid
        """
        self.decoder = OrderedDict()
        self.decoder["FC1"] = FullyConnected(
            act_fn=ReLU(),
            init=self.init,
            n_out=self.latent_dim,
            optimizer=self.optimizer,
        )
        # NB. `n_out` is dependent on the dimensionality of X. we use a
        # placeholder for now, and update it within the `forward` method
        self.decoder["FC2"] = FullyConnected(
            n_out=None, act_fn=Sigmoid(), optimizer=self.optimizer, init=self.init
        )

    @property
    def parameters(self):
        return {
            "components": {
                "encoder": {k: v.parameters for k, v in self.encoder.items()},
                "decoder": {k: v.parameters for k, v in self.decoder.items()},
            }
        }

    @property
    def hyperparameters(self):
        return {
            "layer": "BernoulliVAE",
            "T": self.T,
            "init": self.init,
            "loss": str(self.loss),
            "optimizer": self.optimizer,
            "latent_dim": self.latent_dim,
            "enc_conv1_pad": self.enc_conv1_pad,
            "enc_conv2_pad": self.enc_conv2_pad,
            "enc_conv1_in_ch": self.enc_conv1_in_ch,
            "enc_conv1_stride": self.enc_conv1_stride,
            "enc_conv1_out_ch": self.enc_conv1_out_ch,
            "enc_pool1_stride": self.enc_pool1_stride,
            "enc_conv2_out_ch": self.enc_conv2_out_ch,
            "enc_conv2_stride": self.enc_conv2_stride,
            "enc_pool2_stride": self.enc_pool2_stride,
            "enc_conv2_kernel_shape": self.enc_conv2_kernel_shape,
            "enc_pool2_kernel_shape": self.enc_pool2_kernel_shape,
            "enc_conv1_kernel_shape": self.enc_conv1_kernel_shape,
            "enc_pool1_kernel_shape": self.enc_pool1_kernel_shape,
            "encoder_ids": list(self.encoder.keys()),
            "decoder_ids": list(self.decoder.keys()),
            "components": {
                "encoder": {k: v.hyperparameters for k, v in self.encoder.items()},
                "decoder": {k: v.hyperparameters for k, v in self.decoder.items()},
            },
        }

    @property
    def derived_variables(self):
        dv = {
            "noise": None,
            "t_mean": None,
            "t_log_var": None,
            "dDecoder_FC1_in": None,
            "dDecoder_t_mean": None,
            "dEncoder_FC5_out": None,
            "dDecoder_FC1_out": None,
            "dEncoder_FC4_out": None,
            "dEncoder_Pool2_out": None,
            "dEncoder_Conv2_out": None,
            "dEncoder_Pool1_out": None,
            "dEncoder_Conv1_out": None,
            "dDecoder_t_log_var": None,
            "dEncoder_Flatten3_out": None,
            "components": {
                "encoder": {k: v.derived_variables for k, v in self.encoder.items()},
                "decoder": {k: v.derived_variables for k, v in self.decoder.items()},
            },
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        return {
            "components": {
                "encoder": {k: v.gradients for k, v in self.encoder.items()},
                "decoder": {k: v.gradients for k, v in self.decoder.items()},
            }
        }

    def _sample(self, t_mean, t_log_var):
        """
        Returns a sample from the distribution

            q(t | x) = N(t_mean, diag(exp(t_log_var)))

        using the reparameterization trick.

        Parameters
        ----------
        t_mean : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, latent_dim)`
            Mean of the desired distribution.
        t_log_var : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, latent_dim)`
            Log variance vector of the desired distribution.

        Returns
        -------
        samples: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, latent_dim)`
        """
        noise = np.random.normal(loc=0.0, scale=1.0, size=t_mean.shape)
        samples = noise * np.exp(t_log_var) + t_mean
        # save sampled noise for backward pass
        self._dv["noise"] = noise
        return samples

    def forward(self, X_train):
        """VAE forward pass"""
        if self.decoder["FC2"].n_out is None:
            fc2 = self.decoder["FC2"]
            self.decoder["FC2"] = fc2.set_params({"n_out": self.N})

        # assume each image is represented as a flattened row vector,
        n_ex, in_rows, N, in_ch = X_train.shape

        # encode the training batch to estimate the mean and variance of the
        # variational distribution
        out = X_train
        for k, v in self.encoder.items():
            out = v.forward(out)

        # extract the mean and log variance of the variational distribution
        # q(t | x) from the encoder output
        t_mean = out[:, : self.T]
        t_log_var = out[:, self.T :]

        # sample t from q(t | x) using reparamterization trick
        t = self._sample(t_mean, t_log_var)

        # pass the sampled latent value, t, through the decoder
        # to generate the average reconstruction
        X_recon = t
        for k, v in self.decoder.items():
            X_recon = v.forward(X_recon)

        self._dv["t_mean"] = t_mean
        self._dv["t_log_var"] = t_log_var
        return X_recon

    def backward(self, X_train, X_recon):
        """VAE backward pass"""
        n_ex = X_train.shape[0]
        D, E = self.decoder, self.encoder
        noise = self.derived_variables["noise"]
        t_mean = self.derived_variables["t_mean"]
        t_log_var = self.derived_variables["t_log_var"]

        # compute gradients through the VAE loss
        dY_pred, dLogVar, dMean = self.loss.grad(
            X_train.reshape(n_ex, -1), X_recon, t_mean, t_log_var
        )

        # backprop through the decoder
        dDecoder_FC1_out = D["FC2"].backward(dY_pred)
        dDecoder_FC1_in = D["FC1"].backward(dDecoder_FC1_out)

        # backprop through the sampler
        dDecoder_t_log_var = dDecoder_FC1_in * (noise * np.exp(t_log_var))
        dDecoder_t_mean = dDecoder_FC1_in

        # backprop through the encoder
        dEncoder_FC5_out = np.hstack(
            [dDecoder_t_mean + dMean, dDecoder_t_log_var + dLogVar]
        )
        dEncoder_FC4_out = E["FC5"].backward(dEncoder_FC5_out)
        dEncoder_Flatten3_out = E["FC4"].backward(dEncoder_FC4_out)
        dEncoder_Pool2_out = E["Flatten3"].backward(dEncoder_Flatten3_out)
        dEncoder_Conv2_out = E["Pool2"].backward(dEncoder_Pool2_out)
        dEncoder_Pool1_out = E["Conv2"].backward(dEncoder_Conv2_out)
        dEncoder_Conv1_out = E["Pool1"].backward(dEncoder_Pool1_out)
        dX = E["Conv1"].backward(dEncoder_Conv1_out)

        self._dv["dDecoder_t_mean"] = dDecoder_t_mean
        self._dv["dDecoder_FC1_in"] = dDecoder_FC1_in
        self._dv["dDecoder_FC1_out"] = dDecoder_FC1_out
        self._dv["dEncoder_FC5_out"] = dEncoder_FC5_out
        self._dv["dEncoder_FC4_out"] = dEncoder_FC4_out
        self._dv["dDecoder_t_log_var"] = dDecoder_t_log_var
        self._dv["dEncoder_Pool2_out"] = dEncoder_Pool2_out
        self._dv["dEncoder_Conv2_out"] = dEncoder_Conv2_out
        self._dv["dEncoder_Pool1_out"] = dEncoder_Pool1_out
        self._dv["dEncoder_Conv1_out"] = dEncoder_Conv1_out
        self._dv["dEncoder_Flatten3_out"] = dEncoder_Flatten3_out
        return dX

    def update(self, cur_loss=None):
        """Perform gradient updates"""
        for k, v in reversed(list(self.decoder.items())):
            v.update(cur_loss)
        for k, v in reversed(list(self.encoder.items())):
            v.update(cur_loss)
        self.flush_gradients()

    def flush_gradients(self):
        """Reset parameter gradients after update"""
        for k, v in self.decoder.items():
            v.flush_gradients()
        for k, v in self.encoder.items():
            v.flush_gradients()

    def fit(self, X_train, n_epochs=20, batchsize=128, verbose=True):
        """
        Fit the VAE to a training dataset.

        Parameters
        ----------
        X_train : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume
        n_epochs : int
            The maximum number of training epochs to run. Default is 20.
        batchsize : int
            The desired number of examples in each training batch. Default is 128.
        verbose : bool
            Print batch information during training. Default is True.
        """
        self.verbose = verbose
        self.n_epochs = n_epochs
        self.batchsize = batchsize

        _, self.in_rows, self.in_cols, self.in_ch = X_train.shape
        self.N = self.in_rows * self.in_cols * self.in_ch

        prev_loss = np.inf
        for i in range(n_epochs):
            loss, estart = 0.0, time()
            batch_generator, nb = minibatch(X_train, batchsize, shuffle=True)

            # TODO: parallelize inner loop
            for j, b_ix in enumerate(batch_generator):
                bsize, bstart = len(b_ix), time()

                X_batch = X_train[b_ix]
                X_batch_col = X_train[b_ix].reshape(bsize, -1)

                X_recon = self.forward(X_batch)
                t_mean = self.derived_variables["t_mean"]
                t_log_var = self.derived_variables["t_log_var"]

                self.backward(X_batch, X_recon)
                batch_loss = self.loss(X_batch_col, X_recon, t_mean, t_log_var)
                loss += batch_loss

                self.update(batch_loss)

                if self.verbose:
                    fstr = "\t[Batch {}/{}] Train loss: {:.3f} ({:.1f}s/batch)"
                    print(fstr.format(j + 1, nb, batch_loss, time() - bstart))

            loss /= nb
            fstr = "[Epoch {}] Avg. loss: {:.3f}  Delta: {:.3f} ({:.2f}m/epoch)"
            print(fstr.format(i + 1, loss, prev_loss - loss, (time() - estart) / 60.0))
            prev_loss = loss


================================================
FILE: numpy_ml/neural_nets/models/w2v.py
================================================
from time import time

import numpy as np

from ..layers import Embedding
from ..losses import NCELoss

from ...preprocessing.nlp import Vocabulary, tokenize_words
from ...utils.data_structures import DiscreteSampler


class Word2Vec(object):
    def __init__(
        self,
        context_len=5,
        min_count=None,
        skip_gram=False,
        max_tokens=None,
        embedding_dim=300,
        filter_stopwords=True,
        noise_dist_power=0.75,
        init="glorot_uniform",
        num_negative_samples=64,
        optimizer="SGD(lr=0.1)",
    ):
        """
        A word2vec model supporting both continuous bag of words (CBOW) and
        skip-gram architectures, with training via noise contrastive
        estimation.

        Parameters
        ----------
        context_len : int
            The number of words to the left and right of the current word to
            use as context during training. Larger values result in more
            training examples and thus can lead to higher accuracy at the
            expense of additional training time. Default is 5.
        min_count : int or None
            Minimum number of times a token must occur in order to be included
            in vocab. If None, include all tokens from `corpus_fp` in vocab.
            Default is None.
        skip_gram : bool
            Whether to train the skip-gram or CBOW model. The skip-gram model
            is trained to predict the target word i given its surrounding
            context, ``words[i - context:i]`` and ``words[i + 1:i + 1 +
            context]`` as input. Default is False.
        max_tokens : int or None
            Only add the first `max_tokens` most frequent tokens that occur
            more than `min_count` to the vocabulary.  If None, add all tokens
            that occur more than than `min_count`. Default is None.
        embedding_dim : int
            The number of dimensions in the final word embeddings. Default is
            300.
        filter_stopwords : bool
            Whether to remove stopwords before encoding the words in the
            corpus. Default is True.
        noise_dist_power : float
            The power the unigram count is raised to when computing the noise
            distribution for negative sampling. A value of 0 corresponds to a
            uniform distribution over tokens, and a value of 1 corresponds to a
            distribution proportional to the token unigram counts. Default is
            0.75.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        num_negative_samples: int
            The number of negative samples to draw from the noise distribution
            for each positive training sample. If 0, use the hierarchical
            softmax formulation of the model instead. Default is 5.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the `update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with
            default parameters. Default is None.

        Attributes
        ----------
        parameters : dict
        hyperparameters : dict
        derived_variables : dict
        gradients : dict

        Notes
        -----
        The word2vec model is outlined in in [1].

        CBOW architecture::

            w_{t-R}   ----|
            w_{t-R+1} ----|
            ...            --> Average --> Embedding layer --> [NCE Layer / HSoftmax] --> P(w_{t} | w_{...})
            w_{t+R-1} ----|
            w_{t+R}   ----|

        Skip-gram architecture::

                                                                   |-->  P(w_{t-R} | w_{t})
                                                                   |-->  P(w_{t-R+1} | w_{t})
            w_{t} --> Embedding layer --> [NCE Layer / HSoftmax] --|     ...
                                                                   |-->  P(w_{t+R-1} | w_{t})
                                                                   |-->  P(w_{t+R} | w_{t})

        where :math:`w_{i}` is the one-hot representation of the word at position
        `i` within a sentence in the corpus and `R` is the length of the context
        window on either side of the target word.

        References
        ----------
        .. [1] Mikolov et al. (2013). "Distributed representations of words
           and phrases and their compositionality," Proceedings of the 26th
           International Conference on Neural Information Processing Systems.
           https://arxiv.org/pdf/1310.4546.pdf
        """
        self.init = init
        self.optimizer = optimizer
        self.skip_gram = skip_gram
        self.min_count = min_count
        self.max_tokens = max_tokens
        self.context_len = context_len
        self.embedding_dim = embedding_dim
        self.filter_stopwords = filter_stopwords
        self.noise_dist_power = noise_dist_power
        self.num_negative_samples = num_negative_samples
        self.special_chars = set(["<unk>", "<eol>", "<bol>"])

    def _init_params(self):
        self._dv = {}
        self._build_noise_distribution()

        self.embeddings = Embedding(
            init=self.init,
            vocab_size=self.vocab_size,
            n_out=self.embedding_dim,
            optimizer=self.optimizer,
            pool=None if self.skip_gram else "mean",
        )

        self.loss = NCELoss(
            init=self.init,
            optimizer=self.optimizer,
            n_classes=self.vocab_size,
            subtract_log_label_prob=False,
            noise_sampler=self._noise_sampler,
            num_negative_samples=self.num_negative_samples,
        )

    @property
    def parameters(self):
        """Model parameters"""
        param = {"components": {"embeddings": {}, "loss": {}}}
        if hasattr(self, "embeddings"):
            param["components"] = {
                "embeddings": self.embeddings.parameters,
                "loss": self.loss.parameters,
            }
        return param

    @property
    def hyperparameters(self):
        """Model hyperparameters"""
        hp = {
            "layer": "Word2Vec",
            "init": self.init,
            "skip_gram": self.skip_gram,
            "optimizer": self.optimizer,
            "max_tokens": self.max_tokens,
            "context_len": self.context_len,
            "embedding_dim": self.embedding_dim,
            "noise_dist_power": self.noise_dist_power,
            "filter_stopwords": self.filter_stopwords,
            "num_negative_samples": self.num_negative_samples,
            "vocab_size": self.vocab_size if hasattr(self, "vocab_size") else None,
            "components": {"embeddings": {}, "loss": {}},
        }

        if hasattr(self, "embeddings"):
            hp["components"] = {
                "embeddings": self.embeddings.hyperparameters,
                "loss": self.loss.hyperparameters,
            }
        return hp

    @property
    def derived_variables(self):
        """Variables computed during model operation"""
        dv = {"components": {"embeddings": {}, "loss": {}}}
        dv.update(self._dv)

        if hasattr(self, "embeddings"):
            dv["components"] = {
                "embeddings": self.embeddings.derived_variables,
                "loss": self.loss.derived_variables,
            }
        return dv

    @property
    def gradients(self):
        """Model parameter gradients"""
        grad = {"components": {"embeddings": {}, "loss": {}}}
        if hasattr(self, "embeddings"):
            grad["components"] = {
                "embeddings": self.embeddings.gradients,
                "loss": self.loss.gradients,
            }
        return grad

    def forward(self, X, targets, retain_derived=True):
        """
        Evaluate the network on a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing a minibatch of `n_ex` examples, each
            consisting of `n_in` integer word indices
        targets : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`
            Target word index for each example in the minibatch.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If `False`, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            True.

        Returns
        -------
        loss : float
            The loss associated with the current minibatch
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`
            The conditional probabilities of the words in `targets` given the
            corresponding example / context in `X`.
        """
        X_emb = self.embeddings.forward(X, retain_derived=True)
        loss, y_pred = self.loss.loss(X_emb, targets.flatten(), retain_derived=True)
        return loss, y_pred

    def backward(self):
        """
        Compute the gradient of the loss wrt the current network parameters.
        """
        dX_emb = self.loss.grad(retain_grads=True, update_params=False)
        self.embeddings.backward(dX_emb)

    def update(self, cur_loss=None):
        """Perform gradient updates"""
        self.loss.update(cur_loss)
        self.embeddings.update(cur_loss)
        self.flush_gradients()

    def flush_gradients(self):
        """Reset parameter gradients after update"""
        self.loss.flush_gradients()
        self.embeddings.flush_gradients()

    def get_embedding(self, word_ids):
        """
        Retrieve the embeddings for a collection of word IDs.

        Parameters
        ----------
        word_ids : :py:class:`ndarray <numpy.ndarray>` of shape `(M,)`
            An array of word IDs to retrieve embeddings for.

        Returns
        -------
        embeddings : :py:class:`ndarray <numpy.ndarray>` of shape `(M, n_out)`
            The embedding vectors for each of the `M` word IDs.
        """
        if isinstance(word_ids, list):
            word_ids = np.array(word_ids)
        return self.embeddings.lookup(word_ids)

    def _build_noise_distribution(self):
        """
        Construct the noise distribution for use during negative sampling.

        For a word ``w`` in the corpus, the noise distribution is::

            P_n(w) = Count(w) ** noise_dist_power / Z

        where ``Z`` is a normalizing constant, and `noise_dist_power` is a
        hyperparameter of the model. Mikolov et al. report best performance
        using a `noise_dist_power` of 0.75.
        """
        if not hasattr(self, "vocab"):
            raise ValueError("Must call `fit` before constructing noise distribution")

        probs = np.zeros(len(self.vocab))
        power = self.hyperparameters["noise_dist_power"]

        for ix, token in enumerate(self.vocab):
            count = token.count
            probs[ix] = count ** power

        probs /= np.sum(probs)
        self._noise_sampler = DiscreteSampler(probs, log=False, with_replacement=False)

    def _train_epoch(self, corpus_fps, encoding):
        total_loss = 0
        batch_generator = self.minibatcher(corpus_fps, encoding)
        for ix, (X, target) in enumerate(batch_generator):
            loss = self._train_batch(X, target)
            total_loss += loss
            if self.verbose:
                smooth_loss = 0.99 * smooth_loss + 0.01 * loss if ix > 0 else loss
                fstr = "[Batch {}] Loss: {:.5f} | Smoothed Loss: {:.5f}"
                print(fstr.format(ix + 1, loss, smooth_loss))
        return total_loss / (ix + 1)

    def _train_batch(self, X, target):
        loss, _ = self.forward(X, target)
        self.backward()
        self.update(loss)
        return loss

    def minibatcher(self, corpus_fps, encoding):
        """
        A minibatch generator for skip-gram and CBOW models.

        Parameters
        ----------
        corpus_fps : str or list of strs
            The filepath / list of filepaths to the document(s) to be encoded.
            Each document is expected to be encoded as newline-separated
            string of text, with adjacent tokens separated by a whitespace
            character.
        encoding : str
            Specifies the text encoding for corpus. This value is passed
            directly to Python's `open` builtin. Common entries are either
            'utf-8' (no header byte), or 'utf-8-sig' (header byte).

        Yields
        ------
        X : list of length `batchsize` or :py:class:`ndarray <numpy.ndarray>` of shape (`batchsize`, `n_in`)
            The context IDs for a minibatch of `batchsize` examples. If
            ``self.skip_gram`` is False, `X` will be a ragged list consisting
            of `batchsize` variable-length lists. If ``self.skip_gram`` is
            `True`, all sublists will be of the same length (`n_in`) and `X`
            will be returned as a :py:class:`ndarray <numpy.ndarray>` of shape (`batchsize`, `n_in`).
        target : :py:class:`ndarray <numpy.ndarray>` of shape (`batchsize`, 1)
            The target IDs associated with each example in `X`
        """
        batchsize = self.batchsize
        X_mb, target_mb, mb_ready = [], [], False

        for d_ix, doc_fp in enumerate(corpus_fps):
            with open(doc_fp, "r", encoding=encoding) as doc:
                for line in doc:
                    words = tokenize_words(
                        line, lowercase=True, filter_stopwords=self.filter_stopwords
                    )
                    word_ixs = self.vocab.words_to_indices(
                        self.vocab.filter(words, unk=False)
                    )
                    for word_loc, word in enumerate(word_ixs):
                        # since more distant words are usually less related to
                        # the target word, we downweight them by sampling from
                        # them less frequently during training.
                        R = np.random.randint(1, self.context_len)
                        left = word_ixs[max(word_loc - R, 0) : word_loc]
                        right = word_ixs[word_loc + 1 : word_loc + 1 + R]
                        context = left + right

                        if len(context) == 0:
                            continue

                        # in the skip-gram architecture we use each of the
                        # surrounding context to predict `word` / avoid
                        # predicting negative samples
                        if self.skip_gram:
                            X_mb.extend([word] * len(context))
                            target_mb.extend(context)
                            mb_ready = len(target_mb) >= batchsize

                        # in the CBOW architecture we use the average of the
                        # context embeddings to predict the target `word` / avoid
                        # predicting the negative samples
                        else:
                            context = np.array(context)
                            X_mb.append(context)  # X_mb will be a ragged array
                            target_mb.append(word)
                            mb_ready = len(X_mb) == batchsize

                        if mb_ready:
                            mb_ready = False
                            X_batch, target_batch = X_mb.copy(), target_mb.copy()
                            X_mb, target_mb = [], []
                            if self.skip_gram:
                                X_batch = np.array(X_batch)[:, None]
                            target_batch = np.array(target_batch)[:, None]
                            yield X_batch, target_batch

        # if we've reached the end of our final document and there are
        # remaining examples, yield the stragglers as a partial minibatch
        if len(X_mb) > 0:
            if self.skip_gram:
                X_mb = np.array(X_mb)[:, None]
            target_mb = np.array(target_mb)[:, None]
            yield X_mb, target_mb

    def fit(
        self, corpus_fps, encoding="utf-8-sig", n_epochs=20, batchsize=128, verbose=True
    ):
        """
        Learn word2vec embeddings for the examples in `X_train`.

        Parameters
        ----------
        corpus_fps : str or list of strs
            The filepath / list of filepaths to the document(s) to be encoded.
            Each document is expected to be encoded as newline-separated
            string of text, with adjacent tokens separated by a whitespace
            character.
        encoding : str
            Specifies the text encoding for corpus. Common entries are either
            'utf-8' (no header byte), or 'utf-8-sig' (header byte).  Default
            value is 'utf-8-sig'.
        n_epochs : int
            The maximum number of training epochs to run. Default is 20.
        batchsize : int
            The desired number of examples in each training batch. Default is
            128.
        verbose : bool
            Print batch information during training. Default is True.
        """
        self.verbose = verbose
        self.n_epochs = n_epochs
        self.batchsize = batchsize

        self.vocab = Vocabulary(
            lowercase=True,
            min_count=self.min_count,
            max_tokens=self.max_tokens,
            filter_stopwords=self.filter_stopwords,
        )
        self.vocab.fit(corpus_fps, encoding=encoding)
        self.vocab_size = len(self.vocab)

        # ignore special characters when training the model
        for sp in self.special_chars:
            self.vocab.counts[sp] = 0

        # now that we know our vocabulary size, we can initialize the embeddings
        self._init_params()

        prev_loss = np.inf
        for i in range(n_epochs):
            loss, estart = 0.0, time()
            loss = self._train_epoch(corpus_fps, encoding)

            fstr = "[Epoch {}] Avg. loss: {:.3f}  Delta: {:.3f} ({:.2f}m/epoch)"
            print(fstr.format(i + 1, loss, prev_loss - loss, (time() - estart) / 60.0))
            prev_loss = loss


================================================
FILE: numpy_ml/neural_nets/models/wgan_gp.py
================================================
from time import time
from collections import OrderedDict

import numpy as np

from ..utils import minibatch
from ..layers import FullyConnected
from ..losses import WGAN_GPLoss


class WGAN_GP(object):
    """
    A Wasserstein generative adversarial network (WGAN) architecture with
    gradient penalty (GP).

    Notes
    -----
    In contrast to a regular WGAN, WGAN-GP uses gradient penalty on the
    generator rather than weight clipping to encourage the 1-Lipschitz
    constraint:

    .. math::

        | \\text{Generator}(\mathbf{x}_1) - \\text{Generator}(\mathbf{x}_2) |
            \leq |\mathbf{x}_1 - \mathbf{x}_2 | \ \ \ \ \\forall \mathbf{x}_1, \mathbf{x}_2

    In other words, the generator must have input gradients with a norm of at
    most 1 under the :math:`\mathbf{X}_{real}` and :math:`\mathbf{X}_{fake}`
    data distributions.

    To enforce this constraint, WGAN-GP penalizes the model if the generator
    gradient norm moves away from a target norm of 1. See
    :class:`~numpy_ml.neural_nets.losses.WGAN_GPLoss` for more details.

    In contrast to a standard WGAN, WGAN-GP avoids using BatchNorm in the
    critic, as correlation between samples in a batch can impact the stability
    of the gradient penalty.

    WGAP-GP architecture:

    .. code-block:: text

        X_real ------------------------|
                                        >---> [Critic] --> Y_out
        Z --> [Generator] --> X_fake --|

    where ``[Generator]`` is

    .. code-block:: text

        FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4

    and ``[Critic]`` is

    .. code-block:: text

        FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4

    and

    .. math::

        Z \sim \mathcal{N}(0, 1)
    """

    def __init__(
        self,
        g_hidden=512,
        init="he_uniform",
        optimizer="RMSProp(lr=0.0001)",
        debug=False,
    ):
        """
        Wasserstein generative adversarial network with gradient penalty.

        Parameters
        ----------
        g_hidden : int
            The number of units in the critic and generator hidden layers.
            Default is 512.
        init : str
            The weight initialization strategy. Valid entries are
            {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform',
            'std_normal', 'trunc_normal'}. Default is "he_uniform".
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object or None
            The optimization strategy to use when performing gradient updates.
            If None, use the :class:`~numpy_ml.neural_nets.optimizers.SGD`
            optimizer with default parameters. Default is "RMSProp(lr=0.0001)".
        debug : bool
            Whether to store additional intermediate output within
            ``self.derived_variables``. Default is False.
        """
        self.init = init
        self.debug = debug
        self.g_hidden = g_hidden
        self.optimizer = optimizer

        self.lambda_ = None
        self.n_steps = None
        self.batchsize = None

        self.is_initialized = False

    def _init_params(self):
        self._dv = {}
        self._gr = {}
        self._build_critic()
        self._build_generator()
        self.is_initialized = True

    def _build_generator(self):
        """
        FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4
        """
        self.generator = OrderedDict()
        self.generator["FC1"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.generator["FC2"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.generator["FC3"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.generator["FC4"] = FullyConnected(
            self.n_feats,
            act_fn="Affine(slope=1, intercept=0)",
            optimizer=self.optimizer,
            init=self.init,
        )

    def _build_critic(self):
        """
        FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4
        """
        self.critic = OrderedDict()
        self.critic["FC1"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.critic["FC2"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.critic["FC3"] = FullyConnected(
            self.g_hidden, act_fn="ReLU", optimizer=self.optimizer, init=self.init
        )
        self.critic["FC4"] = FullyConnected(
            1,
            act_fn="Affine(slope=1, intercept=0)",
            optimizer=self.optimizer,
            init=self.init,
        )

    @property
    def hyperparameters(self):
        return {
            "init": self.init,
            "lambda_": self.lambda_,
            "g_hidden": self.g_hidden,
            "n_steps": self.n_steps,
            "optimizer": self.optimizer,
            "batchsize": self.batchsize,
            "c_updates_per_epoch": self.c_updates_per_epoch,
            "components": {
                "critic": {k: v.hyperparameters for k, v in self.critic.items()},
                "generator": {k: v.hyperparameters for k, v in self.generator.items()},
            },
        }

    @property
    def parameters(self):
        return {
            "components": {
                "critic": {k: v.parameters for k, v in self.critic.items()},
                "generator": {k: v.parameters for k, v in self.generator.items()},
            }
        }

    @property
    def derived_variables(self):
        C = self.critic.items()
        G = self.generator.items()
        dv = {
            "components": {
                "critic": {k: v.derived_variables for k, v in C},
                "generator": {k: v.derived_variables for k, v in G},
            }
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        grads = {
            "dC_Y_fake": None,
            "dC_Y_real": None,
            "dG_Y_fake": None,
            "dC_gradInterp": None,
            "components": {
                "critic": {k: v.gradients for k, v in self.critic.items()},
                "generator": {k: v.gradients for k, v in self.generator.items()},
            },
        }
        grads.update(self._gr)
        return grads

    def forward(self, X, module, retain_derived=True):
        """
        Perform the forward pass for either the generator or the critic.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, \*)`
            Input data
        module : {'C' or 'G'}
            Whether to perform the forward pass for the critic ('C') or for the
            generator ('G').
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        out : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, \*)`
            The output of the final layer of the module.
        Xs : dict
            A dictionary with layer ids as keys and values corresponding to the
            input to each intermediate layer during the forward pass. Useful
            during debugging.
        """
        if module == "G":
            mod = self.generator
        elif module == "C":
            mod = self.critic
        else:
            raise ValueError("Unrecognized module name: {}".format(module))

        Xs = {}
        out, rd = X, retain_derived
        for k, v in mod.items():
            Xs[k] = out
            out = v.forward(out, retain_derived=rd)
        return out, Xs

    def backward(self, grad, module, retain_grads=True):
        """
        Perform the backward pass for either the generator or the critic.

        Parameters
        ----------
        grad : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, \*)` or list of arrays
            Gradient of the loss with respect to module output(s).
        module : {'C' or 'G'}
            Whether to perform the backward pass for the critic ('C') or for the
            generator ('G').
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is True.

        Returns
        -------
        out : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, \*)`
            The gradient of the loss with respect to the module input.
        dXs : dict
            A dictionary with layer ids as keys and values corresponding to the
            input to each intermediate layer during the backward pass. Useful
            during debugging.
        """
        if module == "G":
            mod = self.generator
        elif module == "C":
            mod = self.critic
        else:
            raise ValueError("Unrecognized module name: {}".format(module))

        dXs = {}
        out, rg = grad, retain_grads
        for k, v in reversed(list(mod.items())):
            dXs[k] = out
            out = v.backward(out, retain_grads=rg)
        return out, dXs

    def _dGradInterp(self, dLdGradInterp, dYi_outs):
        """
        Compute the gradient penalty's contribution to the critic loss and
        update the parameter gradients accordingly.

        Parameters
        ----------
        dLdGradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, critic_in_dim)`
            Gradient of `Y_interp` with respect to `X_interp`.
        dYi_outs : dict
            The intermediate outputs generated during the backward pass when
            computing `dLdGradInterp`.
        """
        dy = dLdGradInterp
        for k, v in self.critic.items():
            X = v.X[-1]  # layer input during forward pass
            dy, dW, dB = v._bwd2(dy, X, dYi_outs[k][2])
            self.critic[k].gradients["W"] += dW
            self.critic[k].gradients["b"] += dB

    def update_critic(self, X_real):
        """
        Compute parameter gradients for the critic on a single minibatch.

        Parameters
        ----------
        X_real : :py:class:`ndarray <numpy.ndarray>` of shape `(batchsize, n_feats)`
            Input data.

        Returns
        -------
        C_loss : float
            The critic loss on the current data.
        """
        self.flush_gradients("C")

        n_ex = X_real.shape[0]
        noise = np.random.randn(*X_real.shape)

        # generate and score the real and fake data
        X_fake, Xf_outs = self.forward(noise, "G")
        Y_real, Yr_outs = self.forward(X_real, "C")
        Y_fake, Yf_outs = self.forward(X_fake, "C")

        # sample a random point on the linear interpolation between real and
        # fake data and compute its score
        alpha = np.random.rand(n_ex, 1)
        X_interp = alpha * X_real + (1 - alpha) * X_fake
        Y_interp, Yi_outs = self.forward(X_interp, "C")

        # compute the gradient of Y_interp wrt. X_interp
        # Note that we don't save intermediate gradients here since this is not
        # the real backward pass
        dLdy = [0, 0, np.ones_like(Y_interp)]
        (_, _, gradInterp), dYi_outs = self.backward(dLdy, "C", retain_grads=False)

        # calculate critic loss and differentiate with respect to each term
        C_loss = self.loss(Y_fake, "C", Y_real, gradInterp)
        dY_real, dY_fake, dGrad_interp = self.loss.grad(Y_fake, "C", Y_real, gradInterp)

        # compute `dY_real` and `dY_fake` contributions to critic loss, update
        # param gradients accordingly
        self.backward([dY_real, dY_fake, 0], "C")

        # compute `gradInterp`'s contribution to the critic loss, updating
        # param gradients accordingly
        self._dGradInterp(dGrad_interp, dYi_outs)

        # cache intermediate vars for the generator update
        self._dv["alpha"] = alpha
        self._dv["Y_fake"] = Y_fake

        # log additional intermediate values for debugging
        if self.debug:
            self._dv["G_fwd_X_fake"] = {}
            self._dv["C_fwd_Y_real"] = {}
            self._dv["C_fwd_Y_fake"] = {}
            self._dv["C_fwd_Y_interp"] = {}

            N = len(self.critic.keys())
            N2 = len(self.generator.keys())

            for i in range(N2):
                self._dv["G_fwd_X_fake"]["FC" + str(i)] = Xf_outs["FC" + str(i + 1)]

            for i in range(N):
                self._dv["C_fwd_Y_real"]["FC" + str(i)] = Yr_outs["FC" + str(i + 1)]
                self._dv["C_fwd_Y_fake"]["FC" + str(i)] = Yf_outs["FC" + str(i + 1)]
                self._dv["C_fwd_Y_interp"]["FC" + str(i)] = Yi_outs["FC" + str(i + 1)]

            self._dv["C_fwd_Y_real"]["FC" + str(N)] = Y_real
            self._dv["C_fwd_Y_fake"]["FC" + str(N)] = Y_fake
            self._dv["G_fwd_X_fake"]["FC" + str(N2)] = X_fake
            self._dv["C_fwd_Y_interp"]["FC" + str(N)] = Y_interp
            self._dv["C_dY_interp_wrt"] = {k: v[2] for k, v in dYi_outs.items()}

            self._dv["noise"] = noise
            self._dv["X_fake"] = X_fake
            self._dv["X_real"] = X_real
            self._dv["Y_real"] = Y_real
            self._dv["Y_fake"] = Y_fake
            self._dv["C_loss"] = C_loss
            self._dv["dY_real"] = dY_real
            self._dv["dC_Y_fake"] = dY_fake
            self._dv["X_interp"] = X_interp
            self._dv["Y_interp"] = Y_interp
            self._dv["gradInterp"] = gradInterp
            self._dv["dGrad_interp"] = dGrad_interp

        return C_loss

    def update_generator(self, X_shape):
        """
        Compute parameter gradients for the generator on a single minibatch.

        Parameters
        ----------
        X_shape : tuple of `(batchsize, n_feats)`
            Shape for the input batch.

        Returns
        -------
        G_loss : float
            The generator loss on the fake data (generated during the critic
            update)
        """
        self.flush_gradients("G")
        Y_fake = self.derived_variables["Y_fake"]

        n_ex, _ = Y_fake.shape
        G_loss = -Y_fake.mean()
        dG_loss = -np.ones_like(Y_fake) / n_ex
        self.backward(dG_loss, "G")

        if self.debug:
            self._dv["G_loss"] = G_loss
            self._dv["dG_Y_fake"] = dG_loss

        return G_loss

    def flush_gradients(self, module):
        """Reset parameter gradients to 0 after an update."""
        if module == "G":
            mod = self.generator
        elif module == "C":
            mod = self.critic
        else:
            raise ValueError("Unrecognized module name: {}".format(module))

        for k, v in mod.items():
            v.flush_gradients()

    def update(self, module, module_loss=None):
        """Perform gradient updates and flush gradients upon completion"""
        if module == "G":
            mod = self.generator
        elif module == "C":
            mod = self.critic
        else:
            raise ValueError("Unrecognized module name: {}".format(module))

        for k, v in reversed(list(mod.items())):
            v.update(module_loss)
        self.flush_gradients(module)

    def fit(
        self,
        X_real,
        lambda_,
        n_steps=1000,
        batchsize=128,
        c_updates_per_epoch=5,
        verbose=True,
    ):
        """
        Fit WGAN_GP on a training dataset.

        Parameters
        ----------
        X_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)`
            Training dataset
        lambda_ : float
            Gradient penalty coefficient for the critic loss
        n_steps : int
            The maximum number of generator updates to perform. Default is
            1000.
        batchsize : int
            Number of examples to use in each training minibatch. Default is
            128.
        c_updates_per_epoch : int
            The number of critic updates to perform at each generator update.
        verbose : bool
            Print loss values after each update. If False, only print loss
            every 100 steps. Default is True.
        """
        self.lambda_ = lambda_
        self.verbose = verbose
        self.n_steps = n_steps
        self.batchsize = batchsize
        self.c_updates_per_epoch = c_updates_per_epoch

        # adjust output of the generator to match the dimensionality of X
        if not self.is_initialized:
            self.n_feats = X_real.shape[1]
            self._init_params()

        # (re-)initialize loss
        prev_C, prev_G = np.inf, np.inf
        self.loss = WGAN_GPLoss(lambda_=self.lambda_)

        # training loop
        NC, NG = self.c_updates_per_epoch, self.n_steps
        for i in range(NG):
            estart = time()
            batch_generator, _ = minibatch(X_real, batchsize, shuffle=False)

            for j, b_ix in zip(range(NC), batch_generator):
                bstart = time()
                X_batch = X_real[b_ix]
                C_loss = self.update_critic(X_batch)

                # for testing, don't perform gradient update so we can inspect each grad
                if not self.debug:
                    self.update("C", C_loss)

                if self.verbose:
                    fstr = "\t[Critic batch {}] Critic loss: {:.3f} {:.3f}∆ ({:.1f}s/batch)"
                    print(fstr.format(j + 1, C_loss, prev_C - C_loss, time() - bstart))
                    prev_C = C_loss

            # generator update
            G_loss = self.update_generator(X_batch.shape)

            # for testing, don't perform gradient update so we can inspect each grad
            if not self.debug:
                self.update("G", G_loss)

            if i % 99 == 0:
                fstr = "[Epoch {}] Gen. loss: {:.3f}  Critic loss: {:.3f}"
                print(fstr.format(i + 1, G_loss, C_loss))

            elif self.verbose:
                fstr = "[Epoch {}] Gen. loss: {:.3f}  {:.3f}∆ ({:.1f}s/epoch)"
                print(fstr.format(i + 1, G_loss, prev_G - G_loss, time() - estart))
                prev_G = G_loss


================================================
FILE: numpy_ml/neural_nets/modules/README.md
================================================
# Modules

The `modules.py` module implements common multi-layer blocks that appear across
many modern deep networks. It includes:

- Bidirectional LSTMs ([Schuster & Paliwal, 1997](https://pdfs.semanticscholar.org/4b80/89bc9b49f84de43acc2eb8900035f7d492b2.pdf))
- ResNet-style "identity" (i.e., `same`-convolution) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
- ResNet-style "convolutional" (i.e., parametric) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
- WaveNet-style residual block with dilated causal convolutions ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf))
- Transformer-style multi-headed dot-product attention ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))


================================================
FILE: numpy_ml/neural_nets/modules/__init__.py
================================================
from .modules import *


================================================
FILE: numpy_ml/neural_nets/modules/modules.py
================================================
from abc import ABC, abstractmethod

import re
import numpy as np

from ..wrappers import Dropout
from ..utils import calc_pad_dims_2D
from ..activations import Tanh, Sigmoid, ReLU, LeakyReLU, Affine
from ..layers import (
    DotProductAttention,
    FullyConnected,
    BatchNorm2D,
    Conv1D,
    Conv2D,
    Multiply,
    LSTMCell,
    Add,
)


class ModuleBase(ABC):
    def __init__(self):
        self.X = None
        self.trainable = True

        super().__init__()

    @abstractmethod
    def _init_params(self, **kwargs):
        raise NotImplementedError

    @abstractmethod
    def forward(self, z, **kwargs):
        raise NotImplementedError

    @abstractmethod
    def backward(self, out, **kwargs):
        raise NotImplementedError

    @property
    def components(self):
        comps = []
        for c in self.hyperparameters["component_ids"]:
            if hasattr(self, c):
                comps.append(getattr(self, c))
        return comps

    def freeze(self):
        self.trainable = False
        for c in self.components:
            c.freeze()

    def unfreeze(self):
        self.trainable = True
        for c in self.components:
            c.unfreeze()

    def update(self, cur_loss=None):
        assert self.trainable, "Layer is frozen"
        for c in self.components:
            c.update(cur_loss)
        self.flush_gradients()

    def flush_gradients(self):
        assert self.trainable, "Layer is frozen"

        self.X = []
        self._dv = {}
        for c in self.components:
            for k, v in c.derived_variables.items():
                c.derived_variables[k] = None

            for k, v in c.gradients.items():
                c.gradients[k] = np.zeros_like(v)

    def set_params(self, summary_dict):
        cids = self.hyperparameters["component_ids"]
        for k, v in summary_dict["parameters"].items():
            if k == "components":
                for c, cd in summary_dict["parameters"][k].items():
                    if c in cids:
                        getattr(self, c).set_params(cd)

            elif k in self.parameters:
                self.parameters[k] = v

        for k, v in summary_dict["hyperparameters"].items():
            if k == "components":
                for c, cd in summary_dict["hyperparameters"][k].items():
                    if c in cids:
                        getattr(self, c).set_params(cd)

            if k in self.hyperparameters:
                if k == "act_fn" and v == "ReLU":
                    self.hyperparameters[k] = ReLU()
                elif v == "act_fn" and v == "Sigmoid":
                    self.hyperparameters[k] = Sigmoid()
                elif v == "act_fn" and v == "Tanh":
                    self.hyperparameters[k] = Tanh()
                elif v == "act_fn" and "Affine" in v:
                    r = r"Affine\(slope=(.*), intercept=(.*)\)"
                    slope, intercept = re.match(r, v).groups()
                    self.hyperparameters[k] = Affine(float(slope), float(intercept))
                elif v == "act_fn" and "Leaky ReLU" in v:
                    r = r"Leaky ReLU\(alpha=(.*)\)"
                    alpha = re.match(r, v).groups()[0]
                    self.hyperparameters[k] = LeakyReLU(float(alpha))
                else:
                    self.hyperparameters[k] = v

    def summary(self):
        return {
            "parameters": self.parameters,
            "layer": self.hyperparameters["layer"],
            "hyperparameters": self.hyperparameters,
        }


class WavenetResidualModule(ModuleBase):
    def __init__(
        self,
        ch_residual,
        ch_dilation,
        dilation,
        kernel_width,
        optimizer=None,
        init="glorot_uniform",
    ):
        """
        A WaveNet-like residual block with causal dilated convolutions.

        .. code-block:: text

            *Skip path in* >-------------------------------------------> + ---> *Skip path out*
                              Causal      |--> Tanh --|                  |
            *Main    |--> Dilated Conv1D -|           * --> 1x1 Conv1D --|
             path >--|                    |--> Sigm --|                  |
             in*     |-------------------------------------------------> + ---> *Main path out*
                                         *Residual path*

        On the final block, the output of the skip path is further processed to
        produce the network predictions.

        References
        ----------
        .. [1] van den Oord et al. (2016). "Wavenet: a generative model for raw
           audio". https://arxiv.org/pdf/1609.03499.pdf

        Parameters
        ----------
        ch_residual : int
            The number of output channels for the 1x1
            :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.
        ch_dilation : int
            The number of output channels for the causal dilated
            :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.
        dilation : int
            The dilation rate for the causal dilated
            :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.
        kernel_width : int
            The width of the causal dilated
            :class:`~numpy_ml.neural_nets.layers.Conv1D` kernel in the main
            path.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with default
            parameters. Default is None.
        """
        super().__init__()

        self.init = init
        self.dilation = dilation
        self.optimizer = optimizer
        self.ch_residual = ch_residual
        self.ch_dilation = ch_dilation
        self.kernel_width = kernel_width

        self._init_params()

    def _init_params(self):
        self._dv = {}

        self.conv_dilation = Conv1D(
            stride=1,
            pad="causal",
            init=self.init,
            kernel_width=2,
            dilation=self.dilation,
            out_ch=self.ch_dilation,
            optimizer=self.optimizer,
            act_fn=Affine(slope=1, intercept=0),
        )

        self.tanh = Tanh()
        self.sigm = Sigmoid()
        self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0))

        self.conv_1x1 = Conv1D(
            stride=1,
            pad="same",
            dilation=0,
            init=self.init,
            kernel_width=1,
            out_ch=self.ch_residual,
            optimizer=self.optimizer,
            act_fn=Affine(slope=1, intercept=0),
        )

        self.add_residual = Add(act_fn=Affine(slope=1, intercept=0))
        self.add_skip = Add(act_fn=Affine(slope=1, intercept=0))

    @property
    def parameters(self):
        """A dictionary of the module parameters."""
        return {
            "components": {
                "conv_1x1": self.conv_1x1.parameters,
                "add_skip": self.add_skip.parameters,
                "add_residual": self.add_residual.parameters,
                "conv_dilation": self.conv_dilation.parameters,
                "multiply_gate": self.multiply_gate.parameters,
            }
        }

    @property
    def hyperparameters(self):
        """A dictionary of the module hyperparameters"""
        return {
            "layer": "WavenetResidualModule",
            "init": self.init,
            "dilation": self.dilation,
            "optimizer": self.optimizer,
            "ch_residual": self.ch_residual,
            "ch_dilation": self.ch_dilation,
            "kernel_width": self.kernel_width,
            "component_ids": [
                "conv_1x1",
                "add_skip",
                "add_residual",
                "conv_dilation",
                "multiply_gate",
            ],
            "components": {
                "conv_1x1": self.conv_1x1.hyperparameters,
                "add_skip": self.add_skip.hyperparameters,
                "add_residual": self.add_residual.hyperparameters,
                "conv_dilation": self.conv_dilation.hyperparameters,
                "multiply_gate": self.multiply_gate.hyperparameters,
            },
        }

    @property
    def derived_variables(self):
        """A dictionary of intermediate values computed during the
        forward/backward passes."""
        dv = {
            "conv_1x1_out": None,
            "conv_dilation_out": None,
            "multiply_gate_out": None,
            "components": {
                "conv_1x1": self.conv_1x1.derived_variables,
                "add_skip": self.add_skip.derived_variables,
                "add_residual": self.add_residual.derived_variables,
                "conv_dilation": self.conv_dilation.derived_variables,
                "multiply_gate": self.multiply_gate.derived_variables,
            },
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        """A dictionary of the module parameter gradients."""
        return {
            "components": {
                "conv_1x1": self.conv_1x1.gradients,
                "add_skip": self.add_skip.gradients,
                "add_residual": self.add_residual.gradients,
                "conv_dilation": self.conv_dilation.gradients,
                "multiply_gate": self.multiply_gate.gradients,
            }
        }

    def forward(self, X_main, X_skip=None):
        """
        Compute the module output on a single minibatch.

        Parameters
        ----------
        X_main : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`, `in_cols`, `in_ch`).
        X_skip : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`, or None
            The output of the preceding skip-connection if this is not the
            first module in the network.

        Returns
        -------
        Y_main : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The output of the main pathway.
        Y_skip : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The output of the skip-connection pathway.
        """
        self.X_main, self.X_skip = X_main, X_skip
        conv_dilation_out = self.conv_dilation.forward(X_main)

        tanh_gate = self.tanh.fn(conv_dilation_out)
        sigm_gate = self.sigm.fn(conv_dilation_out)

        multiply_gate_out = self.multiply_gate.forward([tanh_gate, sigm_gate])
        conv_1x1_out = self.conv_1x1.forward(multiply_gate_out)

        # if this is the first wavenet block, initialize the "previous" skip
        # connection sum to 0
        self.X_skip = np.zeros_like(conv_1x1_out) if X_skip is None else X_skip

        Y_skip = self.add_skip.forward([X_skip, conv_1x1_out])
        Y_main = self.add_residual.forward([X_main, conv_1x1_out])

        self._dv["tanh_out"] = tanh_gate
        self._dv["sigm_out"] = sigm_gate
        self._dv["conv_dilation_out"] = conv_dilation_out
        self._dv["multiply_gate_out"] = multiply_gate_out
        self._dv["conv_1x1_out"] = conv_1x1_out
        return Y_main, Y_skip

    def backward(self, dY_skip, dY_main=None):
        dX_skip, dConv_1x1_out = self.add_skip.backward(dY_skip)

        # if this is the last wavenet block, dY_main will be None. if not,
        # calculate the error contribution from dY_main and add it to the
        # contribution from the skip path
        dX_main = np.zeros_like(self.X_main)
        if dY_main is not None:
            dX_main, dConv_1x1_main = self.add_residual.backward(dY_main)
            dConv_1x1_out += dConv_1x1_main

        dMultiply_out = self.conv_1x1.backward(dConv_1x1_out)
        dTanh_out, dSigm_out = self.multiply_gate.backward(dMultiply_out)

        conv_dilation_out = self.derived_variables["conv_dilation_out"]
        dTanh_in = dTanh_out * self.tanh.grad(conv_dilation_out)
        dSigm_in = dSigm_out * self.sigm.grad(conv_dilation_out)
        dDilation_out = dTanh_in + dSigm_in

        conv_back = self.conv_dilation.backward(dDilation_out)
        dX_main += conv_back

        self._dv["dLdTanh"] = dTanh_out
        self._dv["dLdSigmoid"] = dSigm_out
        self._dv["dLdConv_1x1"] = dConv_1x1_out
        self._dv["dLdMultiply"] = dMultiply_out
        self._dv["dLdConv_dilation"] = dDilation_out
        return dX_main, dX_skip


class SkipConnectionIdentityModule(ModuleBase):
    def __init__(
        self,
        out_ch,
        kernel_shape1,
        kernel_shape2,
        stride1=1,
        stride2=1,
        act_fn=None,
        epsilon=1e-5,
        momentum=0.9,
        optimizer=None,
        init="glorot_uniform",
    ):
        """
        A ResNet-like "identity" shortcut module.

        Notes
        -----
        The identity module enforces `same` padding during each convolution to
        ensure module output has same dims as its input.

        .. code-block:: text

            X -> Conv2D -> Act_fn -> BatchNorm2D -> Conv2D -> BatchNorm2D -> + -> Act_fn
             \______________________________________________________________/

        References
        ----------
        .. [1] He et al. (2015). "Deep residual learning for image
           recognition." https://arxiv.org/pdf/1512.03385.pdf

        Parameters
        ----------
        out_ch : int
            The number of filters/kernels to compute in the first convolutional
            layer.
        kernel_shape1 : 2-tuple
            The dimension of a single 2D filter/kernel in the first
            convolutional layer.
        kernel_shape2 : 2-tuple
            The dimension of a single 2D filter/kernel in the second
            convolutional layer.
        stride1 : int
            The stride/hop of the convolution kernels in the first
            convolutional layer. Default is 1.
        stride2 : int
            The stride/hop of the convolution kernels in the second
            convolutional layer. Default is 1.
        act_fn : :doc:`Activation <numpy_ml.neural_nets.activations>` object or None
            The activation function for computing Y[t]. If None, use the
            identity :math:`f(x) = x` by default. Default is None.
        epsilon : float
            A small smoothing constant to use during
            :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` computation to
            avoid divide-by-zero errors. Default is 1e-5.
        momentum : float
            The momentum term for the running mean/running std calculations in
            the :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` layers.  The
            closer this is to 1, the less weight will be given to the mean/std
            of the current batch (i.e., higher smoothing). Default is 0.9.
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with
            default parameters. Default is None.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        """
        super().__init__()

        self.init = init
        self.in_ch = None
        self.out_ch = out_ch
        self.epsilon = epsilon
        self.stride1 = stride1
        self.stride2 = stride2
        self.optimizer = optimizer
        self.momentum = momentum
        self.kernel_shape1 = kernel_shape1
        self.kernel_shape2 = kernel_shape2
        self.act_fn = Affine(slope=1, intercept=0) if act_fn is None else act_fn

        self._init_params()

    def _init_params(self):
        self._dv = {}

        self.conv1 = Conv2D(
            pad="same",
            init=self.init,
            out_ch=self.out_ch,
            act_fn=self.act_fn,
            stride=self.stride1,
            optimizer=self.optimizer,
            kernel_shape=self.kernel_shape1,
        )
        # we can't initialize `conv2` without X's dimensions; see `forward`
        # for further details
        self.batchnorm1 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)
        self.batchnorm2 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)
        self.add3 = Add(self.act_fn)

    def _init_conv2(self):
        self.conv2 = Conv2D(
            pad="same",
            init=self.init,
            out_ch=self.in_ch,
            stride=self.stride2,
            optimizer=self.optimizer,
            kernel_shape=self.kernel_shape2,
            act_fn=Affine(slope=1, intercept=0),
        )

    @property
    def parameters(self):
        """A dictionary of the module parameters."""
        return {
            "components": {
                "add3": self.add3.parameters,
                "conv1": self.conv1.parameters,
                "conv2": self.conv2.parameters,
                "batchnorm1": self.batchnorm1.parameters,
                "batchnorm2": self.batchnorm2.parameters,
            }
        }

    @property
    def hyperparameters(self):
        """A dictionary of the module hyperparameters."""
        return {
            "layer": "SkipConnectionIdentityModule",
            "init": self.init,
            "in_ch": self.in_ch,
            "out_ch": self.out_ch,
            "epsilon": self.epsilon,
            "stride1": self.stride1,
            "stride2": self.stride2,
            "momentum": self.momentum,
            "optimizer": self.optimizer,
            "act_fn": str(self.act_fn),
            "kernel_shape1": self.kernel_shape1,
            "kernel_shape2": self.kernel_shape2,
            "component_ids": ["conv1", "batchnorm1", "conv2", "batchnorm2", "add3"],
            "components": {
                "add3": self.add3.hyperparameters,
                "conv1": self.conv1.hyperparameters,
                "conv2": self.conv2.hyperparameters,
                "batchnorm1": self.batchnorm1.hyperparameters,
                "batchnorm2": self.batchnorm2.hyperparameters,
            },
        }

    @property
    def derived_variables(self):
        """A dictionary of intermediate values computed during the
        forward/backward passes."""
        dv = {
            "conv1_out": None,
            "conv2_out": None,
            "batchnorm1_out": None,
            "batchnorm2_out": None,
            "components": {
                "add3": self.add3.derived_variables,
                "conv1": self.conv1.derived_variables,
                "conv2": self.conv2.derived_variables,
                "batchnorm1": self.batchnorm1.derived_variables,
                "batchnorm2": self.batchnorm2.derived_variables,
            },
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        """A dictionary of the accumulated module parameter gradients."""
        return {
            "components": {
                "add3": self.add3.gradients,
                "conv1": self.conv1.gradients,
                "conv2": self.conv2.gradients,
                "batchnorm1": self.batchnorm1.gradients,
                "batchnorm2": self.batchnorm2.gradients,
            }
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the module output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape (n_ex, in_rows, in_cols, in_ch)
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`, `in_cols`, `in_ch`).
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape (n_ex, out_rows, out_cols, out_ch)
            The module output volume.
        """
        if not hasattr(self, "conv2"):
            self.in_ch = X.shape[3]
            self._init_conv2()

        conv1_out = self.conv1.forward(X, retain_derived)
        bn1_out = self.batchnorm1.forward(conv1_out, retain_derived)
        conv2_out = self.conv2.forward(bn1_out, retain_derived)
        bn2_out = self.batchnorm2.forward(conv2_out, retain_derived)
        Y = self.add3.forward([X, bn2_out], retain_derived)

        if retain_derived:
            self._dv["conv1_out"] = conv1_out
            self._dv["conv2_out"] = conv2_out
            self._dv["batchnorm1_out"] = bn1_out
            self._dv["batchnorm2_out"] = bn2_out
        return Y

    def backward(self, dLdY, retain_grads=True):
        """
        Compute the gradient of the loss with respect to the layer parameters.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape (`n_ex, out_rows, out_cols, out_ch`) or list of arrays
            The gradient(s) of the loss with respect to the module output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape (n_ex, in_rows, in_cols, in_ch)
            The gradient of the loss with respect to the module input volume.
        """
        dX, dBn2_out = self.add3.backward(dLdY, retain_grads)
        dConv2_out = self.batchnorm2.backward(dBn2_out, retain_grads)
        dBn1_out = self.conv2.backward(dConv2_out, retain_grads)
        dConv1_out = self.batchnorm1.backward(dBn1_out, retain_grads)
        dX += self.conv1.backward(dConv1_out, retain_grads)

        self._dv["dLdAdd3_X"] = dX
        self._dv["dLdBn2"] = dBn2_out
        self._dv["dLdBn1"] = dBn1_out
        self._dv["dLdConv2"] = dConv2_out
        self._dv["dLdConv1"] = dConv1_out
        return dX


class SkipConnectionConvModule(ModuleBase):
    def __init__(
        self,
        out_ch1,
        out_ch2,
        kernel_shape1,
        kernel_shape2,
        kernel_shape_skip,
        pad1=0,
        pad2=0,
        stride1=1,
        stride2=1,
        act_fn=None,
        epsilon=1e-5,
        momentum=0.9,
        stride_skip=1,
        optimizer=None,
        init="glorot_uniform",
    ):
        """
        A ResNet-like "convolution" shortcut module.

        Notes
        -----
        In contrast to :class:`SkipConnectionIdentityModule`, the additional
        `conv2d_skip` and `batchnorm_skip` layers in the shortcut path allow
        adjusting the dimensions of `X` to match the output of the main set of
        convolutions.

        .. code-block:: text

            X -> Conv2D -> Act_fn -> BatchNorm2D -> Conv2D -> BatchNorm2D -> + -> Act_fn
             \_____________________ Conv2D -> Batchnorm2D __________________/

        References
        ----------
        .. [1] He et al. (2015). "Deep residual learning for image
           recognition." https://arxiv.org/pdf/1512.03385.pdf

        Parameters
        ----------
        out_ch1 : int
            The number of filters/kernels to compute in the first convolutional
            layer.
        out_ch2 : int
            The number of filters/kernels to compute in the second
            convolutional layer.
        kernel_shape1 : 2-tuple
            The dimension of a single 2D filter/kernel in the first
            convolutional layer.
        kernel_shape2 : 2-tuple
            The dimension of a single 2D filter/kernel in the second
            convolutional layer.
        kernel_shape_skip : 2-tuple
            The dimension of a single 2D filter/kernel in the "skip"
            convolutional layer.
        stride1 : int
            The stride/hop of the convolution kernels in the first
            convolutional layer. Default is 1.
        stride2 : int
            The stride/hop of the convolution kernels in the second
            convolutional layer. Default is 1.
        stride_skip : int
            The stride/hop of the convolution kernels in the "skip"
            convolutional layer. Default is 1.
        pad1 : int, tuple, or 'same'
            The number of rows/columns of 0's to pad the input to the first
            convolutional layer with. Default is 0.
        pad2 : int, tuple, or 'same'
            The number of rows/columns of 0's to pad the input to the second
            convolutional layer with. Default is 0.
        act_fn : :doc:`Activation <numpy_ml.neural_nets.activations>` object or None
            The activation function for computing ``Y[t]``. If None, use the
            identity :math:`f(x) = x` by default. Default is None.
        epsilon : float
            A small smoothing constant to use during
            :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` computation to
            avoid divide-by-zero errors. Default is 1e-5.
        momentum : float
            The momentum term for the running mean/running std calculations in
            the :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` layers.  The
            closer this is to 1, the less weight will be given to the mean/std
            of the current batch (i.e., higher smoothing). Default is 0.9.
        init : str
            The weight initialization strategy. Valid entries are
            {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}.
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object
            The optimization strategy to use when performing gradient updates
            within the :class:`update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with
            default parameters. Default is None.
        """
        super().__init__()

        self.init = init
        self.pad1 = pad1
        self.pad2 = pad2
        self.in_ch = None
        self.out_ch1 = out_ch1
        self.out_ch2 = out_ch2
        self.epsilon = epsilon
        self.stride1 = stride1
        self.stride2 = stride2
        self.momentum = momentum
        self.optimizer = optimizer
        self.stride_skip = stride_skip
        self.kernel_shape1 = kernel_shape1
        self.kernel_shape2 = kernel_shape2
        self.kernel_shape_skip = kernel_shape_skip
        self.act_fn = Affine(slope=1, intercept=0) if act_fn is None else act_fn

        self._init_params()

    def _init_params(self, X=None):
        self._dv = {}
        self.conv1 = Conv2D(
            pad=self.pad1,
            init=self.init,
            act_fn=self.act_fn,
            out_ch=self.out_ch1,
            stride=self.stride1,
            optimizer=self.optimizer,
            kernel_shape=self.kernel_shape1,
        )
        self.conv2 = Conv2D(
            pad=self.pad2,
            init=self.init,
            out_ch=self.out_ch2,
            stride=self.stride2,
            optimizer=self.optimizer,
            kernel_shape=self.kernel_shape2,
            act_fn=Affine(slope=1, intercept=0),
        )
        # we can't initialize `conv_skip` without X's dimensions; see `forward`
        # for further details
        self.batchnorm1 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)
        self.batchnorm2 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)
        self.batchnorm_skip = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)
        self.add3 = Add(self.act_fn)

    def _calc_skip_padding(self, X):
        pads = []
        for p in [self.pad1, self.pad2]:
            if isinstance(p, int):
                pads.append((p, p, p, p))
            elif isinstance(p, tuple) and len(p) == 2:
                pads.append((p[0], p[0], p[1], p[1]))
        self.pad1, self.pad2 = pads

        # compute the dimensions of the convolution1 output
        s1 = self.stride1
        fr1, fc1 = self.kernel_shape1
        _, in_rows, in_cols, _ = X.shape
        pr11, pr12, pc11, pc12 = self.pad1

        out_rows1 = np.floor(1 + (in_rows + pr11 + pr12 - fr1) / s1).astype(int)
        out_cols1 = np.floor(1 + (in_cols + pc11 + pc12 - fc1) / s1).astype(int)

        # compute the dimensions of the convolution2 output
        s2 = self.stride2
        fr2, fc2 = self.kernel_shape2
        pr21, pr22, pc21, pc22 = self.pad2

        out_rows2 = np.floor(1 + (out_rows1 + pr21 + pr22 - fr2) / s2).astype(int)
        out_cols2 = np.floor(1 + (out_cols1 + pc21 + pc22 - fc2) / s2).astype(int)

        # finally, compute the appropriate padding dims for the skip convolution
        desired_dims = (out_rows2, out_cols2)
        self.pad_skip = calc_pad_dims_2D(
            X.shape,
            desired_dims,
            stride=self.stride_skip,
            kernel_shape=self.kernel_shape_skip,
        )

    def _init_conv_skip(self, X):
        self._calc_skip_padding(X)
        self.conv_skip = Conv2D(
            init=self.init,
            pad=self.pad_skip,
            out_ch=self.out_ch2,
            stride=self.stride_skip,
            kernel_shape=self.kernel_shape_skip,
            act_fn=Affine(slope=1, intercept=0),
            optimizer=self.optimizer,
        )

    @property
    def parameters(self):
        """A dictionary of the module parameters."""
        return {
            "components": {
                "add3": self.add3.parameters,
                "conv1": self.conv1.parameters,
                "conv2": self.conv2.parameters,
                "conv_skip": self.conv_skip.parameters
                if hasattr(self, "conv_skip")
                else None,
                "batchnorm1": self.batchnorm1.parameters,
                "batchnorm2": self.batchnorm2.parameters,
                "batchnorm_skip": self.batchnorm_skip.parameters,
            }
        }

    @property
    def hyperparameters(self):
        """A dictionary of the module hyperparameters."""
        return {
            "layer": "SkipConnectionConvModule",
            "init": self.init,
            "pad1": self.pad1,
            "pad2": self.pad2,
            "in_ch": self.in_ch,
            "out_ch1": self.out_ch1,
            "out_ch2": self.out_ch2,
            "epsilon": self.epsilon,
            "stride1": self.stride1,
            "stride2": self.stride2,
            "momentum": self.momentum,
            "act_fn": str(self.act_fn),
            "stride_skip": self.stride_skip,
            "kernel_shape1": self.kernel_shape1,
            "kernel_shape2": self.kernel_shape2,
            "kernel_shape_skip": self.kernel_shape_skip,
            "pad_skip": self.pad_skip if hasattr(self, "pad_skip") else None,
            "component_ids": [
                "add3",
                "conv1",
                "conv2",
                "conv_skip",
                "batchnorm1",
                "batchnorm2",
                "batchnorm_skip",
            ],
            "components": {
                "add3": self.add3.hyperparameters,
                "conv1": self.conv1.hyperparameters,
                "conv2": self.conv2.hyperparameters,
                "conv_skip": self.conv_skip.hyperparameters
                if hasattr(self, "conv_skip")
                else None,
                "batchnorm1": self.batchnorm1.hyperparameters,
                "batchnorm2": self.batchnorm2.hyperparameters,
                "batchnorm_skip": self.batchnorm_skip.hyperparameters,
            },
        }

    @property
    def derived_variables(self):
        """A dictionary of intermediate values computed during the
        forward/backward passes."""
        dv = {
            "conv1_out": None,
            "conv2_out": None,
            "conv_skip_out": None,
            "batchnorm1_out": None,
            "batchnorm2_out": None,
            "batchnorm_skip_out": None,
            "components": {
                "add3": self.add3.derived_variables,
                "conv1": self.conv1.derived_variables,
                "conv2": self.conv2.derived_variables,
                "conv_skip": self.conv_skip.derived_variables
                if hasattr(self, "conv_skip")
                else None,
                "batchnorm1": self.batchnorm1.derived_variables,
                "batchnorm2": self.batchnorm2.derived_variables,
                "batchnorm_skip": self.batchnorm_skip.derived_variables,
            },
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        """A dictionary of the accumulated module parameter gradients."""
        return {
            "components": {
                "add3": self.add3.gradients,
                "conv1": self.conv1.gradients,
                "conv2": self.conv2.gradients,
                "conv_skip": self.conv_skip.gradients
                if hasattr(self, "conv_skip")
                else None,
                "batchnorm1": self.batchnorm1.gradients,
                "batchnorm2": self.batchnorm2.gradients,
                "batchnorm_skip": self.batchnorm_skip.gradients,
            }
        }

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output given input volume `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The input volume consisting of `n_ex` examples, each with dimension
            (`in_rows`, `in_cols`, `in_ch`).
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
            The module output volume.
        """
        # now that we have the input dims for X we can initialize the proper
        # padding in the `conv_skip` layer
        if not hasattr(self, "conv_skip"):
            self._init_conv_skip(X)
            self.in_ch = X.shape[3]

        conv1_out = self.conv1.forward(X, retain_derived)
        bn1_out = self.batchnorm1.forward(conv1_out, retain_derived)
        conv2_out = self.conv2.forward(bn1_out, retain_derived)
        bn2_out = self.batchnorm2.forward(conv2_out, retain_derived)
        conv_skip_out = self.conv_skip.forward(X, retain_derived)
        bn_skip_out = self.batchnorm_skip.forward(conv_skip_out, retain_derived)
        Y = self.add3.forward([bn_skip_out, bn2_out], retain_derived)

        if retain_derived:
            self._dv["conv1_out"] = conv1_out
            self._dv["conv2_out"] = conv2_out
            self._dv["batchnorm1_out"] = bn1_out
            self._dv["batchnorm2_out"] = bn2_out
            self._dv["conv_skip_out"] = conv_skip_out
            self._dv["batchnorm_skip_out"] = bn_skip_out
        return Y

    def backward(self, dLdY, retain_grads=True):
        """
        Compute the gradient of the loss with respect to the module parameters.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
        or list of arrays
            The gradient(s) of the loss with respect to the module output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
            The gradient of the loss with respect to the module input volume.
        """
        dBnskip_out, dBn2_out = self.add3.backward(dLdY)
        dConvskip_out = self.batchnorm_skip.backward(dBnskip_out)
        dX = self.conv_skip.backward(dConvskip_out)

        dConv2_out = self.batchnorm2.backward(dBn2_out)
        dBn1_out = self.conv2.backward(dConv2_out)
        dConv1_out = self.batchnorm1.backward(dBn1_out)
        dX += self.conv1.backward(dConv1_out)

        if retain_grads:
            self._dv["dLdAdd3_X"] = dX
            self._dv["dLdBn1"] = dBn1_out
            self._dv["dLdBn2"] = dBn2_out
            self._dv["dLdConv1"] = dConv1_out
            self._dv["dLdConv2"] = dConv2_out
            self._dv["dLdBnSkip"] = dBnskip_out
            self._dv["dLdConvSkip"] = dConvskip_out
        return dX


class BidirectionalLSTM(ModuleBase):
    def __init__(
        self,
        n_out,
        act_fn=None,
        gate_fn=None,
        merge_mode="concat",
        init="glorot_uniform",
        optimizer=None,
    ):
        """
        A single bidirectional long short-term memory (LSTM) layer.

        Parameters
        ----------
        n_out : int
            The dimension of a single hidden state / output on a given timestep
        act_fn : :doc:`Activation <numpy_ml.neural_nets.activations>` object or None
            The activation function for computing ``A[t]``. If not specified,
            use :class:`~numpy_ml.neural_nets.activations.Tanh` by default.
        gate_fn : :doc:`Activation <numpy_ml.neural_nets.activations>` object or None
            The gate function for computing the update, forget, and output
            gates. If not specified, use
            :class:`~numpy_ml.neural_nets.activations.Sigmoid` by default.
        merge_mode : {"sum", "multiply", "concat", "average"}
            Mode by which outputs of the forward and backward LSTMs will be
            combined. Default is 'concat'.
        optimizer : str or :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object or None
            The optimization strategy to use when performing gradient updates
            within the `update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with
            default parameters. Default is None.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        """
        super().__init__()

        self.init = init
        self.n_in = None
        self.n_out = n_out
        self.optimizer = optimizer
        self.merge_mode = merge_mode
        self.act_fn = Tanh() if act_fn is None else act_fn
        self.gate_fn = Sigmoid() if gate_fn is None else gate_fn
        self._init_params()

    def _init_params(self):
        self.cell_fwd = LSTMCell(
            init=self.init,
            n_out=self.n_out,
            act_fn=self.act_fn,
            gate_fn=self.gate_fn,
            optimizer=self.optimizer,
        )
        self.cell_bwd = LSTMCell(
            init=self.init,
            n_out=self.n_out,
            act_fn=self.act_fn,
            gate_fn=self.gate_fn,
            optimizer=self.optimizer,
        )

    def forward(self, X):
        """
        Run a forward pass across all timesteps in the input.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_t)`
            Input consisting of `n_ex` examples each of dimensionality `n_in`
            and extending for `n_t` timesteps.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """
        Y_fwd, Y_bwd, Y = [], [], []
        n_ex, self.n_in, n_t = X.shape

        # forward LSTM
        for t in range(n_t):
            yt, ct = self.cell_fwd.forward(X[:, :, t])
            Y_fwd.append(yt)

        # backward LSTM
        for t in reversed(range(n_t)):
            yt, ct = self.cell_bwd.forward(X[:, :, t])
            Y_bwd.insert(0, yt)

        # merge forward and backward states
        for t in range(n_t):
            if self.merge_mode == "concat":
                Y.append(np.concatenate([Y_fwd[t], Y_bwd[t]], axis=1))
            elif self.merge_mode == "sum":
                Y.append(Y_fwd[t] + Y_bwd[t])
            elif self.merge_mode == "average":
                Y.append((Y_fwd[t] + Y_bwd[t]) / 2)
            elif self.merge_mode == "multiply":
                Y.append(Y_fwd[t] * Y_bwd[t])

        self.Y_fwd, self.Y_bwd = Y_fwd, Y_bwd
        return np.dstack(Y)

    def backward(self, dLdA):
        """
        Run a backward pass across all timesteps in the input.

        Parameters
        ----------
        dLdA : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out, n_t)`
            The gradient of the loss with respect to the layer output for each
            of the `n_ex` examples across all `n_t` timesteps.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in, n_t)`
            The value of the hidden state for each of the `n_ex` examples
            across each of the `n_t` timesteps.
        """
        assert self.trainable, "Layer is frozen"

        n_ex, n_out, n_t = dLdA.shape
        dLdX_f, dLdX_b, dLdX = [], [], []

        # forward LSTM
        for t in reversed(range(n_t)):
            if self.merge_mode == "concat":
                dLdXt_f = self.cell_fwd.backward(dLdA[:, : self.n_out, t])
            elif self.merge_mode == "sum":
                dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t])
            elif self.merge_mode == "multiplty":
                dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t] * self.Y_bwd[t])
            elif self.merge_mode == "average":
                dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t] * 0.5)
            dLdX_f.insert(0, dLdXt_f)

        # backward LSTM
        for t in range(n_t):
            if self.merge_mode == "concat":
                dLdXt_b = self.cell_bwd.backward(dLdA[:, self.n_out :, t])
            elif self.merge_mode == "sum":
                dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t])
            elif self.merge_mode == "multiplty":
                dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t] * self.Y_fwd[t])
            elif self.merge_mode == "average":
                dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t] * 0.5)
            dLdX_b.append(dLdXt_b)

        for t in range(n_t):
            dLdX.append(dLdX_f[t] + dLdX_b[t])

        return np.dstack(dLdX)

    @property
    def derived_variables(self):
        """A dictionary of intermediate values computed during the
        forward/backward passes."""
        return {
            "components": {
                "cell_fwd": self.cell_fwd.derived_variables,
                "cell_bwd": self.cell_bwd.derived_variables,
            }
        }

    @property
    def gradients(self):
        """A dictionary of the accumulated module parameter gradients."""
        return {
            "components": {
                "cell_fwd": self.cell_fwd.gradients,
                "cell_bwd": self.cell_bwd.gradients,
            }
        }

    @property
    def parameters(self):
        """A dictionary of the module parameters."""
        return {
            "components": {
                "cell_fwd": self.cell_fwd.parameters,
                "cell_bwd": self.cell_bwd.parameters,
            }
        }

    @property
    def hyperparameters(self):
        """A dictionary of the module hyperparameters."""
        return {
            "layer": "BidirectionalLSTM",
            "init": self.init,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "act_fn": str(self.act_fn),
            "optimizer": self.optimizer,
            "merge_mode": self.merge_mode,
            "component_ids": ["cell_fwd", "cell_bwd"],
            "components": {
                "cell_fwd": self.cell_fwd.hyperparameters,
                "cell_bwd": self.cell_bwd.hyperparameters,
            },
        }


class MultiHeadedAttentionModule(ModuleBase):
    def __init__(self, n_heads=8, dropout_p=0, init="glorot_uniform", optimizer=None):
        """
        A mutli-headed attention module.

        Notes
        -----
        Multi-head attention allows a model to jointly attend to information from
        different representation subspaces at different positions. With a
        single head, this information would get averaged away when the
        attention weights are combined with the value

        .. math::

            \\text{MultiHead}(\mathbf{Q}, \mathbf{K}, \mathbf{V})
                = [\\text{head}_1; ...; \\text{head}_h] \\mathbf{W}^{(O)}

        where

        .. math::

            \\text{head}_i = \\text{SDP_attention}(
                \mathbf{Q W}_i^{(Q)}, \mathbf{K W}_i^{(K)}, \mathbf{V W}_i^{(V)})

        and the projection weights are parameter matrices:

        .. math::

            \mathbf{W}_i^{(Q)}  &\in
                \mathbb{R}^{(\\text{kqv_dim} \ \\times \ \\text{latent_dim})} \\\\
            \mathbf{W}_i^{(K)}  &\in
                \mathbb{R}^{(\\text{kqv_dim} \ \\times \ \\text{latent_dim})} \\\\
            \mathbf{W}_i^{(V)}  &\in
                \mathbb{R}^{(\\text{kqv_dim} \ \\times \ \\text{latent_dim})} \\\\
            \mathbf{W}^{(O)}  &\in
                \mathbb{R}^{(\\text{n_heads} \cdot \\text{latent_dim} \ \\times \ \\text{kqv_dim})}

        Importantly, the current module explicitly assumes that

        .. math::

            \\text{kqv_dim} = \\text{dim(query)} = \\text{dim(keys)} = \\text{dim(values)}

        and that

        .. math::

            \\text{latent_dim} = \\text{kqv_dim / n_heads}

        **[MH Attention Head h]**:

        .. code-block:: text

            K --> W_h^(K) ------\\
            V --> W_h^(V) ------- > DP_Attention --> head_h
            Q --> W_h^(Q) ------/

        The full **[MultiHeadedAttentionModule]** then becomes

        .. code-block:: text

                  -----------------
            K --> | [Attn Head 1] | --> head_1 --\\
            V --> | [Attn Head 2] | --> head_2 --\\
            Q --> |      ...      |      ...       --> Concat --> W^(O) --> MH_out
                  | [Attn Head Z] | --> head_Z --/
                  -----------------

        Due to the reduced dimension of each head, the total computational cost
        is similar to that of a single attention head with full (i.e., kqv_dim)
        dimensionality.

        Parameters
        ----------
        n_heads : int
            The number of simultaneous attention heads to use. Note that the
            larger `n_heads`, the smaller the dimensionality of any single
            head, since ``latent_dim = kqv_dim / n_heads``. Default is 8.
        dropout_p : float in [0, 1)
            The dropout propbability during training, applied to the output of
            the softmax in each dot-product attention head. If 0, no dropout is
            applied. Default is 0.
        init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
            The weight initialization strategy. Default is 'glorot_uniform'.
        optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
            The optimization strategy to use when performing gradient updates
            within the :meth:`update` method.  If None, use the
            :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with default
            parameters. Default is None.
        """
        self.init = init
        self.kqv_dim = None
        self.projections = {}
        self.n_heads = n_heads
        self.optimizer = optimizer
        self.dropout_p = dropout_p
        self.is_initialized = False

    def _init_params(self):
        self._dv = {}

        # assume dim(keys) = dim(query) = dim(values)
        assert self.kqv_dim % self.n_heads == 0
        self.latent_dim = self.kqv_dim // self.n_heads

        self.attention = DotProductAttention(scale=True, dropout_p=self.dropout_p)
        self.projections = {
            k: Dropout(
                FullyConnected(
                    init=self.init,
                    n_out=self.kqv_dim,
                    optimizer=self.optimizer,
                    act_fn="Affine(slope=1, intercept=0)",
                ),
                self.dropout_p,
            )
            for k in ["Q", "K", "V", "O"]
        }

        self.is_initialized = True

    def forward(self, Q, K, V):
        if not self.is_initialized:
            self.kqv_dim = Q.shape[-1]
            self._init_params()

        # project queries, keys, and values into the `latent_dim`-dimensional subspace
        n_ex = Q.shape[0]
        for k, x in zip(["Q", "K", "V"], [Q, K, V]):
            proj = self.projections[k].forward(x)
            proj = proj.reshape(n_ex, -1, self.n_heads, self.latent_dim).swapaxes(1, 2)
            self._dv["{}_proj".format(k)] = proj

        dv = self.derived_variables
        Q_proj, K_proj, V_proj = dv["Q_proj"], dv["K_proj"], dv["V_proj"]

        # apply scaled dot-product attention to the projected vectors
        attn = self.attention
        attn_out = attn.forward(Q_proj, K_proj, V_proj)
        self._dv["attention_weights"] = attn.derived_variables["attention_weights"]

        # concatenate the different heads using `reshape` to create an
        # `kqv_dim`-dim vector
        attn_out = attn_out.swapaxes(1, 2).reshape(n_ex, self.kqv_dim)
        self._dv["attention_out"] = attn_out.reshape(n_ex, -1, self.kqv_dim)

        # apply the final output projection
        Y = self.projections["O"].forward(attn_out)
        Y = Y.reshape(n_ex, -1, self.kqv_dim)
        return Y

    def backward(self, dLdy):
        n_ex = dLdy.shape[0]
        dLdy = dLdy.reshape(n_ex, self.kqv_dim)
        dLdX = self.projections["O"].backward(dLdy)
        dLdX = dLdX.reshape(n_ex, self.n_heads, -1, self.latent_dim)

        dLdQ_proj, dLdK_proj, dLdV_proj = self.attention.backward(dLdX)

        self._dv["dQ_proj"] = dLdQ_proj
        self._dv["dK_proj"] = dLdK_proj
        self._dv["dV_proj"] = dLdV_proj

        dLdQ_proj = dLdQ_proj.reshape(n_ex, self.kqv_dim)
        dLdK_proj = dLdK_proj.reshape(n_ex, self.kqv_dim)
        dLdV_proj = dLdV_proj.reshape(n_ex, self.kqv_dim)

        dLdQ = self.projections["Q"].backward(dLdQ_proj)
        dLdK = self.projections["K"].backward(dLdK_proj)
        dLdV = self.projections["V"].backward(dLdV_proj)
        return dLdQ, dLdK, dLdV

    @property
    def derived_variables(self):
        """A dictionary of intermediate values computed during the
        forward/backward passes."""
        dv = {
            "Q_proj": None,
            "K_proj": None,
            "V_proj": None,
            "components": {
                "Q": self.projections["Q"].derived_variables,
                "K": self.projections["K"].derived_variables,
                "V": self.projections["V"].derived_variables,
                "O": self.projections["O"].derived_variables,
                "attention": self.attention.derived_variables,
            },
        }
        dv.update(self._dv)
        return dv

    @property
    def gradients(self):
        """A dictionary of the accumulated module parameter gradients."""
        return {
            "components": {
                "Q": self.projections["Q"].gradients,
                "K": self.projections["K"].gradients,
                "V": self.projections["V"].gradients,
                "O": self.projections["O"].gradients,
                "attention": self.attention.gradients,
            }
        }

    @property
    def parameters(self):
        """A dictionary of the module parameters."""
        return {
            "components": {
                "Q": self.projections["Q"].parameters,
                "K": self.projections["K"].parameters,
                "V": self.projections["V"].parameters,
                "O": self.projections["O"].parameters,
                "attention": self.attention.parameters,
            }
        }

    @property
    def hyperparameters(self):
        """A dictionary of the module hyperparameters."""
        return {
            "layer": "MultiHeadedAttentionModule",
            "init": self.init,
            "kqv_dim": self.kqv_dim,
            "latent_dim": self.latent_dim,
            "n_heads": self.n_heads,
            "dropout_p": self.dropout_p,
            "component_ids": ["attention", "Q", "K", "V", "O"],
            "components": {
                "Q": self.projections["Q"].hyperparameters,
                "K": self.projections["K"].hyperparameters,
                "V": self.projections["V"].hyperparameters,
                "O": self.projections["O"].hyperparameters,
                "attention": self.attention.hyperparameters,
            },
        }


================================================
FILE: numpy_ml/neural_nets/optimizers/README.md
================================================
# Optimizers

The `optimizers.py` module implements common modifications to stochastic gradient descent. It includes:

- SGD with momentum ([Rummelhart, Hinton, & Williams, 1986](https://www.cs.princeton.edu/courses/archive/spring18/cos495/res/backprop_old.pdf))
- AdaGrad ([Duchi, Hazan, & Singer, 2011](http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
- RMSProp ([Tieleman & Hinton, 2012](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
- Adam ([Kingma & Ba, 2015](https://arxiv.org/pdf/1412.6980v8.pdf))


================================================
FILE: numpy_ml/neural_nets/optimizers/__init__.py
================================================
from .optimizers import *


================================================
FILE: numpy_ml/neural_nets/optimizers/optimizers.py
================================================
from copy import deepcopy
from abc import ABC, abstractmethod

import numpy as np
from numpy.linalg import norm


class OptimizerBase(ABC):
    def __init__(self, lr, scheduler=None):
        """
        An abstract base class for all Optimizer objects.

        This should never be used directly.
        """
        from ..initializers import SchedulerInitializer

        self.cache = {}
        self.cur_step = 0
        self.hyperparameters = {}
        self.lr_scheduler = SchedulerInitializer(scheduler, lr=lr)()

    def __call__(self, param, param_grad, param_name, cur_loss=None):
        return self.update(param, param_grad, param_name, cur_loss)

    def step(self):
        """Increment the optimizer step counter by 1"""
        self.cur_step += 1

    def reset_step(self):
        """Reset the step counter to zero"""
        self.cur_step = 0

    def copy(self):
        """Return a copy of the optimizer object"""
        return deepcopy(self)

    def set_params(self, hparam_dict=None, cache_dict=None):
        """Set the parameters of the optimizer object from a dictionary"""
        from ..initializers import SchedulerInitializer

        if hparam_dict is not None:
            for k, v in hparam_dict.items():
                if k in self.hyperparameters:
                    self.hyperparameters[k] = v
                    if k == "lr_scheduler":
                        self.lr_scheduler = SchedulerInitializer(v, lr=None)()

        if cache_dict is not None:
            for k, v in cache_dict.items():
                if k in self.cache:
                    self.cache[k] = v

    @abstractmethod
    def update(self, param, param_grad, param_name, cur_loss=None):
        raise NotImplementedError


class SGD(OptimizerBase):
    def __init__(
        self, lr=0.01, momentum=0.0, clip_norm=None, lr_scheduler=None, **kwargs
    ):
        """
        A stochastic gradient descent optimizer.

        Notes
        -----
        For model parameters :math:`\\theta`, averaged parameter gradients
        :math:`\\nabla_{\\theta} \mathcal{L}`, and learning rate :math:`\eta`,
        the SGD update at timestep `t` is

        .. math::

            \\text{update}^{(t)}
                &=  \\text{momentum} \cdot \\text{update}^{(t-1)} + \eta^{(t)} \\nabla_{\\theta} \mathcal{L}\\\\
            \\theta^{(t+1)}
                &\leftarrow  \\theta^{(t)} - \\text{update}^{(t)}

        Parameters
        ----------
        lr : float
            Learning rate for SGD. If scheduler is not None, this is used as
            the starting learning rate. Default is 0.01.
        momentum : float in range [0, 1]
            The fraction of the previous update to add to the current update.
            If 0, no momentum is applied. Default is 0.
        clip_norm : float
            If not None, all param gradients are scaled to have maximum l2 norm of
            `clip_norm` before computing update. Default is None.
        lr_scheduler : str, :doc:`Scheduler <numpy_ml.neural_nets.schedulers>` object, or None
            The learning rate scheduler. If None, use a constant learning
            rate equal to `lr`. Default is None.
        """
        super().__init__(lr, lr_scheduler)

        self.hyperparameters = {
            "id": "SGD",
            "lr": lr,
            "momentum": momentum,
            "clip_norm": clip_norm,
            "lr_scheduler": str(self.lr_scheduler),
        }

    def __str__(self):
        H = self.hyperparameters
        lr, mm, cn, sc = H["lr"], H["momentum"], H["clip_norm"], H["lr_scheduler"]
        return "SGD(lr={}, momentum={}, clip_norm={}, lr_scheduler={})".format(
            lr, mm, cn, sc
        )

    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the SGD update for a given parameter

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated.
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`.
        param_name : str
            The name of the parameter.
        cur_loss : float
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.
            Default is None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the momentum update.
        """
        C = self.cache
        H = self.hyperparameters
        momentum, clip_norm = H["momentum"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = np.zeros_like(param_grad)

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        update = momentum * C[param_name] + lr * param_grad
        self.cache[param_name] = update
        return param - update


#######################################################################
#                      Adaptive Gradient Methods                      #
#######################################################################


class AdaGrad(OptimizerBase):
    def __init__(self, lr=0.01, eps=1e-7, clip_norm=None, lr_scheduler=None, **kwargs):
        """
        An AdaGrad optimizer.

        Notes
        -----
        Weights that receive large gradients will have their effective learning
        rate reduced, while weights that receive small or infrequent updates
        will have their effective learning rate increased.

        Equations::

            cache[t] = cache[t-1] + grad[t] ** 2
            update[t] = lr * grad[t] / (np.sqrt(cache[t]) + eps)
            param[t+1] = param[t] - update[t]

        Note that the ``**`` and `/` operations are elementwise

        "A downside of Adagrad ... is that the monotonic learning rate usually
        proves too aggressive and stops learning too early." [1]

        References
        ----------
        .. [1] Karpathy, A. "CS231n: Convolutional neural networks for visual
           recognition" https://cs231n.github.io/neural-networks-3/

        Parameters
        ----------
        lr : float
            Global learning rate
        eps : float
            Smoothing term to avoid divide-by-zero errors in the update calc.
            Default is 1e-7.
        clip_norm : float or None
            If not None, all param gradients are scaled to have maximum `L2` norm of
            `clip_norm` before computing update. Default is None.
        lr_scheduler : str or :doc:`Scheduler <numpy_ml.neural_nets.schedulers>` object or None
            The learning rate scheduler. If None, use a constant learning
            rate equal to `lr`. Default is None.
        """
        super().__init__(lr, lr_scheduler)

        self.cache = {}
        self.hyperparameters = {
            "id": "AdaGrad",
            "lr": lr,
            "eps": eps,
            "clip_norm": clip_norm,
            "lr_scheduler": str(self.lr_scheduler),
        }

    def __str__(self):
        H = self.hyperparameters
        lr, eps, cn, sc = H["lr"], H["eps"], H["clip_norm"], H["lr_scheduler"]
        return "AdaGrad(lr={}, eps={}, clip_norm={}, lr_scheduler={})".format(
            lr, eps, cn, sc
        )

    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the AdaGrad update for a given parameter.

        Notes
        -----
        Adjusts the learning rate of each weight based on the magnitudes of its
        gradients (big gradient -> small lr, small gradient -> big lr).

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`
        param_name : str
            The name of the parameter
        cur_loss : float or None
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.
            Default is None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the AdaGrad update
        """
        C = self.cache
        H = self.hyperparameters
        eps, clip_norm = H["eps"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = np.zeros_like(param_grad)

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        C[param_name] += param_grad ** 2
        update = lr * param_grad / (np.sqrt(C[param_name]) + eps)
        self.cache = C
        return param - update


class RMSProp(OptimizerBase):
    def __init__(
        self, lr=0.001, decay=0.9, eps=1e-7, clip_norm=None, lr_scheduler=None, **kwargs
    ):
        """
        RMSProp optimizer.

        Notes
        -----
        RMSProp was proposed as a refinement of :class:`AdaGrad` to reduce its
        aggressive, monotonically decreasing learning rate.

        RMSProp uses a *decaying average* of the previous squared gradients
        (second moment) rather than just the immediately preceding squared
        gradient for its `previous_update` value.

        Equations::

            cache[t] = decay * cache[t-1] + (1 - decay) * grad[t] ** 2
            update[t] = lr * grad[t] / (np.sqrt(cache[t]) + eps)
            param[t+1] = param[t] - update[t]

        Note that the ``**`` and ``/`` operations are elementwise.

        Parameters
        ----------
        lr : float
            Learning rate for update. Default is 0.001.
        decay : float in [0, 1]
            Rate of decay for the moving average. Typical values are [0.9,
            0.99, 0.999]. Default is 0.9.
        eps : float
            Constant term to avoid divide-by-zero errors during the update calc. Default is 1e-7.
        clip_norm : float or None
            If not None, all param gradients are scaled to have maximum l2 norm of
            `clip_norm` before computing update. Default is None.
        lr_scheduler : str or :doc:`Scheduler <numpy_ml.neural_nets.schedulers>` object or None
            The learning rate scheduler. If None, use a constant learning
            rate equal to `lr`. Default is None.
        """
        super().__init__(lr, lr_scheduler)

        self.cache = {}
        self.hyperparameters = {
            "id": "RMSProp",
            "lr": lr,
            "eps": eps,
            "decay": decay,
            "clip_norm": clip_norm,
            "lr_scheduler": str(self.lr_scheduler),
        }

    def __str__(self):
        H = self.hyperparameters
        sc = H["lr_scheduler"]
        lr, eps, dc, cn = H["lr"], H["eps"], H["decay"], H["clip_norm"]
        return "RMSProp(lr={}, eps={}, decay={}, clip_norm={}, lr_scheduler={})".format(
            lr, eps, dc, cn, sc
        )

    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the RMSProp update for a given parameter.

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`
        param_name : str
            The name of the parameter
        cur_loss : float or None
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.
            Default is None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the RMSProp update.
        """
        C = self.cache
        H = self.hyperparameters
        eps, decay, clip_norm = H["eps"], H["decay"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = np.zeros_like(param_grad)

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        C[param_name] = decay * C[param_name] + (1 - decay) * param_grad ** 2
        update = lr * param_grad / (np.sqrt(C[param_name]) + eps)
        self.cache = C
        return param - update


class Adam(OptimizerBase):
    def __init__(
        self,
        lr=0.001,
        decay1=0.9,
        decay2=0.999,
        eps=1e-7,
        clip_norm=None,
        lr_scheduler=None,
        **kwargs
    ):
        """
        Adam (adaptive moment estimation) optimization algorithm.

        Notes
        -----
        Designed to combine the advantages of :class:`AdaGrad`, which works
        well with sparse gradients, and :class:`RMSProp`, which works well in
        online and non-stationary settings.

        Parameters
        ----------
        lr : float
            Learning rate for update. This parameter is ignored if using
            :class:`~numpy_ml.neural_nets.schedulers.NoamScheduler`.
            Default is 0.001.
        decay1 : float
            The rate of decay to use for in running estimate of the first
            moment (mean) of the gradient. Default is 0.9.
        decay2 : float
            The rate of decay to use for in running estimate of the second
            moment (variance) of the gradient. Default is 0.999.
        eps : float
            Constant term to avoid divide-by-zero errors during the update
            calc. Default is 1e-7.
        clip_norm : float
            If not None, all param gradients are scaled to have maximum l2 norm of
            `clip_norm` before computing update. Default is None.
        lr_scheduler : str, or :doc:`Scheduler <numpy_ml.neural_nets.schedulers>` object, or None
            The learning rate scheduler. If None, use a constant learning rate
            equal to `lr`. Default is None.
        """
        super().__init__(lr, lr_scheduler)

        self.cache = {}
        self.hyperparameters = {
            "id": "Adam",
            "lr": lr,
            "eps": eps,
            "decay1": decay1,
            "decay2": decay2,
            "clip_norm": clip_norm,
            "lr_scheduler": str(self.lr_scheduler),
        }

    def __str__(self):
        H = self.hyperparameters
        lr, d1, d2 = H["lr"], H["decay1"], H["decay2"]
        eps, cn, sc = H["eps"], H["clip_norm"], H["lr_scheduler"]
        return "Adam(lr={}, decay1={}, decay2={}, eps={}, clip_norm={}, lr_scheduler={})".format(
            lr, d1, d2, eps, cn, sc
        )

    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the Adam update for a given parameter.

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated.
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`.
        param_name : str
            The name of the parameter.
        cur_loss : float
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is
            None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the Adam update.
        """
        C = self.cache
        H = self.hyperparameters
        d1, d2 = H["decay1"], H["decay2"]
        eps, clip_norm = H["eps"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = {
                "t": 0,
                "mean": np.zeros_like(param_grad),
                "var": np.zeros_like(param_grad),
            }

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        t = C[param_name]["t"] + 1
        var = C[param_name]["var"]
        mean = C[param_name]["mean"]

        # update cache
        C[param_name]["t"] = t
        C[param_name]["var"] = d2 * var + (1 - d2) * param_grad ** 2
        C[param_name]["mean"] = d1 * mean + (1 - d1) * param_grad
        self.cache = C

        # calc unbiased moment estimates and Adam update
        v_hat = C[param_name]["var"] / (1 - d2 ** t)
        m_hat = C[param_name]["mean"] / (1 - d1 ** t)
        update = lr * m_hat / (np.sqrt(v_hat) + eps)
        return param - update


================================================
FILE: numpy_ml/neural_nets/schedulers/README.md
================================================
# Learning Rate Schedulers
The `schedulers` module implements several common strategies for learning rate
decay:

- Constant
- Exponential decay
- Noam/Transformer decay ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
- Davis King/Dlib decay ([King, 2018](http://blog.dlib.net/2018/02/automatic-learning-rate-scheduling-that.html))

## Plots
<p align="center">
<img src="img/plot.png" align='center' height="550" />
</p>


================================================
FILE: numpy_ml/neural_nets/schedulers/__init__.py
================================================
from .schedulers import *


================================================
FILE: numpy_ml/neural_nets/schedulers/schedulers.py
================================================
from copy import deepcopy
from abc import ABC, abstractmethod

import numpy as np

from math import erf


def gaussian_cdf(x, mean, var):
    """
    Compute the probability that a random draw from a 1D Gaussian with mean
    `mean` and variance `var` is less than or equal to `x`.
    """
    eps = np.finfo(float).eps
    x_scaled = (x - mean) / np.sqrt(var + eps)
    return (1 + erf(x_scaled / np.sqrt(2))) / 2


class SchedulerBase(ABC):
    def __init__(self):
        """Abstract base class for all Scheduler objects."""
        self.hyperparameters = {}

    def __call__(self, step=None, cur_loss=None):
        return self.learning_rate(step=step, cur_loss=cur_loss)

    def copy(self):
        """Return a copy of the current object."""
        return deepcopy(self)

    def set_params(self, hparam_dict):
        """Set the scheduler hyperparameters from a dictionary."""
        if hparam_dict is not None:
            for k, v in hparam_dict.items():
                if k in self.hyperparameters:
                    self.hyperparameters[k] = v

    @abstractmethod
    def learning_rate(self, step=None):
        raise NotImplementedError


class ConstantScheduler(SchedulerBase):
    def __init__(self, lr=0.01, **kwargs):
        """
        Returns a fixed learning rate, regardless of the current step.

        Parameters
        ----------
        initial_lr : float
            The learning rate. Default is 0.01
        """
        super().__init__()
        self.lr = lr
        self.hyperparameters = {"id": "ConstantScheduler", "lr": self.lr}

    def __str__(self):
        return "ConstantScheduler(lr={})".format(self.lr)

    def learning_rate(self, **kwargs):
        """
        Return the current learning rate.

        Returns
        -------
        lr : float
            The learning rate
        """
        return self.lr


class ExponentialScheduler(SchedulerBase):
    def __init__(
        self, initial_lr=0.01, stage_length=500, staircase=False, decay=0.1, **kwargs
    ):
        """
        An exponential learning rate scheduler.

        Notes
        -----
        The exponential scheduler decays the learning rate by `decay` every
        `stage_length` steps, starting from `initial_lr`::

            learning_rate = initial_lr * decay ** curr_stage

        where::

            curr_stage = step / stage_length          if staircase = False
            curr_stage = floor(step / stage_length)   if staircase = True

        Parameters
        ----------
        initial_lr : float
            The learning rate at the first step. Default is 0.01.
        stage_length : int
            The length of each stage, in steps. Default is 500.
        staircase : bool
            If True, only adjusts the learning rate at the stage transitions,
            producing a step-like decay schedule. If False, adjusts the
            learning rate after each step, creating a smooth decay schedule.
            Default is False.
        decay : float
            The amount to decay the learning rate at each new stage. Default is
            0.1.
        """
        super().__init__()
        self.decay = decay
        self.staircase = staircase
        self.initial_lr = initial_lr
        self.stage_length = stage_length
        self.hyperparameters = {
            "id": "StepScheduler",
            "decay": self.decay,
            "staircase": self.staircase,
            "initial_lr": self.initial_lr,
            "stage_length": self.stage_length,
        }

    def __str__(self):
        return "ExponentialScheduler(initial_lr={}, stage_length={}, staircase={}, decay={})".format(
            self.initial_lr, self.stage_length, self.staircase, self.decay
        )

    def learning_rate(self, step, **kwargs):
        """
        Return the current learning rate as a function of `step`.

        Parameters
        ----------
        step : int
            The current step number.

        Returns
        -------
        lr : float
            The learning rate for the current step.
        """
        cur_stage = step / self.stage_length
        if self.staircase:
            cur_stage = np.floor(cur_stage)
        return self.initial_lr * self.decay ** cur_stage


class NoamScheduler(SchedulerBase):
    def __init__(self, model_dim=512, scale_factor=1, warmup_steps=4000, **kwargs):
        """
        The Noam learning rate scheduler, originally used in conjunction with
        the Adam optimizer in [1].

        Notes
        -----
        The Noam scheduler increases the learning rate linearly for the first
        `warmup_steps` steps, and decreases it thereafter proportionally to the
        inverse square root of the step number::

            lr = scale_factor * ( (model_dim ** (-0.5)) * adj_step )
            adj_step = min(step_num ** (-0.5), step_num * warmup_steps ** (-1.5))

        References
        ----------
        .. [1] Vaswani et al. (2017) "Attention is all you need". *31st
           Conference on Neural Information Processing Systems*,
           https://arxiv.org/pdf/1706.03762.pdf

        Parameters
        ----------
        model_dim : int
            The number of units in the layer output. Default is 512.
        scale_factor : float
            A fixed coefficient for rescaling the final learning rate. Default
            is 1.
        warmup_steps : int
            The number of steps in the warmup stage of training. Default is
            4000.
        """
        super().__init__()
        self.model_dim = model_dim
        self.scale_factor = scale_factor
        self.warmup_steps = warmup_steps
        self.hyperparameters = {
            "id": "NoamScheduler",
            "model_dim": self.model_dim,
            "scale_factor": self.scale_factor,
            "warmup_steps": self.warmup_steps,
        }

    def __str__(self):
        return "NoamScheduler(model_dim={}, scale_factor={}, warmup_steps={})".format(
            self.model_dim, self.scale_factor, self.warmup_steps
        )

    def learning_rate(self, step, **kwargs):
        warmup, d_model = self.warmup_steps, self.model_dim
        new_lr = d_model ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
        return self.scale_factor * new_lr


class KingScheduler(SchedulerBase):
    def __init__(self, initial_lr=0.01, patience=1000, decay=0.99, **kwargs):
        """
        The Davis King / DLib learning rate scheduler.

        Notes
        -----
        The KingScheduler computes the probability that the slope of the OLS
        fit to the loss history is negative. If the probability that it is
        negative is less than 51% over the last `patience` steps, the scheduler
        exponentially decreases the current learning rate by `decay`.

        References
        ----------
        .. [1] King, D. (2018). "Automatic learning rate scheduling that really
           works". http://blog.dlib.net/2018/02/automatic-learning-rate-scheduling-that.html

        Parameters
        ----------
        initial_lr : float
            The learning rate to begin at. Default is 0.01.
        patience : int
            Amount of time to maintain the current learning rate without a
            decrease in loss before adjustment. Default is 1000.
        decay : float
            The amount to decay the learning rate at each new stage. Default is
            0.99.
        """
        super().__init__()
        self.decay = decay
        self.patience = patience
        self.initial_lr = initial_lr
        self.current_lr = initial_lr
        self.max_history = np.ceil(1.1 * (patience + 1)).astype(int)

        self.loss_history = []
        self.hyperparameters = {
            "id": "KingScheduler",
            "decay": self.decay,
            "patience": self.patience,
            "initial_lr": self.initial_lr,
        }

    def __str__(self):
        return "KingScheduler(initial_lr={}, patience={}, decay={})".format(
            self.initial_lr, self.patience, self.decay
        )

    def _steps_without_decrease(self, robust=False, check_all=False):
        """
        Returns the maximum number of timesteps for which `P(loss is decreasing)
        < 0.51`.

        Parameters
        ----------
        robust : bool
            If `robust=True`, first filter out the largest 10% of the loss
            values to remove transient spikes in the loss due to, e.g., a few
            bad minibatches. Default is False.
        check_all : bool
            If False, returns the maximum number of timesteps for which P(loss
            is decreasing) < 0.51. If True, only checks whether the number of
            timesteps for which P(loss is decreasing) < 0.51 is equal to
            ``self.patience``. The former provides more information but is
            significantly more computationally expensive.  Default is False.

        Returns
        -------
        steps_without_decrease: int
            The maximum number of steps back in loss_history for which P(loss
            is decreasing) < 0.51.
        """
        lh = np.array(self.loss_history)

        # drop top 10% of loss values to filter out large loss spikes
        if robust:
            thresh = np.quantile(lh, 0.9)
            lh = np.array([i for i in lh if i <= thresh])

        N = len(lh)
        steps_without_decrease = 0
        if check_all:
            for i in reversed(range(N - 2)):
                if self._p_decreasing(lh, i) < 0.51:
                    steps_without_decrease = N - i
        else:
            i = max(0, N - self.patience - 1)
            if self._p_decreasing(lh, i) < 0.51:
                steps_without_decrease = N - i
        return steps_without_decrease

    def _p_decreasing(self, loss_history, i):
        """
        Compute the probability that the slope of the OLS fit to the loss
        history is negative.

        Parameters
        ----------
        loss_history : numpy array of shape (N,)
            The sequence of loss values for the previous `N` minibatches.
        i : int
            Compute P(Slope < 0) beginning at index i in `history`.

        Returns
        ------
        p_decreasing : float
            The probability that the slope of the OLS fit to loss_history is
            less than or equal to 0.
        """
        loss = loss_history[i:]
        N = len(loss)

        # perform OLS on the loss entries to calc the slope mean
        X = np.c_[np.ones(N), np.arange(i, len(loss_history))]
        intercept, s_mean = np.linalg.inv(X.T @ X) @ X.T @ loss
        loss_pred = s_mean * X[:, 1] + intercept

        # compute the variance of our loss predictions and use this to compute
        # the (unbiased) estimate of the slope variance
        loss_var = 1 / (N - 2) * np.sum((loss - loss_pred) ** 2)
        s_var = (12 * loss_var) / (N ** 3 - N)

        # compute the probability that a random sample from a Gaussian
        # parameterized by s_mean and s_var is less than or equal to 0
        p_decreasing = gaussian_cdf(0, s_mean, s_var)
        return p_decreasing

    def learning_rate(self, step, cur_loss):
        """
        Compute the updated learning rate for the current step and loss.

        Parameters
        ----------
        step : int
            The current step number. Unused.
        cur_loss : float
            The loss at the current step.

        Returns
        -------
        lr : float
            The learning rate for the current step.
        """
        if cur_loss is None:
            raise ValueError("cur_loss must be a float, but got {}".format(cur_loss))

        # this happens if we initialize the scheduler from a string / dict
        if not hasattr(self, "max_history"):
            self.max_history = np.ceil(1.1 * (self.patience + 1)).astype(int)
        patience, max_history = self.patience, self.max_history

        self.loss_history.append(cur_loss)
        if len(self.loss_history) < patience:
            return self.current_lr
        self.loss_history = self.loss_history[-max_history:]

        # if the loss has not decreased for `patience` timesteps, drop the
        # learning rate
        if (
            self._steps_without_decrease() > patience
            and self._steps_without_decrease(robust=True) > patience
        ):
            self.current_lr *= self.decay

        return self.current_lr


================================================
FILE: numpy_ml/neural_nets/utils/README.md
================================================
# Utilities

The `utils.py` module implements common, neural network-specific helper
functions, primarily for dealing with CNNs. It includes:

- `im2col`
- `col2im`
- `conv1D`
- `conv2D`
- `dilate`
- `deconv2D`
- `minibatch`
- Various weight initialization utilities
- Various padding and convolution arithmetic utilities


================================================
FILE: numpy_ml/neural_nets/utils/__init__.py
================================================
"""
Common neural network-specific helper functions.

The ``neural_nets.utils` module contains neural network-specific helper
functions, primarily for dealing with CNNs.
"""

from .utils import *


================================================
FILE: numpy_ml/neural_nets/utils/utils.py
================================================
import numpy as np

#######################################################################
#                           Training Utils                            #
#######################################################################


def minibatch(X, batchsize=256, shuffle=True):
    """
    Compute the minibatch indices for a training dataset.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        The dataset to divide into minibatches. Assumes the first dimension
        represents the number of training examples.
    batchsize : int
        The desired size of each minibatch. Note, however, that if ``X.shape[0] %
        batchsize > 0`` then the final batch will contain fewer than batchsize
        entries. Default is 256.
    shuffle : bool
        Whether to shuffle the entries in the dataset before dividing into
        minibatches. Default is True.

    Returns
    -------
    mb_generator : generator
        A generator which yields the indices into X for each batch
    n_batches: int
        The number of batches
    """
    N = X.shape[0]
    ix = np.arange(N)
    n_batches = int(np.ceil(N / batchsize))

    if shuffle:
        np.random.shuffle(ix)

    def mb_generator():
        for i in range(n_batches):
            yield ix[i * batchsize : (i + 1) * batchsize]

    return mb_generator(), n_batches


#######################################################################
#                            Padding Utils                            #
#######################################################################


def calc_pad_dims_2D(X_shape, out_dim, kernel_shape, stride, dilation=0):
    """
    Compute the padding necessary to ensure that convolving `X` with a 2D kernel
    of shape `kernel_shape` and stride `stride` produces outputs with dimension
    `out_dim`.

    Parameters
    ----------
    X_shape : tuple of `(n_ex, in_rows, in_cols, in_ch)`
        Dimensions of the input volume. Padding is applied to `in_rows` and
        `in_cols`.
    out_dim : tuple of `(out_rows, out_cols)`
        The desired dimension of an output example after applying the
        convolution.
    kernel_shape : 2-tuple
        The dimension of the 2D convolution kernel.
    stride : int
        The stride for the convolution kernel.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    padding_dims : 4-tuple
        Padding dims for `X`. Organized as (left, right, up, down)
    """
    if not isinstance(X_shape, tuple):
        raise ValueError("`X_shape` must be of type tuple")

    if not isinstance(out_dim, tuple):
        raise ValueError("`out_dim` must be of type tuple")

    if not isinstance(kernel_shape, tuple):
        raise ValueError("`kernel_shape` must be of type tuple")

    if not isinstance(stride, int):
        raise ValueError("`stride` must be of type int")

    d = dilation
    fr, fc = kernel_shape
    out_rows, out_cols = out_dim
    n_ex, in_rows, in_cols, in_ch = X_shape

    # update effective filter shape based on dilation factor
    _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d

    pr = int((stride * (out_rows - 1) + _fr - in_rows) / 2)
    pc = int((stride * (out_cols - 1) + _fc - in_cols) / 2)

    out_rows1 = int(1 + (in_rows + 2 * pr - _fr) / stride)
    out_cols1 = int(1 + (in_cols + 2 * pc - _fc) / stride)

    # add asymmetric padding pixels to right / bottom
    pr1, pr2 = pr, pr
    if out_rows1 == out_rows - 1:
        pr1, pr2 = pr, pr + 1
    elif out_rows1 != out_rows:
        raise AssertionError

    pc1, pc2 = pc, pc
    if out_cols1 == out_cols - 1:
        pc1, pc2 = pc, pc + 1
    elif out_cols1 != out_cols:
        raise AssertionError

    if any(np.array([pr1, pr2, pc1, pc2]) < 0):
        raise ValueError(
            "Padding cannot be less than 0. Got: {}".format((pr1, pr2, pc1, pc2))
        )
    return (pr1, pr2, pc1, pc2)


def calc_pad_dims_1D(X_shape, l_out, kernel_width, stride, dilation=0, causal=False):
    """
    Compute the padding necessary to ensure that convolving `X` with a 1D kernel
    of shape `kernel_shape` and stride `stride` produces outputs with length
    `l_out`.

    Parameters
    ----------
    X_shape : tuple of `(n_ex, l_in, in_ch)`
        Dimensions of the input volume. Padding is applied on either side of
        `l_in`.
    l_out : int
        The desired length an output example after applying the convolution.
    kernel_width : int
        The width of the 1D convolution kernel.
    stride : int
        The stride for the convolution kernel.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.
    causal : bool
        Whether to compute the padding dims for a regular or causal
        convolution. If causal, padding is added only to the left side of the
        sequence. Default is False.

    Returns
    -------
    padding_dims : 2-tuple
        Padding dims for X. Organized as (left, right)
    """
    if not isinstance(X_shape, tuple):
        raise ValueError("`X_shape` must be of type tuple")

    if not isinstance(l_out, int):
        raise ValueError("`l_out` must be of type int")

    if not isinstance(kernel_width, int):
        raise ValueError("`kernel_width` must be of type int")

    if not isinstance(stride, int):
        raise ValueError("`stride` must be of type int")

    d = dilation
    fw = kernel_width
    n_ex, l_in, in_ch = X_shape

    # update effective filter shape based on dilation factor
    _fw = fw * (d + 1) - d
    total_pad = int((stride * (l_out - 1) + _fw - l_in))

    if not causal:
        pw = total_pad // 2
        l_out1 = int(1 + (l_in + 2 * pw - _fw) / stride)

        # add asymmetric padding pixels to right / bottom
        pw1, pw2 = pw, pw
        if l_out1 == l_out - 1:
            pw1, pw2 = pw, pw + 1
        elif l_out1 != l_out:
            raise AssertionError

    if causal:
        # if this is a causal convolution, only pad the left side of the
        # sequence
        pw1, pw2 = total_pad, 0
        l_out1 = int(1 + (l_in + total_pad - _fw) / stride)
        assert l_out1 == l_out

    if any(np.array([pw1, pw2]) < 0):
        raise ValueError("Padding cannot be less than 0. Got: {}".format((pw1, pw2)))
    return (pw1, pw2)


def pad1D(X, pad, kernel_width=None, stride=None, dilation=0):
    """
    Zero-pad a 3D input volume `X` along the second dimension.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_in, in_ch)`
        Input volume. Padding is applied to `l_in`.
    pad : tuple, int, or {'same', 'causal'}
        The padding amount. If 'same', add padding to ensure that the output
        length of a 1D convolution with a kernel of `kernel_shape` and stride
        `stride` is the same as the input length.  If 'causal' compute padding
        such that the output both has the same length as the input AND
        ``output[t]`` does not depend on ``input[t + 1:]``. If 2-tuple,
        specifies the number of padding columns to add on each side of the
        sequence.
    kernel_width : int
        The dimension of the 2D convolution kernel. Only relevant if p='same'
        or 'causal'. Default is None.
    stride : int
        The stride for the convolution kernel. Only relevant if p='same' or
        'causal'. Default is None.
    dilation : int
        The dilation of the convolution kernel. Only relevant if p='same' or
        'causal'. Default is None.

    Returns
    -------
    X_pad : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, padded_seq, in_channels)`
        The padded output volume
    p : 2-tuple
        The number of 0-padded columns added to the (left, right) of the sequences
        in `X`.
    """
    p = pad
    if isinstance(p, int):
        p = (p, p)

    if isinstance(p, tuple):
        X_pad = np.pad(
            X,
            pad_width=((0, 0), (p[0], p[1]), (0, 0)),
            mode="constant",
            constant_values=0,
        )

    # compute the correct padding dims for a 'same' or 'causal' convolution
    if p in ["same", "causal"] and kernel_width and stride:
        causal = p == "causal"
        p = calc_pad_dims_1D(
            X.shape, X.shape[1], kernel_width, stride, causal=causal, dilation=dilation
        )
        X_pad, p = pad1D(X, p)

    return X_pad, p


def pad2D(X, pad, kernel_shape=None, stride=None, dilation=0):
    """
    Zero-pad a 4D input volume `X` along the second and third dimensions.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume. Padding is applied to `in_rows` and `in_cols`.
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 2D convolution with a kernel of `kernel_shape` and stride `stride`
        has the same dimensions as the input.  If 2-tuple, specifies the number
        of padding rows and colums to add *on both sides* of the rows/columns
        in `X`. If 4-tuple, specifies the number of rows/columns to add to the
        top, bottom, left, and right of the input volume.
    kernel_shape : 2-tuple
        The dimension of the 2D convolution kernel. Only relevant if p='same'.
        Default is None.
    stride : int
        The stride for the convolution kernel. Only relevant if p='same'.
        Default is None.
    dilation : int
        The dilation of the convolution kernel. Only relevant if p='same'.
        Default is 0.

    Returns
    -------
    X_pad : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, padded_in_rows, padded_in_cols, in_channels)`
        The padded output volume.
    p : 4-tuple
        The number of 0-padded rows added to the (top, bottom, left, right) of
        `X`.
    """
    p = pad
    if isinstance(p, int):
        p = (p, p, p, p)

    if isinstance(p, tuple):
        if len(p) == 2:
            p = (p[0], p[0], p[1], p[1])

        X_pad = np.pad(
            X,
            pad_width=((0, 0), (p[0], p[1]), (p[2], p[3]), (0, 0)),
            mode="constant",
            constant_values=0,
        )

    # compute the correct padding dims for a 'same' convolution
    if p == "same" and kernel_shape and stride is not None:
        p = calc_pad_dims_2D(
            X.shape, X.shape[1:3], kernel_shape, stride, dilation=dilation
        )
        X_pad, p = pad2D(X, p)
    return X_pad, p


def dilate(X, d):
    """
    Dilate the 4D volume `X` by `d`.

    Notes
    -----
    For a visual depiction of a dilated convolution, see [1].

    References
    ----------
    .. [1] Dumoulin & Visin (2016). "A guide to convolution arithmetic for deep
       learning." https://arxiv.org/pdf/1603.07285v1.pdf

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume.
    d : int
        The number of 0-rows to insert between each adjacent row + column in `X`.

    Returns
    -------
    Xd : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
        The dilated array where

        .. math::

            \\text{out_rows}  &=  \\text{in_rows} + d(\\text{in_rows} - 1) \\\\
            \\text{out_cols}  &=  \\text{in_cols} + d (\\text{in_cols} - 1)
    """
    n_ex, in_rows, in_cols, n_in = X.shape
    r_ix = np.repeat(np.arange(1, in_rows), d)
    c_ix = np.repeat(np.arange(1, in_cols), d)
    Xd = np.insert(X, r_ix, 0, axis=1)
    Xd = np.insert(Xd, c_ix, 0, axis=2)
    return Xd


#######################################################################
#                     Convolution Arithmetic                          #
#######################################################################


def calc_fan(weight_shape):
    """
    Compute the fan-in and fan-out for a weight matrix/volume.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume. The final 2 entries must be
        `in_ch`, `out_ch`.

    Returns
    -------
    fan_in : int
        The number of input units in the weight tensor
    fan_out : int
        The number of output units in the weight tensor
    """
    if len(weight_shape) == 2:
        fan_in, fan_out = weight_shape
    elif len(weight_shape) in [3, 4]:
        in_ch, out_ch = weight_shape[-2:]
        kernel_size = np.prod(weight_shape[:-2])
        fan_in, fan_out = in_ch * kernel_size, out_ch * kernel_size
    else:
        raise ValueError("Unrecognized weight dimension: {}".format(weight_shape))
    return fan_in, fan_out


def calc_conv_out_dims(X_shape, W_shape, stride=1, pad=0, dilation=0):
    """
    Compute the dimension of the output volume for the specified convolution.

    Parameters
    ----------
    X_shape : 3-tuple or 4-tuple
        The dimensions of the input volume to the convolution. If 3-tuple,
        entries are expected to be (`n_ex`, `in_length`, `in_ch`). If 4-tuple,
        entries are expected to be (`n_ex`, `in_rows`, `in_cols`, `in_ch`).
    weight_shape : 3-tuple or 4-tuple
        The dimensions of the weight volume for the convolution. If 3-tuple,
        entries are expected to be (`f_len`, `in_ch`, `out_ch`). If 4-tuple,
        entries are expected to be (`fr`, `fc`, `in_ch`, `out_ch`).
    pad : tuple, int, or {'same', 'causal'}
        The padding amount. If 'same', add padding to ensure that the output
        length of a 1D convolution with a kernel of `kernel_shape` and stride
        `stride` is the same as the input length.  If 'causal' compute padding
        such that the output both has the same length as the input AND
        ``output[t]`` does not depend on ``input[t + 1:]``. If 2-tuple, specifies the
        number of padding columns to add on each side of the sequence. Default
        is 0.
    stride : int
        The stride for the convolution kernel. Default is 1.
    dilation : int
        The dilation of the convolution kernel. Default is 0.

    Returns
    -------
    out_dims : 3-tuple or 4-tuple
        The dimensions of the output volume. If 3-tuple, entries are (`n_ex`,
        `out_length`, `out_ch`). If 4-tuple, entries are (`n_ex`, `out_rows`,
        `out_cols`, `out_ch`).
    """
    dummy = np.zeros(X_shape)
    s, p, d = stride, pad, dilation
    if len(X_shape) == 3:
        _, p = pad1D(dummy, p)
        pw1, pw2 = p
        fw, in_ch, out_ch = W_shape
        n_ex, in_length, in_ch = X_shape

        _fw = fw * (d + 1) - d
        out_length = (in_length + pw1 + pw2 - _fw) // s + 1
        out_dims = (n_ex, out_length, out_ch)

    elif len(X_shape) == 4:
        _, p = pad2D(dummy, p)
        pr1, pr2, pc1, pc2 = p
        fr, fc, in_ch, out_ch = W_shape
        n_ex, in_rows, in_cols, in_ch = X_shape

        # adjust effective filter size to account for dilation
        _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d
        out_rows = (in_rows + pr1 + pr2 - _fr) // s + 1
        out_cols = (in_cols + pc1 + pc2 - _fc) // s + 1
        out_dims = (n_ex, out_rows, out_cols, out_ch)
    else:
        raise ValueError("Unrecognized number of input dims: {}".format(len(X_shape)))
    return out_dims


#######################################################################
#                   Convolution Vectorization Utils                   #
#######################################################################


def _im2col_indices(X_shape, fr, fc, p, s, d=0):
    """
    Helper function that computes indices into X in prep for columnization in
    :func:`im2col`.

    Code extended from Andrej Karpathy's `im2col.py`
    """
    pr1, pr2, pc1, pc2 = p
    n_ex, n_in, in_rows, in_cols = X_shape

    # adjust effective filter size to account for dilation
    _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d

    out_rows = (in_rows + pr1 + pr2 - _fr) // s + 1
    out_cols = (in_cols + pc1 + pc2 - _fc) // s + 1

    if any([out_rows <= 0, out_cols <= 0]):
        raise ValueError(
            "Dimension mismatch during convolution: "
            "out_rows = {}, out_cols = {}".format(out_rows, out_cols)
        )

    # i1/j1 : row/col templates
    # i0/j0 : n. copies (len) and offsets (values) for row/col templates
    i0 = np.repeat(np.arange(fr), fc)
    i0 = np.tile(i0, n_in) * (d + 1)
    i1 = s * np.repeat(np.arange(out_rows), out_cols)
    j0 = np.tile(np.arange(fc), fr * n_in) * (d + 1)
    j1 = s * np.tile(np.arange(out_cols), out_rows)

    # i.shape = (fr * fc * n_in, out_height * out_width)
    # j.shape = (fr * fc * n_in, out_height * out_width)
    # k.shape = (fr * fc * n_in, 1)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(n_in), fr * fc).reshape(-1, 1)
    return k, i, j


def im2col(X, W_shape, pad, stride, dilation=0):
    """
    Pads and rearrange overlapping windows of the input volume into column
    vectors, returning the concatenated padded vectors in a matrix `X_col`.

    Notes
    -----
    A NumPy reimagining of MATLAB's ``im2col`` 'sliding' function.

    Code extended from Andrej Karpathy's ``im2col.py``.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume (not padded).
    W_shape: 4-tuple containing `(kernel_rows, kernel_cols, in_ch, out_ch)`
        The dimensions of the weights/kernels in the present convolutional
        layer.
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 2D convolution with a kernel of `kernel_shape` and stride `stride`
        produces an output volume of the same dimensions as the input.  If
        2-tuple, specifies the number of padding rows and colums to add *on both
        sides* of the rows/columns in X. If 4-tuple, specifies the number of
        rows/columns to add to the top, bottom, left, and right of the input
        volume.
    stride : int
        The stride of each convolution kernel
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    X_col : :py:class:`ndarray <numpy.ndarray>` of shape (Q, Z)
        The reshaped input volume where where:

        .. math::

            Q  &=  \\text{kernel_rows} \\times \\text{kernel_cols} \\times \\text{n_in} \\\\
            Z  &=  \\text{n_ex} \\times \\text{out_rows} \\times \\text{out_cols}
    """
    fr, fc, n_in, n_out = W_shape
    s, p, d = stride, pad, dilation
    n_ex, in_rows, in_cols, n_in = X.shape

    # zero-pad the input
    X_pad, p = pad2D(X, p, W_shape[:2], stride=s, dilation=d)
    pr1, pr2, pc1, pc2 = p

    # shuffle to have channels as the first dim
    X_pad = X_pad.transpose(0, 3, 1, 2)

    # get the indices for im2col
    k, i, j = _im2col_indices((n_ex, n_in, in_rows, in_cols), fr, fc, p, s, d)

    X_col = X_pad[:, k, i, j]
    X_col = X_col.transpose(1, 2, 0).reshape(fr * fc * n_in, -1)
    return X_col, p


def col2im(X_col, X_shape, W_shape, pad, stride, dilation=0):
    """
    Take columns of a 2D matrix and rearrange them into the blocks/windows of
    a 4D image volume.

    Notes
    -----
    A NumPy reimagining of MATLAB's ``col2im`` 'sliding' function.

    Code extended from Andrej Karpathy's ``im2col.py``.

    Parameters
    ----------
    X_col : :py:class:`ndarray <numpy.ndarray>` of shape `(Q, Z)`
        The columnized version of `X` (assumed to include padding)
    X_shape : 4-tuple containing `(n_ex, in_rows, in_cols, in_ch)`
        The original dimensions of `X` (not including padding)
    W_shape: 4-tuple containing `(kernel_rows, kernel_cols, in_ch, out_ch)`
        The dimensions of the weights in the present convolutional layer
    pad : 4-tuple of `(left, right, up, down)`
        Number of zero-padding rows/cols to add to `X`
    stride : int
        The stride of each convolution kernel
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    img : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        The reshaped `X_col` input matrix
    """
    if not (isinstance(pad, tuple) and len(pad) == 4):
        raise TypeError("pad must be a 4-tuple, but got: {}".format(pad))

    s, d = stride, dilation
    pr1, pr2, pc1, pc2 = pad
    fr, fc, n_in, n_out = W_shape
    n_ex, in_rows, in_cols, n_in = X_shape

    X_pad = np.zeros((n_ex, n_in, in_rows + pr1 + pr2, in_cols + pc1 + pc2))
    k, i, j = _im2col_indices((n_ex, n_in, in_rows, in_cols), fr, fc, pad, s, d)

    X_col_reshaped = X_col.reshape(n_in * fr * fc, -1, n_ex)
    X_col_reshaped = X_col_reshaped.transpose(2, 0, 1)

    np.add.at(X_pad, (slice(None), k, i, j), X_col_reshaped)

    pr2 = None if pr2 == 0 else -pr2
    pc2 = None if pc2 == 0 else -pc2
    return X_pad[:, :, pr1:pr2, pc1:pc2]


#######################################################################
#                             Convolution                             #
#######################################################################


def conv2D(X, W, stride, pad, dilation=0):
    """
    A faster (but more memory intensive) implementation of the 2D "convolution"
    (technically, cross-correlation) of input `X` with a collection of kernels in
    `W`.

    Notes
    -----
    Relies on the :func:`im2col` function to perform the convolution as a single
    matrix multiplication.

    For a helpful diagram, see Pete Warden's 2015 blogpost [1].

    References
    ----------
    .. [1] Warden (2015). "Why GEMM is at the heart of deep learning,"
       https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume (unpadded).
    W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`
        A volume of convolution weights/kernels for a given layer.
    stride : int
        The stride of each convolution kernel.
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 2D convolution with a kernel of `kernel_shape` and stride `stride`
        produces an output volume of the same dimensions as the input.  If
        2-tuple, specifies the number of padding rows and colums to add *on both
        sides* of the rows/columns in `X`. If 4-tuple, specifies the number of
        rows/columns to add to the top, bottom, left, and right of the input
        volume.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    Z : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
        The covolution of `X` with `W`.
    """
    s, d = stride, dilation
    _, p = pad2D(X, pad, W.shape[:2], s, dilation=dilation)

    pr1, pr2, pc1, pc2 = p
    fr, fc, in_ch, out_ch = W.shape
    n_ex, in_rows, in_cols, in_ch = X.shape

    # update effective filter shape based on dilation factor
    _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d

    # compute the dimensions of the convolution output
    out_rows = int((in_rows + pr1 + pr2 - _fr) / s + 1)
    out_cols = int((in_cols + pc1 + pc2 - _fc) / s + 1)

    # convert X and W into the appropriate 2D matrices and take their product
    X_col, _ = im2col(X, W.shape, p, s, d)
    W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1)

    Z = (W_col @ X_col).reshape(out_ch, out_rows, out_cols, n_ex).transpose(3, 1, 2, 0)

    return Z


def conv1D(X, W, stride, pad, dilation=0):
    """
    A faster (but more memory intensive) implementation of a 1D "convolution"
    (technically, cross-correlation) of input `X` with a collection of kernels in
    `W`.

    Notes
    -----
    Relies on the :func:`im2col` function to perform the convolution as a single
    matrix multiplication.

    For a helpful diagram, see Pete Warden's 2015 blogpost [1].

    References
    ----------
    .. [1] Warden (2015). "Why GEMM is at the heart of deep learning,"
       https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_in, in_ch)`
        Input volume (unpadded)
    W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_width, in_ch, out_ch)`
        A volume of convolution weights/kernels for a given layer
    stride : int
        The stride of each convolution kernel
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 1D convolution with a kernel of `kernel_shape` and stride `stride`
        produces an output volume of the same dimensions as the input.  If
        2-tuple, specifies the number of padding colums to add *on both sides*
        of the columns in X.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    Z : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, l_out, out_ch)`
        The convolution of X with W.
    """
    _, p = pad1D(X, pad, W.shape[0], stride, dilation=dilation)

    # add a row dimension to X to permit us to use im2col/col2im
    X2D = np.expand_dims(X, axis=1)
    W2D = np.expand_dims(W, axis=0)
    p2D = (0, 0, p[0], p[1])
    Z2D = conv2D(X2D, W2D, stride, p2D, dilation)

    # drop the row dimension
    return np.squeeze(Z2D, axis=1)


def deconv2D_naive(X, W, stride, pad, dilation=0):
    """
    Perform a "deconvolution" (more accurately, a transposed convolution) of an
    input volume `X` with a weight kernel `W`, incorporating stride, pad, and
    dilation.

    Notes
    -----
    Rather than using the transpose of the convolution matrix, this approach
    uses a direct convolution with zero padding, which, while conceptually
    straightforward, is computationally inefficient.

    For further explanation, see [1].

    References
    ----------
    .. [1] Dumoulin & Visin (2016). "A guide to convolution arithmetic for deep
       learning." https://arxiv.org/pdf/1603.07285v1.pdf

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume (not padded)
    W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`
        A volume of convolution weights/kernels for a given layer
    stride : int
        The stride of each convolution kernel
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 2D convolution with a kernel of `kernel_shape` and stride `stride`
        produces an output volume of the same dimensions as the input.  If
        2-tuple, specifies the number of padding rows and colums to add *on both
        sides* of the rows/columns in `X`. If 4-tuple, specifies the number of
        rows/columns to add to the top, bottom, left, and right of the input
        volume.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, n_out)`
        The decovolution of (padded) input volume `X` with `W` using stride `s` and
        dilation `d`.
    """
    if stride > 1:
        X = dilate(X, stride - 1)
        stride = 1

    # pad the input
    X_pad, p = pad2D(X, pad, W.shape[:2], stride=stride, dilation=dilation)

    n_ex, in_rows, in_cols, n_in = X_pad.shape
    fr, fc, n_in, n_out = W.shape
    s, d = stride, dilation
    pr1, pr2, pc1, pc2 = p

    # update effective filter shape based on dilation factor
    _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d

    # compute deconvolution output dims
    out_rows = s * (in_rows - 1) - pr1 - pr2 + _fr
    out_cols = s * (in_cols - 1) - pc1 - pc2 + _fc
    out_dim = (out_rows, out_cols)

    # add additional padding to achieve the target output dim
    _p = calc_pad_dims_2D(X_pad.shape, out_dim, W.shape[:2], s, d)
    X_pad, pad = pad2D(X_pad, _p, W.shape[:2], stride=s, dilation=dilation)

    # perform the forward convolution using the flipped weight matrix (note
    # we set pad to 0, since we've already added padding)
    Z = conv2D(X_pad, np.rot90(W, 2), s, 0, d)

    pr2 = None if pr2 == 0 else -pr2
    pc2 = None if pc2 == 0 else -pc2
    return Z[:, pr1:pr2, pc1:pc2, :]


def conv2D_naive(X, W, stride, pad, dilation=0):
    """
    A slow but more straightforward implementation of a 2D "convolution"
    (technically, cross-correlation) of input `X` with a collection of kernels `W`.

    Notes
    -----
    This implementation uses ``for`` loops and direct indexing to perform the
    convolution. As a result, it is slower than the vectorized :func:`conv2D`
    function that relies on the :func:`col2im` and :func:`im2col`
    transformations.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)`
        Input volume.
    W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`
        The volume of convolution weights/kernels.
    stride : int
        The stride of each convolution kernel.
    pad : tuple, int, or 'same'
        The padding amount. If 'same', add padding to ensure that the output of
        a 2D convolution with a kernel of `kernel_shape` and stride `stride`
        produces an output volume of the same dimensions as the input.  If
        2-tuple, specifies the number of padding rows and colums to add *on both
        sides* of the rows/columns in `X`. If 4-tuple, specifies the number of
        rows/columns to add to the top, bottom, left, and right of the input
        volume.
    dilation : int
        Number of pixels inserted between kernel elements. Default is 0.

    Returns
    -------
    Z : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)`
        The covolution of `X` with `W`.
    """
    s, d = stride, dilation
    X_pad, p = pad2D(X, pad, W.shape[:2], stride=s, dilation=d)

    pr1, pr2, pc1, pc2 = p
    fr, fc, in_ch, out_ch = W.shape
    n_ex, in_rows, in_cols, in_ch = X.shape

    # update effective filter shape based on dilation factor
    fr, fc = fr * (d + 1) - d, fc * (d + 1) - d

    out_rows = int((in_rows + pr1 + pr2 - fr) / s + 1)
    out_cols = int((in_cols + pc1 + pc2 - fc) / s + 1)

    Z = np.zeros((n_ex, out_rows, out_cols, out_ch))
    for m in range(n_ex):
        for c in range(out_ch):
            for i in range(out_rows):
                for j in range(out_cols):
                    i0, i1 = i * s, (i * s) + fr
                    j0, j1 = j * s, (j * s) + fc

                    window = X_pad[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :]
                    Z[m, i, j, c] = np.sum(window * W[:, :, :, c])
    return Z


#######################################################################
#                        Weight Initialization                        #
#######################################################################


def he_uniform(weight_shape):
    """
    Initializes network weights `W` with using the He uniform initialization
    strategy.

    Notes
    -----
    The He uniform initializations trategy initializes thew eights in `W` using
    draws from Uniform(-b, b) where

    .. math::

        b = \sqrt{\\frac{6}{\\text{fan_in}}}

    Developed for deep networks with ReLU nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    b = np.sqrt(6 / fan_in)
    return np.random.uniform(-b, b, size=weight_shape)


def he_normal(weight_shape):
    """
    Initialize network weights `W` using the He normal initialization strategy.

    Notes
    -----
    The He normal initialization strategy initializes the weights in `W` using
    draws from TruncatedNormal(0, b) where the variance `b` is

    .. math::

        b = \\frac{2}{\\text{fan_in}}

    He normal initialization was originally developed for deep networks with
    :class:`~numpy_ml.neural_nets.activations.ReLU` nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    std = np.sqrt(2 / fan_in)
    return truncated_normal(0, std, weight_shape)


def glorot_uniform(weight_shape, gain=1.0):
    """
    Initialize network weights `W` using the Glorot uniform initialization
    strategy.

    Notes
    -----
    The Glorot uniform initialization strategy initializes weights using draws
    from ``Uniform(-b, b)`` where:

    .. math::

        b = \\text{gain} \sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}}

    The motivation for Glorot uniform initialization is to choose weights to
    ensure that the variance of the layer outputs are approximately equal to
    the variance of its inputs.

    This initialization strategy was primarily developed for deep networks with
    tanh and logistic sigmoid nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    b = gain * np.sqrt(6 / (fan_in + fan_out))
    return np.random.uniform(-b, b, size=weight_shape)


def glorot_normal(weight_shape, gain=1.0):
    """
    Initialize network weights `W` using the Glorot normal initialization strategy.

    Notes
    -----
    The Glorot normal initializaiton initializes weights with draws from
    TruncatedNormal(0, b) where the variance `b` is

    .. math::

        b = \\frac{2 \\text{gain}^2}{\\text{fan_in} + \\text{fan_out}}

    The motivation for Glorot normal initialization is to choose weights to
    ensure that the variance of the layer outputs are approximately equal to
    the variance of its inputs.

    This initialization strategy was primarily developed for deep networks with
    :class:`~numpy_ml.neural_nets.activations.Tanh` and
    :class:`~numpy_ml.neural_nets.activations.Sigmoid` nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    std = gain * np.sqrt(2 / (fan_in + fan_out))
    return truncated_normal(0, std, weight_shape)


def truncated_normal(mean, std, out_shape):
    """
    Generate draws from a truncated normal distribution via rejection sampling.

    Notes
    -----
    The rejection sampling regimen draws samples from a normal distribution
    with mean `mean` and standard deviation `std`, and resamples any values
    more than two standard deviations from `mean`.

    Parameters
    ----------
    mean : float or array_like of floats
        The mean/center of the distribution
    std : float or array_like of floats
        Standard deviation (spread or "width") of the distribution.
    out_shape : int or tuple of ints
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.

    Returns
    -------
    samples : :py:class:`ndarray <numpy.ndarray>` of shape `out_shape`
        Samples from the truncated normal distribution parameterized by `mean`
        and `std`.
    """
    samples = np.random.normal(loc=mean, scale=std, size=out_shape)
    reject = np.logical_or(samples >= mean + 2 * std, samples <= mean - 2 * std)
    while any(reject.flatten()):
        resamples = np.random.normal(loc=mean, scale=std, size=reject.sum())
        samples[reject] = resamples
        reject = np.logical_or(samples >= mean + 2 * std, samples <= mean - 2 * std)
    return samples


================================================
FILE: numpy_ml/neural_nets/wrappers/README.md
================================================
# Wrappers

The `wrappers.py` module implements wrappers for the layers in `layers.py`. It
includes
- Dropout ([Srivastava, et al., 2014](http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf))


================================================
FILE: numpy_ml/neural_nets/wrappers/__init__.py
================================================
from .wrappers import *


================================================
FILE: numpy_ml/neural_nets/wrappers/wrappers.py
================================================
"""
A collection of objects thats can wrap / otherwise modify arbitrary neural
network layers.
"""

from abc import ABC, abstractmethod

import numpy as np


class WrapperBase(ABC):
    def __init__(self, wrapped_layer):
        """An abstract base class for all Wrapper instances"""
        self._base_layer = wrapped_layer
        if hasattr(wrapped_layer, "_base_layer"):
            self._base_layer = wrapped_layer._base_layer
        super().__init__()

    @abstractmethod
    def _init_wrapper_params(self):
        raise NotImplementedError

    @abstractmethod
    def forward(self, z, **kwargs):
        """Overwritten by inherited class"""
        raise NotImplementedError

    @abstractmethod
    def backward(self, out, **kwargs):
        """Overwritten by inherited class"""
        raise NotImplementedError

    @property
    def trainable(self):
        """Whether the base layer is frozen"""
        return self._base_layer.trainable

    @property
    def parameters(self):
        """A dictionary of the base layer parameters"""
        return self._base_layer.parameters

    @property
    def hyperparameters(self):
        """A dictionary of the base layer's hyperparameters"""
        hp = self._base_layer.hyperparameters
        hpw = self._wrapper_hyperparameters
        if "wrappers" in hp:
            hp["wrappers"].append(hpw)
        else:
            hp["wrappers"] = [hpw]
        return hp

    @property
    def derived_variables(self):
        """
        A dictionary of the intermediate values computed during layer
        training.
        """
        dv = self._base_layer.derived_variables.copy()
        if "wrappers" in dv:
            dv["wrappers"].append(self._wrapper_derived_variables)
        else:
            dv["wrappers"] = [self._wrapper_derived_variables]
        return dv

    @property
    def gradients(self):
        """A dictionary of the current layer parameter gradients."""
        return self._base_layer.gradients

    @property
    def act_fn(self):
        """The activation function for the base layer."""
        return self._base_layer.act_fn

    @property
    def X(self):
        """The collection of layer inputs."""
        return self._base_layer.X

    def _init_params(self):
        hp = self._wrapper_hyperparameters
        if "wrappers" in self._base_layer.hyperparameters:
            self._base_layer.hyperparameters["wrappers"].append(hp)
        else:
            self._base_layer.hyperparameters["wrappers"] = [hp]

    def freeze(self):
        """
        Freeze the base layer's parameters at their current values so they can
        no longer be updated.
        """
        self._base_layer.freeze()

    def unfreeze(self):
        """Unfreeze the base layer's parameters so they can be updated."""
        self._base_layer.freeze()

    def flush_gradients(self):
        """Erase all the wrapper and base layer's derived variables and gradients."""
        assert self.trainable, "Layer is frozen"
        self._base_layer.flush_gradients()

        for k, v in self._wrapper_derived_variables.items():
            self._wrapper_derived_variables[k] = []

    def update(self, lr):
        """
        Update the base layer's parameters using the accrued gradients and
        layer optimizer. Flush all gradients once the update is complete.
        """
        assert self.trainable, "Layer is frozen"
        self._base_layer.update(lr)
        self.flush_gradients()

    def _set_wrapper_params(self, pdict):
        for k, v in pdict.items():
            if k in self._wrapper_hyperparameters:
                self._wrapper_hyperparameters[k] = v
        return self

    def set_params(self, summary_dict):
        """
        Set the base layer parameters from a dictionary of values.

        Parameters
        ----------
        summary_dict : dict
            A dictionary of layer parameters and hyperparameters. If a required
            parameter or hyperparameter is not included within `summary_dict`,
            this method will use the value in the current layer's
            :meth:`summary` method.

        Returns
        -------
        layer : :doc:`Layer <numpy_ml.neural_nets.layers>` object
            The newly-initialized layer.
        """
        return self._base_layer.set_params(summary_dict)

    def summary(self):
        """Return a dict of the layer parameters, hyperparameters, and ID."""
        return {
            "layer": self.hyperparameters["layer"],
            "layer_wrappers": [i["wrapper"] for i in self.hyperparameters["wrappers"]],
            "parameters": self.parameters,
            "hyperparameters": self.hyperparameters,
        }


class Dropout(WrapperBase):
    def __init__(self, wrapped_layer, p):
        """
        A dropout regularization wrapper.

        Notes
        -----
        During training, a dropout layer zeroes each element of the layer input
        with probability `p` and scales the activation by `1 / (1 - p)` (to reflect
        the fact that on average only `(1 - p) * N` units are active on any
        training pass). At test time, does not adjust elements of the input at
        all (ie., simply computes the identity function).

        Parameters
        ----------
        wrapped_layer : :doc:`Layer <numpy_ml.neural_nets.layers>` instance
            The layer to apply dropout to.
        p : float in [0, 1)
            The dropout propbability during training
        """
        super().__init__(wrapped_layer)
        self.p = p
        self._init_wrapper_params()
        self._init_params()

    def _init_wrapper_params(self):
        self._wrapper_derived_variables = {"dropout_mask": []}
        self._wrapper_hyperparameters = {"wrapper": "Dropout", "p": self.p}

    def forward(self, X, retain_derived=True):
        """
        Compute the layer output with dropout for a single minibatch.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)`
            Layer input, representing the `n_in`-dimensional features for a
            minibatch of `n_ex` examples.
        retain_derived : bool
            Whether to retain the variables calculated during the forward pass
            for use later during backprop. If False, this suggests the layer
            will not be expected to backprop through wrt. this input. Default
            is True.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)`
            Layer output for each of the `n_ex` examples.
        """
        scaler, mask = 1.0, np.ones(X.shape).astype(bool)
        if self.trainable:
            scaler = 1.0 / (1.0 - self.p)
            mask = np.random.rand(*X.shape) >= self.p
            X = mask * X

        if retain_derived:
            self._wrapper_derived_variables["dropout_mask"].append(mask)

        return scaler * self._base_layer.forward(X, retain_derived)

    def backward(self, dLdy, retain_grads=True):
        """
        Backprop from the base layer's outputs to inputs.

        Parameters
        ----------
        dLdy : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_out)` or list of arrays
            The gradient(s) of the loss wrt. the layer output(s).
        retain_grads : bool
            Whether to include the intermediate parameter gradients computed
            during the backward pass in the final parameter update. Default is
            True.

        Returns
        -------
        dLdX : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_in)` or list of arrays
            The gradient of the loss wrt. the layer input(s) `X`.
        """  # noqa: E501
        assert self.trainable, "Layer is frozen"
        dLdy *= 1.0 / (1.0 - self.p)
        return self._base_layer.backward(dLdy, retain_grads)


def init_wrappers(layer, wrappers_list):
    """
    Initialize the layer wrappers in `wrapper_list` and return a wrapped
    `layer` object.

    Parameters
    ----------
    layer : :doc:`Layer <numpy_ml.neural_nets.layers>` instance
        The base layer object to apply the wrappers to.
    wrappers : list of dicts
        A list of parameter dictionaries for a the wrapper objects. The
        wrappers are initialized and applied to the the layer sequentially.

    Returns
    -------
    wrapped_layer : :class:`WrapperBase` instance
        The wrapped layer object
    """
    for wr in wrappers_list:
        if wr["wrapper"] == "Dropout":
            layer = Dropout(layer, 1)._set_wrapper_params(wr)
        else:
            raise NotImplementedError("{}".format(wr["wrapper"]))
    return layer


================================================
FILE: numpy_ml/ngram/README.md
================================================
# N-Gram Sequence Models
The `ngram.py` module implements [n-gram models](https://en.wikipedia.org/wiki/N-gram) with different smoothing techniques:

- Maximum likelihood (no smoothing)
- [Additive smoothing](https://en.wikipedia.org/wiki/Additive_smoothing) (incl.
  Laplace smoothing, expected likelihood estimation, etc.)
- Simple [Good-Turing smoothing](https://en.wikipedia.org/wiki/Good%E2%80%93Turing_frequency_estimation) ([Gale, 1995](https://www.csie.ntu.edu.tw/~b92b02053/print/good-turing-smoothing-without.pdf))

## Plots
<p align="center">
  <img src="img/rank_probs.png" height="500" />
</p>
<p align="center">
  <img src="img/add_smooth.png" height="550" />
</p>


================================================
FILE: numpy_ml/ngram/__init__.py
================================================
from .ngram import *


================================================
FILE: numpy_ml/ngram/ngram.py
================================================
"""A module for different N-gram smoothing models"""
import textwrap
from abc import ABC, abstractmethod
from collections import Counter

import numpy as np

from numpy_ml.linear_models import LinearRegression
from numpy_ml.preprocessing.nlp import tokenize_words, ngrams, strip_punctuation


class NGramBase(ABC):
    def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):
        """
        A simple word-level N-gram language model.

        Notes
        -----
        This is not optimized code and will be slow for large corpora. To see
        how industry-scale NGram models are handled, see the SRLIM-format:

            http://www.speech.sri.com/projects/srilm/
        """
        self.N = N
        self.unk = unk
        self.filter_stopwords = filter_stopwords
        self.filter_punctuation = filter_punctuation

        self.hyperparameters = {
            "N": N,
            "unk": unk,
            "filter_stopwords": filter_stopwords,
            "filter_punctuation": filter_punctuation,
        }

        super().__init__()

    def train(self, corpus_fp, vocab=None, encoding=None):
        """
        Compile the n-gram counts for the text(s) in `corpus_fp`.

        Notes
        -----
        After running `train`, the ``self.counts`` attribute will store
        dictionaries of the `N`, `N-1`, ..., 1-gram counts.

        Parameters
        ----------
        corpus_fp : str
            The path to a newline-separated text corpus file.
        vocab : :class:`~numpy_ml.preprocessing.nlp.Vocabulary` instance or None
            If not None, only the words in `vocab` will be used to construct
            the language model; all out-of-vocabulary words will either be
            mappend to ``<unk>`` (if ``self.unk = True``) or removed (if
            ``self.unk = False``). Default is None.
        encoding : str or None
            Specifies the text encoding for corpus. Common entries are 'utf-8',
            'utf-8-sig', 'utf-16'. Default is None.
        """
        return self._train(corpus_fp, vocab=vocab, encoding=encoding)

    def _train(self, corpus_fp, vocab=None, encoding=None):
        """Actual N-gram training logic"""
        H = self.hyperparameters
        grams = {N: [] for N in range(1, self.N + 1)}
        counts = {N: Counter() for N in range(1, self.N + 1)}
        filter_stop, filter_punc = H["filter_stopwords"], H["filter_punctuation"]

        _n_words = 0
        tokens = {"<unk>"}
        bol, eol = ["<bol>"], ["<eol>"]

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                line = strip_punctuation(line) if filter_punc else line
                words = tokenize_words(line, filter_stopwords=filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                _n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for N in range(1, self.N + 1):
                    words_padded = bol * max(1, N - 1) + words + eol * max(1, N - 1)
                    grams[N].extend(ngrams(words_padded, N))

        for N in counts.keys():
            counts[N].update(grams[N])

        n_words = {N: np.sum(list(counts[N].values())) for N in range(1, self.N + 1)}
        n_words[1] = _n_words

        n_tokens = {N: len(counts[N]) for N in range(2, self.N + 1)}
        n_tokens[1] = len(vocab) if vocab is not None else len(tokens)

        self.counts = counts
        self.n_words = n_words
        self.n_tokens = n_tokens

    def completions(self, words, N):
        """
        Return the distribution over proposed next words under the `N`-gram
        language model.

        Parameters
        ----------
        words : list or tuple of strings
            The initial sequence of words
        N : int
            The gram-size of the language model to use to generate completions

        Returns
        -------
        probs : list of (word, log_prob) tuples
            The list of possible next words and their log probabilities under
            the `N`-gram language model (unsorted)
        """
        N = min(N, len(words) + 1)
        assert N in self.counts, "You do not have counts for {}-grams".format(N)
        assert len(words) >= N - 1, "`words` must have at least {} words".format(N - 1)

        probs = []
        base = tuple(w.lower() for w in words[-N + 1 :])
        for k in self.counts[N].keys():
            if k[:-1] == base:
                c_prob = self._log_ngram_prob(base + k[-1:])
                probs.append((k[-1], c_prob))
        return probs

    def generate(self, N, seed_words=["<bol>"], n_sentences=5):
        """
        Use the `N`-gram language model to generate sentences.

        Parameters
        ----------
        N : int
            The gram-size of the model to generate from
        seed_words : list of strs
            A list of seed words to use to condition the initial sentence
            generation. Default is ``["<bol>"]``.
        sentences : int
            The number of sentences to generate from the `N`-gram model.
            Default is 50.

        Returns
        -------
        sentences : str
            Samples from the `N`-gram model, joined by white spaces, with
            individual sentences separated by newlines.
        """
        counter = 0
        sentences = []
        words = seed_words.copy()
        while counter < n_sentences:
            nextw, probs = zip(*self.completions(words, N))
            probs = np.exp(probs) / np.exp(probs).sum()  # renormalize probs if smoothed
            next_word = np.random.choice(nextw, p=probs)

            # if we reach the end of a sentence, save it and start a new one
            if next_word == "<eol>":
                S = " ".join([w for w in words if w != "<bol>"])
                S = textwrap.fill(S, 90, initial_indent="", subsequent_indent="   ")
                print(S)
                words.append(next_word)
                sentences.append(words)
                words = seed_words.copy()
                counter += 1
                continue

            words.append(next_word)
        return sentences

    def perplexity(self, words, N):
        r"""
        Calculate the model perplexity on a sequence of words.

        Notes
        -----
        Perplexity, `PP`, is defined as

        .. math::

            PP(W)  =  \left( \frac{1}{p(W)} \right)^{1 / n}

        or simply

        .. math::

            PP(W)  &=  \exp(-\log p(W) / n) \\
                   &=  \exp(H(W))

        where :math:`W = [w_1, \ldots, w_k]` is a sequence of words, `H(w)` is
        the cross-entropy of `W` under the current model, and `n` is the number
        of `N`-grams in `W`.

        Minimizing perplexity is equivalent to maximizing the probability of
        `words` under the `N`-gram model. It may also be interpreted as the
        average branching factor when predicting the next word under the
        language model.

        Parameters
        ----------
        N : int
            The gram-size of the model to calculate perplexity with.
        words : list or tuple of strings
            The sequence of words to compute perplexity on.

        Returns
        -------
        perplexity : float
            The model perlexity for the words in `words`.
        """
        return np.exp(self.cross_entropy(words, N))

    def cross_entropy(self, words, N):
        r"""
        Calculate the model cross-entropy on a sequence of words against the
        empirical distribution of words in a sample.

        Notes
        -----
        Model cross-entropy, `H`, is defined as

        .. math::

            H(W) = -\frac{\log p(W)}{n}

        where :math:`W = [w_1, \ldots, w_k]` is a sequence of words, and `n` is
        the number of `N`-grams in `W`.

        The model cross-entropy is proportional (not equal, since we use base
        `e`) to the average number of bits necessary to encode `W` under the
        model distribution.

        Parameters
        ----------
        N : int
            The gram-size of the model to calculate cross-entropy on.
        words : list or tuple of strings
            The sequence of words to compute cross-entropy on.

        Returns
        -------
        H : float
            The model cross-entropy for the words in `words`.
        """
        n_ngrams = len(ngrams(words, N))
        return -(1 / n_ngrams) * self.log_prob(words, N)

    def _log_prob(self, words, N):
        """
        Calculate the log probability of a sequence of words under the
        `N`-gram model
        """
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob

    def _n_completions(self, words, N):
        """
        Return the number of unique word tokens that could follow the sequence
        `words` under the *unsmoothed* `N`-gram language model.
        """
        assert N in self.counts, "You do not have counts for {}-grams".format(N)
        assert len(words) <= N - 1, "Need > {} words to use {}-grams".format(N - 2, N)

        if isinstance(words, list):
            words = tuple(words)

        base = words[-N + 1 :]
        return len([k[-1] for k in self.counts[N].keys() if k[:-1] == base])

    def _num_grams_with_count(self, C, N):
        """
        Return the number of unique `N`-gram tokens that occur exactly `C`
        times
        """
        assert C > 0
        assert N in self.counts, "You do not have counts for {}-grams".format(N)
        # cache count values for future calls
        if not hasattr(self, "_NC"):
            self._NC = {N: {} for N in range(1, self.N + 1)}
        if C not in self._NC[N]:
            self._NC[N][C] = len([k for k, v in self.counts[N].items() if v == C])
        return self._NC[N][C]

    @abstractmethod
    def log_prob(self, words, N):
        """
        Compute the log probability of a sequence of words under the
        unsmoothed, maximum-likelihood `N`-gram language model.
        """
        raise NotImplementedError

    @abstractmethod
    def _log_ngram_prob(self, ngram):
        """Return the unsmoothed log probability of the ngram"""
        raise NotImplementedError


class MLENGram(NGramBase):
    def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):
        """
        A simple, unsmoothed N-gram model.

        Parameters
        ----------
        N : int
            The maximum length (in words) of the context-window to use in the
            langauge model. Model will compute all n-grams from 1, ..., N.
        unk : bool
            Whether to include the ``<unk>`` (unknown) token in the LM. Default
            is True.
        filter_stopwords : bool
            Whether to remove stopwords before training. Default is True.
        filter_punctuation : bool
            Whether to remove punctuation before training. Default is True.
        """
        super().__init__(N, unk, filter_stopwords, filter_punctuation)

        self.hyperparameters["id"] = "MLENGram"

    def log_prob(self, words, N):
        """
        Compute the log probability of a sequence of words under the
        unsmoothed, maximum-likelihood `N`-gram language model.

        Parameters
        ----------
        words : list of strings
            A sequence of words
        N : int
            The gram-size of the language model to use when calculating the log
            probabilities of the sequence

        Returns
        -------
        total_prob : float
            The total log-probability of the sequence `words` under the
            `N`-gram language model
        """
        return self._log_prob(words, N)

    def _log_ngram_prob(self, ngram):
        """Return the unsmoothed log probability of the ngram"""
        N = len(ngram)
        num = self.counts[N][ngram]
        den = self.counts[N - 1][ngram[:-1]] if N > 1 else self.n_words[1]
        return np.log(num) - np.log(den) if (den > 0 and num > 0) else -np.inf


class AdditiveNGram(NGramBase):
    def __init__(
        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True,
    ):
        """
        An N-Gram model with smoothed probabilities calculated via additive /
        Lidstone smoothing.

        Notes
        -----
        The resulting estimates correspond to the expected value of the
        posterior, `p(ngram_prob | counts)`, when using a symmetric Dirichlet
        prior on counts with parameter `K`.

        Parameters
        ----------
        N : int
            The maximum length (in words) of the context-window to use in the
            langauge model. Model will compute all n-grams from 1, ..., N
        K : float
            The pseudocount to add to each observation. Larger values allocate
            more probability toward unseen events. When `K` = 1, the model is
            known as Laplace smoothing.  When `K` = 0.5, the model is known as
            expected likelihood estimation (ELE) or the Jeffreys-Perks law.
            Default is 1.
        unk : bool
            Whether to include the ``<unk>`` (unknown) token in the LM. Default
            is True.
        filter_stopwords : bool
            Whether to remove stopwords before training. Default is True.
        filter_punctuation : bool
            Whether to remove punctuation before training. Default is True.
        """
        super().__init__(N, unk, filter_stopwords, filter_punctuation)

        self.hyperparameters["id"] = "AdditiveNGram"
        self.hyperparameters["K"] = K

    def log_prob(self, words, N):
        r"""
        Compute the smoothed log probability of a sequence of words under the
        `N`-gram language model with additive smoothing.

        Notes
        -----
        For a bigram, additive smoothing amounts to:

        .. math::

            P(w_i \mid w_{i-1}) = \frac{A + K}{B + KV}

        where

        .. math::

            A  &=  \text{Count}(w_{i-1}, w_i) \\
            B  &=  \sum_j \text{Count}(w_{i-1}, w_j) \\
            V  &= |\{ w_j \ : \ \text{Count}(w_{i-1}, w_j) > 0 \}|

        This is equivalent to pretending we've seen every possible `N`-gram
        sequence at least `K` times.

        Additive smoothing can be problematic, as it:
            - Treats each predicted word in the same way
            - Can assign too much probability mass to unseen `N`-grams

        Parameters
        ----------
        words : list of strings
            A sequence of words.
        N : int
            The gram-size of the language model to use when calculating the log
            probabilities of the sequence.

        Returns
        -------
        total_prob : float
            The total log-probability of the sequence `words` under the
            `N`-gram language model.
        """
        return self._log_prob(words, N)

    def _log_ngram_prob(self, ngram):
        """Return the smoothed log probability of the ngram"""
        N = len(ngram)
        K = self.hyperparameters["K"]
        counts, n_words, n_tokens = self.counts, self.n_words[1], self.n_tokens[1]

        ctx = ngram[:-1]
        num = counts[N][ngram] + K
        ctx_count = counts[N - 1][ctx] if N > 1 else n_words
        den = ctx_count + K * n_tokens
        return np.log(num / den) if den != 0 else -np.inf


class GoodTuringNGram(NGramBase):
    def __init__(
        self, N, conf=1.96, unk=True, filter_stopwords=True, filter_punctuation=True,
    ):
        """
        An N-Gram model with smoothed probabilities calculated with the simple
        Good-Turing estimator from Gale (2001).

        Parameters
        ----------
        N : int
            The maximum length (in words) of the context-window to use in the
            langauge model. Model will compute all n-grams from 1, ..., N.
        conf: float
            The multiplier of the standard deviation of the empirical smoothed
            count (the default, 1.96, corresponds to a 95% confidence
            interval). Controls how many datapoints are smoothed using the
            log-linear model.
        unk : bool
            Whether to include the ``<unk>`` (unknown) token in the LM. Default
            is True.
        filter_stopwords : bool
            Whether to remove stopwords before training. Default is True.
        filter_punctuation : bool
            Whether to remove punctuation before training. Default is True.
        """
        super().__init__(N, unk, filter_stopwords, filter_punctuation)

        self.hyperparameters["id"] = "GoodTuringNGram"
        self.hyperparameters["conf"] = conf

    def train(self, corpus_fp, vocab=None, encoding=None):
        """
        Compile the n-gram counts for the text(s) in `corpus_fp`. Upon
        completion the `self.counts` attribute will store dictionaries of the
        `N`, `N-1`, ..., 1-gram counts.

        Parameters
        ----------
        corpus_fp : str
            The path to a newline-separated text corpus file
        vocab : :class:`~numpy_ml.preprocessing.nlp.Vocabulary` instance or None.
            If not None, only the words in `vocab` will be used to construct
            the language model; all out-of-vocabulary words will either be
            mappend to ``<unk>`` (if ``self.unk = True``) or removed (if
            ``self.unk = False``). Default is None.
        encoding : str  or None
            Specifies the text encoding for corpus. Common entries are 'utf-8',
            'utf-8-sig', 'utf-16'. Default is None.
        """
        self._train(corpus_fp, vocab=vocab, encoding=encoding)
        self._calc_smoothed_counts()

    def log_prob(self, words, N):
        r"""
        Compute the smoothed log probability of a sequence of words under the
        `N`-gram language model with Good-Turing smoothing.

        Notes
        -----
        For a bigram, Good-Turing smoothing amounts to:

        .. math::

            P(w_i \mid w_{i-1}) = \frac{C^*}{\text{Count}(w_{i-1})}

        where :math:`C^*` is the Good-Turing smoothed estimate of the bigram
        count:

        .. math::

            C^* = \frac{(c + 1) \text{NumCounts}(c + 1, 2)}{\text{NumCounts}(c, 2)}

        where

        .. math::

            c  &=  \text{Count}(w_{i-1}, w_i) \\
            \text{NumCounts}(r, k)  &=
                |\{ k\text{-gram} : \text{Count}(k\text{-gram}) = r \}|

        In words, the probability of an `N`-gram that occurs `r` times in the
        corpus is estimated by dividing up the probability mass occupied by
        N-grams that occur `r+1` times.

        For large values of `r`, NumCounts becomes unreliable. In this case, we
        compute a smoothed version of NumCounts using a power law function:

        .. math::

            \log \text{NumCounts}(r) = b + a \log r

        Under the Good-Turing estimator, the total probability assigned to
        unseen `N`-grams is equal to the relative occurrence of `N`-grams that
        appear only once.

        Parameters
        ----------
        words : list of strings
            A sequence of words.
        N : int
            The gram-size of the language model to use when calculating the log
            probabilities of the sequence.

        Returns
        -------
        total_prob : float
            The total log-probability of the sequence `words` under the
            `N`-gram language model.
        """
        return self._log_prob(words, N)

    def _calc_smoothed_counts(self):
        use_interp = False
        counts = self.counts
        NC = self._num_grams_with_count
        conf = self.hyperparameters["conf"]

        totals = {N: 0 for N in range(1, self.N + 1)}
        smooth_counts = {N: {} for N in range(1, self.N + 1)}

        # calculate the probability of all <unk> (i.e., unseen) n-grams
        self._p0 = {n: NC(1, n) / sum(counts[n].values()) for n in range(1, self.N + 1)}

        # fit log-linear models for predicting smoothed counts in absence of
        # real data
        self._fit_count_models()

        LM = self._count_models
        for N in range(1, self.N + 1):
            for C in sorted(set(counts[N].values())):

                # estimate the interpolated count using the log-linear model
                c1_lm = np.exp(LM[N].predict(np.c_[np.log(C + 1)])).item()
                c0_lm = np.exp(LM[N].predict(np.c_[np.log(C)])).item()
                count_interp = ((C + 1) * c1_lm) / c0_lm

                # if we have previously been using the interpolated count, or
                # if the number of ocurrences of C+1 is 0, use the interpolated
                # count as the smoothed count value C*
                c1, c0 = NC(C + 1, N), NC(C, N)
                if use_interp or c1 == 0:
                    use_interp = True
                    smooth_counts[N][C] = count_interp
                    totals[N] += c0 * smooth_counts[N][C]
                    continue

                # estimate the smoothed count C* empirically if the number of
                # terms with count C + 1 > 0
                count_emp = ((C + 1) * c1) / c0

                # compute the approximate variance of the empirical smoothed
                # count C* given C
                t = conf * np.sqrt((C + 1) ** 2 * (c1 / c0 ** 2) * (1 + c1 / c0))

                # if the difference between the empirical and interpolated
                # smoothed counts is greater than t, the empirical estimate
                # tends to be more accurate. otherwise, use interpolated
                if np.abs(count_interp - count_emp) > t:
                    smooth_counts[N][C] = count_emp
                    totals[N] += c0 * smooth_counts[N][C]
                    continue

                use_interp = True
                smooth_counts[N][C] = count_interp
                totals[N] += c0 * smooth_counts[N][C]

        self._smooth_totals = totals
        self._smooth_counts = smooth_counts

    def _log_ngram_prob(self, ngram):
        """Return the smoothed log probability of the ngram"""
        N = len(ngram)
        sc, T = self._smooth_counts[N], self._smooth_totals[N]
        n_tokens, n_seen = self.n_tokens[N], len(self.counts[N])

        # approx. prob of an out-of-vocab ngram (i.e., a fraction of p0)
        n_unseen = max((n_tokens ** N) - n_seen, 1)
        prob = np.log(self._p0[N] / n_unseen)

        if ngram in self.counts[N]:
            C = self.counts[N][ngram]
            prob = np.log(1 - self._p0[N]) + np.log(sc[C]) - np.log(T)
        return prob

    def _fit_count_models(self):
        """
        Perform the averaging transform proposed by Church and Gale (1991):
        estimate the expected count-of-counts by the *density* of
        count-of-count values.
        """
        self._count_models = {}
        NC = self._num_grams_with_count
        for N in range(1, self.N + 1):
            X, Y = [], []
            sorted_counts = sorted(set(self.counts[N].values()))  # r

            for ix, j in enumerate(sorted_counts):
                i = 0 if ix == 0 else sorted_counts[ix - 1]
                k = 2 * j - i if ix == len(sorted_counts) - 1 else sorted_counts[ix + 1]
                y = 2 * NC(j, N) / (k - i)
                X.append(j)
                Y.append(y)

            # fit log-linear model: log(counts) ~ log(average_transform(counts))
            self._count_models[N] = LinearRegression(fit_intercept=True)
            self._count_models[N].fit(np.log(X), np.log(Y))
            b, a = self._count_models[N].beta

            if a > -1:
                fstr = "[Warning] Log-log averaging transform has slope > -1 for N={}"
                print(fstr.format(N))


================================================
FILE: numpy_ml/nonparametric/README.md
================================================
# Nonparametric Models
The nonparametric module implements several popular nonparameteric regression
and classification models.

- `kernel_regression.py` implements Nadaraya-Watson kernel regression
  ([Nadaraya, 1964](https://epubs.siam.org/doi/abs/10.1137/1109020); [Watson,
1964](https://www.jstor.org/stable/pdf/25049340.pdf))
- `knn.py` implements k-nearest neighbors regression and classification
  models using a ball-tree
- `gp.py` implements Gaussian process regression / simple kriging ([Krige, 1951](https://pdfs.semanticscholar.org/e497/8126fe00eca432b896ebcba978ba3f30a475.pdf); [Matheron, 1963](http://cg.ensmp.fr/bibliotheque/public/MATHERON_Publication_02396.pdf); [Williams & Rasmussen, 1996](http://mlg.eng.cam.ac.uk/pub/pdf/WilRas96.pdf))

## Plots
<p align="center">
<strong>k-Nearest Neighbors</strong>
<img src="img/knn_plots.png" align='center' height="550" />
</p>

<p align="center">
<strong>Nadaraya-Watson Kernel Regression</strong>
<img src="img/kr_plots.png" align='center' height="550" />
</p>

<p align="center">
<strong>Gaussian Process Regression</strong>
<img src="img/gp_dist.png" align='center' width="950" />
</p>


================================================
FILE: numpy_ml/nonparametric/__init__.py
================================================
"""
Popular nonparameteric regression and classification models.

The nonparametric module contains an assortment of nonparametric models that
don't fit elsewhere in the package. For other nonparametric models, see the
``numpy_ml.trees`` module.
"""

from .gp import *
from .knn import *
from .kernel_regression import *


================================================
FILE: numpy_ml/nonparametric/gp.py
================================================
import warnings
import numpy as np
from numpy.linalg import slogdet, inv

try:
    _SCIPY = True
    from scipy.stats import norm
except:
    _SCIPY = False
    warnings.warn(
        "Could not import scipy.stats. Confidence scores "
        "for GPRegression are restricted to 95% bounds"
    )

from ..utils.kernels import KernelInitializer


class GPRegression:
    def __init__(self, kernel="RBFKernel", alpha=1e-10):
        """
        A Gaussian Process (GP) regression model.

        .. math::

            y \mid X, f  &\sim  \mathcal{N}( [f(x_1), \ldots, f(x_n)], \\alpha I ) \\\\
            f \mid X     &\sim  \\text{GP}(0, K)

        for data :math:`D = \{(x_1, y_1), \ldots, (x_n, y_n) \}` and a covariance matrix :math:`K_{ij}
        = \\text{kernel}(x_i, x_j)` for all :math:`i, j \in \{1, \ldots, n \}`.

        Parameters
        ----------
        kernel : str
            The kernel to use in fitting the GP prior. Default is 'RBFKernel'.
        alpha : float
            An isotropic noise term for the diagonal in the GP covariance, `K`.
            Larger values correspond to the expectation of greater noise in the
            observed data points. Default is 1e-10.
        """
        self.kernel = KernelInitializer(kernel)()
        self.parameters = {"GP_mean": None, "GP_cov": None, "X": None}
        self.hyperparameters = {"kernel": str(self.kernel), "alpha": alpha}

    def fit(self, X, y):
        """
        Fit the GP prior to the training data.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            A training dataset of `N` examples, each with dimensionality `M`.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)`
            A collection of real-valued training targets for the
            examples in `X`, each with dimension `O`.
        """
        mu = np.zeros(X.shape[0])
        K = self.kernel(X, X)

        self.parameters["X"] = X
        self.parameters["y"] = y
        self.parameters["GP_cov"] = K
        self.parameters["GP_mean"] = mu

    def predict(self, X, conf_interval=0.95, return_cov=False):
        """
        Return the MAP estimate for :math:`y^*`, corresponding the mean/mode of
        the posterior predictive distribution, :math:`p(y^* \mid x^*, X, y)`.

        Notes
        -----
        Under the GP regression model, the posterior predictive distribution is

        .. math::

            y^* \mid x^*, X, y \sim \mathcal{N}(\mu^*, \\text{cov}^*)

        where

        .. math::

            \mu^*  &=  K^* (K + \\alpha I)^{-1} y \\\\
            \\text{cov}^*  &=  K^{**} - K^{*'} (K + \\alpha I)^{-1} K^*

        and

        .. math::

            K  &=  \\text{kernel}(X, X) \\\\
            K^*  &=  \\text{kernel}(X, X^*) \\\\
            K^{**}  &=  \\text{kernel}(X^*, X^*)

        NB. This implementation uses the inefficient but general purpose
        `np.linalg.inv` routine to invert :math:`(K + \\alpha I)`. A more
        efficient way is to rely on the fact that `K` (and hence also :math:`K
        + \\alpha I`) is symmetric positive (semi-)definite and take the inner
        product of the inverse of its (lower) Cholesky decompositions:

        .. math::

            Q^{-1} = \\text{cholesky}(Q)^{-1 \\top} \\text{cholesky}(Q)^{-1}

        For more details on a production-grade implementation, see Algorithm
        2.1 in Rasmussen & Williams (2006).

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape (N, M)
            The collection of datapoints to generate predictions on
        conf_interval : float in (0, 1)
            The percentage confidence bound to return for each prediction. If
            the scipy package is not available, this value is always set to
            0.95. Default is 0.95.
        return_cov : bool
            If True, also return the covariance (`cov*`) of the posterior
            predictive distribution for the points in `X`. Default is False.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)`
            The predicted values for each point in `X`, each with
            dimensionality `O`.
        conf : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)`
            The % conf_interval confidence bound for each `y_pred`. The conf %
            confidence interval for the `i`'th prediction is ``[y[i] - conf[i],
            y[i] + conf[i]]``.
        cov : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The covariance (`cov*`) of the posterior predictive distribution for
            `X`. Only returned if `return_cov` is True.
        """
        if conf_interval != 0.95 and not _SCIPY:
            fstr = "Cannot compute {}% confidence score without scipy.stats"
            warnings.warn(fstr.format(conf_interval))

        X_star = X
        X = self.parameters["X"]
        y = self.parameters["y"]
        K = self.parameters["GP_cov"]
        alpha = self.hyperparameters["alpha"]

        K_star = self.kernel(X_star, X)
        K_star_star = self.kernel(X_star, X_star)

        sig = np.eye(K.shape[0]) * alpha
        K_y_inv = inv(K + sig)

        pp_mean = K_star @ K_y_inv @ y
        pp_cov = K_star_star - K_star @ K_y_inv @ K_star.T

        # if we can't use scipy, ignore the passed value for `conf_interval`
        # and return the 95% confidence bound.
        # (norm.ppf == inverse CDF for standard normal)
        percentile = 1.96 if not _SCIPY else norm.ppf(conf_interval)
        conf = percentile * np.sqrt(np.diag(pp_cov))
        return (pp_mean, conf) if not return_cov else (pp_mean, conf, pp_cov)

    def marginal_log_likelihood(self, kernel_params=None):
        """
        Compute the log of the marginal likelihood (i.e., the log model
        evidence), :math:`p(y \mid X, \\text{kernel_params})`.

        Notes
        -----
        Under the GP regression model, the marginal likelihood is normally
        distributed:

        .. math::

            y | X, \\theta  \sim  \mathcal{N}(0, K + \\alpha I)

        Hence,

        .. math::

            \log p(y \mid X, \\theta) =
                -0.5 \log \det(K + \\alpha I) -
                    0.5 y^\\top (K + \\alpha I)^{-1} y + \\frac{n}{2} \log 2 \pi

        where :math:`K = \\text{kernel}(X, X)`, :math:`\\theta` is the set of
        kernel parameters, and `n` is the number of dimensions in `K`.

        Parameters
        ----------
        kernel_params : dict
            Parameters for the kernel function. If None, calculate the
            marginal likelihood under the kernel parameters defined at model
            initialization. Default is None.

        Returns
        -------
        marginal_log_likelihood : float
            The log likelihood of the training targets given the kernel
            parameterized by `kernel_params` and the training inputs,
            marginalized over all functions `f`.
        """
        X = self.parameters["X"]
        y = self.parameters["y"]
        alpha = self.hyperparameters["alpha"]

        K = self.parameters["GP_cov"]
        if kernel_params is not None:
            # create a new kernel with parameters `kernel_params` and recalc
            # the GP covariance matrix
            summary_dict = self.kernel.summary_dict()
            summary_dict["parameters"].update(kernel_params)
            kernel = KernelInitializer(summary_dict)()
            K = kernel(X, X)

        # add isotropic noise to kernel diagonal
        K += np.eye(K.shape[0]) * alpha

        Kinv = inv(K)
        Klogdet = -0.5 * slogdet(K)[1]
        const = K.shape[0] / 2 * np.log(2 * np.pi)

        # handle both uni- and multidimensional target values
        if y.ndim == 1:
            y = y[:, np.newaxis]

        # sum over each dimension of y
        marginal_ll = np.sum([Klogdet - 0.5 * _y.T @ Kinv @ _y - const for _y in y.T])
        return marginal_ll

    def sample(self, X, n_samples=1, dist="posterior_predictive"):
        """
        Sample functions from the GP prior or posterior predictive
        distribution.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The collection of datapoints to generate predictions on. Only used if
            `dist` = 'posterior_predictive'.
        n_samples: int
            The number of samples to generate. Default is 1.
        dist : {"posterior_predictive", "prior"}
            The distribution to draw samples from. Default is
            "posterior_predictive".

        Returns
        -------
        samples : :py:class:`ndarray <numpy.ndarray>` of shape `(n_samples, O, N)`
            The generated samples for the points in `X`.
        """
        mvnorm = np.random.multivariate_normal

        if dist == "prior":
            mu = np.zeros((X.shape[0], 1))
            cov = self.kernel(X, X)
        elif dist == "posterior_predictive":
            mu, _, cov = self.predict(X, return_cov=True)
        else:
            raise ValueError("Unrecognized dist: '{}'".format(dist))

        if mu.ndim == 1:
            mu = mu[:, np.newaxis]

        samples = np.array([mvnorm(_mu, cov, size=n_samples) for _mu in mu.T])
        return samples.swapaxes(0, 1)


================================================
FILE: numpy_ml/nonparametric/kernel_regression.py
================================================
from ..utils.kernels import KernelInitializer


class KernelRegression:
    def __init__(self, kernel=None):
        """
        A Nadaraya-Watson kernel regression model.

        Notes
        -----
        The Nadaraya-Watson regression model is

        .. math::

            f(x) = \sum_i w_i(x) y_i

        where the sample weighting functions, :math:`w_i`, are simply

        .. math::

            w_i(x) = \\frac{k(x, x_i)}{\sum_j k(x, x_j)}

        with `k` being the kernel function.

        Observe that `k`-nearest neighbors
        (:class:`~numpy_ml.nonparametric.KNN`) regression is a special case of
        kernel regression where the `k` closest observations have a weight
        `1/k`, and all others have weight 0.

        Parameters
        ----------
        kernel : str, :doc:`Kernel <numpy_ml.utils.kernels>` object, or dict
            The kernel to use. If None, default to
            :class:`~numpy_ml.utils.kernels.LinearKernel`. Default is None.
        """
        self.parameters = {"X": None, "y": None}
        self.hyperparameters = {"kernel": str(kernel)}
        self.kernel = KernelInitializer(kernel)()

    def fit(self, X, y):
        """
        Fit the regression model to the data and targets in `X` and `y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            An array of N examples to generate predictions on
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, ...)`
            Predicted targets for the `N` rows in `X`
        """
        self.parameters = {"X": X, "y": y}

    def predict(self, X):
        """
        Generate predictions for the targets associated with the rows in `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N', M')`
            An array of `N'` examples to generate predictions on

        Returns
        -------
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N', ...)`
            Predicted targets for the `N'` rows in `X`
        """
        K = self.kernel
        P = self.parameters
        sim = K(P["X"], X)
        return (sim * P["y"][:, None]).sum(axis=0) / sim.sum(axis=0)


================================================
FILE: numpy_ml/nonparametric/knn.py
================================================
"""A k-Nearest Neighbors (KNN) model for both classiciation and regression."""
from collections import Counter

import numpy as np

from ..utils.data_structures import BallTree


class KNN:
    def __init__(
        self, k=5, leaf_size=40, classifier=True, metric=None, weights="uniform",
    ):
        """
        A `k`-nearest neighbors (kNN) model relying on a ball tree for efficient
        computation.

        Parameters
        ----------
        k : int
            The number of neighbors to use during prediction. Default is 5.
        leaf_size : int
            The maximum number of datapoints at each leaf in the ball tree.
            Default is 40.
        classifier : bool
            Whether to treat the values in Y as class labels (classifier =
            True) or real-valued targets (classifier = False). Default is True.
        metric : :doc:`Distance metric <numpy_ml.utils.distance_metrics>` or None
            The distance metric to use for computing nearest neighbors. If
            None, use the :func:`~numpy_ml.utils.distance_metrics.euclidean`
            metric by default. Default is None.
        weights : {'uniform', 'distance'}
            How to weight the predictions from each neighbors. 'uniform'
            assigns uniform weights to each neighbor, while 'distance' assigns
            weights proportional to the inverse of the distance from the query
            point. Default is 'uniform'.
        """
        self._ball_tree = BallTree(leaf_size=leaf_size, metric=metric)
        self.hyperparameters = {
            "id": "KNN",
            "k": k,
            "leaf_size": leaf_size,
            "classifier": classifier,
            "metric": str(metric),
            "weights": weights,
        }

    def fit(self, X, y):
        r"""
        Fit the model to the data and targets in `X` and `y`

        Parameters
        ----------
        X : numpy array of shape `(N, M)`
            An array of `N` examples to generate predictions on.
        y : numpy array of shape `(N, *)`
            Targets for the `N` rows in `X`.
        """
        if X.ndim != 2:
            raise Exception("X must be two-dimensional")
        self._ball_tree.fit(X, y)

    def predict(self, X):
        r"""
        Generate predictions for the targets associated with the rows in `X`.

        Parameters
        ----------
        X : numpy array of shape `(N', M')`
            An array of `N'` examples to generate predictions on.

        Returns
        -------
        y : numpy array of shape `(N', *)`
            Predicted targets for the `N'` rows in `X`.
        """
        predictions = []
        H = self.hyperparameters
        for x in X:
            pred = None
            nearest = self._ball_tree.nearest_neighbors(H["k"], x)
            targets = [n.val for n in nearest]

            if H["classifier"]:
                if H["weights"] == "uniform":
                    # for consistency with sklearn / scipy.stats.mode, return
                    # the smallest class ID in the event of a tie
                    counts = Counter(targets).most_common()
                    pred, _ = sorted(counts, key=lambda x: (-x[1], x[0]))[0]
                elif H["weights"] == "distance":
                    best_score = -np.inf
                    for label in set(targets):
                        scores = [1 / n.distance for n in nearest if n.val == label]
                        pred = label if np.sum(scores) > best_score else pred
            else:
                if H["weights"] == "uniform":
                    pred = np.mean(targets)
                elif H["weights"] == "distance":
                    weights = [1 / n.distance for n in nearest]
                    pred = np.average(targets, weights=weights)
            predictions.append(pred)
        return np.array(predictions)


================================================
FILE: numpy_ml/plots/bandit_plots.py
================================================
"""Miscellaneous plots for multi-arm bandit validation"""

from collections import namedtuple

import numpy as np

from numpy_ml.bandits import (
    MultinomialBandit,
    BernoulliBandit,
    ShortestPathBandit,
    ContextualLinearBandit,
)
from numpy_ml.bandits.trainer import BanditTrainer
from numpy_ml.bandits.policies import (
    EpsilonGreedy,
    UCB1,
    ThompsonSamplingBetaBinomial,
    LinUCB,
)
from numpy_ml.utils.graphs import random_DAG, DiGraph, Edge


def random_multinomial_mab(n_arms=10, n_choices_per_arm=5, reward_range=[0, 1]):
    """Generate a random multinomial multi-armed bandit environemt"""
    payoffs = []
    payoff_probs = []
    lo, hi = reward_range
    for a in range(n_arms):
        p = np.random.uniform(size=n_choices_per_arm)
        p = p / p.sum()
        r = np.random.uniform(low=lo, high=hi, size=n_choices_per_arm)

        payoffs.append(list(r))
        payoff_probs.append(list(p))

    return MultinomialBandit(payoffs, payoff_probs)


def random_bernoulli_mab(n_arms=10):
    """Generate a random Bernoulli multi-armed bandit environemt"""
    p = np.random.uniform(size=n_arms)
    payoff_probs = p / p.sum()
    return BernoulliBandit(payoff_probs)


def plot_epsilon_greedy_multinomial_payoff():
    """
    Evaluate an epsilon-greedy policy on a random multinomial bandit
    problem
    """
    np.random.seed(12345)
    N = np.random.randint(2, 30)  # n arms
    K = np.random.randint(2, 10)  # n payoffs / arm
    ep_length = 1

    rrange = [0, 1]
    n_duplicates = 5
    n_episodes = 5000

    mab = random_multinomial_mab(N, K, rrange)
    policy = EpsilonGreedy(epsilon=0.05, ev_prior=rrange[1] / 2)
    policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)


def plot_ucb1_multinomial_payoff():
    """Evaluate the UCB1 policy on a multinomial bandit environment"""
    np.random.seed(12345)
    N = np.random.randint(2, 30)  # n arms
    K = np.random.randint(2, 10)  # n payoffs / arm
    ep_length = 1

    C = 1
    rrange = [0, 1]
    n_duplicates = 5
    n_episodes = 5000

    mab = random_multinomial_mab(N, K, rrange)
    policy = UCB1(C=C, ev_prior=rrange[1] / 2)
    policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)


def plot_thompson_sampling_beta_binomial_payoff():
    """
    Evaluate the ThompsonSamplingBetaBinomial policy on a random Bernoulli
    multi-armed bandit.
    """
    np.random.seed(12345)
    N = np.random.randint(2, 30)  # n arms
    ep_length = 1

    n_duplicates = 5
    n_episodes = 5000

    mab = random_bernoulli_mab(N)
    policy = ThompsonSamplingBetaBinomial(alpha=1, beta=1)
    policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)


def plot_lin_ucb():
    """Plot the linUCB policy on a contextual linear bandit problem"""
    np.random.seed(12345)
    ep_length = 1
    K = np.random.randint(2, 25)
    D = np.random.randint(2, 10)

    n_duplicates = 5
    n_episodes = 5000

    cmab = ContextualLinearBandit(K, D, 1)
    policy = LinUCB(alpha=1)
    policy = BanditTrainer().train(policy, cmab, ep_length, n_episodes, n_duplicates)


def plot_ucb1_gaussian_shortest_path():
    """
    Plot the UCB1 policy on a graph shortest path problem each edge weight
    drawn from an independent univariate Gaussian
    """
    np.random.seed(12345)

    ep_length = 1
    n_duplicates = 5
    n_episodes = 5000
    p = np.random.rand()
    n_vertices = np.random.randint(5, 15)

    Gaussian = namedtuple("Gaussian", ["mean", "variance", "EV", "sample"])

    # create randomly-weighted edges
    print("Building graph")
    E = []
    G = random_DAG(n_vertices, p)
    V = G.vertices
    for e in G.edges:
        mean, var = np.random.uniform(0, 1), np.random.uniform(0, 1)
        w = lambda: np.random.normal(mean, var)  # noqa: E731
        rv = Gaussian(mean, var, mean, w)
        E.append(Edge(e.fr, e.to, rv))

    G = DiGraph(V, E)
    while not G.path_exists(V[0], V[-1]):
        print("Skipping")
        idx = np.random.randint(0, len(V))
        V[idx], V[-1] = V[-1], V[idx]

    mab = ShortestPathBandit(G, V[0], V[-1])
    policy = UCB1(C=1, ev_prior=0.5)
    policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)


def plot_comparison():
    """
    Use the BanditTrainer to compare several policies on the same bandit
    problem
    """
    np.random.seed(1234)
    ep_length = 1
    K = 10

    n_duplicates = 5
    n_episodes = 5000

    cmab = random_bernoulli_mab(n_arms=K)
    policy1 = EpsilonGreedy(epsilon=0.05, ev_prior=0.5)
    policy2 = UCB1(C=1, ev_prior=0.5)
    policy3 = ThompsonSamplingBetaBinomial(alpha=1, beta=1)
    policies = [policy1, policy2, policy3]

    BanditTrainer().compare(
        policies, cmab, ep_length, n_episodes, n_duplicates,
    )


================================================
FILE: numpy_ml/plots/gmm_plots.py
================================================
# flake8: noqa
import numpy as np
from sklearn.datasets.samples_generator import make_blobs

from scipy.stats import multivariate_normal

import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("paper", font_scale=1)

from numpy_ml.gmm import GMM

from matplotlib.colors import ListedColormap


def plot_countour(X, x, y, z, ax, xlim, ylim):
    def fixed_aspect_ratio(ratio, ax):
        """
        Set a fixed aspect ratio on matplotlib plots
        regardless of axis units
        """
        xvals, yvals = ax.get_xlim(), ax.get_ylim()

        xrange = xvals[1] - xvals[0]
        yrange = yvals[1] - yvals[0]
        ax.set_aspect(ratio * (xrange / yrange), adjustable="box")

    # contour the gridded data, plotting dots at the randomly spaced data points.
    ax.contour(x, y, z, 6, linewidths=0.5, colors="k")

    ax.set_xlim(*xlim)
    ax.set_ylim(*ylim)
    fixed_aspect_ratio(1, ax)
    return ax


def plot_clusters(model, X, ax):
    C = model.C

    xmin = min(X[:, 0]) - 0.1 * (max(X[:, 0]) - min(X[:, 0]))
    xmax = max(X[:, 0]) + 0.1 * (max(X[:, 0]) - min(X[:, 0]))
    ymin = min(X[:, 1]) - 0.1 * (max(X[:, 1]) - min(X[:, 1]))
    ymax = max(X[:, 1]) + 0.1 * (max(X[:, 1]) - min(X[:, 1]))

    for c in range(C):
        rv = multivariate_normal(model.mu[c], model.sigma[c], allow_singular=True)

        x = np.linspace(xmin, xmax, 500)
        y = np.linspace(ymin, ymax, 500)

        X1, Y1 = np.meshgrid(x, y)
        xy = np.column_stack([X1.flat, Y1.flat])

        # density values at the grid points
        Z = rv.pdf(xy).reshape(X1.shape)
        ax = plot_countour(X, X1, Y1, Z, ax=ax, xlim=(xmin, xmax), ylim=(ymin, ymax))
        ax.plot(model.mu[c, 0], model.mu[c, 1], "ro")

    # plot data points
    cm = ListedColormap(sns.color_palette().as_hex())
    labels = model.Q.argmax(1)
    uniq = set(labels)
    for i in uniq:
        ax.scatter(X[labels == i, 0], X[labels == i, 1], c=cm.colors[i - 1], s=30)
    return ax


def plot():
    fig, axes = plt.subplots(4, 4)
    fig.set_size_inches(10, 10)
    for i, ax in enumerate(axes.flatten()):
        n_ex = 150
        n_in = 2
        n_classes = np.random.randint(2, 4)
        X, y = make_blobs(
            n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=i
        )
        X -= X.mean(axis=0)

        # take best fit over 10 runs
        best_elbo = -np.inf
        for k in range(10):
            _G = GMM(C=n_classes, seed=k * 3)
            ret = _G.fit(X, max_iter=100, verbose=False)
            while ret != 0:
                print("Components collapsed; Refitting")
                ret = _G.fit(X, max_iter=100, verbose=False)

            if _G.best_elbo > best_elbo:
                best_elbo = _G.best_elbo
                G = _G

        ax = plot_clusters(G, X, ax)
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
        ax.set_title("# Classes: {}; Final VLB: {:.2f}".format(n_classes, G.best_elbo))

    plt.tight_layout()
    plt.savefig("img/plot.png", dpi=300)
    plt.close("all")


================================================
FILE: numpy_ml/plots/hmm_plots.py
================================================
# flake8: noqa
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("notebook", font_scale=0.8)

from hmmlearn.hmm import MultinomialHMM as MHMM
from numpy_ml.hmm import MultinomialHMM


def generate_training_data(params, n_steps=500, n_examples=15):
    hmm = MultinomialHMM(A=params["A"], B=params["B"], pi=params["pi"])

    # generate a new sequence
    observations = []
    for i in range(n_examples):
        latent, obs = hmm.generate(
            n_steps, params["latent_states"], params["obs_types"]
        )
        assert len(latent) == len(obs) == n_steps
        observations.append(obs)

    observations = np.array(observations)
    return observations


def default_hmm():
    obs_types = [0, 1, 2, 3]
    latent_states = ["H", "C"]

    # derived variables
    V = len(obs_types)
    N = len(latent_states)

    # define a very simple HMM with T=3 observations
    O = np.array([1, 3, 1]).reshape(1, -1)
    A = np.array([[0.9, 0.1], [0.5, 0.5]])
    B = np.array([[0.2, 0.7, 0.09, 0.01], [0.1, 0.0, 0.8, 0.1]])
    pi = np.array([0.75, 0.25])

    return {
        "latent_states": latent_states,
        "obs_types": obs_types,
        "V": V,
        "N": N,
        "O": O,
        "A": A,
        "B": B,
        "pi": pi,
    }


def plot_matrices(params, best, best_theirs):
    cmap = "copper"
    ll_mine, best = best
    ll_theirs, best_theirs = best_theirs

    fig, axes = plt.subplots(3, 3)
    axes = {
        "A": [axes[0, 0], axes[0, 1], axes[0, 2]],
        "B": [axes[1, 0], axes[1, 1], axes[1, 2]],
        "pi": [axes[2, 0], axes[2, 1], axes[2, 2]],
    }

    for k, tt in [("A", "Transition"), ("B", "Emission"), ("pi", "Prior")]:
        true_ax, est_ax, est_theirs_ax = axes[k]
        true, est, est_theirs = params[k], best[k], best_theirs[k]

        if k == "pi":
            true = true.reshape(-1, 1)
            est = est.reshape(-1, 1)
            est_theirs = est_theirs.reshape(-1, 1)

        true_ax = sns.heatmap(
            true,
            vmin=0.0,
            vmax=1.0,
            fmt=".2f",
            cmap=cmap,
            cbar=False,
            annot=True,
            ax=true_ax,
            xticklabels=[],
            yticklabels=[],
            linewidths=0.25,
        )

        est_ax = sns.heatmap(
            est,
            vmin=0.0,
            vmax=1.0,
            fmt=".2f",
            ax=est_ax,
            cmap=cmap,
            annot=True,
            cbar=False,
            xticklabels=[],
            yticklabels=[],
            linewidths=0.25,
        )

        est_theirs_ax = sns.heatmap(
            est_theirs,
            vmin=0.0,
            vmax=1.0,
            fmt=".2f",
            cmap=cmap,
            annot=True,
            cbar=False,
            xticklabels=[],
            yticklabels=[],
            linewidths=0.25,
            ax=est_theirs_ax,
        )

        true_ax.set_title("{} (True)".format(tt))
        est_ax.set_title("{} (Mine)".format(tt))
        est_theirs_ax.set_title("{} (hmmlearn)".format(tt))
    fig.suptitle("LL (mine): {:.2f}, LL (hmmlearn): {:.2f}".format(ll_mine, ll_theirs))
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig("img/plot.png", dpi=300)
    plt.close()


def test_HMM():
    np.random.seed(12345)
    np.set_printoptions(precision=5, suppress=True)

    P = default_hmm()
    ls, obs = P["latent_states"], P["obs_types"]

    # generate a new sequence
    O = generate_training_data(P, n_steps=30, n_examples=25)

    tol = 1e-5
    n_runs = 5
    best, best_theirs = (-np.inf, []), (-np.inf, [])
    for _ in range(n_runs):
        hmm = MultinomialHMM()
        A_, B_, pi_ = hmm.fit(O, ls, obs, tol=tol, verbose=True)

        theirs = MHMM(
            tol=tol,
            verbose=True,
            n_iter=int(1e9),
            transmat_prior=1,
            startprob_prior=1,
            algorithm="viterbi",
            n_components=len(ls),
        )

        O_flat = O.reshape(1, -1).flatten().reshape(-1, 1)
        theirs = theirs.fit(O_flat, lengths=[O.shape[1]] * O.shape[0])

        hmm2 = MultinomialHMM(A=A_, B=B_, pi=pi_)
        like = np.sum([hmm2.log_likelihood(obs) for obs in O])
        like_theirs = theirs.score(O_flat, lengths=[O.shape[1]] * O.shape[0])

        if like > best[0]:
            best = (like, {"A": A_, "B": B_, "pi": pi_})

        if like_theirs > best_theirs[0]:
            best_theirs = (
                like_theirs,
                {
                    "A": theirs.transmat_,
                    "B": theirs.emissionprob_,
                    "pi": theirs.startprob_,
                },
            )
    print("Final log likelihood of sequence: {:.5f}".format(best[0]))
    print("Final log likelihood of sequence (theirs): {:.5f}".format(best_theirs[0]))
    plot_matrices(P, best, best_theirs)


================================================
FILE: numpy_ml/plots/lda_plots.py
================================================
# flake8: noqa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("paper", font_scale=1)

np.random.seed(12345)

from numpy_ml.lda import LDA


def generate_corpus():
    # Generate some fake data
    D = 300
    T = 10
    V = 30
    N = np.random.randint(150, 200, size=D)

    # Create a document-topic distribution for 3 different types of documents
    alpha1 = np.array((20, 15, 10, 1, 1, 1, 1, 1, 1, 1))
    alpha2 = np.array((1, 1, 1, 10, 15, 20, 1, 1, 1, 1))
    alpha3 = np.array((1, 1, 1, 1, 1, 1, 10, 12, 15, 18))

    # Arbitrarily choose each topic to have 3 very common, diagnostic words
    # These words are barely shared with any other topic
    beta_probs = (
        np.ones((V, T)) + np.array([np.arange(V) % T == t for t in range(T)]).T * 19
    )
    beta_gen = np.array(list(map(lambda x: np.random.dirichlet(x), beta_probs.T))).T

    corpus = []
    theta = np.empty((D, T))

    # Generate each document from the LDA model
    for d in range(D):

        # Draw topic distribution for the document
        if d < (D / 3):
            theta[d, :] = np.random.dirichlet(alpha1, 1)[0]
        elif d < 2 * (D / 3):
            theta[d, :] = np.random.dirichlet(alpha2, 1)[0]
        else:
            theta[d, :] = np.random.dirichlet(alpha3, 1)[0]

        doc = np.array([])
        for n in range(N[d]):
            # Draw a topic according to the document's topic distribution
            z_n = np.random.choice(np.arange(T), p=theta[d, :])

            # Draw a word according to the topic-word distribution
            w_n = np.random.choice(np.arange(V), p=beta_gen[:, z_n])
            doc = np.append(doc, w_n)

        corpus.append(doc)
    return corpus, T


def plot_unsmoothed():
    corpus, T = generate_corpus()
    L = LDA(T)
    L.train(corpus, verbose=False)

    fig, axes = plt.subplots(1, 2)
    ax1 = sns.heatmap(L.beta, xticklabels=[], yticklabels=[], ax=axes[0])
    ax1.set_xlabel("Topics")
    ax1.set_ylabel("Words")
    ax1.set_title("Recovered topic-word distribution")

    ax2 = sns.heatmap(L.gamma, xticklabels=[], yticklabels=[], ax=axes[1])
    ax2.set_xlabel("Topics")
    ax2.set_ylabel("Documents")
    ax2.set_title("Recovered document-topic distribution")

    plt.savefig("img/plot_unsmoothed.png", dpi=300)
    plt.close("all")


================================================
FILE: numpy_ml/plots/lm_plots.py
================================================
# flake8: noqa
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_blobs
from sklearn.linear_model import LogisticRegression as LogisticRegression_sk
from sklearn.datasets import make_regression
from sklearn.metrics import zero_one_loss, r2_score

import matplotlib.pyplot as plt

import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("paper", font_scale=0.5)


from numpy_ml.linear_models import (
    RidgeRegression,
    LinearRegression,
    BayesianLinearRegressionKnownVariance,
    BayesianLinearRegressionUnknownVariance,
    LogisticRegression,
)

#######################################################################
#                           Data Generators                           #
#######################################################################


def random_binary_tensor(shape, sparsity=0.5):
    X = (np.random.rand(*shape) >= (1 - sparsity)).astype(float)
    return X


def random_regression_problem(n_ex, n_in, n_out, intercept=0, std=1, seed=0):
    X, y, coef = make_regression(
        n_samples=n_ex,
        n_features=n_in,
        n_targets=n_out,
        bias=intercept,
        noise=std,
        coef=True,
        random_state=seed,
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed
    )
    return X_train, y_train, X_test, y_test, coef


def random_classification_problem(n_ex, n_classes, n_in, seed=0):
    X, y = make_blobs(
        n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=seed
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed
    )
    return X_train, y_train, X_test, y_test


#######################################################################
#                                Plots                                #
#######################################################################


def plot_logistic():
    np.random.seed(12345)

    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_ex = 150
        X_train, y_train, X_test, y_test = random_classification_problem(
            n_ex, n_classes=2, n_in=n_in, seed=i
        )
        LR = LogisticRegression(penalty="l2", gamma=0.2, fit_intercept=True)
        LR.fit(X_train, y_train, lr=0.1, tol=1e-7, max_iter=1e7)
        y_pred = (LR.predict(X_test) >= 0.5) * 1.0
        loss = zero_one_loss(y_test, y_pred) * 100.0

        LR_sk = LogisticRegression_sk(
            penalty="l2", tol=0.0001, C=0.8, fit_intercept=True, random_state=i
        )
        LR_sk.fit(X_train, y_train)
        y_pred_sk = (LR_sk.predict(X_test) >= 0.5) * 1.0
        loss_sk = zero_one_loss(y_test, y_pred_sk) * 100.0

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_sk = LR_sk.predict_proba(X_plot.reshape(-1, 1))[:, 1]

        ax.scatter(X_test[y_pred == 0], y_test[y_pred == 0], alpha=0.5)
        ax.scatter(X_test[y_pred == 1], y_test[y_pred == 1], alpha=0.5)
        ax.plot(X_plot, y_plot, label="mine", alpha=0.75)
        ax.plot(X_plot, y_plot_sk, label="sklearn", alpha=0.75)
        ax.legend()
        ax.set_title("Loss mine: {:.2f} Loss sklearn: {:.2f}".format(loss, loss_sk))

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("plot_logistic.png", dpi=300)
    plt.close("all")


def plot_bayes():
    np.random.seed(12345)
    n_in = 1
    n_out = 1
    n_ex = 20
    std = 15
    intercept = 10
    X_train, y_train, X_test, y_test, coefs = random_regression_problem(
        n_ex, n_in, n_out, intercept=intercept, std=std, seed=0
    )

    # add some outliers
    x1, x2 = X_train[0] + 0.5, X_train[6] - 0.3
    y1 = np.dot(x1, coefs) + intercept + 25
    y2 = np.dot(x2, coefs) + intercept - 31
    X_train = np.vstack([X_train, np.array([x1, x2])])
    y_train = np.hstack([y_train, [y1[0], y2[0]]])

    LR = LinearRegression(fit_intercept=True)
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    loss = np.mean((y_test - y_pred) ** 2)

    ridge = RidgeRegression(alpha=1, fit_intercept=True)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    loss_ridge = np.mean((y_test - y_pred) ** 2)

    LR_var = BayesianLinearRegressionKnownVariance(
        mu=np.c_[intercept, coefs][0], sigma=np.sqrt(std), V=None, fit_intercept=True,
    )
    LR_var.fit(X_train, y_train)
    y_pred_var = LR_var.predict(X_test)
    loss_var = np.mean((y_test - y_pred_var) ** 2)

    LR_novar = BayesianLinearRegressionUnknownVariance(
        alpha=1, beta=2, mu=np.c_[intercept, coefs][0], V=None, fit_intercept=True
    )
    LR_novar.fit(X_train, y_train)
    y_pred_novar = LR_novar.predict(X_test)
    loss_novar = np.mean((y_test - y_pred_novar) ** 2)

    xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
    xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
    X_plot = np.linspace(xmin, xmax, 100)
    y_plot = LR.predict(X_plot)
    y_plot_ridge = ridge.predict(X_plot)
    y_plot_var = LR_var.predict(X_plot)
    y_plot_novar = LR_novar.predict(X_plot)

    y_true = [np.dot(x, coefs) + intercept for x in X_plot]
    fig, axes = plt.subplots(1, 4)

    axes = axes.flatten()
    axes[0].scatter(X_test, y_test)
    axes[0].plot(X_plot, y_plot, label="MLE")
    axes[0].plot(X_plot, y_true, label="True fn")
    axes[0].set_title("Linear Regression\nMLE Test MSE: {:.2f}".format(loss))
    axes[0].legend()
    #  axes[0].fill_between(X_plot, y_plot - error, y_plot + error)

    axes[1].scatter(X_test, y_test)
    axes[1].plot(X_plot, y_plot_ridge, label="MLE")
    axes[1].plot(X_plot, y_true, label="True fn")
    axes[1].set_title(
        "Ridge Regression (alpha=1)\nMLE Test MSE: {:.2f}".format(loss_ridge)
    )
    axes[1].legend()

    axes[2].plot(X_plot, y_plot_var, label="MAP")
    mu, cov = LR_var.posterior["b"].mean, LR_var.posterior["b"].cov
    for k in range(200):
        b_samp = np.random.multivariate_normal(mu, cov)
        y_samp = [np.dot(x, b_samp[1]) + b_samp[0] for x in X_plot]
        axes[2].plot(X_plot, y_samp, alpha=0.05)
    axes[2].scatter(X_test, y_test)
    axes[2].plot(X_plot, y_true, label="True fn")
    axes[2].legend()
    axes[2].set_title(
        "Bayesian Regression (known variance)\nMAP Test MSE: {:.2f}".format(loss_var)
    )

    axes[3].plot(X_plot, y_plot_novar, label="MAP")
    mu = LR_novar.posterior["b | sigma**2"].mean
    cov = LR_novar.posterior["b | sigma**2"].cov
    for k in range(200):
        b_samp = np.random.multivariate_normal(mu, cov)
        y_samp = [np.dot(x, b_samp[1]) + b_samp[0] for x in X_plot]
        axes[3].plot(X_plot, y_samp, alpha=0.05)
    axes[3].scatter(X_test, y_test)
    axes[3].plot(X_plot, y_true, label="True fn")
    axes[3].legend()
    axes[3].set_title(
        "Bayesian Regression (unknown variance)\nMAP Test MSE: {:.2f}".format(
            loss_novar
        )
    )

    for ax in axes:
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    fig.set_size_inches(7.5, 1.875)
    plt.savefig("plot_bayes.png", dpi=300)
    plt.close("all")


def plot_regression():
    np.random.seed(12345)

    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_out = 1
        n_ex = 50
        std = np.random.randint(0, 100)
        intercept = np.random.rand() * np.random.randint(-300, 300)
        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
            n_ex, n_in, n_out, intercept=intercept, std=std, seed=i
        )

        LR = LinearRegression(fit_intercept=True)
        LR.fit(X_train, y_train)
        y_pred = LR.predict(X_test)
        loss = np.mean((y_test - y_pred) ** 2)
        r2 = r2_score(y_test, y_pred)

        LR_var = BayesianLinearRegressionKnownVariance(
            mu=np.c_[intercept, coefs][0],
            sigma=np.sqrt(std),
            V=None,
            fit_intercept=True,
        )
        LR_var.fit(X_train, y_train)
        y_pred_var = LR_var.predict(X_test)
        loss_var = np.mean((y_test - y_pred_var) ** 2)
        r2_var = r2_score(y_test, y_pred_var)

        LR_novar = BayesianLinearRegressionUnknownVariance(
            alpha=1, beta=2, mu=np.c_[intercept, coefs][0], V=None, fit_intercept=True,
        )
        LR_novar.fit(X_train, y_train)
        y_pred_novar = LR_novar.predict(X_test)
        loss_novar = np.mean((y_test - y_pred_novar) ** 2)
        r2_novar = r2_score(y_test, y_pred_novar)

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_var = LR_var.predict(X_plot)
        y_plot_novar = LR_novar.predict(X_plot)

        ax.scatter(X_test, y_test, marker="x", alpha=0.5)
        ax.plot(X_plot, y_plot, label="linear regression", alpha=0.5)
        ax.plot(X_plot, y_plot_var, label="Bayes (w var)", alpha=0.5)
        ax.plot(X_plot, y_plot_novar, label="Bayes (no var)", alpha=0.5)
        ax.legend()
        ax.set_title(
            "MSE\nLR: {:.2f} Bayes (w var): {:.2f}\nBayes (no var): {:.2f}".format(
                loss, loss_var, loss_novar
            )
        )

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("plot_regression.png", dpi=300)
    plt.close("all")


================================================
FILE: numpy_ml/plots/ngram_plots.py
================================================
# flake8: noqa
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("notebook", font_scale=1)

from numpy_ml.ngram import MLENGram, AdditiveNGram, GoodTuringNGram


def plot_count_models(GT, N):
    NC = GT._num_grams_with_count
    mod = GT._count_models[N]
    max_n = max(GT.counts[N].values())
    emp = [NC(n + 1, N) for n in range(max_n)]
    prd = [np.exp(mod.predict(np.array([n + 1]))) for n in range(max_n + 10)]
    plt.scatter(range(max_n), emp, c="r", label="actual")
    plt.plot(range(max_n + 10), prd, "-", label="model")
    plt.ylim([-1, 100])
    plt.xlabel("Count ($r$)")
    plt.ylabel("Count-of-counts ($N_r$)")
    plt.legend()
    plt.savefig("test.png")
    plt.close()


def compare_probs(fp, N):
    MLE = MLENGram(N, unk=False, filter_punctuation=False, filter_stopwords=False)
    MLE.train(fp, encoding="utf-8-sig")

    add_y, mle_y, gtt_y = [], [], []
    addu_y, mleu_y, gttu_y = [], [], []
    seen = ("<bol>", "the")
    unseen = ("<bol>", "asdf")

    GTT = GoodTuringNGram(
        N, conf=1.96, unk=False, filter_stopwords=False, filter_punctuation=False
    )
    GTT.train(fp, encoding="utf-8-sig")

    gtt_prob = GTT.log_prob(seen, N)
    gtt_prob_u = GTT.log_prob(unseen, N)

    for K in np.linspace(0, 10, 20):
        ADD = AdditiveNGram(
            N, K, unk=False, filter_punctuation=False, filter_stopwords=False
        )
        ADD.train(fp, encoding="utf-8-sig")

        add_prob = ADD.log_prob(seen, N)
        mle_prob = MLE.log_prob(seen, N)

        add_y.append(add_prob)
        mle_y.append(mle_prob)
        gtt_y.append(gtt_prob)

        mle_prob_u = MLE.log_prob(unseen, N)
        add_prob_u = ADD.log_prob(unseen, N)

        addu_y.append(add_prob_u)
        mleu_y.append(mle_prob_u)
        gttu_y.append(gtt_prob_u)

    plt.plot(np.linspace(0, 10, 20), add_y, label="Additive (seen ngram)")
    plt.plot(np.linspace(0, 10, 20), addu_y, label="Additive (unseen ngram)")
    #  plt.plot(np.linspace(0, 10, 20), gtt_y, label="Good-Turing (seen ngram)")
    #  plt.plot(np.linspace(0, 10, 20), gttu_y, label="Good-Turing (unseen ngram)")
    plt.plot(np.linspace(0, 10, 20), mle_y, "--", label="MLE (seen ngram)")
    plt.xlabel("K")
    plt.ylabel("log P(sequence)")
    plt.legend()
    plt.savefig("img/add_smooth.png")
    plt.close("all")


def plot_gt_freqs(fp):
    """
    Draws a scatterplot of the empirical frequencies of the counted species
    versus their Simple Good Turing smoothed values, in rank order. Depends on
    pylab and matplotlib.
    """
    MLE = MLENGram(1, filter_punctuation=False, filter_stopwords=False)
    MLE.train(fp, encoding="utf-8-sig")
    counts = dict(MLE.counts[1])

    GT = GoodTuringNGram(1, filter_stopwords=False, filter_punctuation=False)
    GT.train(fp, encoding="utf-8-sig")

    ADD = AdditiveNGram(1, 1, filter_punctuation=False, filter_stopwords=False)
    ADD.train(fp, encoding="utf-8-sig")

    tot = float(sum(counts.values()))
    freqs = dict([(token, cnt / tot) for token, cnt in counts.items()])
    sgt_probs = dict([(tok, np.exp(GT.log_prob(tok, 1))) for tok in counts.keys()])
    as_probs = dict([(tok, np.exp(ADD.log_prob(tok, 1))) for tok in counts.keys()])

    X, Y = np.arange(len(freqs)), sorted(freqs.values(), reverse=True)
    plt.loglog(X, Y, "k+", alpha=0.25, label="MLE")

    X, Y = np.arange(len(sgt_probs)), sorted(sgt_probs.values(), reverse=True)
    plt.loglog(X, Y, "r+", alpha=0.25, label="simple Good-Turing")

    X, Y = np.arange(len(as_probs)), sorted(as_probs.values(), reverse=True)
    plt.loglog(X, Y, "b+", alpha=0.25, label="Laplace smoothing")

    plt.xlabel("Rank")
    plt.ylabel("Probability")
    plt.legend()
    plt.tight_layout()
    plt.savefig("img/rank_probs.png")
    plt.close("all")


================================================
FILE: numpy_ml/plots/nn_activations_plots.py
================================================
# flake8: noqa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("notebook", font_scale=0.7)

from numpy_ml.neural_nets.activations import (
    Affine,
    ReLU,
    LeakyReLU,
    Tanh,
    Sigmoid,
    ELU,
    Exponential,
    SELU,
    HardSigmoid,
    SoftPlus,
)


def plot_activations():
    fig, axes = plt.subplots(2, 5, sharex=True, sharey=True)
    fns = [
        Affine(),
        Tanh(),
        Sigmoid(),
        ReLU(),
        LeakyReLU(),
        ELU(),
        Exponential(),
        SELU(),
        HardSigmoid(),
        SoftPlus(),
    ]

    for ax, fn in zip(axes.flatten(), fns):
        X = np.linspace(-3, 3, 100).astype(float).reshape(100, 1)
        ax.plot(X, fn(X), label=r"$y$", alpha=1.0)
        ax.plot(X, fn.grad(X), label=r"$\frac{dy}{dx}$", alpha=1.0)
        ax.plot(X, fn.grad2(X), label=r"$\frac{d^2 y}{dx^2}$", alpha=1.0)
        ax.hlines(0, -3, 3, lw=1, linestyles="dashed", color="k")
        ax.vlines(0, -1.2, 1.2, lw=1, linestyles="dashed", color="k")
        ax.set_ylim(-1.1, 1.1)
        ax.set_xlim(-3, 3)
        ax.set_xticks([])
        ax.set_yticks([-1, 0, 1])
        ax.xaxis.set_visible(False)
        #  ax.yaxis.set_visible(False)
        ax.set_title("{}".format(fn))
        ax.legend(frameon=False)
        sns.despine(left=True, bottom=True)

    fig.set_size_inches(10, 5)
    plt.tight_layout()
    plt.savefig("img/plot.png", dpi=300)
    plt.close("all")


if __name__ == "__main__":
    plot_activations()


================================================
FILE: numpy_ml/plots/nn_schedulers_plots.py
================================================
# flake8: noqa

import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("notebook", font_scale=0.7)

from numpy_ml.neural_nets.schedulers import (
    ConstantScheduler,
    ExponentialScheduler,
    NoamScheduler,
    KingScheduler,
)


def king_loss_fn(x):
    if x <= 250:
        return -0.25 * x + 82.50372665317208
    elif 250 < x <= 600:
        return 20.00372665317208
    elif 600 < x <= 700:
        return -0.2 * x + 140.00372665317207
    else:
        return 0.003726653172066108


def plot_schedulers():
    fig, axes = plt.subplots(2, 2)
    schedulers = [
        (
            [ConstantScheduler(lr=0.01), "lr=1e-2"],
            [ConstantScheduler(lr=0.008), "lr=8e-3"],
            [ConstantScheduler(lr=0.006), "lr=6e-3"],
            [ConstantScheduler(lr=0.004), "lr=4e-3"],
            [ConstantScheduler(lr=0.002), "lr=2e-3"],
        ),
        (
            [
                ExponentialScheduler(
                    lr=0.01, stage_length=250, staircase=False, decay=0.4
                ),
                "lr=0.01, stage=250, stair=False, decay=0.4",
            ],
            [
                ExponentialScheduler(
                    lr=0.01, stage_length=250, staircase=True, decay=0.4
                ),
                "lr=0.01, stage=250, stair=True, decay=0.4",
            ],
            [
                ExponentialScheduler(
                    lr=0.01, stage_length=125, staircase=True, decay=0.1
                ),
                "lr=0.01, stage=125, stair=True, decay=0.1",
            ],
            [
                ExponentialScheduler(
                    lr=0.001, stage_length=250, staircase=False, decay=0.1
                ),
                "lr=0.001, stage=250, stair=False, decay=0.1",
            ],
            [
                ExponentialScheduler(
                    lr=0.001, stage_length=125, staircase=False, decay=0.8
                ),
                "lr=0.001, stage=125, stair=False, decay=0.8",
            ],
            [
                ExponentialScheduler(
                    lr=0.01, stage_length=250, staircase=False, decay=0.01
                ),
                "lr=0.01, stage=250, stair=False, decay=0.01",
            ],
        ),
        (
            [
                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=250),
                "dim=512, scale=1, warmup=250",
            ],
            [
                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=250),
                "dim=256, scale=1, warmup=250",
            ],
            [
                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=500),
                "dim=512, scale=1, warmup=500",
            ],
            [
                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=500),
                "dim=512, scale=1, warmup=500",
            ],
            [
                NoamScheduler(model_dim=512, scale_factor=2, warmup_steps=500),
                "dim=512, scale=2, warmup=500",
            ],
            [
                NoamScheduler(model_dim=512, scale_factor=0.5, warmup_steps=500),
                "dim=512, scale=0.5, warmup=500",
            ],
        ),
        (
            #  [
            #      KingScheduler(initial_lr=0.01, patience=100, decay=0.1),
            #      "lr=0.01, patience=100, decay=0.8",
            #  ],
            #  [
            #      KingScheduler(initial_lr=0.01, patience=300, decay=0.999),
            #      "lr=0.01, patience=300, decay=0.999",
            #  ],
            [
                KingScheduler(initial_lr=0.009, patience=150, decay=0.995),
                "lr=0.009, patience=150, decay=0.9999",
            ],
            [
                KingScheduler(initial_lr=0.008, patience=100, decay=0.995),
                "lr=0.008, patience=100, decay=0.995",
            ],
            [
                KingScheduler(initial_lr=0.007, patience=50, decay=0.995),
                "lr=0.007, patience=50, decay=0.995",
            ],
            [
                KingScheduler(initial_lr=0.005, patience=25, decay=0.9),
                "lr=0.005, patience=25, decay=0.99",
            ],
        ),
    ]

    for ax, schs, title in zip(
        axes.flatten(), schedulers, ["Constant", "Exponential", "Noam", "King"]
    ):
        t0 = time.time()
        print("Running {} scheduler".format(title))
        X = np.arange(1, 1000)
        loss = np.array([king_loss_fn(x) for x in X])

        # scale loss to fit on same axis as lr
        scale = 0.01 / loss[0]
        loss *= scale

        if title == "King":
            ax.plot(X, loss, ls=":", label="Loss")

        for sc, lg in schs:
            Y = np.array([sc(x, ll) for x, ll in zip(X, loss)])
            ax.plot(X, Y, label=lg, alpha=0.6)

        ax.legend(fontsize=5)
        ax.set_xlabel("Steps")
        ax.set_ylabel("Learning rate")
        ax.set_title("{} scheduler".format(title))
        print(
            "Finished plotting {} runs of {} in {:.2f}s".format(
                len(schs), title, time.time() - t0
            )
        )

    plt.tight_layout()
    plt.savefig("plot.png", dpi=300)
    plt.close("all")


if __name__ == "__main__":
    plot_schedulers()


================================================
FILE: numpy_ml/plots/nonparametric_plots.py
================================================
# flake8: noqa
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("paper", font_scale=0.5)

from numpy_ml.nonparametric import GPRegression, KNN, KernelRegression
from numpy_ml.linear_models.lm import LinearRegression

from sklearn.model_selection import train_test_split


def random_regression_problem(n_ex, n_in, n_out, d=3, intercept=0, std=1, seed=0):
    coef = np.random.uniform(0, 50, size=d)
    coef[-1] = intercept

    y = []
    X = np.random.uniform(-100, 100, size=(n_ex, n_in))
    for x in X:
        val = np.polyval(coef, x) + np.random.normal(0, std)
        y.append(val)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed
    )
    return X_train, y_train, X_test, y_test, coef


def plot_regression():
    np.random.seed(12345)
    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_out = 1
        d = np.random.randint(1, 5)
        n_ex = np.random.randint(5, 500)
        std = np.random.randint(0, 1000)
        intercept = np.random.rand() * np.random.randint(-300, 300)
        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
        )

        LR = LinearRegression(fit_intercept=True)
        LR.fit(X_train, y_train)
        y_pred = LR.predict(X_test)
        loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)

        d = 3
        best_loss = np.inf
        for gamma in np.linspace(1e-10, 1, 100):
            for c0 in np.linspace(-1, 1000, 100):
                kernel = "PolynomialKernel(d={}, gamma={}, c0={})".format(d, gamma, c0)
                KR_poly = KernelRegression(kernel=kernel)
                KR_poly.fit(X_train, y_train)
                y_pred_poly = KR_poly.predict(X_test)
                loss_poly = np.mean((y_test.flatten() - y_pred_poly.flatten()) ** 2)
                if loss_poly <= best_loss:
                    KR_poly_best = kernel
                    best_loss = loss_poly

        print("Best kernel: {} || loss: {:.4f}".format(KR_poly_best, best_loss))
        KR_poly = KernelRegression(kernel=KR_poly_best)
        KR_poly.fit(X_train, y_train)

        KR_rbf = KernelRegression(kernel="RBFKernel(sigma=1)")
        KR_rbf.fit(X_train, y_train)
        y_pred_rbf = KR_rbf.predict(X_test)
        loss_rbf = np.mean((y_test.flatten() - y_pred_rbf.flatten()) ** 2)

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_poly = KR_poly.predict(X_plot)
        y_plot_rbf = KR_rbf.predict(X_plot)

        ax.scatter(X_test, y_test, alpha=0.5)
        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
        ax.plot(
            X_plot, y_plot_poly, label="KR (poly kernel, d={})".format(d), alpha=0.5
        )
        ax.plot(X_plot, y_plot_rbf, label="KR (rbf kernel)", alpha=0.5)
        ax.legend()
        #  ax.set_title(
        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
        #          loss, loss_poly, loss_rbf
        #      )
        #  )

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("img/kr_plots.png", dpi=300)
    plt.close("all")


def plot_knn():
    np.random.seed(12345)
    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_out = 1
        d = np.random.randint(1, 5)
        n_ex = np.random.randint(5, 500)
        std = np.random.randint(0, 1000)
        intercept = np.random.rand() * np.random.randint(-300, 300)
        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
        )

        LR = LinearRegression(fit_intercept=True)
        LR.fit(X_train, y_train)
        y_pred = LR.predict(X_test)
        loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)

        knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
        knn_1.fit(X_train, y_train)
        y_pred_1 = knn_1.predict(X_test)
        loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten()) ** 2)

        knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
        knn_5.fit(X_train, y_train)
        y_pred_5 = knn_5.predict(X_test)
        loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten()) ** 2)

        knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
        knn_10.fit(X_train, y_train)
        y_pred_10 = knn_10.predict(X_test)
        loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten()) ** 2)

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_1 = knn_1.predict(X_plot)
        y_plot_5 = knn_5.predict(X_plot)
        y_plot_10 = knn_10.predict(X_plot)

        ax.scatter(X_test, y_test, alpha=0.5)
        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
        ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
        ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
        ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
        ax.legend()
        #  ax.set_title(
        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
        #          loss, loss_poly, loss_rbf
        #      )
        #  )

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("img/knn_plots.png", dpi=300)
    plt.close("all")


def plot_gp():
    np.random.seed(12345)
    sns.set_context("paper", font_scale=0.65)

    X_test = np.linspace(-10, 10, 100)
    X_train = np.array([-3, 0, 7, 1, -9])
    y_train = np.sin(X_train)

    fig, axes = plt.subplots(2, 2)
    alphas = [0, 1e-10, 1e-5, 1]
    for ix, (ax, alpha) in enumerate(zip(axes.flatten(), alphas)):
        G = GPRegression(kernel="RBFKernel", alpha=alpha)
        G.fit(X_train, y_train)
        y_pred, conf = G.predict(X_test)

        ax.plot(X_train, y_train, "rx", label="observed")
        ax.plot(X_test, np.sin(X_test), label="true fn")
        ax.plot(X_test, y_pred, "--", label="MAP (alpha={})".format(alpha))
        ax.fill_between(X_test, y_pred + conf, y_pred - conf, alpha=0.1)
        ax.set_xticks([])
        ax.set_yticks([])
        sns.despine()

        ax.legend()

    plt.tight_layout()
    plt.savefig("img/gp_alpha.png", dpi=300)
    plt.close("all")


def plot_gp_dist():
    np.random.seed(12345)
    sns.set_context("paper", font_scale=0.95)

    X_test = np.linspace(-10, 10, 100)
    X_train = np.array([-3, 0, 7, 1, -9])
    y_train = np.sin(X_train)

    fig, axes = plt.subplots(1, 3)
    G = GPRegression(kernel="RBFKernel", alpha=0)
    G.fit(X_train, y_train)

    y_pred_prior = G.sample(X_test, 3, "prior")
    y_pred_posterior = G.sample(X_test, 3, "posterior_predictive")

    for prior_sample in y_pred_prior:
        axes[0].plot(X_test, prior_sample.ravel(), lw=1)
    axes[0].set_title("Prior samples")
    axes[0].set_xticks([])
    axes[0].set_yticks([])

    for post_sample in y_pred_posterior:
        axes[1].plot(X_test, post_sample.ravel(), lw=1)
    axes[1].plot(X_train, y_train, "ko", ms=1.2)
    axes[1].set_title("Posterior samples")
    axes[1].set_xticks([])
    axes[1].set_yticks([])

    y_pred, conf = G.predict(X_test)

    axes[2].plot(X_test, np.sin(X_test), lw=1, label="true function")
    axes[2].plot(X_test, y_pred, lw=1, label="MAP estimate")
    axes[2].fill_between(X_test, y_pred + conf, y_pred - conf, alpha=0.1)
    axes[2].plot(X_train, y_train, "ko", ms=1.2, label="observed")
    axes[2].legend(fontsize="x-small")
    axes[2].set_title("Posterior mean")
    axes[2].set_xticks([])
    axes[2].set_yticks([])

    fig.set_size_inches(6, 2)
    plt.tight_layout()
    plt.savefig("img/gp_dist.png", dpi=300)
    plt.close("all")


================================================
FILE: numpy_ml/plots/rl_plots.py
================================================
# flake8: noqa
import gym

from numpy_ml.rl_models.trainer import Trainer
from numpy_ml.rl_models.agents import (
    CrossEntropyAgent,
    MonteCarloAgent,
    TemporalDifferenceAgent,
    DynaAgent,
)


def test_cross_entropy_agent():
    seed = 12345
    max_steps = 300
    n_episodes = 50
    retain_prcnt = 0.2
    n_samples_per_episode = 500
    env = gym.make("LunarLander-v2")

    agent = CrossEntropyAgent(env, n_samples_per_episode, retain_prcnt)
    trainer = Trainer(agent, env)
    trainer.train(
        n_episodes, max_steps, seed=seed, plot=True, verbose=True, render_every=None
    )


def test_monte_carlo_agent():
    seed = 12345
    max_steps = 300
    n_episodes = 10000

    epsilon = 0.05
    off_policy = True
    smooth_factor = 0.001
    temporal_discount = 0.95
    env = gym.make("Copy-v0")

    agent = MonteCarloAgent(env, off_policy, temporal_discount, epsilon)
    trainer = Trainer(agent, env)
    trainer.train(
        n_episodes,
        max_steps,
        seed=seed,
        plot=True,
        verbose=True,
        render_every=None,
        smooth_factor=smooth_factor,
    )


def test_temporal_difference_agent():
    seed = 12345
    max_steps = 200
    n_episodes = 5000

    lr = 0.4
    n_tilings = 10
    epsilon = 0.10
    off_policy = True
    grid_dims = [100, 100]
    smooth_factor = 0.005
    temporal_discount = 0.999
    env = gym.make("LunarLander-v2")
    obs_max = 1
    obs_min = -1

    agent = TemporalDifferenceAgent(
        env,
        lr=lr,
        obs_max=obs_max,
        obs_min=obs_min,
        epsilon=epsilon,
        n_tilings=n_tilings,
        grid_dims=grid_dims,
        off_policy=off_policy,
        temporal_discount=temporal_discount,
    )

    trainer = Trainer(agent, env)
    trainer.train(
        n_episodes,
        max_steps,
        seed=seed,
        plot=True,
        verbose=True,
        render_every=None,
        smooth_factor=smooth_factor,
    )


def test_dyna_agent():
    seed = 12345
    max_steps = 200
    n_episodes = 150

    lr = 0.4
    q_plus = False
    n_tilings = 10
    epsilon = 0.10
    grid_dims = [10, 10]
    smooth_factor = 0.01
    temporal_discount = 0.99
    explore_weight = 0.05
    n_simulated_actions = 25

    obs_max, obs_min = 1, -1
    env = gym.make("Taxi-v2")

    agent = DynaAgent(
        env,
        lr=lr,
        q_plus=q_plus,
        obs_max=obs_max,
        obs_min=obs_min,
        epsilon=epsilon,
        n_tilings=n_tilings,
        grid_dims=grid_dims,
        explore_weight=explore_weight,
        temporal_discount=temporal_discount,
        n_simulated_actions=n_simulated_actions,
    )

    trainer = Trainer(agent, env)
    trainer.train(
        n_episodes,
        max_steps,
        seed=seed,
        plot=True,
        verbose=True,
        render_every=None,
        smooth_factor=smooth_factor,
    )


================================================
FILE: numpy_ml/plots/trees_plots.py
================================================
# flake8: noqa
import numpy as np

from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import make_blobs, make_regression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
import seaborn as sns

sns.set_style("white")
sns.set_context("paper", font_scale=0.9)

from numpy_ml.trees import GradientBoostedDecisionTree, DecisionTree, RandomForest


def plot():
    fig, axes = plt.subplots(4, 4)
    fig.set_size_inches(10, 10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth_r,
            )
            mine_d = DecisionTree(
                criterion=criterion, max_depth=max_depth_d, classifier=classifier
            )
            mine_g = GradientBoostedDecisionTree(
                n_trees=n_trees,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="crossentropy",
                step_size="constant",
                split_criterion=criterion,
            )

        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth_r,
                classifier=classifier,
            )
            mine_d = DecisionTree(
                criterion=criterion, max_depth=max_depth_d, classifier=classifier
            )
            mine_g = GradientBoostedDecisionTree(
                n_trees=n_trees,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="mse",
                step_size="adaptive",
                split_criterion=criterion,
            )

        # fit 'em
        mine.fit(X, Y)
        mine_d.fit(X, Y)
        mine_g.fit(X, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_mine_test_d = mine_d.predict(X_test)
        y_pred_mine_test_g = mine_g.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)

        if classifier:
            entries = [
                ("RF", loss_mine_test, y_pred_mine_test),
                ("DT", loss_mine_test_d, y_pred_mine_test_d),
                ("GB", loss_mine_test_g, y_pred_mine_test_g),
            ]
            (lbl, test_loss, preds) = entries[np.random.randint(3)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    #  s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100
            ).reshape(-1, 1)
            y_pred_mine_test = mine.predict(X_ax)
            y_pred_mine_test_d = mine_d.predict(X_ax)
            y_pred_mine_test_g = mine_g.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_g.flatten(),
                #  linewidth=0.5,
                label="GB".format(n_trees, n_feats, max_depth_d),
                color="red",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test.flatten(),
                #  linewidth=0.5,
                label="RF".format(n_trees, n_feats, max_depth_r),
                color="cornflowerblue",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title(
                "GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
                    loss_mine_test_g, loss_mine_test, loss_mine_test_d
                )
            )
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    plt.savefig("plot.png", dpi=300)
    plt.close("all")


================================================
FILE: numpy_ml/preprocessing/README.md
================================================
# Preprocessing
The preprocessing module implements common data preprocessing routines.

- `nlp.py`: Routines and objects for handling text data.
    - n-gram generators
    - Word and character tokenization
    - Punctuation and stop-word removal
    - Vocabulary / unigram count objects
    - Byte-pair encoding ([Gage, 1994](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM); [Sennrich, Haddow, & Birch, 2015](https://arxiv.org/pdf/1508.07909.pdf))
    - [Huffman tree](https://en.wikipedia.org/wiki/Huffman_coding) encoding / decoding
    - Term frequency-inverse document frequency ([tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) encoding

- `dsp.py`: Routines for handling audio and image data.
    - Signal windowing
    - Signal autocorrelation
    - Discrete Fourier transform
    - Discrete cosine transform (type II)
    - Signal resampling via (bi-)linear interpolation and nearest neighbor
    - Mel-frequency cepstral coefficients (MFCCs) ([Mermelstein, 1976](https://files.eric.ed.gov/fulltext/ED128870.pdf#page=93); [Davis & Mermelstein, 1980](https://pdfs.semanticscholar.org/24b8/7a58511919cc867a71f0b58328694dd494b3.pdf))

- `general.py`: General data preprocessing objects and functions.
    - Feature hashing ([Moody, 1989](http://papers.nips.cc/paper/175-fast-learning-in-multi-resolution-hierarchies.pdf))
    - Mini-batch generators
    - One-hot encoding / decoding
    - Feature standardization


================================================
FILE: numpy_ml/preprocessing/__init__.py
================================================
from . import general
from . import nlp
from . import dsp


================================================
FILE: numpy_ml/preprocessing/dsp.py
================================================
import numpy as np
from numpy.lib.stride_tricks import as_strided

from ..utils.windows import WindowInitializer

#######################################################################
#                          Signal Resampling                          #
#######################################################################


def batch_resample(X, new_dim, mode="bilinear"):
    """
    Resample each image (or similar grid-based 2D signal) in a batch to
    `new_dim` using the specified resampling strategy.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_channels)`
        An input image volume
    new_dim : 2-tuple of `(out_rows, out_cols)`
        The dimension to resample each image to
    mode : {'bilinear', 'neighbor'}
        The resampling strategy to employ. Default is 'bilinear'.

    Returns
    -------
    resampled : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, in_channels)`
        The resampled image volume.
    """
    if mode == "bilinear":
        interpolate = bilinear_interpolate
    elif mode == "neighbor":
        interpolate = nn_interpolate_2D
    else:
        raise NotImplementedError("Unrecognized resampling mode: {}".format(mode))

    out_rows, out_cols = new_dim
    n_ex, in_rows, in_cols, n_in = X.shape

    # compute coordinates to resample
    x = np.tile(np.linspace(0, in_cols - 2, out_cols), out_rows)
    y = np.repeat(np.linspace(0, in_rows - 2, out_rows), out_cols)

    # resample each image
    resampled = []
    for i in range(n_ex):
        r = interpolate(X[i, ...], x, y)
        r = r.reshape(out_rows, out_cols, n_in)
        resampled.append(r)
    return np.dstack(resampled)


def nn_interpolate_2D(X, x, y):
    """
    Estimates of the pixel values at the coordinates (x, y) in `X` using a
    nearest neighbor interpolation strategy.

    Notes
    -----
    Assumes the current entries in `X` reflect equally-spaced samples from a 2D
    integer grid.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(in_rows, in_cols, in_channels)`
        An input image sampled along a grid of `in_rows` by `in_cols`.
    x : list of length `k`
        A list of x-coordinates for the samples we wish to generate
    y : list of length `k`
        A list of y-coordinates for the samples we wish to generate

    Returns
    -------
    samples : :py:class:`ndarray <numpy.ndarray>` of shape `(k, in_channels)`
        The samples for each (x,y) coordinate computed via nearest neighbor
        interpolation
    """
    nx, ny = np.around(x), np.around(y)
    nx = np.clip(nx, 0, X.shape[1] - 1).astype(int)
    ny = np.clip(ny, 0, X.shape[0] - 1).astype(int)
    return X[ny, nx, :]


def nn_interpolate_1D(X, t):
    """
    Estimates of the signal values at `X[t]` using a nearest neighbor
    interpolation strategy.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(in_length, in_channels)`
        An input image sampled along an integer `in_length`
    t : list of length `k`
        A list of coordinates for the samples we wish to generate

    Returns
    -------
    samples : :py:class:`ndarray <numpy.ndarray>` of shape `(k, in_channels)`
        The samples for each (x,y) coordinate computed via nearest neighbor
        interpolation
    """
    nt = np.clip(np.around(t), 0, X.shape[0] - 1).astype(int)
    return X[nt, :]


def bilinear_interpolate(X, x, y):
    """
    Estimates of the pixel values at the coordinates (x, y) in `X` via bilinear
    interpolation.

    Notes
    -----
    Assumes the current entries in X reflect equally-spaced
    samples from a 2D integer grid.

    Modified from https://bit.ly/2NMb1Dr

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(in_rows, in_cols, in_channels)`
        An input image sampled along a grid of `in_rows` by `in_cols`.
    x : list of length `k`
        A list of x-coordinates for the samples we wish to generate
    y : list of length `k`
        A list of y-coordinates for the samples we wish to generate

    Returns
    -------
    samples : list of length `(k, in_channels)`
        The samples for each (x,y) coordinate computed via bilinear
        interpolation
    """
    x0 = np.floor(x).astype(int)
    y0 = np.floor(y).astype(int)
    x1 = x0 + 1
    y1 = y0 + 1

    x0 = np.clip(x0, 0, X.shape[1] - 1)
    y0 = np.clip(y0, 0, X.shape[0] - 1)
    x1 = np.clip(x1, 0, X.shape[1] - 1)
    y1 = np.clip(y1, 0, X.shape[0] - 1)

    Ia = X[y0, x0, :].T
    Ib = X[y1, x0, :].T
    Ic = X[y0, x1, :].T
    Id = X[y1, x1, :].T

    wa = (x1 - x) * (y1 - y)
    wb = (x1 - x) * (y - y0)
    wc = (x - x0) * (y1 - y)
    wd = (x - x0) * (y - y0)

    return (Ia * wa).T + (Ib * wb).T + (Ic * wc).T + (Id * wd).T


#######################################################################
#                        Fourier Decomposition                        #
#######################################################################


def DCT(frame, orthonormal=True):
    """
    A naive :math:`O(N^2)` implementation of the 1D discrete cosine transform-II
    (DCT-II).

    Notes
    -----
    For a signal :math:`\mathbf{x} = [x_1, \ldots, x_N]` consisting of `N`
    samples, the `k` th DCT coefficient, :math:`c_k`, is

    .. math::

        c_k = 2 \sum_{n=0}^{N-1} x_n \cos(\pi k (2 n + 1) / (2 N))

    where `k` ranges from :math:`0, \ldots, N-1`.

    The DCT is highly similar to the DFT -- whereas in a DFT the basis
    functions are sinusoids, in a DCT they are restricted solely to cosines. A
    signal's DCT representation tends to have more of its energy concentrated
    in a smaller number of coefficients when compared to the DFT, and is thus
    commonly used for signal compression. [1]

    .. [1] Smoother signals can be accurately approximated using fewer DFT / DCT
       coefficients, resulting in a higher compression ratio. The DCT naturally
       yields a continuous extension at the signal boundaries due its use of
       even basis functions (cosine). This in turn produces a smoother
       extension in comparison to DFT or DCT approximations, resulting in a
       higher compression.

    Parameters
    ----------
    frame : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A signal frame consisting of N samples
    orthonormal : bool
        Scale to ensure the coefficient vector is orthonormal. Default is True.

    Returns
    -------
    dct : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The discrete cosine transform of the samples in `frame`.
    """
    N = len(frame)
    out = np.zeros_like(frame)
    for k in range(N):
        for (n, xn) in enumerate(frame):
            out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N))
        scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N))
        out[k] *= 2 * scale if orthonormal else 2
    return out


def __DCT2(frame):
    """Currently broken"""
    N = len(frame)  # window length

    k = np.arange(N, dtype=float)
    F = k.reshape(1, -1) * k.reshape(-1, 1)
    K = np.divide(F, k, out=np.zeros_like(F), where=F != 0)

    FC = np.cos(F * np.pi / N + K * np.pi / 2 * N)
    return 2 * (FC @ frame)


def DFT(frame, positive_only=True):
    """
    A naive :math:`O(N^2)` implementation of the 1D discrete Fourier transform (DFT).

    Notes
    -----
    The Fourier transform decomposes a signal into a linear combination of
    sinusoids (ie., basis elements in the space of continuous periodic
    functions).  For a sequence :math:`\mathbf{x} = [x_1, \ldots, x_N]` of N
    evenly spaced samples, the `k` th DFT coefficient is given by:

    .. math::

        c_k = \sum_{n=0}^{N-1} x_n \exp(-2 \pi i k n / N)

    where `i` is the imaginary unit, `k` is an index ranging from `0, ..., N-1`,
    and :math:`X_k` is the complex coefficient representing the phase
    (imaginary part) and amplitude (real part) of the `k` th sinusoid in the
    DFT spectrum. The frequency of the `k` th sinusoid is :math:`(k 2 \pi / N)`
    radians per sample.

    When applied to a real-valued input, the negative frequency terms are the
    complex conjugates of the positive-frequency terms and the overall spectrum
    is symmetric (excluding the first index, which contains the zero-frequency
    / intercept term).

    Parameters
    ----------
    frame : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A signal frame consisting of N samples
    positive_only : bool
        Whether to only return the coefficients for the positive frequency
        terms. Default is True.

    Returns
    -------
    spectrum : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` or `(N // 2 + 1,)` if `real_only`
        The coefficients of the frequency spectrum for `frame`, including
        imaginary components.
    """
    N = len(frame)  # window length

    # F[i,j] = coefficient for basis vector i, timestep j (i.e., k * n)
    F = np.arange(N).reshape(1, -1) * np.arange(N).reshape(-1, 1)
    F = np.exp(F * (-1j * 2 * np.pi / N))

    # vdot only operates on vectors (rather than ndarrays), so we have to
    # loop over each basis vector in F explicitly
    spectrum = np.array([np.vdot(f, frame) for f in F])
    return spectrum[: (N // 2) + 1] if positive_only else spectrum


def dft_bins(N, fs=44000, positive_only=True):
    """
    Calc the frequency bin centers for a DFT with `N` coefficients.

    Parameters
    ----------
    N : int
        The number of frequency bins in the DFT
    fs : int
        The sample rate/frequency of the signal (in Hz). Default is 44000.
    positive_only : bool
        Whether to only return the bins for the positive frequency
        terms. Default is True.

    Returns
    -------
    bins : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` or `(N // 2 + 1,)` if `positive_only`
        The frequency bin centers associated with each coefficient in the
        DFT spectrum
    """
    if positive_only:
        freq_bins = np.linspace(0, fs / 2, 1 + N // 2, endpoint=True)
    else:
        l, r = (1 + (N - 1) / 2, (1 - N) / 2) if N % 2 else (N / 2, -N / 2)
        freq_bins = np.r_[np.arange(l), np.arange(r, 0)] * fs / N
    return freq_bins


def magnitude_spectrum(frames):
    """
    Compute the magnitude spectrum (i.e., absolute value of the DFT spectrum)
    for each frame in `frames`. Assumes each frame is real-valued only.

    Parameters
    ----------
    frames : :py:class:`ndarray <numpy.ndarray>` of shape `(M, N)`
        A sequence of `M` frames each consisting of `N` samples

    Returns
    -------
    magnitude_spec : :py:class:`ndarray <numpy.ndarray>` of shape `(M, N // 2 + 1)`
        The magnitude spectrum for each frame in `frames`. Only includes the
        coefficients for the positive spectrum frequencies.
    """
    return np.vstack([np.abs(DFT(frame, positive_only=True)) for frame in frames])


def power_spectrum(frames, scale=False):
    """
    Compute the power spectrum for a signal represented as a collection of
    frames. Assumes each frame is real-valued only.

    The power spectrum is simply the square of the magnitude spectrum, possibly
    scaled by the number of FFT bins. It measures how the energy of the signal
    is distributed over the frequency domain.

    Parameters
    ----------
    frames : :py:class:`ndarray <numpy.ndarray>` of shape `(M, N)`
        A sequence of `M` frames each consisting of `N` samples
    scale : bool
        Whether the scale by the number of DFT bins. Default is False.

    Returns
    -------
    power_spec : :py:class:`ndarray <numpy.ndarray>` of shape `(M, N // 2 + 1)`
        The power spectrum for each frame in `frames`. Only includes the
        coefficients for the positive spectrum frequencies.
    """
    scaler = frames.shape[1] // 2 + 1 if scale else 1
    return (1 / scaler) * magnitude_spectrum(frames) ** 2


#######################################################################
#                       Preprocessing Utils                           #
#######################################################################


def to_frames(x, frame_width, stride, writeable=False):
    """
    Convert a 1D signal x into overlapping windows of width `frame_width` using
    a hop length of `stride`.

    Notes
    -----
    If ``(len(x) - frame_width) % stride != 0`` then some number of the samples
    in x will be dropped. Specifically::

        n_dropped_frames = len(x) - frame_width - stride * (n_frames - 1)

    where::

        n_frames = (len(x) - frame_width) // stride + 1

    This method uses low-level stride manipulation to avoid creating an
    additional copy of `x`. The downside is that if ``writeable`=True``,
    modifying the `frame` output can result in unexpected behavior:

        >>> out = to_frames(np.arange(6), 5, 1)
        >>> out
        array([[0, 1, 2, 3, 4],
               [1, 2, 3, 4, 5]])
        >>> out[0, 1] = 99
        >>> out
        array([[ 0, 99,  2,  3,  4],
               [99,  2,  3,  4,  5]])

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of N samples
    frame_width : int
        The width of a single frame window in samples
    stride : int
        The hop size / number of samples advanced between consecutive frames
    writeable : bool
        If set to False, the returned array will be readonly. Otherwise it will
        be writable if `x` was. It is advisable to set this to False whenever
        possible to avoid unexpected behavior (see NB 2 above). Default is False.

    Returns
    -------
    frame: :py:class:`ndarray <numpy.ndarray>` of shape `(n_frames, frame_width)`
        The collection of overlapping frames stacked into a matrix
    """
    assert x.ndim == 1
    assert stride >= 1
    assert len(x) >= frame_width

    # get the size for an element in x in bits
    byte = x.itemsize
    n_frames = (len(x) - frame_width) // stride + 1
    return as_strided(
        x,
        shape=(n_frames, frame_width),
        strides=(byte * stride, byte),
        writeable=writeable,
    )


def autocorrelate1D(x):
    """
    Autocorrelate a 1D signal `x` with itself.

    Notes
    -----
    The `k` th term in the 1 dimensional autocorrelation is

    .. math::

        a_k = \sum_n x_{n + k} x_n

    NB. This is a naive :math:`O(N^2)` implementation.  For a faster :math:`O(N
    \log N)` approach using the FFT, see [1].

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Autocorrelation#Efficient%computation

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of N samples

    Returns
    -------
    auto : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The autocorrelation of `x` with itself
    """
    N = len(x)
    auto = np.zeros(N)
    for k in range(N):
        for n in range(N - k):
            auto[k] += x[n + k] * x[n]
    return auto


#######################################################################
#                               Filters                               #
#######################################################################


def preemphasis(x, alpha):
    """
    Increase the amplitude of high frequency bands + decrease the amplitude of
    lower bands.

    Notes
    -----
    Preemphasis filtering is (was?) a common transform in speech processing,
    where higher frequencies tend to be more useful during signal
    disambiguation.

    .. math::

        \\text{preemphasis}( x_t ) = x_t - \\alpha x_{t-1}

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of `N` samples
    alpha : float in [0, 1)
        The preemphasis coefficient. A value of 0 corresponds to no
        filtering

    Returns
    -------
    out : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The filtered signal
    """
    return np.concatenate([x[:1], x[1:] - alpha * x[:-1]])


def cepstral_lifter(mfccs, D):
    """
    A simple sinusoidal filter applied in the Mel-frequency domain.

    Notes
    -----
    Cepstral lifting helps to smooth the spectral envelope and dampen the
    magnitude of the higher MFCC coefficients while keeping the other
    coefficients unchanged. The filter function is:

    .. math::

        \\text{lifter}( x_n ) = x_n \left(1 + \\frac{D \sin(\pi n / D)}{2}\\right)

    Parameters
    ----------
    mfccs : :py:class:`ndarray <numpy.ndarray>` of shape `(G, C)`
        Matrix of Mel cepstral coefficients. Rows correspond to frames, columns
        to cepstral coefficients
    D : int in :math:`[0, +\infty]`
        The filter coefficient. 0 corresponds to no filtering, larger values
        correspond to greater amounts of smoothing

    Returns
    -------
    out : :py:class:`ndarray <numpy.ndarray>` of shape `(G, C)`
        The lifter'd MFCC coefficients
    """
    if D == 0:
        return mfccs
    n = np.arange(mfccs.shape[1])
    return mfccs * (1 + (D / 2) * np.sin(np.pi * n / D))


def mel_spectrogram(
    x,
    window_duration=0.025,
    stride_duration=0.01,
    mean_normalize=True,
    window="hamming",
    n_filters=20,
    center=True,
    alpha=0.95,
    fs=44000,
):
    """
    Apply the Mel-filterbank to the power spectrum for a signal `x`.

    Notes
    -----
    The Mel spectrogram is the projection of the power spectrum of the framed
    and windowed signal onto the basis set provided by the Mel filterbank.

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of N samples
    window_duration : float
        The duration of each frame / window (in seconds). Default is 0.025.
    stride_duration : float
        The duration of the hop between consecutive windows (in seconds).
        Default is 0.01.
    mean_normalize : bool
        Whether to subtract the coefficient means from the final filter values
        to improve the signal-to-noise ratio. Default is True.
    window : {'hamming', 'hann', 'blackman_harris'}
        The windowing function to apply to the signal before FFT. Default is
        'hamming'.
    n_filters : int
        The number of mel filters to include in the filterbank. Default is 20.
    center : bool
        Whether to the `k` th frame of the signal should *begin* at index ``x[k *
        stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]``
        (center = True). Default is False.
    alpha : float in [0, 1)
        The coefficient for the preemphasis filter. A value of 0 corresponds to
        no filtering. Default is 0.95.
    fs : int
        The sample rate/frequency for the signal. Default is 44000.

    Returns
    -------
    filter_energies : :py:class:`ndarray <numpy.ndarray>` of shape `(G, n_filters)`
        The (possibly mean_normalized) power for each filter in the Mel
        filterbank (i.e., the Mel spectrogram). Rows correspond to frames,
        columns to filters
    energy_per_frame : :py:class:`ndarray <numpy.ndarray>` of shape `(G,)`
        The total energy in each frame of the signal
    """
    eps = np.finfo(float).eps
    window_fn = WindowInitializer()(window)

    stride = round(stride_duration * fs)
    frame_width = round(window_duration * fs)
    N = frame_width

    # add a preemphasis filter to the raw signal
    x = preemphasis(x, alpha)

    # convert signal to overlapping frames and apply a window function
    x = np.pad(x, N // 2, "reflect") if center else x
    frames = to_frames(x, frame_width, stride, fs)

    window = np.tile(window_fn(frame_width), (frames.shape[0], 1))
    frames = frames * window

    # compute the power spectrum
    power_spec = power_spectrum(frames)
    energy_per_frame = np.sum(power_spec, axis=1)
    energy_per_frame[energy_per_frame == 0] = eps

    # compute the power at each filter in the Mel filterbank
    fbank = mel_filterbank(N, n_filters=n_filters, fs=fs)
    filter_energies = power_spec @ fbank.T
    filter_energies -= np.mean(filter_energies, axis=0) if mean_normalize else 0
    filter_energies[filter_energies == 0] = eps
    return filter_energies, energy_per_frame


#######################################################################
#                       Mel-Frequency Features                        #
#######################################################################


def mfcc(
    x,
    fs=44000,
    n_mfccs=13,
    alpha=0.95,
    center=True,
    n_filters=20,
    window="hann",
    normalize=True,
    lifter_coef=22,
    stride_duration=0.01,
    window_duration=0.025,
    replace_intercept=True,
):
    """
    Compute the Mel-frequency cepstral coefficients (MFCC) for a signal.

    Notes
    -----
    Computing MFCC features proceeds in the following stages:

        1. Convert the signal into overlapping frames and apply a window fn
        2. Compute the power spectrum at each frame
        3. Apply the mel filterbank to the power spectra to get mel filterbank powers
        4. Take the logarithm of the mel filterbank powers at each frame
        5. Take the discrete cosine transform (DCT) of the log filterbank
           energies and retain only the first k coefficients to further reduce
           the dimensionality

    MFCCs were developed in the context of HMM-GMM automatic speech recognition
    (ASR) systems and can be used to provide a somewhat speaker/pitch
    invariant representation of phonemes.

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of N samples
    fs : int
        The sample rate/frequency for the signal. Default is 44000.
    n_mfccs : int
        The number of cepstral coefficients to return (including the intercept
        coefficient). Default is 13.
    alpha : float in [0, 1)
        The preemphasis coefficient. A value of 0 corresponds to no
        filtering. Default is 0.95.
    center : bool
        Whether to the kth frame of the signal should *begin* at index ``x[k *
        stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]``
        (center = True). Default is True.
    n_filters : int
        The number of filters to include in the Mel filterbank. Default is 20.
    normalize : bool
        Whether to mean-normalize the MFCC values. Default is True.
    lifter_coef : int in :math:[0, + \infty]`
        The cepstral filter coefficient. 0 corresponds to no filtering, larger
        values correspond to greater amounts of smoothing. Default is 22.
    window : {'hamming', 'hann', 'blackman_harris'}
        The windowing function to apply to the signal before taking the DFT.
        Default is 'hann'.
    stride_duration : float
        The duration of the hop between consecutive windows (in seconds).
        Default is 0.01.
    window_duration : float
        The duration of each frame / window (in seconds). Default is 0.025.
    replace_intercept : bool
        Replace the first MFCC coefficient (the intercept term) with the
        log of the total frame energy instead. Default is True.

    Returns
    -------
    mfccs : :py:class:`ndarray <numpy.ndarray>` of shape `(G, C)`
        Matrix of Mel-frequency cepstral coefficients. Rows correspond to
        frames, columns to cepstral coefficients
    """
    # map the power spectrum for the (framed + windowed representation of) `x`
    # onto the mel scale
    filter_energies, frame_energies = mel_spectrogram(
        x=x,
        fs=fs,
        alpha=alpha,
        center=center,
        window=window,
        n_filters=n_filters,
        mean_normalize=False,
        window_duration=window_duration,
        stride_duration=stride_duration,
    )

    log_energies = 10 * np.log10(filter_energies)

    # perform a DCT on the log-mel coefficients to further reduce the data
    # dimensionality -- the early DCT coefficients will capture the majority of
    # the data, allowing us to discard coefficients > n_mfccs
    mfccs = np.array([DCT(frame) for frame in log_energies])[:, :n_mfccs]

    mfccs = cepstral_lifter(mfccs, D=lifter_coef)
    mfccs -= np.mean(mfccs, axis=0) if normalize else 0

    if replace_intercept:
        # the 0th MFCC coefficient doesn't tell us anything about the spectrum;
        # replace it with the log of the frame energy for something more
        # informative
        mfccs[:, 0] = np.log(frame_energies)
    return mfccs


def mel2hz(mel, formula="htk"):
    """
    Convert the mel-scale representation of a signal into Hz

    Parameters
    ----------
    mel : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        An array of mel frequencies to convert
    formula : {"htk", "slaney"}
        The Mel formula to use. "htk" uses the formula used by the Hidden
        Markov Model Toolkit, and described in O'Shaughnessy (1987). "slaney"
        uses the formula used in the MATLAB auditory toolbox (Slaney, 1998).
        Default is 'htk'

    Returns
    -------
    hz : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        The frequencies of the items in `mel`, in Hz
    """
    fstr = "formula must be either 'htk' or 'slaney' but got '{}'"
    assert formula in ["htk", "slaney"], fstr.format(formula)
    if formula == "htk":
        return 700 * (10 ** (mel / 2595) - 1)
    raise NotImplementedError("slaney")


def hz2mel(hz, formula="htk"):
    """
    Convert the frequency representaiton of a signal in Hz into the mel scale.

    Parameters
    ----------
    hz : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        The frequencies of the items in `mel`, in Hz
    formula : {"htk", "slaney"}
        The Mel formula to use. "htk" uses the formula used by the Hidden
        Markov Model Toolkit, and described in O'Shaughnessy (1987). "slaney"
        uses the formula used in the MATLAB auditory toolbox (Slaney, 1998).
        Default is 'htk'.

    Returns
    -------
    mel : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        An array of mel frequencies to convert.
    """
    fstr = "formula must be either 'htk' or 'slaney' but got '{}'"
    assert formula in ["htk", "slaney"], fstr.format(formula)

    if formula == "htk":
        return 2595 * np.log10(1 + hz / 700)
    raise NotImplementedError("slaney")


def mel_filterbank(
    N, n_filters=20, fs=44000, min_freq=0, max_freq=None, normalize=True
):
    """
    Compute the filters in a Mel filterbank and return the corresponding
    transformation matrix

    Notes
    -----
    The Mel scale is a perceptual scale designed to simulate the way the human
    ear works. Pitches judged by listeners to be equal in perceptual /
    psychological distance have equal distance on the Mel scale.  Practically,
    this corresponds to a scale with higher resolution at low frequencies and
    lower resolution at higher (> 500 Hz) frequencies.

    Each filter in the Mel filterbank is triangular with a response of 1 at its
    center and a linear decay on both sides until it reaches the center
    frequency of the next adjacent filter.

    This implementation is based on code in the (superb) LibROSA package [1].

    References
    ----------
    .. [1] McFee et al. (2015). "librosa: Audio and music signal analysis in
       Python", *Proceedings of the 14th Python in Science Conference*
       https://librosa.github.io

    Parameters
    ----------
    N : int
        The number of DFT bins
    n_filters : int
        The number of mel filters to include in the filterbank. Default is 20.
    min_freq : int
        Minimum filter frequency (in Hz). Default is 0.
    max_freq : int
        Maximum filter frequency (in Hz). Default is 0.
    fs : int
        The sample rate/frequency for the signal. Default is 44000.
    normalize : bool
        If True, scale the Mel filter weights by their area in Mel space.
        Default is True.

    Returns
    -------
    fbank : :py:class:`ndarray <numpy.ndarray>` of shape `(n_filters, N // 2 + 1)`
        The mel-filterbank transformation matrix. Rows correspond to filters,
        columns to DFT bins.
    """
    max_freq = fs / 2 if max_freq is None else max_freq
    min_mel, max_mel = hz2mel(min_freq), hz2mel(max_freq)

    fbank = np.zeros((n_filters, N // 2 + 1))

    # uniformly spaced values on the mel scale, translated back into Hz
    mel_bins = mel2hz(np.linspace(min_mel, max_mel, n_filters + 2))

    # the centers of the frequency bins for the DFT
    hz_bins = dft_bins(N, fs)

    mel_spacing = np.diff(mel_bins)

    # ramps[i] = mel_bins[i] - hz_bins
    ramps = mel_bins.reshape(-1, 1) - hz_bins.reshape(1, -1)
    for i in range(n_filters):
        # calc the filter values on the left and right across the bins ...
        left = -ramps[i] / mel_spacing[i]
        right = ramps[i + 2] / mel_spacing[i + 1]

        # .. and set them zero when they cross the x-axis
        fbank[i] = np.maximum(0, np.minimum(left, right))

    if normalize:
        energy_norm = 2.0 / (mel_bins[2 : n_filters + 2] - mel_bins[:n_filters])
        fbank *= energy_norm[:, np.newaxis]

    return fbank


================================================
FILE: numpy_ml/preprocessing/general.py
================================================
import json
import hashlib
import warnings

import numpy as np

try:
    from scipy.sparse import csr_matrix

    _SCIPY = True
except ImportError:
    warnings.warn("Scipy not installed. FeatureHasher can only create dense matrices")
    _SCIPY = False


def minibatch(X, batchsize=256, shuffle=True):
    """
    Compute the minibatch indices for a training dataset.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)`
        The dataset to divide into minibatches. Assumes the first dimension
        represents the number of training examples.
    batchsize : int
        The desired size of each minibatch. Note, however, that if ``X.shape[0] %
        batchsize > 0`` then the final batch will contain fewer than batchsize
        entries. Default is 256.
    shuffle : bool
        Whether to shuffle the entries in the dataset before dividing into
        minibatches. Default is True.

    Returns
    -------
    mb_generator : generator
        A generator which yields the indices into `X` for each batch.
    n_batches: int
        The number of batches.
    """
    N = X.shape[0]
    ix = np.arange(N)
    n_batches = int(np.ceil(N / batchsize))

    if shuffle:
        np.random.shuffle(ix)

    def mb_generator():
        for i in range(n_batches):
            yield ix[i * batchsize : (i + 1) * batchsize]

    return mb_generator(), n_batches


class OneHotEncoder:
    def __init__(self):
        """
        Convert between category labels and their one-hot vector
        representations.

        Parameters
        ----------
        categories : list of length `C`
            List of the unique category labels for the items to encode.
        """
        self._is_fit = False
        self.hyperparameters = {}
        self.parameters = {"categories": None}

    def __call__(self, labels):
        return self.transform(labels)

    def fit(self, categories):
        """
        Create mappings between columns and category labels.

        Parameters
        ----------
        categories : list of length `C`
            List of the unique category labels for the items to encode.
        """
        self.parameters["categories"] = categories
        self.cat2idx = {c: i for i, c in enumerate(categories)}
        self.idx2cat = {i: c for i, c in enumerate(categories)}
        self._is_fit = True

    def transform(self, labels, categories=None):
        """
        Convert a list of labels into a one-hot encoding.

        Parameters
        ----------
        labels : list of length `N`
            A list of category labels.
        categories : list of length `C`
            List of the unique category labels for the items to encode. Default
            is None.

        Returns
        -------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            The one-hot encoded labels. Each row corresponds to an example,
            with a single 1 in the column corresponding to the respective
            label.
        """
        if not self._is_fit:
            categories = set(labels) if categories is None else categories
            self.fit(categories)

        unknown = list(set(labels) - set(self.cat2idx.keys()))
        assert len(unknown) == 0, "Unrecognized label(s): {}".format(unknown)

        N, C = len(labels), len(self.cat2idx)
        cols = np.array([self.cat2idx[c] for c in labels])

        Y = np.zeros((N, C))
        Y[np.arange(N), cols] = 1
        return Y

    def inverse_transform(self, Y):
        """
        Convert a one-hot encoding back into the corresponding labels

        Parameters
        ----------
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            One-hot encoded labels. Each row corresponds to an example, with a
            single 1 in the column associated with the label for that example

        Returns
        -------
        labels : list of length `N`
            The list of category labels corresponding to the nonzero columns in
            `Y`
        """
        C = len(self.cat2idx)
        assert Y.ndim == 2, "Y must be 2D, but has shape {}".format(Y.shape)
        assert Y.shape[1] == C, "Y must have {} columns, got {}".format(C, Y.shape[1])
        return [self.idx2cat[ix] for ix in Y.nonzero()[1]]


class Standardizer:
    def __init__(self, with_mean=True, with_std=True):
        """
        Feature-wise standardization for vector inputs.

        Notes
        -----
        Due to the sensitivity of empirical mean and standard deviation
        calculations to extreme values, `Standardizer` cannot guarantee
        balanced feature scales in the presence of outliers. In particular,
        note that because outliers for each feature can have different
        magnitudes, the spread of the transformed data on each feature can be
        very different.

        Similar to sklearn, `Standardizer` uses a biased estimator for the
        standard deviation: ``numpy.std(x, ddof=0)``.

        Parameters
        ----------
        with_mean : bool
            Whether to scale samples to have 0 mean during transformation.
            Default is True.
        with_std : bool
            Whether to scale samples to have unit variance during
            transformation. Default is True.
        """
        self.with_mean = with_mean
        self.with_std = with_std
        self._is_fit = False

    @property
    def hyperparameters(self):
        H = {"with_mean": self.with_mean, "with_std": self.with_std}
        return H

    @property
    def parameters(self):
        params = {
            "mean": self._mean if hasattr(self, "mean") else None,
            "std": self._std if hasattr(self, "std") else None,
        }
        return params

    def __call__(self, X):
        return self.transform(X)

    def fit(self, X):
        """
        Store the feature-wise mean and standard deviation across the samples
        in `X` for future scaling.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            An array of N samples, each with dimensionality `C`
        """
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if X.shape[0] < 2:
            raise ValueError("`X` must contain at least 2 samples")

        std = np.ones(X.shape[1])
        mean = np.zeros(X.shape[1])

        if self.with_mean:
            mean = np.mean(X, axis=0)

        if self.with_std:
            std = np.std(X, axis=0, ddof=0)

        self._mean = mean
        self._std = std
        self._is_fit = True

    def transform(self, X):
        """
        Standardize features by removing the mean and scaling to unit variance.

        For a sample `x`, the standardized score is calculated as:

        .. math::

            z = (x - u) / s

        where `u` is the mean of the training samples or zero if `with_mean` is
        False, and `s` is the standard deviation of the training samples or 1
        if `with_std` is False.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            An array of N samples, each with dimensionality `C`.

        Returns
        -------
        Z : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            The feature-wise standardized version of `X`.
        """
        if not self._is_fit:
            raise Exception("Must call `fit` before using the `transform` method")
        return (X - self._mean) / self._std

    def inverse_transform(self, Z):
        """
        Convert a collection of standardized features back into the original
        feature space.

        For a standardized sample `z`, the unstandardized score is calculated as:

        .. math::

            x = z s + u

        where `u` is the mean of the training samples or zero if `with_mean` is
        False, and `s` is the standard deviation of the training samples or 1
        if `with_std` is False.

        Parameters
        ----------
        Z : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            An array of `N` standardized samples, each with dimensionality `C`.

        Returns
        -------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            The unstandardixed samples from `Z`.
        """
        assert self._is_fit, "Must fit `Standardizer` before calling inverse_transform"
        P = self.parameters
        mean, std = P["mean"], P["std"]
        return Z * std + mean


class FeatureHasher:
    def __init__(self, n_dim=256, sparse=True):
        """
        Convert a collection of features to a fixed-dimensional matrix using
        the hashing trick.

        Notes
        -----
        Uses the md5 hash.

        Parameters
        ----------
        n_dim : int
            The dimensionality of each example in the output feature matrix.
            Small numbers of features are likely to cause hash collisions, but
            large numbers will cause larger overall parameter dimensions for
            any (linear) learning agent. Default is 256.
        sparse : bool
            Whether the resulting feature matrix should be a sparse
            :py:class:`csr_matrix <scipy.sparse.csr_matrix>` or dense
            :py:class:`ndarray <numpy.ndarray>`. Default is True.
        """
        self.n_dim = n_dim
        self.hash = hashlib.md5
        self.sparse = sparse and _SCIPY

    def encode(self, examples):
        """
        Encode a collection of multi-featured examples into a
        `n_dim`-dimensional feature matrix via feature hashing.

        Notes
        -----
        Feature hashing works by applying a hash function to the features of an
        example and using the hash values as column indices in the resulting
        feature matrix. The entries at each hashed feature column correspond to
        the values for that example and feature. For example, given the
        following two input examples:

            >>> examples = [
                {"furry": 1, "quadruped": 1, "domesticated": 1},
                {"nocturnal": 1, "quadruped": 1},
            ]

        and a hypothetical hash function `H` mapping strings to [0, 127], we have:

            >>> feature_mat = zeros(2, 128)
            >>> ex1_cols = [H("furry"), H("quadruped"), H("domesticated")]
            >>> ex2_cols = [H("nocturnal"), H("quadruped")]
            >>> feat_mat[0, ex1_cols] = 1
            >>> feat_mat[1, ex2_cols] = 1

        To better handle hash collisions, it is common to multiply the feature
        value by the sign of the digest for the corresponding feature name.

        Parameters
        ----------
        examples : dict or list of dicts
            A collection of `N` examples, each represented as a dict where keys
            correspond to the feature name and values correspond to the feature
            value.

        Returns
        -------
        table : :py:class:`ndarray <numpy.ndarray>` or :py:class:`csr_matrix <scipy.sparse.csr_matrix>` of shape `(N, n_dim)`
            The encoded feature matrix
        """
        if isinstance(examples, dict):
            examples = [examples]

        sparse = self.sparse
        return self._encode_sparse(examples) if sparse else self._encode_dense(examples)

    def _encode_dense(self, examples):
        N = len(examples)
        table = np.zeros(N, self.n_dim)  # dense

        for row, feat_dict in enumerate(examples):
            for f_id, val in feat_dict.items():
                if isinstance(f_id, str):
                    f_id = f_id.encode("utf-8")

                # use json module to convert the feature id into a unique
                # string compatible with the buffer API (required by hashlib)
                if isinstance(f_id, (tuple, dict, list)):
                    f_id = json.dumps(f_id, sort_keys=True).encode("utf-8")

                h = int(self.hash(f_id).hexdigest(), base=16)
                col = h % self.n_dim
                table[row, col] += np.sign(h) * val

        return table

    def _encode_sparse(self, examples):
        N = len(examples)
        idxs, data = [], []

        for row, feat_dict in enumerate(examples):
            for f_id, val in feat_dict.items():
                if isinstance(f_id, str):
                    f_id = f_id.encode("utf-8")

                # use json module to convert the feature id into a unique
                # string compatible with the buffer API (required by hashlib)
                if isinstance(f_id, (tuple, dict, list)):
                    f_id = json.dumps(f_id, sort_keys=True).encode("utf-8")

                h = int(self.hash(f_id).hexdigest(), base=16)
                col = h % self.n_dim
                idxs.append((row, col))
                data.append(np.sign(h) * val)

        table = csr_matrix((data, zip(*idxs)), shape=(N, self.n_dim))
        return table


================================================
FILE: numpy_ml/preprocessing/nlp.py
================================================
"""Common preprocessing utilities for working with text data"""
import re
import heapq
import os.path as op
from collections import Counter, OrderedDict, defaultdict

import numpy as np


# This list of English stop words is taken from the "Glasgow Information
# Retrieval Group". The original list can be found at
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
_STOP_WORDS = set(
    (
        "a about above across after afterwards again against all almost alone "
        "along already also although always am among amongst amoungst amount an "
        "and another any anyhow anyone anything anyway anywhere are around as at "
        "back be became because become becomes becoming been before beforehand "
        "behind being below beside besides between beyond bill both bottom but by "
        "call can cannot cant co con could couldnt cry de describe detail do done "
        "down due during each eg eight either eleven else elsewhere empty enough "
        "etc even ever every everyone everything everywhere except few fifteen "
        "fifty fill find fire first five for former formerly forty found four from "
        "front full further get give go had has hasnt have he hence her here "
        "hereafter hereby herein hereupon hers herself him himself his how however "
        "hundred i ie if in inc indeed interest into is it its itself keep last "
        "latter latterly least less ltd made many may me meanwhile might mill mine "
        "more moreover most mostly move much must my myself name namely neither "
        "never nevertheless next nine no nobody none noone nor not nothing now "
        "nowhere of off often on once one only onto or other others otherwise our "
        "ours ourselves out over own part per perhaps please put rather re same see "
        "seem seemed seeming seems serious several she should show side since "
        "sincere six sixty so some somehow someone something sometime sometimes "
        "somewhere still such system take ten than that the their them themselves "
        "then thence there thereafter thereby therefore therein thereupon these "
        "they thick thin third this those though three through throughout thru thus "
        "to together too top toward towards twelve twenty two un under until up "
        "upon us very via was we well were what whatever when whence whenever where "
        "whereafter whereas whereby wherein whereupon wherever whether which while "
        "whither who whoever whole whom whose why will with within without would "
        "yet you your yours yourself yourselves"
    ).split(" "),
)

_WORD_REGEX = re.compile(r"(?u)\b\w\w+\b")  # sklearn default
_WORD_REGEX_W_PUNC = re.compile(r"(?u)\w+|[^a-zA-Z0-9\s]")
_WORD_REGEX_W_PUNC_AND_WHITESPACE = re.compile(r"(?u)s?\w+\s?|\s?[^a-zA-Z0-9\s]\s?")

_PUNC_BYTE_REGEX = re.compile(
    r"(33|34|35|36|37|38|39|40|41|42|43|44|45|"
    r"46|47|58|59|60|61|62|63|64|91|92|93|94|"
    r"95|96|123|124|125|126)",
)
_PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
_PUNC_TABLE = str.maketrans("", "", _PUNCTUATION)


def ngrams(sequence, N):
    """Return all `N`-grams of the elements in `sequence`"""
    assert N >= 1
    return list(zip(*[sequence[i:] for i in range(N)]))


def tokenize_whitespace(
    line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,
):
    """
    Split a string at any whitespace characters, optionally removing
    punctuation and stop-words in the process.
    """
    line = line.lower() if lowercase else line
    words = line.split()
    line = [strip_punctuation(w) for w in words] if filter_punctuation else line
    return remove_stop_words(words) if filter_stopwords else words


def tokenize_words(
    line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,
):
    """
    Split a string into individual words, optionally removing punctuation and
    stop-words in the process.
    """
    REGEX = _WORD_REGEX if filter_punctuation else _WORD_REGEX_W_PUNC
    words = REGEX.findall(line.lower() if lowercase else line)
    return remove_stop_words(words) if filter_stopwords else words


def tokenize_words_bytes(
    line,
    lowercase=True,
    filter_stopwords=True,
    filter_punctuation=True,
    encoding="utf-8",
    **kwargs,
):
    """
    Split a string into individual words, optionally removing punctuation and
    stop-words in the process. Translate each word into a list of bytes.
    """
    words = tokenize_words(
        line,
        lowercase=lowercase,
        filter_stopwords=filter_stopwords,
        filter_punctuation=filter_punctuation,
        **kwargs,
    )
    words = [" ".join([str(i) for i in w.encode(encoding)]) for w in words]
    return words


def tokenize_bytes_raw(line, encoding="utf-8", splitter=None, **kwargs):
    """
    Convert the characters in `line` to a collection of bytes. Each byte is
    represented in decimal as an integer between 0 and 255.

    Parameters
    ----------
    line : str
        The string to tokenize.
    encoding : str
        The encoding scheme for the characters in `line`. Default is `'utf-8'`.
    splitter : {'punctuation', None}
        If `'punctuation'`, split the string at any punctuation character
        before encoding into bytes. If None, do not split `line` at all.
        Default is None.

    Returns
    -------
    bytes : list
        A list of the byte-encoded characters in `line`. Each item in the list
        is a string of space-separated integers between 0 and 255 representing
        the bytes encoding the characters in `line`.
    """
    byte_str = [" ".join([str(i) for i in line.encode(encoding)])]
    if splitter == "punctuation":
        byte_str = _PUNC_BYTE_REGEX.sub(r"-\1-", byte_str[0]).split("-")
    return byte_str


def bytes_to_chars(byte_list, encoding="utf-8"):
    """
    Decode bytes (represented as an integer between 0 and 255) to characters in
    the specified encoding.
    """
    hex_array = [hex(a).replace("0x", "") for a in byte_list]
    hex_array = " ".join([h if len(h) > 1 else f"0{h}" for h in hex_array])
    return bytearray.fromhex(hex_array).decode(encoding)


def tokenize_chars(line, lowercase=True, filter_punctuation=True, **kwargs):
    """
    Split a string into individual characters, optionally removing punctuation
    and stop-words in the process.
    """
    line = line.lower() if lowercase else line
    line = strip_punctuation(line) if filter_punctuation else line
    chars = list(re.sub(" {2,}", " ", line).strip())
    return chars


def remove_stop_words(words):
    """Remove stop words from a list of word strings"""
    return [w for w in words if w.lower() not in _STOP_WORDS]


def strip_punctuation(line):
    """Remove punctuation from a string"""
    return line.translate(_PUNC_TABLE).strip()


#######################################################################
#                          Byte-Pair Encoder                          #
#######################################################################


class BytePairEncoder(object):
    def __init__(self, max_merges=3000, encoding="utf-8"):
        """
        A byte-pair encoder for sub-word embeddings.

        Notes
        -----
        Byte-pair encoding [1][2] is a compression algorithm that iteratively
        replaces the most frequently ocurring byte pairs in a set of documents
        with a new, single token. It has gained popularity as a preprocessing
        step for many NLP tasks due to its simplicity and expressiveness: using
        a base coebook of just 256 unique tokens (bytes), any string can be
        encoded.

        References
        ----------
        .. [1] Gage, P. (1994). A new algorithm for data compression. *C
           Users Journal, 12(2)*, 23–38.
        .. [2] Sennrich, R., Haddow, B., & Birch, A. (2015). Neural machine
           translation of rare words with subword units, *Proceedings of the
           54th Annual Meeting of the Association for Computational
           Linguistics,* 1715-1725.

        Parameters
        ----------
        max_merges : int
            The maximum number of byte pair merges to perform during the
            :meth:`fit` operation. Default is 3000.
        encoding : str
            The encoding scheme for the documents used to train the encoder.
            Default is `'utf-8'`.
        """
        self.parameters = {
            "max_merges": max_merges,
            "encoding": encoding,
        }

        # initialize the byte <-> token and token <-> byte dictionaries. bytes
        # are represented in decimal as integers between 0 and 255. there is a
        # 1:1 correspondence between token and byte representations up to 255.
        self.byte2token = OrderedDict({i: i for i in range(256)})
        self.token2byte = OrderedDict({v: k for k, v in self.byte2token.items()})

    def fit(self, corpus_fps, encoding="utf-8"):
        """
        Train a byte pair codebook on a set of documents.

        Parameters
        ----------
        corpus_fps : str or list of strs
            The filepath / list of filepaths for the document(s) to be used to
            learn the byte pair codebook.
        encoding : str
            The text encoding for documents. Common entries are either 'utf-8'
            (no header byte), or 'utf-8-sig' (header byte). Default is
            'utf-8'.
        """
        vocab = (
            Vocabulary(
                lowercase=False,
                min_count=None,
                max_tokens=None,
                filter_stopwords=False,
                filter_punctuation=False,
                tokenizer="bytes",
            )
            .fit(corpus_fps, encoding=encoding)
            .counts
        )

        # iteratively merge the most common byte bigram across the documents
        for _ in range(self.parameters["max_merges"]):
            pair_counts = self._get_counts(vocab)
            most_common_bigram = max(pair_counts, key=pair_counts.get)
            vocab = self._merge(most_common_bigram, vocab)

        token_bytes = set()
        for k in vocab.keys():
            token_bytes = token_bytes.union([w for w in k.split(" ") if "-" in w])

        for i, t in enumerate(token_bytes):
            byte_tuple = tuple(int(j) for j in t.split("-"))
            self.token2byte[256 + i] = byte_tuple
            self.byte2token[byte_tuple] = 256 + i

        return self

    def _get_counts(self, vocab):
        """Collect bigram counts for the tokens in vocab"""
        pair_counts = defaultdict(int)
        for word, count in vocab.items():
            pairs = ngrams(word.split(" "), 2)
            for p in pairs:
                pair_counts[p] += count
        return pair_counts

    def _merge(self, bigram, vocab):
        """Replace `bigram` with a single token and update vocab accordingly"""
        v_out = {}
        bg = re.escape(" ".join(bigram))
        bigram_regex = re.compile(r"(?<!\S)" + bg + r"(?!\S)")
        for word in vocab.keys():
            # bigram "a b" becomes "a-b"
            w_out = bigram_regex.sub("-".join(bigram), word)
            v_out[w_out] = vocab[word]
        return v_out

    def transform(self, text):
        """
        Transform the words in `text` into their byte pair encoded token IDs.

        Parameters
        ----------
        text: str or list of `N` strings
            The list of strings to encode

        Returns
        -------
        codes : list of `N` lists
            A list of byte pair token IDs for each of the `N` strings in
            `text`.

        Examples
        --------
        >>> B = BytePairEncoder(max_merges=100).fit("./example.txt")
        >>> encoded_tokens = B.transform("Hello! How are you 😁 ?")
        >>> encoded_tokens
        [[72, 879, 474, ...]]
        """
        if isinstance(text, str):
            text = [text]
        return [self._transform(string) for string in text]

    def _transform(self, text):
        """Transform a single text string to a list of byte-pair IDs"""
        P = self.parameters
        _bytes = tokenize_bytes_raw(text, encoding=P["encoding"])

        encoded = []
        for w in _bytes:
            l, r = 0, len(w)
            w = [int(i) for i in w.split(" ")]

            while l < len(w):
                candidate = tuple(w[l:r])

                if len(candidate) > 1 and candidate in self.byte2token:
                    # candidate is a collection of several bytes and is in our
                    # vocab
                    encoded.append(self.byte2token[candidate])
                    l, r = r, len(w)
                elif len(candidate) == 1:
                    # candidate is a single byte and should always be in our
                    # vocab
                    encoded.append(candidate[0])
                    l, r = r, len(w)
                else:
                    # candidate is not in vocab, so we decrease our context
                    # window by 1 and try again
                    r -= 1
        return encoded

    def inverse_transform(self, codes):
        """
        Transform an encoded sequence of byte pair codeword IDs back into
        human-readable text.

        Parameters
        ----------
        codes : list of `N` lists
            A list of `N` lists. Each sublist is a collection of integer
            byte-pair token IDs representing a particular text string.

        Returns
        -------
        text: list of `N` strings
            The decoded strings corresponding to the `N` sublists in `codes`.

        Examples
        --------
        >>> B = BytePairEncoder(max_merges=100).fit("./example.txt")
        >>> encoded_tokens = B.transform("Hello! How are you 😁 ?")
        >>> encoded_tokens
        [[72, 879, 474, ...]]
        >>> B.inverse_transform(encoded_tokens)
        ["Hello! How are you 😁 ?"]
        """
        if isinstance(codes[0], int):
            codes = [codes]

        decoded = []
        P = self.parameters

        for code in codes:
            _bytes = [self.token2byte[t] if t > 255 else [t] for t in code]
            _bytes = [b for blist in _bytes for b in blist]
            decoded.append(bytes_to_chars(_bytes, encoding=P["encoding"]))
        return decoded

    @property
    def codebook(self):
        """
        A list of the learned byte pair codewords, decoded into human-readable
        format
        """
        return [
            self.inverse_transform(t)[0]
            for t in self.byte2token.keys()
            if isinstance(t, tuple)
        ]

    @property
    def tokens(self):
        """A list of the byte pair codeword IDs"""
        return list(self.token2byte.keys())


#######################################################################
#                            Huffman Tree                             #
#######################################################################


class Node(object):
    def __init__(self, key, val):
        self.key = key
        self.val = val
        self.left = None
        self.right = None

    def __gt__(self, other):
        """Greater than"""
        if not isinstance(other, Node):
            return -1
        return self.val > other.val

    def __ge__(self, other):
        """Greater than or equal to"""
        if not isinstance(other, Node):
            return -1
        return self.val >= other.val

    def __lt__(self, other):
        """Less than"""
        if not isinstance(other, Node):
            return -1
        return self.val < other.val

    def __le__(self, other):
        """Less than or equal to"""
        if not isinstance(other, Node):
            return -1
        return self.val <= other.val


class HuffmanEncoder(object):
    def fit(self, text):
        """
        Build a Huffman tree for the tokens in `text` and compute each token's
        binary encoding.

        Notes
        -----
        In a Huffman code, tokens that occur more frequently are (generally)
        represented using fewer bits. Huffman codes produce the minimum expected
        codeword length among all methods for encoding tokens individually.

        Huffman codes correspond to paths through a binary tree, with 1
        corresponding to "move right" and 0 corresponding to "move left". In
        contrast to standard binary trees, the Huffman tree is constructed from the
        bottom up. Construction begins by initializing a min-heap priority queue
        consisting of each token in the corpus, with priority corresponding to the
        token frequency. At each step, the two most infrequent tokens in the corpus
        are removed and become the children of a parent pseudotoken whose
        "frequency" is the sum of the frequencies of its children. This new parent
        pseudotoken is added to the priority queue and the process is repeated
        recursively until no tokens remain.

        Parameters
        ----------
        text: list of strs or :class:`Vocabulary` instance
            The tokenized text or a pretrained :class:`Vocabulary` object to use for
            building the Huffman code.
        """
        self._build_tree(text)
        self._generate_codes()

    def transform(self, text):
        """
        Transform the words in `text` into their Huffman-code representations.

        Parameters
        ----------
        text: list of `N` strings
            The list of words to encode

        Returns
        -------
        codes : list of `N` binary strings
            The encoded words in `text`
        """
        if isinstance(text, str):
            text = [text]
        for token in set(text):
            if token not in self._item2code:
                raise Warning("Token '{}' not in Huffman tree. Skipping".format(token))
        return [self._item2code.get(t, None) for t in text]

    def inverse_transform(self, codes):
        """
        Transform an encoded sequence of bit-strings back into words.

        Parameters
        ----------
        codes : list of `N` binary strings
            A list of encoded bit-strings, represented as strings.

        Returns
        -------
        text: list of `N` strings
            The decoded text.
        """
        if isinstance(codes, str):
            codes = [codes]
        for code in set(codes):
            if code not in self._code2item:
                raise Warning("Code '{}' not in Huffman tree. Skipping".format(code))
        return [self._code2item.get(c, None) for c in codes]

    @property
    def tokens(self):
        """A list the unique tokens in `text`"""
        return list(self._item2code.keys())

    @property
    def codes(self):
        """A list with the Huffman code for each unique token in `text`"""
        return list(self._code2item.keys())

    def _counter(self, text):
        counts = {}
        for item in text:
            counts[item] = counts.get(item, 0) + 1
        return counts

    def _build_tree(self, text):
        """Construct Huffman Tree"""
        PQ = []

        if isinstance(text, Vocabulary):
            counts = text.counts
        else:
            counts = self._counter(text)

        for (k, c) in counts.items():
            PQ.append(Node(k, c))

        # create a priority queue with priority = item frequency
        heapq.heapify(PQ)

        while len(PQ) > 1:
            node1 = heapq.heappop(PQ)  # item with smallest frequency
            node2 = heapq.heappop(PQ)  # item with second smallest frequency

            parent = Node(None, node1.val + node2.val)
            parent.left = node1
            parent.right = node2

            heapq.heappush(PQ, parent)

        self._root = heapq.heappop(PQ)

    def _generate_codes(self):
        current_code = ""
        self._item2code = {}
        self._code2item = {}
        self._build_code(self._root, current_code)

    def _build_code(self, root, current_code):
        if root is None:
            return

        if root.key is not None:
            self._item2code[root.key] = current_code
            self._code2item[current_code] = root.key
            return

        # 0 = move left, 1 = move right
        self._build_code(root.left, current_code + "0")
        self._build_code(root.right, current_code + "1")


#######################################################################
#                             Containers                              #
#######################################################################


class Token:
    def __init__(self, word):
        self.count = 0
        self.word = word

    def __repr__(self):
        """A string representation of the token"""
        return "Token(word='{}', count={})".format(self.word, self.count)


class TFIDFEncoder:
    def __init__(
        self,
        vocab=None,
        lowercase=True,
        min_count=0,
        smooth_idf=True,
        max_tokens=None,
        input_type="files",
        filter_stopwords=True,
        filter_punctuation=True,
        tokenizer="words",
    ):
        r"""
        An object for compiling and encoding the term-frequency
        inverse-document-frequency (TF-IDF) representation of the tokens in a
        text corpus.

        Notes
        -----
        TF-IDF is intended to reflect how important a word is to a document in
        a collection or corpus. For a word token `w` in a document `d`, and a
        corpus, :math:`D = \{d_1, \ldots, d_N\}`, we have:

        .. math::
            \text{TF}(w, d)  &=  \text{num. occurences of }w \text{ in document }d \\
            \text{IDF}(w, D)  &=  \log \frac{|D|}{|\{ d \in D: t \in d \}|}

        Parameters
        ----------
        vocab : :class:`Vocabulary` object or list-like
            An existing vocabulary to filter the tokens in the corpus against.
            Default is None.
        lowercase : bool
            Whether to convert each string to lowercase before tokenization.
            Default is True.
        min_count : int
            Minimum number of times a token must occur in order to be included
            in vocab. Default is 0.
        smooth_idf : bool
            Whether to add 1 to the denominator of the IDF calculation to avoid
            divide-by-zero errors. Default is True.
        max_tokens : int
            Only add the `max_tokens` most frequent tokens that occur more
            than `min_count` to the vocabulary.  If None, add all tokens
            greater that occur more than than `min_count`. Default is None.
        input_type : {'files', 'strings'}
            If 'files', the sequence input to `fit` is expected to be a list
            of filepaths. If 'strings', the input is expected to be a list of
            lists, each sublist containing the raw strings for a single
            document in the corpus. Default is 'filename'.
        filter_stopwords : bool
            Whether to remove stopwords before encoding the words in the
            corpus. Default is True.
        filter_punctuation : bool
            Whether to remove punctuation before encoding the words in the
            corpus. Default is True.
        tokenizer : {'whitespace', 'words', 'characters', 'bytes'}
            Strategy to follow when mapping strings to tokens. The
            `'whitespace'` tokenizer splits strings at whitespace characters.
            The `'words'` tokenizer splits strings using a "word" regex. The
            `'characters'` tokenizer splits strings into individual characters.
            The `'bytes'` tokenizer splits strings into a collection of
            individual bytes.
        """
        # create a function to filter against words in the vocab
        self._filter_vocab = lambda words: words
        if isinstance(vocab, Vocabulary):
            self._filter_vocab = vocab.filter
        elif isinstance(vocab, (list, np.ndarray, set)):
            vocab = set(vocab)
            self._filter_vocab = lambda words: [
                w if w in vocab else "<unk>" for w in words
            ]

        if input_type not in ["files", "strings"]:
            fstr = "`input_type` must be either 'files' or 'strings', but got {}"
            raise ValueError(fstr.format(input_type))

        self._tokens = None
        self._idx2doc = None
        self.term_freq = None
        self.idx2token = None
        self.token2idx = None
        self.inv_doc_freq = None

        self.hyperparameters = {
            "id": "TFIDFEncoder",
            "encoding": None,
            "vocab": vocab
            if not isinstance(vocab, Vocabulary)
            else vocab.hyperparameters,
            "lowercase": lowercase,
            "min_count": min_count,
            "input_type": input_type,
            "max_tokens": max_tokens,
            "smooth_idf": smooth_idf,
            "tokenizer": tokenizer
            if not isinstance(vocab, Vocabulary)
            else vocab.hyperparameters["tokenizer"],
            "filter_stopwords": filter_stopwords
            if not isinstance(vocab, Vocabulary)
            else vocab.hyperparameters["filter_stopwords"],
            "filter_punctuation": filter_punctuation
            if not isinstance(vocab, Vocabulary)
            else vocab.hyperparameters["filter_punctuation"],
        }

    def fit(self, corpus_seq, encoding="utf-8-sig"):
        """
        Compute term-frequencies and inverse document frequencies on a
        collection of documents.

        Parameters
        ----------
        corpus_seq : str or list of strs
            The filepath / list of filepaths / raw string contents of the
            document(s) to be encoded, in accordance with the `input_type`
            parameter passed to the :meth:`__init__` method. Each document is
            expected to be a string of tokens separated by whitespace.
        encoding : str
            Specifies the text encoding for corpus if `input_type` is `files`.
            Common entries are either 'utf-8' (no header byte), or 'utf-8-sig'
            (header byte). Default is 'utf-8-sig'.

        Returns
        -------
        self
        """
        H = self.hyperparameters

        if isinstance(corpus_seq, str):
            corpus_seq = [corpus_seq]

        if H["input_type"] == "files":
            for corpus_fp in corpus_seq:
                assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp)

        tokens = []
        idx2token, token2idx = {}, {}

        # encode special tokens
        for tt in ["<bol>", "<eol>", "<unk>"]:
            token2idx[tt] = len(tokens)
            idx2token[len(tokens)] = tt
            tokens.append(Token(tt))

        min_count = H["min_count"]
        max_tokens = H["max_tokens"]
        H["encoding"] = encoding

        bol_ix = token2idx["<bol>"]
        eol_ix = token2idx["<eol>"]
        idx2doc, term_freq = {}, {}

        # encode the text in `corpus_fps` without any filtering ...
        for d_ix, doc in enumerate(corpus_seq):
            doc_count = {}
            idx2doc[d_ix] = doc if H["input_type"] == "files" else None
            token2idx, idx2token, tokens, doc_count = self._encode_document(
                doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix,
            )
            term_freq[d_ix] = doc_count

        self._tokens = tokens
        self._idx2doc = idx2doc
        self.token2idx = token2idx
        self.idx2token = idx2token
        self.term_freq = term_freq

        # ... retain only the top `max_tokens` most frequent tokens, coding
        # everything else as <unk> ...
        if max_tokens is not None and len(tokens) > max_tokens:
            self._keep_top_n_tokens()

        # ... replace all words occurring less than `min_count` by <unk> ...
        if min(self._tokens, key=lambda t: t.count).count < min_count:
            self._drop_low_freq_tokens()

        # ... sort tokens alphabetically and reindex ...
        self._sort_tokens()

        # ... finally, calculate inverse document frequency
        self._calc_idf()
        return self

    def _encode_document(
        self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
    ):
        """Perform tokenization and compute token counts for a single document"""
        H = self.hyperparameters
        lowercase = H["lowercase"]
        filter_stop = H["filter_stopwords"]
        filter_punc = H["filter_punctuation"]

        if H["input_type"] == "files":
            with open(doc, "r", encoding=H["encoding"]) as handle:
                doc = handle.read()

        tokenizer_dict = {
            "words": tokenize_words,
            "characters": tokenize_chars,
            "whitespace": tokenize_whitespace,
            "bytes": tokenize_bytes_raw,
        }
        tokenizer = tokenizer_dict[H["tokenizer"]]

        n_words = 0
        lines = doc.split("\n")
        for line in lines:
            words = tokenizer(
                line,
                lowercase=lowercase,
                filter_stopwords=filter_stop,
                filter_punctuation=filter_punc,
                encoding=H["encoding"],
            )
            words = self._filter_vocab(words)
            n_words += len(words)

            for ww in words:
                if ww not in word2idx:
                    word2idx[ww] = len(tokens)
                    idx2word[len(tokens)] = ww
                    tokens.append(Token(ww))

                t_idx = word2idx[ww]
                tokens[t_idx].count += 1
                doc_count[t_idx] = doc_count.get(t_idx, 0) + 1

            # wrap line in <bol> and <eol> tags
            tokens[bol_ix].count += 1
            tokens[eol_ix].count += 1

            doc_count[bol_ix] = doc_count.get(bol_ix, 0) + 1
            doc_count[eol_ix] = doc_count.get(eol_ix, 0) + 1
        return word2idx, idx2word, tokens, doc_count

    def _keep_top_n_tokens(self):
        N = self.hyperparameters["max_tokens"]
        doc_counts, word2idx, idx2word = {}, {}, {}
        tokens = sorted(self._tokens, key=lambda x: x.count, reverse=True)

        # reindex the top-N tokens...
        unk_ix = None
        for idx, tt in enumerate(tokens[:N]):
            word2idx[tt.word] = idx
            idx2word[idx] = tt.word

            if tt.word == "<unk>":
                unk_ix = idx

        # ... if <unk> isn't in the top-N, add it, replacing the Nth
        # most-frequent word and adjust the <unk> count accordingly ...
        if unk_ix is None:
            unk_ix = self.token2idx["<unk>"]
            old_count = tokens[N - 1].count
            tokens[N - 1] = self._tokens[unk_ix]
            tokens[N - 1].count += old_count
            word2idx["<unk>"] = N - 1
            idx2word[N - 1] = "<unk>"

        # ... and recode all dropped tokens as "<unk>"
        for tt in tokens[N:]:
            tokens[unk_ix].count += tt.count

        # ... finally, reindex the word counts for each document
        doc_counts = {}
        for d_ix in self.term_freq.keys():
            doc_counts[d_ix] = {}
            for old_ix, d_count in self.term_freq[d_ix].items():
                word = self.idx2token[old_ix]
                new_ix = word2idx.get(word, unk_ix)
                doc_counts[d_ix][new_ix] = doc_counts[d_ix].get(new_ix, 0) + d_count

        self._tokens = tokens[:N]
        self.token2idx = word2idx
        self.idx2token = idx2word
        self.term_freq = doc_counts

        assert len(self._tokens) <= N

    def _drop_low_freq_tokens(self):
        """
        Replace all tokens that occur less than `min_count` with the `<unk>`
        token.
        """
        H = self.hyperparameters
        unk_token = self._tokens[self.token2idx["<unk>"]]
        eol_token = self._tokens[self.token2idx["<eol>"]]
        bol_token = self._tokens[self.token2idx["<bol>"]]
        tokens = [unk_token, eol_token, bol_token]

        unk_idx = 0
        word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
        idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
        special = {"<eol>", "<bol>", "<unk>"}

        for tt in self._tokens:
            if tt.word not in special:
                if tt.count < H["min_count"]:
                    tokens[unk_idx].count += tt.count
                else:
                    word2idx[tt.word] = len(tokens)
                    idx2word[len(tokens)] = tt.word
                    tokens.append(tt)

        # reindex document counts
        doc_counts = {}
        for d_idx in self.term_freq.keys():
            doc_counts[d_idx] = {}
            for old_idx, d_count in self.term_freq[d_idx].items():
                word = self.idx2token[old_idx]
                new_idx = word2idx.get(word, unk_idx)
                doc_counts[d_idx][new_idx] = doc_counts[d_idx].get(new_idx, 0) + d_count

        self._tokens = tokens
        self.token2idx = word2idx
        self.idx2token = idx2word
        self.term_freq = doc_counts

    def _sort_tokens(self):
        # sort tokens alphabetically and recode
        ix = 0
        token2idx, idx2token, = (
            {},
            {},
        )
        special = ["<eol>", "<bol>", "<unk>"]
        words = sorted(self.token2idx.keys())
        term_freq = {d: {} for d in self.term_freq.keys()}

        for w in words:
            if w not in special:
                old_ix = self.token2idx[w]
                token2idx[w], idx2token[ix] = ix, w
                for d in self.term_freq.keys():
                    if old_ix in self.term_freq[d]:
                        count = self.term_freq[d][old_ix]
                        term_freq[d][ix] = count
                ix += 1

        for w in special:
            token2idx[w] = len(token2idx)
            idx2token[len(idx2token)] = w

        self.token2idx = token2idx
        self.idx2token = idx2token
        self.term_freq = term_freq
        self.vocab_counts = Counter({t.word: t.count for t in self._tokens})

    def _calc_idf(self):
        """
        Compute the (smoothed-) inverse-document frequency for each token in
        the corpus.

        For a word token `w`, the IDF is simply

            IDF(w) = log ( |D| / |{ d in D: w in d }| ) + 1

        where D is the set of all documents in the corpus,

            D = {d1, d2, ..., dD}

        If `smooth_idf` is True, we perform additive smoothing on the number of
        documents containing a given word, equivalent to pretending that there
        exists a final D+1st document that contains every word in the corpus:

            SmoothedIDF(w) = log ( |D| + 1 / [1 + |{ d in D: w in d }|] ) + 1
        """
        inv_doc_freq = {}
        smooth_idf = self.hyperparameters["smooth_idf"]
        tf, doc_idxs = self.term_freq, self._idx2doc.keys()

        D = len(self._idx2doc) + int(smooth_idf)
        for word, w_ix in self.token2idx.items():
            d_count = int(smooth_idf)
            d_count += np.sum([1 if w_ix in tf[d_ix] else 0 for d_ix in doc_idxs])
            inv_doc_freq[w_ix] = 1 if d_count == 0 else np.log(D / d_count) + 1
        self.inv_doc_freq = inv_doc_freq

    def transform(self, ignore_special_chars=True):
        """
        Generate the term-frequency inverse-document-frequency encoding of a
        text corpus.

        Parameters
        ----------
        ignore_special_chars : bool
            Whether to drop columns corresponding to "<eol>", "<bol>", and
            "<unk>" tokens from the final tfidf encoding. Default is True.

        Returns
        -------
        tfidf : numpy array of shape `(D, M [- 3])`
            The encoded corpus, with each row corresponding to a single
            document, and each column corresponding to a token id. The mapping
            between column numbers and tokens is stored in the `idx2token`
            attribute IFF `ignore_special_chars` is False. Otherwise, the
            mappings are not accurate.
        """
        D, N = len(self._idx2doc), len(self._tokens)
        tf = np.zeros((D, N))
        idf = np.zeros((D, N))

        for d_ix in self._idx2doc.keys():
            words, counts = zip(*self.term_freq[d_ix].items())
            docs = np.ones(len(words), dtype=int) * d_ix
            tf[docs, words] = counts

        words = sorted(self.idx2token.keys())
        idf = np.tile(np.array([self.inv_doc_freq[w] for w in words]), (D, 1))
        tfidf = tf * idf

        if ignore_special_chars:
            idxs = [
                self.token2idx["<unk>"],
                self.token2idx["<eol>"],
                self.token2idx["<bol>"],
            ]
            tfidf = np.delete(tfidf, idxs, 1)

        return tfidf


class Vocabulary:
    def __init__(
        self,
        lowercase=True,
        min_count=None,
        max_tokens=None,
        filter_stopwords=True,
        filter_punctuation=True,
        tokenizer="words",
    ):
        """
        An object for compiling and encoding the unique tokens in a text corpus.

        Parameters
        ----------
        lowercase : bool
            Whether to convert each string to lowercase before tokenization.
            Default is True.
        min_count : int
            Minimum number of times a token must occur in order to be included
            in vocab. If `None`, include all tokens from `corpus_fp` in vocab.
            Default is None.
        max_tokens : int
            Only add the `max_tokens` most frequent tokens that occur more
            than `min_count` to the vocabulary.  If None, add all tokens
            that occur more than than `min_count`. Default is None.
        filter_stopwords : bool
            Whether to remove stopwords before encoding the words in the
            corpus. Default is True.
        filter_punctuation : bool
            Whether to remove punctuation before encoding the words in the
            corpus. Default is True.
        tokenizer : {'whitespace', 'words', 'characters', 'bytes'}
            Strategy to follow when mapping strings to tokens. The
            `'whitespace'` tokenizer splits strings at whitespace characters.
            The `'words'` tokenizer splits strings using a "word" regex. The
            `'characters'` tokenizer splits strings into individual characters.
            The `'bytes'` tokenizer splits strings into a collection of
            individual bytes.
        """
        self.hyperparameters = {
            "id": "Vocabulary",
            "encoding": None,
            "corpus_fps": None,
            "lowercase": lowercase,
            "min_count": min_count,
            "max_tokens": max_tokens,
            "filter_stopwords": filter_stopwords,
            "filter_punctuation": filter_punctuation,
            "tokenizer": tokenizer,
        }

    def __len__(self):
        """Return the number of tokens in the vocabulary"""
        return len(self._tokens)

    def __iter__(self):
        """Return an iterator over the tokens in the vocabulary"""
        return iter(self._tokens)

    def __contains__(self, word):
        """Assert whether `word` is a token in the vocabulary"""
        return word in self.token2idx

    def __getitem__(self, key):
        """
        Return the token (if key is an integer) or the index (if key is a string)
        for the key in the vocabulary, if it exists.
        """
        if isinstance(key, str):
            return self._tokens[self.token2idx[key]]
        if isinstance(key, int):
            return self._tokens[key]

    @property
    def n_tokens(self):
        """The number of unique word tokens in the vocabulary"""
        return len(self.token2idx)

    @property
    def n_words(self):
        """The total number of words in the corpus"""
        return sum(self.counts.values())

    @property
    def shape(self):
        """The number of unique word tokens in the vocabulary"""
        return self._tokens.shape

    def most_common(self, n=5):
        """Return the top `n` most common tokens in the corpus"""
        return self.counts.most_common()[:n]

    def words_with_count(self, k):
        """Return all tokens that occur `k` times in the corpus"""
        return [w for w, c in self.counts.items() if c == k]

    def filter(self, words, unk=True):  # noqa: A003
        """
        Filter (or replace) any word in `words` that is not present in
        `Vocabulary`.

        Parameters
        ----------
        words : list of strs
            A list of words to filter
        unk : bool
            Whether to replace any out of vocabulary words in `words` with the
            ``<unk>`` token (True) or skip them entirely (False).  Default is
            True.

        Returns
        -------
        filtered : list of strs
            The list of words filtered against the words in Vocabulary.
        """
        if unk:
            return [w if w in self else "<unk>" for w in words]
        return [w for w in words if w in self]

    def words_to_indices(self, words):
        """
        Convert the words in `words` to their token indices. If a word is not
        in the vocabulary, return the index for the ``<unk>`` token

        Parameters
        ----------
        words : list of strs
            A list of words to filter

        Returns
        -------
        indices : list of ints
            The token indices for each word in `words`
        """
        unk_ix = self.token2idx["<unk>"]
        lowercase = self.hyperparameters["lowercase"]
        words = [w.lower() for w in words] if lowercase else words
        return [self.token2idx[w] if w in self else unk_ix for w in words]

    def indices_to_words(self, indices):
        """
        Convert the indices in `indices` to their word values. If an index is
        not in the vocabulary, return the ``<unk>`` token.

        Parameters
        ----------
        indices : list of ints
            The token indices for each word in `words`

        Returns
        -------
        words : list of strs
            The word strings corresponding to each token index in `indices`
        """
        unk = "<unk>"
        return [self.idx2token[i] if i in self.idx2token else unk for i in indices]

    def fit(self, corpus_fps, encoding="utf-8-sig"):
        """
        Compute the vocabulary across a collection of documents.

        Parameters
        ----------
        corpus_fps : str or list of strs
            The filepath / list of filepaths for the document(s) to be encoded.
            Each document is expected to be encoded as newline-separated
            string of text, with adjacent tokens separated by a whitespace
            character.
        encoding : str
            Specifies the text encoding for corpus. Common entries are either
            'utf-8' (no header byte), or 'utf-8-sig' (header byte). Default is
            'utf-8-sig'.

        Returns
        -------
        self
        """
        if isinstance(corpus_fps, str):
            corpus_fps = [corpus_fps]

        for corpus_fp in corpus_fps:
            assert op.isfile(corpus_fp), "{} does not exist".format(corpus_fp)

        tokens = []
        H = self.hyperparameters
        idx2word, word2idx = {}, {}

        tokenizer_dict = {
            "words": tokenize_words,
            "characters": tokenize_chars,
            "whitespace": tokenize_whitespace,
            "bytes": tokenize_bytes_raw,
        }

        min_count = H["min_count"]
        lowercase = H["lowercase"]
        max_tokens = H["max_tokens"]
        filter_stop = H["filter_stopwords"]
        filter_punc = H["filter_punctuation"]
        tokenizer = tokenizer_dict[H["tokenizer"]]

        H["encoding"] = encoding
        H["corpus_fps"] = corpus_fps

        # encode special tokens
        for tt in ["<bol>", "<eol>", "<unk>"]:
            word2idx[tt] = len(tokens)
            idx2word[len(tokens)] = tt
            tokens.append(Token(tt))

        bol_ix = word2idx["<bol>"]
        eol_ix = word2idx["<eol>"]

        for d_ix, doc_fp in enumerate(corpus_fps):
            with open(doc_fp, "r", encoding=H["encoding"]) as doc:
                for line in doc:
                    words = tokenizer(
                        line,
                        lowercase=lowercase,
                        filter_stopwords=filter_stop,
                        filter_punctuation=filter_punc,
                        encoding=H["encoding"],
                    )

                    for ww in words:
                        if ww not in word2idx:
                            word2idx[ww] = len(tokens)
                            idx2word[len(tokens)] = ww
                            tokens.append(Token(ww))

                        t_idx = word2idx[ww]
                        tokens[t_idx].count += 1

                    # wrap line in <bol> and <eol> tags
                    tokens[bol_ix].count += 1
                    tokens[eol_ix].count += 1

        self._tokens = tokens
        self.token2idx = word2idx
        self.idx2token = idx2word

        # replace all words occurring less than `min_count` by <unk>
        if min_count is not None:
            self._drop_low_freq_tokens()

        # retain only the top `max_tokens` most frequent tokens, coding
        # everything else as <unk>
        if max_tokens is not None and len(tokens) > max_tokens:
            self._keep_top_n_tokens()

        counts = {w: self._tokens[ix].count for w, ix in self.token2idx.items()}
        self.counts = Counter(counts)
        self._tokens = np.array(self._tokens)
        return self

    def _keep_top_n_tokens(self):
        word2idx, idx2word = {}, {}
        N = self.hyperparameters["max_tokens"]
        tokens = sorted(self._tokens, key=lambda x: x.count, reverse=True)

        # reindex the top-N tokens...
        unk_ix = None
        for idx, tt in enumerate(tokens[:N]):
            word2idx[tt.word] = idx
            idx2word[idx] = tt.word

            if tt.word == "<unk>":
                unk_ix = idx

        # ... if <unk> isn't in the top-N, add it, replacing the Nth
        # most-frequent word and adjusting the <unk> count accordingly ...
        if unk_ix is None:
            unk_ix = self.token2idx["<unk>"]
            old_count = tokens[N - 1].count
            tokens[N - 1] = self._tokens[unk_ix]
            tokens[N - 1].count += old_count
            word2idx["<unk>"] = N - 1
            idx2word[N - 1] = "<unk>"

        # ... and recode all dropped tokens as "<unk>"
        for tt in tokens[N:]:
            tokens[unk_ix].count += tt.count

        self._tokens = tokens[:N]
        self.token2idx = word2idx
        self.idx2token = idx2word

        assert len(self._tokens) <= N

    def _drop_low_freq_tokens(self):
        """
        Replace all tokens that occur less than `min_count` with the `<unk>`
        token.
        """
        unk_idx = 0
        unk_token = self._tokens[self.token2idx["<unk>"]]
        eol_token = self._tokens[self.token2idx["<eol>"]]
        bol_token = self._tokens[self.token2idx["<bol>"]]

        H = self.hyperparameters
        tokens = [unk_token, eol_token, bol_token]
        word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
        idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
        special = {"<eol>", "<bol>", "<unk>"}

        for tt in self._tokens:
            if tt.word not in special:
                if tt.count < H["min_count"]:
                    tokens[unk_idx].count += tt.count
                else:
                    word2idx[tt.word] = len(tokens)
                    idx2word[len(tokens)] = tt.word
                    tokens.append(tt)

        self._tokens = tokens
        self.token2idx = word2idx
        self.idx2token = idx2word


================================================
FILE: numpy_ml/rl_models/README.md
================================================
# RL Models
The `agents.py` module implements a number of standard reinforcement learning (RL) agents that
can be run on [OpenAI gym](https://gym.openai.com/) environments.

1. **Monte Carlo Methods**
    - First-visit Monte Carlo updates (on-policy)
    - Incremental weighted importance sampling (off-policy)
    - Cross-entropy method ([Mannor, Rubinstein, & Gat, 2003](https://www.aaai.org/Papers/ICML/2003/ICML03-068.pdf))

2. **Temporal-Difference Methods**
    - SARSA (on-policy) ([Rummery & Niranjan, 1994](http://mi.eng.cam.ac.uk/reports/svr-ftp/auto-pdf/rummery_tr166.pdf))
    - Q-learning (off-policy) ([Watkins, 1989](https://www.researchgate.net/profile/Christopher_Watkins2/publication/33784417_Learning_From_Delayed_Rewards/links/53fe12e10cf21edafd142e03.pdf))

3. **Model-Based Methods**
    - Dyna-Q/Dyna-Q+ with prioritized sweeping ([Sutton, 1990](http://papersdb.cs.ualberta.ca/~papersdb/uploaded_files/505/paper_sutton-90.pdf); [Moore & Atkeson, 1993](https://link.springer.com/content/pdf/10.1007/BF00993104.pdf))

## Plots
<p align="center">
<img src="img/MonteCarloAgent-Copy-v0.png" align='center' height="400" />

<img src="img/TemporalDifferenceAgent-Taxi-v2.png" align='center' height="400" />

<img src="img/CrossEntropyAgent-LunarLander-v2.png" align='center' height="400" />

<img src="img/DynaAgent-Taxi-v2.png" align='center' height="400" />
</p>


================================================
FILE: numpy_ml/rl_models/__init__.py
================================================
from . import rl_utils
from . import agents
from . import trainer
from . import tiles


================================================
FILE: numpy_ml/rl_models/agents.py
================================================
"""Reinforcement learning agents that can be run on OpenAI gym environs"""

from abc import ABC, abstractmethod
from collections import defaultdict

import numpy as np

from .rl_utils import EnvModel, env_stats, tile_state_space
from ..utils.data_structures import Dict


class AgentBase(ABC):
    def __init__(self, env):
        super().__init__()
        self.env = env
        self.parameters = {}
        self.hyperparameters = {}
        self.derived_variables = {}
        self.env_info = env_stats(env)

    def _create_2num_dicts(self, obs_encoder=None, act_encoder=None):
        E = self.env_info
        n_states = np.prod(E["n_obs_per_dim"])
        n_actions = np.prod(E["n_actions_per_dim"])

        # create action -> scalar dictionaries
        self._num2action = Dict()
        self._action2num = Dict(act_encoder)
        if n_actions != np.inf:
            self._action2num = {act: i for i, act in enumerate(E["action_ids"])}
            self._num2action = {i: act for act, i in self._action2num.items()}

        # create obs -> scalar dictionaries
        self._num2obs = Dict()
        self._obs2num = Dict(obs_encoder)
        if n_states != np.inf:
            self._obs2num = {act: i for i, act in enumerate(E["obs_ids"])}
            self._num2obs = {i: act for act, i in self._obs2num.items()}

    def flush_history(self):
        """Clear the episode history"""
        for k, v in self.episode_history.items():
            self.episode_history[k] = []

    @abstractmethod
    def act(self, obs):
        """Generate an action given the current observation"""
        raise NotImplementedError

    @abstractmethod
    def greedy_policy(self, **kwargs):
        """
        Take a greedy action.

        Returns
        -------
        total_reward : float
            The total reward on the episode.
        n_steps : float
            The total number of steps taken on the episode.
        """
        raise NotImplementedError

    @abstractmethod
    def run_episode(self, max_steps, render=False):
        """
        Run the agent on a single episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode
        render : bool
            Whether to render the episode during training

        Returns
        -------
        reward : float
            The total reward on the episode, averaged over the theta samples.
        steps : float
            The total number of steps taken on the episode, averaged over the
            theta samples.
        """
        raise NotImplementedError

    @abstractmethod
    def update(self):
        r"""
        Update the agent parameters according to the rewards accrued on the
        current episode.

        Returns
        -------
        avg_reward : float
            The average reward earned by the best `retain_prcnt` theta samples
            on the current episode.
        """
        raise NotImplementedError


class CrossEntropyAgent(AgentBase):
    def __init__(self, env, n_samples_per_episode=500, retain_prcnt=0.2):
        r"""
        A cross-entropy method agent.

        Notes
        -----
        The cross-entropy method [1]_ [2]_ agent only operates on ``envs`` with
        discrete action spaces.

        On each episode the agent generates `n_theta_samples` of the parameters
        (:math:`\theta`) for its behavior policy. The `i`'th sample at
        timestep `t` is:

        .. math::

            \theta_i  &=  \{\mathbf{W}_i^{(t)}, \mathbf{b}_i^{(t)} \} \\
            \theta_i  &\sim  \mathcal{N}(\mu^{(t)}, \Sigma^{(t)})

        Weights (:math:`\mathbf{W}_i`) and bias (:math:`\mathbf{b}_i`) are the
        parameters of the softmax policy:

        .. math::

            \mathbf{z}_i  &=  \text{obs} \cdot \mathbf{W}_i + \mathbf{b}_i \\
            p(a_i^{(t + 1)})  &=  \frac{e^{\mathbf{z}_i}}{\sum_j e^{z_{ij}}} \\
            a^{(t + 1)}  &=  \arg \max_j p(a_j^{(t+1)})

        At the end of each episode, the agent takes the top `retain_prcnt`
        highest scoring :math:`\theta` samples and combines them to generate
        the mean and variance of the distribution of :math:`\theta` for the
        next episode:

        .. math::

            \mu^{(t+1)}  &=  \text{avg}(\texttt{best_thetas}^{(t)}) \\
            \Sigma^{(t+1)}  &=  \text{var}(\texttt{best_thetas}^{(t)})

        References
        ----------
        .. [1] Mannor, S., Rubinstein, R., & Gat, Y. (2003). The cross entropy
           method for fast policy search. In *Proceedings of the 20th Annual
           ICML, 20*.
        .. [2] Rubinstein, R. (1997). optimization of computer simulation
           models with rare events, *European Journal of Operational Research,
           99*, 89–112.

        Parameters
        ----------
        env : :meth:`gym.wrappers` or :meth:`gym.envs` instance
            The environment to run the agent on.
        n_samples_per_episode : int
            The number of theta samples to evaluate on each episode. Default is 500.
        retain_prcnt: float
            The percentage of `n_samples_per_episode` to use when calculating
            the parameter update at the end of the episode. Default is 0.2.
        """
        super().__init__(env)

        self.retain_prcnt = retain_prcnt
        self.n_samples_per_episode = n_samples_per_episode
        self._init_params()

    def _init_params(self):
        E = self.env_info
        assert not E["continuous_actions"], "Action space must be discrete"

        self._create_2num_dicts()
        b_len = np.prod(E["n_actions_per_dim"])
        W_len = b_len * np.prod(E["obs_dim"])
        theta_dim = b_len + W_len

        # init mean and variance for mv gaussian with dimensions theta_dim
        theta_mean = np.random.rand(theta_dim)
        theta_var = np.ones(theta_dim)

        self.parameters = {"theta_mean": theta_mean, "theta_var": theta_var}
        self.derived_variables = {
            "b_len": b_len,
            "W_len": W_len,
            "W_samples": [],
            "b_samples": [],
            "episode_num": 0,
            "cumulative_rewards": [],
        }

        self.hyperparameters = {
            "agent": "CrossEntropyAgent",
            "retain_prcnt": self.retain_prcnt,
            "n_samples_per_episode": self.n_samples_per_episode,
        }

        self.episode_history = {"rewards": [], "state_actions": []}

    def act(self, obs):
        r"""
        Generate actions according to a softmax policy.

        Notes
        -----
        The softmax policy assumes that the pmf over actions in state :math:`x_t` is
        given by:

        .. math::

            \pi(a | x^{(t)}) = \text{softmax}(
                \text{obs}^{(t)} \cdot \mathbf{W}_i^{(t)} + \mathbf{b}_i^{(t)} )

        where :math:`\mathbf{W}` is a learned weight matrix, `obs` is the observation
        at timestep `t`, and **b** is a learned bias vector.

        Parameters
        ----------
        obs : int or :py:class:`ndarray <numpy.ndarray>`
            An observation from the environment.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            An action sampled from the distribution over actions defined by the
            softmax policy.
        """
        E, P = self.env_info, self.parameters
        W, b = P["W"], P["b"]

        s = self._obs2num[obs]
        s = np.array([s]) if E["obs_dim"] == 1 else s

        # compute softmax
        Z = s.T @ W + b
        e_Z = np.exp(Z - np.max(Z, axis=-1, keepdims=True))
        action_probs = e_Z / e_Z.sum(axis=-1, keepdims=True)

        # sample action
        a = np.random.multinomial(1, action_probs).argmax()
        return self._num2action[a]

    def run_episode(self, max_steps, render=False):
        """
        Run the agent on a single episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode
        render : bool
            Whether to render the episode during training

        Returns
        -------
        reward : float
            The total reward on the episode, averaged over the theta samples.
        steps : float
            The total number of steps taken on the episode, averaged over the
            theta samples.
        """
        self._sample_thetas()

        E, D = self.env_info, self.derived_variables
        n_actions = np.prod(E["n_actions_per_dim"])
        W_len, obs_dim = D["W_len"], E["obs_dim"]
        steps, rewards = [], []

        for theta in D["theta_samples"]:
            W = theta[:W_len].reshape(obs_dim, n_actions)
            b = theta[W_len:]

            total_rwd, n_steps = self._episode(W, b, max_steps, render)
            rewards.append(total_rwd)
            steps.append(n_steps)

        # return the average reward and average number of steps across all
        # samples on the current episode
        D["episode_num"] += 1
        D["cumulative_rewards"] = rewards
        return np.mean(D["cumulative_rewards"]), np.mean(steps)

    def _episode(self, W, b, max_steps, render):
        """
        Run the agent for an episode.

        Parameters
        ----------
        W : :py:class:`ndarray <numpy.ndarray>` of shape `(obs_dim, n_actions)`
            The weights for the softmax policy.
        b : :py:class:`ndarray <numpy.ndarray>` of shape `(bias_len, )`
            The bias for the softmax policy.
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The total number of steps taken on the episode.
        """
        rwds, sa = [], []
        H = self.episode_history
        total_reward, n_steps = 0.0, 1
        obs = self.env.reset()

        self.parameters["W"] = W
        self.parameters["b"] = b

        for i in range(max_steps):
            if render:
                self.env.render()

            n_steps += 1
            action = self.act(obs)
            s, a = self._obs2num[obs], self._action2num[action]
            sa.append((s, a))

            obs, reward, done, _ = self.env.step(action)
            rwds.append(reward)
            total_reward += reward

            if done:
                break

        H["rewards"].append(rwds)
        H["state_actions"].append(sa)
        return total_reward, n_steps

    def update(self):
        r"""
        Update :math:`\mu` and :math:`\Sigma` according to the rewards accrued on
        the current episode.

        Returns
        -------
        avg_reward : float
            The average reward earned by the best `retain_prcnt` theta samples
            on the current episode.
        """
        D, P = self.derived_variables, self.parameters
        n_retain = int(self.retain_prcnt * self.n_samples_per_episode)

        # sort the cumulative rewards for each theta sample from greatest to least
        sorted_y_val_idxs = np.argsort(D["cumulative_rewards"])[::-1]
        top_idxs = sorted_y_val_idxs[:n_retain]

        # update theta_mean and theta_var with the best theta value
        P["theta_mean"] = np.mean(D["theta_samples"][top_idxs], axis=0)
        P["theta_var"] = np.var(D["theta_samples"][top_idxs], axis=0)

    def _sample_thetas(self):
        """
        Sample `n_samples_per_episode` thetas from a multivariate Gaussian with
        mean `theta_mean` and covariance `diag(theta_var)`
        """
        P, N = self.parameters, self.n_samples_per_episode
        Mu, Sigma = P["theta_mean"], np.diag(P["theta_var"])
        samples = np.random.multivariate_normal(Mu, Sigma, N)
        self.derived_variables["theta_samples"] = samples

    def greedy_policy(self, max_steps, render=True):
        """
        Execute a greedy policy using the current agent parameters.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during execution.

        Returns
        -------
        total_reward : float
            The total reward on the episode.
        n_steps : float
            The total number of steps taken on the episode.
        """
        E, D, P = self.env_info, self.derived_variables, self.parameters
        Mu, Sigma = P["theta_mean"], np.diag(P["theta_var"])
        sample = np.random.multivariate_normal(Mu, Sigma, 1)

        W_len, obs_dim = D["W_len"], E["obs_dim"]
        n_actions = np.prod(E["n_actions_per_dim"])

        W = sample[0, :W_len].reshape(obs_dim, n_actions)
        b = sample[0, W_len:]
        total_reward, n_steps = self._episode(W, b, max_steps, render)
        return total_reward, n_steps


class MonteCarloAgent(AgentBase):
    def __init__(self, env, off_policy=False, temporal_discount=0.9, epsilon=0.1):
        """
        A Monte-Carlo learning agent trained using either first-visit Monte
        Carlo updates (on-policy) or incremental weighted importance sampling
        (off-policy).

        Parameters
        ----------
        env : :class:`gym.wrappers` or :class:`gym.envs` instance
            The environment to run the agent on.
        off_policy : bool
            Whether to use a behavior policy separate from the target policy
            during training. If False, use the same epsilon-soft policy for
            both behavior and target policies. Default is False.
        temporal_discount : float between [0, 1]
            The discount factor used for downweighting future rewards. Smaller
            values result in greater discounting of future rewards. Default is
            0.9.
        epsilon : float between [0, 1]
            The epsilon value in the epsilon-soft policy. Larger values
            encourage greater exploration during training. Default is 0.1.
        """
        super().__init__(env)

        self.epsilon = epsilon
        self.off_policy = off_policy
        self.temporal_discount = temporal_discount

        self._init_params()

    def _init_params(self):
        E = self.env_info
        assert not E["continuous_actions"], "Action space must be discrete"
        assert not E["continuous_observations"], "Observation space must be discrete"

        n_states = np.prod(E["n_obs_per_dim"])
        n_actions = np.prod(E["n_actions_per_dim"])

        self._create_2num_dicts()

        # behavior policy is stochastic, epsilon-soft policy
        self.behavior_policy = self.target_policy = self._epsilon_soft_policy
        if self.off_policy:
            self.parameters["C"] = np.zeros((n_states, n_actions))

            # target policy is deterministic, greedy policy
            self.target_policy = self._greedy

        # initialize Q function
        self.parameters["Q"] = np.random.rand(n_states, n_actions)

        # initialize returns object for each state-action pair
        self.derived_variables = {
            "returns": {(s, a): [] for s in range(n_states) for a in range(n_actions)},
            "episode_num": 0,
        }

        self.hyperparameters = {
            "agent": "MonteCarloAgent",
            "epsilon": self.epsilon,
            "off_policy": self.off_policy,
            "temporal_discount": self.temporal_discount,
        }

        self.episode_history = {"state_actions": [], "rewards": []}

    def _epsilon_soft_policy(self, s, a=None):
        r"""
        Epsilon-soft exploration policy.

        Notes
        -----
        Soft policies are necessary for first-visit Monte Carlo methods, as
        they require continual exploration (i.e., each state-action pair must
        have nonzero probability of occurring).

        In epsilon-soft policies, :math:`\pi(a \mid s) > 0` for all :math:`s
        \in S` and all :math:`a \in A(s)` at the start of training. As learning
        progresses, :math:`pi` gradually shifts closer and closer to a
        deterministic optimal policy.

        In particular, we have:

        .. math::

            \pi(a \mid s)  &=
                1 - \epsilon + \frac{\epsilon}{|A(s)|}  &&\text{if} a = a^*
            \pi(a \mid s)  &=
                \frac{\epsilon}{|A(s)|}                 &&\text{if} a \neq a^*

        where :math:`|A(s)|` is the number of actions available in state `s`
        and :math:`a^* \in A(s)` is the greedy action in state `s` (i.e.,
        :math:`a^* = \arg \max_a Q(s, a)`).

        Note that epsilon-greedy policies are instances of epsilon-soft
        policies, defined as policies for which :math:`\pi(a|s) \geq \epsilon / |A(s)|`
        for all states and actions.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            ``_obs2num[obs]``.
        a : int, float, tuple, or None
            The action number in the current state, as returned by
            ``self._action2num[obs]``. If None, sample an action from the
            action probabilities in state `s`, otherwise, return the
            probability of action `a` under the epsilon-soft policy. Default is
            None.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            If `a` is None, this is an action sampled from the distribution
            over actions defined by the epsilon-soft policy. If `a` is not
            None, this is the probability of `a` under the epsilon-soft policy.
        """
        E, P = self.env_info, self.parameters

        # TODO: this assumes all actions are available in every state
        n_actions = np.prod(E["n_actions_per_dim"])

        a_star = P["Q"][s, :].argmax()
        p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)
        p_a = self.epsilon / n_actions

        action_probs = np.ones(n_actions) * p_a
        action_probs[a_star] = p_a_star
        np.testing.assert_allclose(np.sum(action_probs), 1)

        if a is not None:
            return action_probs[a]

        # sample action
        a = np.random.multinomial(1, action_probs).argmax()
        return self._num2action[a]

    def _greedy(self, s, a=None):
        """
        A greedy behavior policy.

        Notes
        -----
        Only used when off-policy is True.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            ``self._obs2num[obs]``.
        a : int, float, or tuple
            The action number in the current state, as returned by
            ``self._action2num[obs]``. If None, sample an action from the action
            probabilities in state `s`, otherwise, return the probability of
            action `a` under the greedy policy. Default is None.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            If `a` is None, this is an action sampled from the distribution
            over actions defined by the greedy policy. If `a` is not
            None, this is the probability of `a` under the greedy policy.
        """
        a_star = self.parameters["Q"][s, :].argmax()
        if a is None:
            out = self._num2action[a_star]
        else:
            out = 1 if a == a_star else 0
        return out

    def _on_policy_update(self):
        r"""
        Update the `Q` function using an on-policy first-visit Monte Carlo
        update.

        Notes
        -----
        The on-policy first-visit Monte Carlo update is

        .. math::

            Q'(s, a) \leftarrow
                \text{avg}(\text{reward following first visit to } (s, a)
                \text{ across all episodes})

        RL agents seek to learn action values conditional on subsequent optimal
        behavior, but they need to behave non-optimally in order to explore all
        actions (to find the optimal actions).

        The on-policy approach is a compromise -- it learns action values not
        for the optimal policy, but for a *near*-optimal policy that still
        explores (the epsilon-soft policy).
        """
        D, P, HS = self.derived_variables, self.parameters, self.episode_history

        ep_rewards = HS["rewards"]
        sa_tuples = set(HS["state_actions"])

        locs = [HS["state_actions"].index(sa) for sa in sa_tuples]
        cumulative_returns = [np.sum(ep_rewards[i:]) for i in locs]

        # update Q value with the average of the first-visit return across
        # episodes
        for (s, a), cr in zip(sa_tuples, cumulative_returns):
            D["returns"][(s, a)].append(cr)
            P["Q"][s, a] = np.mean(D["returns"][(s, a)])

    def _off_policy_update(self):
        """
        Update `Q` using weighted importance sampling.

        Notes
        -----
        In importance sampling updates, we account for the fact that we are
        updating a different policy from the one we used to generate behavior
        by weighting the accumulated rewards by the ratio of the probability of
        the trajectory under the target policy versus its probability under
        the behavior policies. This is known as the importance sampling weight.

        In weighted importance sampling, we scale the accumulated rewards for a
        trajectory by their importance sampling weight, then take the
        *weighted* average using the importance sampling weight. This weighted
        average then becomes the value for the trajectory.

            W   = importance sampling weight
            G_t = total discounted reward from time t until episode end
            C_n = sum of importance weights for the first n rewards

        This algorithm converges to Q* in the limit.
        """
        P = self.parameters
        HS = self.episode_history
        ep_rewards = HS["rewards"]
        T = len(ep_rewards)

        G, W = 0.0, 1.0
        for t in reversed(range(T)):
            s, a = HS["state_actions"][t]
            G = self.temporal_discount * G + ep_rewards[t]
            P["C"][s, a] += W

            # update Q(s, a) using weighted importance sampling
            P["Q"][s, a] += (W / P["C"][s, a]) * (G - P["Q"][s, a])

            # multiply the importance sampling ratio by the current weight
            W *= self.target_policy(s, a) / self.behavior_policy(s, a)

            if W == 0.0:
                break

    def act(self, obs):
        r"""
        Execute the behavior policy--an :math:`\epsilon`-soft policy used to
        generate actions during training.

        Parameters
        ----------
        obs : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by ``env.step(action)``
            An observation from the environment.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            An action sampled from the distribution over actions defined by the
            epsilon-soft policy.
        """  # noqa: E501
        s = self._obs2num[obs]
        return self.behavior_policy(s)

    def run_episode(self, max_steps, render=False):
        """
        Run the agent on a single episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        D = self.derived_variables
        total_rwd, n_steps = self._episode(max_steps, render)

        D["episode_num"] += 1
        return total_rwd, n_steps

    def _episode(self, max_steps, render):
        """
        Execute agent on an episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        obs = self.env.reset()
        HS = self.episode_history
        total_reward, n_steps = 0.0, 0

        for i in range(max_steps):
            if render:
                self.env.render()

            n_steps += 1
            action = self.act(obs)

            s = self._obs2num[obs]
            a = self._action2num[action]

            # store (state, action) tuple
            HS["state_actions"].append((s, a))

            # take action
            obs, reward, done, info = self.env.step(action)

            # record rewards
            HS["rewards"].append(reward)
            total_reward += reward

            if done:
                break

        return total_reward, n_steps

    def update(self):
        """
        Update the parameters of the model following the completion of an
        episode. Flush the episode history after the update is complete.
        """
        H = self.hyperparameters
        if H["off_policy"]:
            self._off_policy_update()
        else:
            self._on_policy_update()

        self.flush_history()

    def greedy_policy(self, max_steps, render=True):
        """
        Execute a greedy policy using the current agent parameters.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during execution.

        Returns
        -------
        total_reward : float
            The total reward on the episode.
        n_steps : float
            The total number of steps taken on the episode.
        """
        H = self.episode_history
        obs = self.env.reset()

        total_reward, n_steps = 0.0, 0
        for i in range(max_steps):
            if render:
                self.env.render()

            n_steps += 1
            action = self._greedy(obs)

            s = self._obs2num[obs]
            a = self._action2num[action]

            # store (state, action) tuple
            H["state_actions"].append((s, a))

            # take action
            obs, reward, done, info = self.env.step(action)

            # record rewards
            H["rewards"].append(reward)
            total_reward += reward

            if done:
                break

        return total_reward, n_steps


class TemporalDifferenceAgent(AgentBase):
    def __init__(
        self,
        env,
        lr=0.4,
        epsilon=0.1,
        n_tilings=8,
        obs_max=None,
        obs_min=None,
        grid_dims=[8, 8],
        off_policy=False,
        temporal_discount=0.99,
    ):
        r"""
        A temporal difference learning agent with expected SARSA (on-policy) [3]_ or
        TD(0) `Q`-learning (off-policy) [4]_ updates.

        Notes
        -----
        The expected SARSA on-policy TD(0) update is:

        .. math::

            Q(s, a) \leftarrow Q(s, a) + \eta \left(
                r + \gamma \mathbb{E}_\pi[Q(s', a') \mid s'] - Q(s, a)
            \right)

        and the TD(0) off-policy Q-learning upate is:

        .. math::

            Q(s, a) \leftarrow Q(s, a) + \eta (
                r + \gamma \max_a \left\{ Q(s', a) \right\} - Q(s, a)
            )

        where in each case we have taken action `a` in state `s`, received
        reward `r`, and transitioned into state :math:`s'`. In the above
        equations, :math:`\eta` is a learning rate parameter, :math:`\gamma` is
        a temporal discount factor, and :math:`\mathbb{E}_\pi[ Q[s', a'] \mid
        s']` is the expected value under the current policy :math:`\pi` of the
        Q function conditioned that we are in state :math:`s'`.

        Observe that the expected SARSA update can be used for both on- and
        off-policy methods. In an off-policy context, if the target policy is
        greedy and the expectation is taken wrt. the target policy then the
        expected SARSA update is exactly Q-learning.

        NB. For this implementation the agent requires a discrete action
        space, but will try to discretize the observation space via tiling if
        it is continuous.

        References
        ----------
        .. [3] Rummery, G. & Niranjan, M. (1994). *On-Line Q-learning Using
           Connectionist Systems*. Tech Report 166. Cambridge University
           Department of Engineering.
        .. [4] Watkins, C. (1989). Learning from delayed rewards. *PhD thesis,
           Cambridge University*.

        Parameters
        ----------
        env : gym.wrappers or gym.envs instance
            The environment to run the agent on.
        lr : float
            Learning rate for the Q function updates. Default is 0.05.
        epsilon : float between [0, 1]
            The epsilon value in the epsilon-soft policy. Larger values
            encourage greater exploration during training. Default is 0.1.
        n_tilings : int
            The number of overlapping tilings to use if the ``env`` observation
            space is continuous. Unused if observation space is discrete.
            Default is 8.
        obs_max : float or :py:class:`ndarray <numpy.ndarray>`
            The value to treat as the max value of the observation space when
            calculating the grid widths if the observation space is continuous.
            If None, use ``env.observation_space.high``. Unused if observation
            space is discrete. Default is None.
        obs_min : float or :py:class:`ndarray <numpy.ndarray>`
            The value to treat as the min value of the observation space when
            calculating grid widths if the observation space is continuous. If
            None, use ``env.observation_space.low``. Unused if observation
            space is discrete. Default is None.
        grid_dims : list
           The number of rows and columns in each tiling grid if the env
           observation space is continuous. Unused if observation space is
           discrete. Default is [8, 8].
        off_policy : bool
            Whether to use a behavior policy separate from the target policy
            during training. If False, use the same epsilon-soft policy for
            both behavior and target policies. Default is False.
        temporal_discount : float between [0, 1]
            The discount factor used for downweighting future rewards. Smaller
            values result in greater discounting of future rewards. Default is
            0.9.
        """
        super().__init__(env)

        self.lr = lr
        self.obs_max = obs_max
        self.obs_min = obs_min
        self.epsilon = epsilon
        self.n_tilings = n_tilings
        self.grid_dims = grid_dims
        self.off_policy = off_policy
        self.temporal_discount = temporal_discount

        self._init_params()

    def _init_params(self):
        E = self.env_info
        assert not E["continuous_actions"], "Action space must be discrete"

        obs_encoder = None
        if E["continuous_observations"]:
            obs_encoder, _ = tile_state_space(
                self.env,
                self.env_info,
                self.n_tilings,
                state_action=False,
                obs_max=self.obs_max,
                obs_min=self.obs_min,
                grid_size=self.grid_dims,
            )

        self._create_2num_dicts(obs_encoder=obs_encoder)

        # behavior policy is stochastic, epsilon-soft policy
        self.behavior_policy = self.target_policy = self._epsilon_soft_policy
        if self.off_policy:
            # target policy is deterministic, greedy policy
            self.target_policy = self._greedy

        # initialize Q function
        self.parameters["Q"] = defaultdict(np.random.rand)

        # initialize returns object for each state-action pair
        self.derived_variables = {"episode_num": 0}

        self.hyperparameters = {
            "agent": "TemporalDifferenceAgent",
            "lr": self.lr,
            "obs_max": self.obs_max,
            "obs_min": self.obs_min,
            "epsilon": self.epsilon,
            "n_tilings": self.n_tilings,
            "grid_dims": self.grid_dims,
            "off_policy": self.off_policy,
            "temporal_discount": self.temporal_discount,
        }

        self.episode_history = {"state_actions": [], "rewards": []}

    def run_episode(self, max_steps, render=False):
        """
        Run the agent on a single episode without updating the priority queue
        or performing backups.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode
        render : bool
            Whether to render the episode during training

        Returns
        -------
        reward : float
            The total reward on the episode, averaged over the theta samples.
        steps : float
            The total number of steps taken on the episode, averaged over the
            theta samples.
        """
        return self._episode(max_steps, render, update=False)

    def train_episode(self, max_steps, render=False):
        """
        Train the agent on a single episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        D = self.derived_variables
        total_rwd, n_steps = self._episode(max_steps, render, update=True)

        D["episode_num"] += 1

        return total_rwd, n_steps

    def _episode(self, max_steps, render, update=True):
        """
        Run or train the agent on an episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during training.
        update : bool
            Whether to perform the Q function backups after each step. Default
            is True.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        self.flush_history()

        obs = self.env.reset()
        HS = self.episode_history

        action = self.act(obs)
        s = self._obs2num[obs]
        a = self._action2num[action]

        # store initial (state, action) tuple
        HS["state_actions"].append((s, a))

        total_reward, n_steps = 0.0, 0
        for i in range(max_steps):
            if render:
                self.env.render()

            # take action
            obs, reward, done, info = self.env.step(action)
            n_steps += 1

            # record rewards
            HS["rewards"].append(reward)
            total_reward += reward

            # generate next state and action
            action = self.act(obs)
            s_ = self._obs2num[obs] if not done else None
            a_ = self._action2num[action]

            # store next (state, action) tuple
            HS["state_actions"].append((s_, a_))

            if update:
                self.update()

            if done:
                break

        return total_reward, n_steps

    def _epsilon_soft_policy(self, s, a=None):
        r"""
        Epsilon-soft exploration policy.

        In epsilon-soft policies, :math:`\pi(a|s) > 0` for all s ∈ S and all a
        ∈ A(s) at the start of training. As learning progresses, :math:`\pi`
        gradually shifts closer and closer to a deterministic optimal policy.

        In particular, we have:

            pi(a|s) = 1 - epsilon + (epsilon / |A(s)|) IFF a == a*
            pi(a|s) = epsilon / |A(s)|                 IFF a != a*

        where

            |A(s)| is the number of actions available in state s
            a* ∈ A(s) is the greedy action in state s (i.e., a* = argmax_a Q(s, a))

        Note that epsilon-greedy policies are instances of epsilon-soft
        policies, defined as policies for which pi(a|s) >= epsilon / |A(s)| for
        all states and actions.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            ``self._obs2num[obs]``
        a : int, float, or tuple
            The action number in the current state, as returned by
            self._action2num[obs]. If None, sample an action from the action
            probabilities in state s, otherwise, return the probability of
            action `a` under the epsilon-soft policy. Default is None.

        Returns
        -------
        If `a` is None:
        action : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by `self._num2action`
            If `a` is None, returns an action sampled from the distribution
            over actions defined by the epsilon-soft policy.

        If `a` is not None:
        action_prob : float in range [0, 1]
            If `a` is not None, returns the probability of `a` under the
            epsilon-soft policy.
        """  # noqa: E501
        E, P = self.env_info, self.parameters

        # TODO: this assumes all actions are available in every state
        n_actions = np.prod(E["n_actions_per_dim"])

        a_star = np.argmax([P["Q"][(s, aa)] for aa in range(n_actions)])
        p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)
        p_a = self.epsilon / n_actions

        action_probs = np.ones(n_actions) * p_a
        action_probs[a_star] = p_a_star
        np.testing.assert_allclose(np.sum(action_probs), 1)

        if a is not None:
            return action_probs[a]

        # sample action
        a = np.random.multinomial(1, action_probs).argmax()
        return self._num2action[a]

    def _greedy(self, s, a=None):
        """
        A greedy behavior policy. Only used when off-policy is true.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            ``self._obs2num[obs]``
        a : int, float, or tuple
            The action number in the current state, as returned by
            ``self._action2num[obs]``. If None, sample an action from the
            action probabilities in state `s`, otherwise, return the
            probability of action `a` under the greedy policy. Default is None.

        Returns
        -------
        If `a` is None:
        action : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by ``self._num2action``
            If `a` is None, returns an action sampled from the distribution
            over actions defined by the greedy policy.

        If `a` is not None:
        action_prob : float in range [0, 1]
            If `a` is not None, returns the probability of `a` under the
            greedy policy.
        """  # noqa: E501
        P, E = self.parameters, self.env_info
        n_actions = np.prod(E["n_actions_per_dim"])
        a_star = np.argmax([P["Q"][(s, aa)] for aa in range(n_actions)])
        if a is None:
            out = self._num2action[a_star]
        else:
            out = 1 if a == a_star else 0
        return out

    def _on_policy_update(self, s, a, r, s_, a_):
        """
        Update the Q function using the expected SARSA on-policy TD(0) update:

            Q[s, a] <- Q[s, a] + lr * [
                r + temporal_discount * E[Q[s', a'] | s'] - Q[s, a]
            ]

        where

            E[ Q[s', a'] | s'] is the expected value of the Q function over all
            a_ given that we're in state s' under the current policy

        NB. the expected SARSA update can be used for both on- and off-policy
        methods. In an off-policy context, if the target policy is greedy and
        the expectation is taken wrt. the target policy then the expected SARSA
        update is exactly Q-learning.

        Parameters
        ----------
        s : int as returned by `self._obs2num`
            The id for the state/observation at timestep t-1
        a : int as returned by `self._action2num`
            The id for the action taken at timestep t-1
        r : float
            The reward after taking action `a` in state `s` at timestep t-1
        s_ : int as returned by `self._obs2num`
            The id for the state/observation at timestep t
        a_ : int as returned by `self._action2num`
            The id for the action taken at timestep t
        """
        Q, E, pi = self.parameters["Q"], self.env_info, self.behavior_policy

        # TODO: this assumes that all actions are available in each state
        n_actions = np.prod(E["n_actions_per_dim"])

        # compute the expected value of Q(s', a') given that we are in state s'
        E_Q = np.sum([pi(s_, aa) * Q[(s_, aa)] for aa in range(n_actions)]) if s_ else 0

        # perform the expected SARSA TD(0) update
        qsa = Q[(s, a)]
        Q[(s, a)] = qsa + self.lr * (r + self.temporal_discount * E_Q - qsa)

    def _off_policy_update(self, s, a, r, s_):
        """
        Update the `Q` function using the TD(0) Q-learning update:

            Q[s, a] <- Q[s, a] + lr * (
                r + temporal_discount * max_a { Q[s', a] } - Q[s, a]
            )

        Parameters
        ----------
        s : int as returned by `self._obs2num`
            The id for the state/observation at timestep `t-1`
        a : int as returned by `self._action2num`
            The id for the action taken at timestep `t-1`
        r : float
            The reward after taking action `a` in state `s` at timestep `t-1`
        s_ : int as returned by `self._obs2num`
            The id for the state/observation at timestep `t`
        """
        Q, E = self.parameters["Q"], self.env_info
        n_actions = np.prod(E["n_actions_per_dim"])

        qsa = Q[(s, a)]
        Qs_ = [Q[(s_, aa)] for aa in range(n_actions)] if s_ else [0]
        Q[(s, a)] = qsa + self.lr * (r + self.temporal_discount * np.max(Qs_) - qsa)

    def update(self):
        """Update the parameters of the model online after each new state-action."""
        H, HS = self.hyperparameters, self.episode_history
        (s, a), r = HS["state_actions"][-2], HS["rewards"][-1]
        s_, a_ = HS["state_actions"][-1]

        if H["off_policy"]:
            self._off_policy_update(s, a, r, s_)
        else:
            self._on_policy_update(s, a, r, s_, a_)

    def act(self, obs):
        r"""
        Execute the behavior policy--an :math:`\epsilon`-soft policy used to
        generate actions during training.

        Parameters
        ----------
        obs : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by ``env.step(action)``
            An observation from the environment.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            An action sampled from the distribution over actions defined by the
            epsilon-soft policy.
        """  # noqa: E501
        s = self._obs2num[obs]
        return self.behavior_policy(s)

    def greedy_policy(self, max_steps, render=True):
        """
        Execute a deterministic greedy policy using the current agent
        parameters.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during execution.

        Returns
        -------
        total_reward : float
            The total reward on the episode.
        n_steps : float
            The total number of steps taken on the episode.
        """
        self.flush_history()

        H = self.episode_history
        obs = self.env.reset()

        total_reward, n_steps = 0.0, 0
        for i in range(max_steps):
            if render:
                self.env.render()

            s = self._obs2num[obs]
            action = self._greedy(s)
            a = self._action2num[action]

            # store (state, action) tuple
            H["state_actions"].append((s, a))

            # take action
            obs, reward, done, info = self.env.step(action)
            n_steps += 1

            # record rewards
            H["rewards"].append(reward)
            total_reward += reward

            if done:
                break

        return total_reward, n_steps


class DynaAgent(AgentBase):
    def __init__(
        self,
        env,
        lr=0.4,
        epsilon=0.1,
        n_tilings=8,
        obs_max=None,
        obs_min=None,
        q_plus=False,
        grid_dims=[8, 8],
        explore_weight=0.05,
        temporal_discount=0.9,
        n_simulated_actions=50,
    ):
        r"""
        A Dyna-`Q` / Dyna-`Q+` agent [5]_ with full TD(0) `Q`-learning updates via
        prioritized-sweeping [6]_ .

        Notes
        -----
        This approach consists of three components: a planning method involving
        simulated actions, a direct RL method where the agent directly interacts
        with the environment, and a model-learning method where the agent
        learns to better represent the environment during planning.

        During planning, the agent performs random-sample one-step tabular
        Q-planning with prioritized sweeping. This entails using a priority
        queue to retrieve the state-action pairs from the agent's history which
        would stand to have the largest change to their Q-values if backed up.
        Specifically, for state action pair `(s, a)` the priority value is:

        .. math::

            P = \sum_{s'} p(s') | r + \gamma \max_a \{Q(s', a) \} - Q(s, a) |

        which corresponds to the absolute magnitude of the TD(0) Q-learning
        backup for the pair.

        When the first pair in the queue is backed up, the effect on each of
        its predecessor pairs is computed. If the predecessor's priority is
        greater than a small threshold the pair is added to the queue and the
        process is repeated until either the queue is empty or we have exceeded
        `n_simulated_actions` updates. These backups occur without the agent
        taking any action in the environment and thus constitute simulations
        based on the agent's current model of the environment (i.e., its
        tabular state-action history).

        During the direct RL phase, the agent takes an action based on its
        current behavior policy and Q function and receives a reward from the
        environment. The agent logs this state-action-reward-new state tuple in
        its interaction table (i.e., environment model) and updates its Q
        function using a full-backup version of the Q-learning update:

        .. math::

            Q(s, a) \leftarrow Q(s, a) + \eta \sum_{r, s'} p(r, s' \mid s, a)
                \left(r + \gamma \max_a \{ Q(s', a) \} - Q(s, a) \right)

        References
        ----------
        .. [5] Sutton, R. (1990). Integrated architectures for learning,
           planning, and reacting based on approximating dynamic programming.
           In *Proceedings of the 7th Annual ICML*, 216-224.
        .. [6] Moore, A. & Atkeson, C. (1993). Prioritized sweeping:
           Reinforcement learning with less data and less time. *Machine
           Learning, 13(1)*, 103-130.

        Parameters
        ----------
        env : :class:`gym.wrappers` or :class:`gym.envs` instance
            The environment to run the agent on
        lr : float
            Learning rate for the `Q` function updates. Default is 0.05.
        epsilon : float between [0, 1]
            The epsilon value in the epsilon-soft policy. Larger values
            encourage greater exploration during training. Default is 0.1.
        n_tilings : int
            The number of overlapping tilings to use if the env observation
            space is continuous. Unused if observation space is discrete.
            Default is 8.
        obs_max : float or :py:class:`ndarray <numpy.ndarray>` or None
            The value to treat as the max value of the observation space when
            calculating the grid widths if the observation space is continuous.
            If None, use :meth:`env.observation_space.high`. Unused if observation
            space is discrete. Default is None.
        obs_min : float or :py:class:`ndarray <numpy.ndarray>` or None
            The value to treat as the min value of the observation space when
            calculating grid widths if the observation space is continuous. If
            None, use :meth:`env.observation_space.low`. Unused if observation
            space is discrete. Default is None.
        grid_dims : list
            The number of rows and columns in each tiling grid if the env
            observation space is continuous. Unused if observation space is
            discrete. Default is `[8, 8]`.
        q_plus : bool
            Whether to add incentives for visiting states that the agent hasn't
            encountered recently. Default is False.
        explore_weight : float
            Amount to incentivize exploring states that the agent hasn't
            recently visited. Only used if `q_plus` is True. Default is 0.05.
        temporal_discount : float between [0, 1]
            The discount factor used for downweighting future rewards. Smaller
            values result in greater discounting of future rewards. Default is
            0.9.
        n_simulated_actions : int
            THe number of simulated actions to perform for each "real" action.
            Default is 50.
        """
        super().__init__(env)

        self.lr = lr
        self.q_plus = q_plus
        self.obs_max = obs_max
        self.obs_min = obs_min
        self.epsilon = epsilon
        self.n_tilings = n_tilings
        self.grid_dims = grid_dims
        self.explore_weight = explore_weight
        self.temporal_discount = temporal_discount
        self.n_simulated_actions = n_simulated_actions

        self._init_params()

    def _init_params(self):
        E = self.env_info
        assert not E["continuous_actions"], "Action space must be discrete"

        obs_encoder = None
        if E["continuous_observations"]:
            obs_encoder, _ = tile_state_space(
                self.env,
                self.env_info,
                self.n_tilings,
                state_action=False,
                obs_max=self.obs_max,
                obs_min=self.obs_min,
                grid_size=self.grid_dims,
            )

        self._create_2num_dicts(obs_encoder=obs_encoder)
        self.behavior_policy = self.target_policy = self._epsilon_soft_policy

        # initialize Q function and model
        self.parameters["Q"] = defaultdict(np.random.rand)
        self.parameters["model"] = EnvModel()

        # initialize returns object for each state-action pair
        self.derived_variables = {
            "episode_num": 0,
            "sweep_queue": {},
            "visited": set(),
            "steps_since_last_visit": defaultdict(lambda: 0),
        }

        if self.q_plus:
            self.derived_variables["steps_since_last_visit"] = defaultdict(
                np.random.rand,
            )

        self.hyperparameters = {
            "agent": "DynaAgent",
            "lr": self.lr,
            "q_plus": self.q_plus,
            "obs_max": self.obs_max,
            "obs_min": self.obs_min,
            "epsilon": self.epsilon,
            "n_tilings": self.n_tilings,
            "grid_dims": self.grid_dims,
            "explore_weight": self.explore_weight,
            "temporal_discount": self.temporal_discount,
            "n_simulated_actions": self.n_simulated_actions,
        }

        self.episode_history = {"state_actions": [], "rewards": []}

    def act(self, obs):
        r"""
        Execute the behavior policy--an :math:`\epsilon`-soft policy used to
        generate actions during training.

        Parameters
        ----------
        obs : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by ``env.step(action)``
            An observation from the environment.

        Returns
        -------
        action : int, float, or :py:class:`ndarray <numpy.ndarray>`
            An action sampled from the distribution over actions defined by the
            epsilon-soft policy.
        """  # noqa: E501
        s = self._obs2num[obs]
        return self.behavior_policy(s)

    def _epsilon_soft_policy(self, s, a=None):
        """
        Epsilon-soft exploration policy.

        In epsilon-soft policies, pi(a|s) > 0 for all s ∈ S and all a ∈ A(s) at
        the start of training. As learning progresses, pi gradually shifts
        closer and closer to a deterministic optimal policy.

        In particular, we have:

            pi(a|s) = 1 - epsilon + (epsilon / |A(s)|) IFF a == a*
            pi(a|s) = epsilon / |A(s)|                 IFF a != a*

        where

            |A(s)| is the number of actions available in state s
            a* ∈ A(s) is the greedy action in state s (i.e., a* = argmax_a Q(s, a))

        Note that epsilon-greedy policies are instances of epsilon-soft
        policies, defined as policies for which pi(a|s) >= epsilon / |A(s)| for
        all states and actions.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            self._obs2num[obs]
        a : int, float, or tuple
            The action number in the current state, as returned by
            self._action2num[obs]. If None, sample an action from the action
            probabilities in state s, otherwise, return the probability of
            action `a` under the epsilon-soft policy. Default is None.

        Returns
        -------
        If `a` is None:
        action : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by :meth:`_num2action`
            If `a` is None, returns an action sampled from the distribution
            over actions defined by the epsilon-soft policy.

        If `a` is not None:
        action_prob : float in range [0, 1]
            If `a` is not None, returns the probability of `a` under the
            epsilon-soft policy.
        """  # noqa: E501
        E, P = self.env_info, self.parameters

        # TODO: this assumes all actions are available in every state
        n_actions = np.prod(E["n_actions_per_dim"])

        a_star = np.argmax([P["Q"][(s, aa)] for aa in range(n_actions)])
        p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)
        p_a = self.epsilon / n_actions

        action_probs = np.ones(n_actions) * p_a
        action_probs[a_star] = p_a_star
        np.testing.assert_allclose(np.sum(action_probs), 1)

        if a is not None:
            return action_probs[a]

        # sample action
        a = np.random.multinomial(1, action_probs).argmax()
        return self._num2action[a]

    def _greedy(self, s, a=None):
        """
        A greedy behavior policy.

        Parameters
        ----------
        s : int, float, or tuple
            The state number for the current observation, as returned by
            self._obs2num[obs]
        a : int, float, or tuple
            The action number in the current state, as returned by
            self._action2num[obs]. If None, sample an action from the action
            probabilities in state s, otherwise, return the probability of
            action `a` under the greedy policy. Default is None.

        Returns
        -------
        If `a` is None:
        action : int, float, or :py:class:`ndarray <numpy.ndarray>` as returned by :meth:`_num2action`
            If `a` is None, returns an action sampled from the distribution
            over actions defined by the greedy policy.

        If `a` is not None:
        action_prob : float in range [0, 1]
            If `a` is not None, returns the probability of `a` under the
            greedy policy.
        """  # noqa: E501
        E, Q = self.env_info, self.parameters["Q"]
        n_actions = np.prod(E["n_actions_per_dim"])
        a_star = np.argmax([Q[(s, aa)] for aa in range(n_actions)])
        if a is None:
            out = self._num2action[a_star]
        else:
            out = 1 if a == a_star else 0
        return out

    def update(self):
        """
        Update the priority queue with the most recent (state, action) pair and
        perform random-sample one-step tabular Q-planning.

        Notes
        -----
        The planning algorithm uses a priority queue to retrieve the
        state-action pairs from the agent's history which will result in the
        largest change to its `Q`-value if backed up. When the first pair in
        the queue is backed up, the effect on each of its predecessor pairs is
        computed. If the predecessor's priority is greater than a small
        threshold the pair is added to the queue and the process is repeated
        until either the queue is empty or we exceed `n_simulated_actions`
        updates.
        """
        s, a = self.episode_history["state_actions"][-1]
        self._update_queue(s, a)
        self._simulate_behavior()

    def _update_queue(self, s, a):
        """
        Update the priority queue by calculating the priority for (s, a) and
        inserting it into the queue if it exceeds a fixed (small) threshold.

        Parameters
        ----------
        s : int as returned by `self._obs2num`
            The id for the state/observation
        a : int as returned by `self._action2num`
            The id for the action taken from state `s`
        """
        sweep_queue = self.derived_variables["sweep_queue"]

        # TODO: what's a good threshold here?
        priority = self._calc_priority(s, a)
        if priority >= 0.001:
            if (s, a) in sweep_queue:
                sweep_queue[(s, a)] = max(priority, sweep_queue[(s, a)])
            else:
                sweep_queue[(s, a)] = priority

    def _calc_priority(self, s, a):
        """
        Compute the "priority" for state-action pair (s, a). The priority P is
        defined as:

            P = sum_{s_} p(s_) * abs(r + temporal_discount * max_a {Q[s_, a]} - Q[s, a])

        which corresponds to the absolute magnitude of the TD(0) Q-learning
        backup for (s, a).

        Parameters
        ----------
        s : int as returned by `self._obs2num`
            The id for the state/observation
        a : int as returned by `self._action2num`
            The id for the action taken from state `s`

        Returns
        -------
        priority : float
            The absolute magnitude of the full-backup TD(0) Q-learning update
            for (s, a)
        """
        priority = 0.0
        E = self.env_info
        Q = self.parameters["Q"]
        env_model = self.parameters["model"]
        n_actions = np.prod(E["n_actions_per_dim"])

        outcome_probs = env_model.outcome_probs(s, a)
        for (r, s_), p_rs_ in outcome_probs:
            max_q = np.max([Q[(s_, aa)] for aa in range(n_actions)])
            P = p_rs_ * (r + self.temporal_discount * max_q - Q[(s, a)])
            priority += np.abs(P)
        return priority

    def _simulate_behavior(self):
        """
        Perform random-sample one-step tabular Q-planning with prioritized
        sweeping.

        Notes
        -----
        This approach uses a priority queue to retrieve the state-action pairs
        from the agent's history with largest change to their Q-values if
        backed up. When the first pair in the queue is backed up, the effect on
        each of its predecessor pairs is computed. If the predecessor's
        priority is greater than a small threshold the pair is added to the
        queue and the process is repeated until either the queue is empty or we
        have exceeded a `n_simulated_actions` updates.
        """
        env_model = self.parameters["model"]
        sweep_queue = self.derived_variables["sweep_queue"]
        for _ in range(self.n_simulated_actions):
            if len(sweep_queue) == 0:
                break

            # select (s, a) pair with the largest update (priority)
            sq_items = list(sweep_queue.items())
            (s_sim, a_sim), _ = sorted(sq_items, key=lambda x: x[1], reverse=True)[0]

            # remove entry from queue
            del sweep_queue[(s_sim, a_sim)]

            # update Q function for (s_sim, a_sim) using the full-backup
            # version of the TD(0) Q-learning update
            self._update(s_sim, a_sim)

            # get all (_s, _a) pairs that lead to s_sim (ie., s_sim's predecessors)
            pairs = env_model.state_action_pairs_leading_to_outcome(s_sim)

            # add predecessors to queue if their priority exceeds thresh
            for (_s, _a) in pairs:
                self._update_queue(_s, _a)

    def _update(self, s, a):
        """
        Update Q using a full-backup version of the TD(0) Q-learning update:

            Q(s, a) = Q(s, a) + lr *
                sum_{r, s'} [
                    p(r, s' | s, a) * (r + gamma * max_a { Q(s', a) } - Q(s, a))
                ]

        Parameters
        ----------
        s : int as returned by ``self._obs2num``
            The id for the state/observation
        a : int as returned by ``self._action2num``
            The id for the action taken from state `s`
        """
        update = 0.0
        env_model = self.parameters["model"]
        E, D, Q = self.env_info, self.derived_variables, self.parameters["Q"]
        n_actions = np.prod(E["n_actions_per_dim"])

        # sample rewards from the model
        outcome_probs = env_model.outcome_probs(s, a)
        for (r, s_), p_rs_ in outcome_probs:
            # encourage visiting long-untried actions by adding a "bonus"
            # reward proportional to the sqrt of the time since last visit
            if self.q_plus:
                r += self.explore_weight * np.sqrt(D["steps_since_last_visit"][(s, a)])

            max_q = np.max([Q[(s_, a_)] for a_ in range(n_actions)])
            update += p_rs_ * (r + self.temporal_discount * max_q - Q[(s, a)])

        # update Q value for (s, a) pair
        Q[(s, a)] += self.lr * update

    def run_episode(self, max_steps, render=False):
        """
        Run the agent on a single episode without performing `Q`-function
        backups.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        return self._episode(max_steps, render, update=False)

    def train_episode(self, max_steps, render=False):
        """
        Train the agent on a single episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run an episode.
        render : bool
            Whether to render the episode during training.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        D = self.derived_variables
        total_rwd, n_steps = self._episode(max_steps, render, update=True)
        D["episode_num"] += 1
        return total_rwd, n_steps

    def _episode(self, max_steps, render, update=True):
        """
        Run or train the agent on an episode.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during training.
        update : bool
            Whether to perform the `Q` function backups after each step.
            Default is True.

        Returns
        -------
        reward : float
            The total reward on the episode.
        steps : float
            The number of steps taken on the episode.
        """
        self.flush_history()

        obs = self.env.reset()
        env_model = self.parameters["model"]
        HS, D = self.episode_history, self.derived_variables

        action = self.act(obs)
        s = self._obs2num[obs]
        a = self._action2num[action]

        # store initial (state, action) tuple
        HS["state_actions"].append((s, a))

        total_reward, n_steps = 0.0, 0
        for i in range(max_steps):
            if render:
                self.env.render()

            # take action
            obs, reward, done, info = self.env.step(action)
            n_steps += 1

            # record rewards
            HS["rewards"].append(reward)
            total_reward += reward

            # generate next state and action
            action = self.act(obs)
            s_ = self._obs2num[obs] if not done else None
            a_ = self._action2num[action]

            # update model
            env_model[(s, a, reward, s_)] += 1

            # update history counter
            for k in D["steps_since_last_visit"].keys():
                D["steps_since_last_visit"][k] += 1
            D["steps_since_last_visit"][(s, a)] = 0

            if update:
                self.update()

            # store next (state, action) tuple
            HS["state_actions"].append((s_, a_))
            s, a = s_, a_

            if done:
                break

        return total_reward, n_steps

    def greedy_policy(self, max_steps, render=True):
        """
        Execute a deterministic greedy policy using the current agent
        parameters.

        Parameters
        ----------
        max_steps : int
            The maximum number of steps to run the episode.
        render : bool
            Whether to render the episode during execution.

        Returns
        -------
        total_reward : float
            The total reward on the episode.
        n_steps : float
            The total number of steps taken on the episode.
        """
        self.flush_history()

        H = self.episode_history
        obs = self.env.reset()

        total_reward, n_steps = 0.0, 0
        for i in range(max_steps):
            if render:
                self.env.render()

            s = self._obs2num[obs]
            action = self._greedy(s)
            a = self._action2num[action]

            # store (state, action) tuple
            H["state_actions"].append((s, a))

            # take action
            obs, reward, done, info = self.env.step(action)
            n_steps += 1

            # record rewards
            H["rewards"].append(reward)
            total_reward += reward

            if done:
                break

        return total_reward, n_steps


================================================
FILE: numpy_ml/rl_models/rl_utils.py
================================================
"""Utilities for training and evaluating RL models on OpenAI gym environments"""
import warnings
from itertools import product
from collections import defaultdict

import numpy as np

from numpy_ml.utils.testing import DependencyWarning
from numpy_ml.rl_models.tiles.tiles3 import tiles, IHT

NO_PD = False
try:
    import pandas as pd
except ModuleNotFoundError:
    NO_PD = True

try:
    import gym
except ModuleNotFoundError:
    fstr = (
        "Agents in `numpy_ml.rl_models` use the OpenAI gym for training. "
        "To install the gym environments, run `pip install gym`. For more"
        " information, see https://github.com/openai/gym."
    )
    warnings.warn(fstr, DependencyWarning)


class EnvModel(object):
    """
    A simple tabular environment model that maintains the counts of each
    reward-outcome pair given the state and action that preceded them. The
    model can be queried with

        >>> M = EnvModel()
        >>> M[(state, action, reward, next_state)] += 1
        >>> M[(state, action, reward, next_state)]
        1
        >>> M.state_action_pairs()
        [(state, action)]
        >>> M.outcome_probs(state, action)
        [(next_state, 1)]
    """

    def __init__(self):
        super(EnvModel, self).__init__()
        self._model = defaultdict(lambda: defaultdict(lambda: 0))

    def __setitem__(self, key, value):
        """Set self[key] to value"""
        s, a, r, s_ = key
        self._model[(s, a)][(r, s_)] = value

    def __getitem__(self, key):
        """Return the value associated with key"""
        s, a, r, s_ = key
        return self._model[(s, a)][(r, s_)]

    def __contains__(self, key):
        """True if EnvModel contains `key`, else False"""
        s, a, r, s_ = key
        p1 = (s, a) in self.state_action_pairs()
        p2 = (r, s_) in self.reward_outcome_pairs()
        return p1 and p2

    def state_action_pairs(self):
        """Return all (state, action) pairs in the environment model"""
        return list(self._model.keys())

    def reward_outcome_pairs(self, s, a):
        """
        Return all (reward, next_state) pairs associated with taking action `a`
        in state `s`.
        """
        return list(self._model[(s, a)].keys())

    def outcome_probs(self, s, a):
        """
        Return the probability under the environment model of each outcome
        state after taking action `a` in state `s`.

        Parameters
        ----------
        s : int as returned by ``self._obs2num``
            The id for the state/observation.
        a : int as returned by ``self._action2num``
            The id for the action taken from state `s`.

        Returns
        -------
        outcome_probs : list of (state, prob) tuples
            A list of each possible outcome and its associated probability
            under the model.
        """
        items = list(self._model[(s, a)].items())
        total_count = np.sum([c for (_, c) in items])
        outcome_probs = [c / total_count for (_, c) in items]
        outcomes = [p for (p, _) in items]
        return list(zip(outcomes, outcome_probs))

    def state_action_pairs_leading_to_outcome(self, outcome):
        """
        Return all (state, action) pairs that have a nonzero probability of
        producing `outcome` under the current model.

        Parameters
        ----------
        outcome : int
            The outcome state.

        Returns
        -------
        pairs : list of (state, action) tuples
            A list of all (state, action) pairs with a nonzero probability of
            producing `outcome` under the model.
        """
        pairs = []
        for sa in self.state_action_pairs():
            outcomes = [o for (r, o) in self.reward_outcome_pairs(*sa)]
            if outcome in outcomes:
                pairs.append(sa)
        return pairs


def tile_state_space(
    env,
    env_stats,
    n_tilings,
    obs_max=None,
    obs_min=None,
    state_action=False,
    grid_size=(4, 4),
):
    """
    Return a function to encode the continous observations generated by `env`
    in terms of a collection of `n_tilings` overlapping tilings (each with
    dimension `grid_size`) of the state space.

    Arguments
    ---------
    env : ``gym.wrappers.time_limit.TimeLimit`` instance
        An openAI environment.
    n_tilings : int
        The number of overlapping tilings to use. Should be a power of 2. This
        determines the dimension of the discretized tile-encoded state vector.
    obs_max : float or np.ndarray
        The value to treat as the max value of the observation space when
        calculating the grid widths. If None, use
        ``env.observation_space.high``. Default is None.
    obs_min : float or np.ndarray
        The value to treat as the min value of the observation space when
        calculating the grid widths. If None, use
        ``env.observation_space.low``. Default is None.
    state_action : bool
        Whether to use tile coding to encode state-action values (True) or just
        state values (False). Default is False.
    grid_size : list of length 2
        A list of ints representing the coarseness of the tilings. E.g., a
        `grid_size` of [4, 4] would mean each tiling consisted of a 4x4 tile
        grid. Default is [4, 4].

    Returns
    -------
    encode_obs_as_tile : function
        A function which takes as input continous observation vector and
        returns a set of the indices of the active tiles in the tile coded
        observation space.
    n_states : int
        An integer reflecting the total number of unique states possible under
        this tile coding regimen.
    """
    obs_max = np.nan_to_num(env.observation_space.high) if obs_max is None else obs_max
    obs_min = np.nan_to_num(env.observation_space.low) if obs_min is None else obs_min

    if state_action:
        if env_stats["tuple_action"]:
            n = [space.n - 1.0 for space in env.action_spaces.spaces]
        else:
            n = [env.action_space.n]

        obs_max = np.concatenate([obs_max, n])
        obs_min = np.concatenate([obs_min, np.zeros_like(n)])

    obs_range = obs_max - obs_min
    scale = 1.0 / obs_range

    # scale (state-)observation vector
    scale_obs = lambda obs: obs * scale  # noqa: E731

    n_tiles = np.prod(grid_size) * n_tilings
    n_states = np.prod([n_tiles - i for i in range(n_tilings)])
    iht = IHT(16384)

    def encode_obs_as_tile(obs):
        obs = scale_obs(obs)
        return tuple(tiles(iht, n_tilings, obs))

    return encode_obs_as_tile, n_states


def get_gym_environs():
    """List all valid OpenAI ``gym`` environment ids"""
    return [e.id for e in gym.envs.registry.all()]


def get_gym_stats():
    """Return a pandas DataFrame of the environment IDs."""
    df = []
    for e in gym.envs.registry.all():
        print(e.id)
        df.append(env_stats(gym.make(e.id)))
    cols = [
        "id",
        "continuous_actions",
        "continuous_observations",
        "action_dim",
        #  "action_ids",
        "deterministic",
        "multidim_actions",
        "multidim_observations",
        "n_actions_per_dim",
        "n_obs_per_dim",
        "obs_dim",
        #  "obs_ids",
        "seed",
        "tuple_actions",
        "tuple_observations",
    ]
    return df if NO_PD else pd.DataFrame(df)[cols]


def is_tuple(env):
    """
    Check if the action and observation spaces for `env` are instances of
    ``gym.spaces.Tuple`` or ``gym.spaces.Dict``.

    Notes
    -----
    A tuple space is a tuple of *several* (possibly multidimensional)
    action/observation spaces. For our purposes, a tuple space is necessarily
    multidimensional.

    Returns
    -------
    tuple_action : bool
        Whether the `env`'s action space is an instance of ``gym.spaces.Tuple``
        or ``gym.spaces.Dict``.
    tuple_obs : bool
        Whether the `env`'s observation space is an instance of
        ``gym.spaces.Tuple`` or ``gym.spaces.Dict``.
    """
    tuple_space, dict_space = gym.spaces.Tuple, gym.spaces.dict.Dict
    tuple_action = isinstance(env.action_space, (tuple_space, dict_space))
    tuple_obs = isinstance(env.observation_space, (tuple_space, dict_space))
    return tuple_action, tuple_obs


def is_multidimensional(env):
    """
    Check if the action and observation spaces for `env` are multidimensional
    or ``Tuple`` spaces.

    Notes
    -----
    A multidimensional space is any space whose actions / observations have
    more than one element in them. This includes ``Tuple`` spaces, but also
    includes single action/observation spaces with several dimensions.

    Parameters
    ----------
    env : ``gym.wrappers`` or ``gym.envs`` instance
        The environment to evaluate.

    Returns
    -------
    md_action : bool
        Whether the `env`'s action space is multidimensional.
    md_obs : bool
        Whether the `env`'s observation space is multidimensional.
    tuple_action : bool
        Whether the `env`'s action space is a ``Tuple`` instance.
    tuple_obs : bool
        Whether the `env`'s observation space is a ``Tuple`` instance.
    """
    md_action, md_obs = True, True
    tuple_action, tuple_obs = is_tuple(env)
    if not tuple_action:
        act = env.action_space.sample()
        md_action = isinstance(act, (list, tuple, np.ndarray)) and len(act) > 1

    if not tuple_obs:
        OS = env.observation_space
        obs = OS.low if "low" in dir(OS) else OS.sample()  # sample causes problems
        md_obs = isinstance(obs, (list, tuple, np.ndarray)) and len(obs) > 1
    return md_action, md_obs, tuple_action, tuple_obs


def is_continuous(env, tuple_action, tuple_obs):
    """
    Check if an `env`'s observation and action spaces are continuous.

    Parameters
    ----------
    env : ``gym.wrappers`` or ``gym.envs`` instance
        The environment to evaluate.
    tuple_action : bool
        Whether the `env`'s action space is an instance of `gym.spaces.Tuple`
        or `gym.spaces.Dict`.
    tuple_obs : bool
        Whether the `env`'s observation space is an instance of `gym.spaces.Tuple`
        or `gym.spaces.Dict`.

    Returns
    -------
    cont_action : bool
        Whether the `env`'s action space is continuous.
    cont_obs : bool
        Whether the `env`'s observation space is continuous.
    """
    Continuous = gym.spaces.box.Box
    if tuple_obs:
        spaces = env.observation_space.spaces
        cont_obs = all(isinstance(s, Continuous) for s in spaces)
    else:
        cont_obs = isinstance(env.observation_space, Continuous)

    if tuple_action:
        spaces = env.action_space.spaces
        cont_action = all(isinstance(s, Continuous) for s in spaces)
    else:
        cont_action = isinstance(env.action_space, Continuous)
    return cont_action, cont_obs


def action_stats(env, md_action, cont_action):
    """
    Get information on `env`'s action space.

    Parameters
    ----------
    md_action : bool
        Whether the `env`'s action space is multidimensional.
    cont_action : bool
        Whether the `env`'s action space is continuous.

    Returns
    -------
    n_actions_per_dim : list of length (action_dim,)
        The number of possible actions for each dimension of the action space.
    action_ids : list or None
        A list of all valid actions within the space. If `cont_action` is
        True, this value will be None.
    action_dim : int or None
        The number of dimensions in a single action.
    """
    if cont_action:
        action_dim = 1
        action_ids = None
        n_actions_per_dim = [np.inf]

        if md_action:
            action_dim = env.action_space.shape[0]
            n_actions_per_dim = [np.inf for _ in range(action_dim)]
    else:
        if md_action:
            n_actions_per_dim = [
                space.n if hasattr(space, "n") else np.inf
                for space in env.action_space.spaces
            ]
            action_ids = (
                None
                if np.inf in n_actions_per_dim
                else list(product(*[range(i) for i in n_actions_per_dim]))
            )
            action_dim = len(n_actions_per_dim)
        else:
            action_dim = 1
            n_actions_per_dim = [env.action_space.n]
            action_ids = list(range(n_actions_per_dim[0]))
    return n_actions_per_dim, action_ids, action_dim


def obs_stats(env, md_obs, cont_obs):
    """
    Get information on the observation space for `env`.

    Parameters
    ----------
    env : ``gym.wrappers`` or ``gym.envs`` instance
        The environment to evaluate.
    md_obs : bool
        Whether the `env`'s action space is multidimensional.
    cont_obs : bool
        Whether the `env`'s observation space is multidimensional.

    Returns
    -------
    n_obs_per_dim : list of length (obs_dim,)
        The number of possible observation classes for each dimension of the
        observation space.
    obs_ids : list or None
        A list of all valid observations within the space. If `cont_obs` is
        True, this value will be None.
    obs_dim : int or None
        The number of dimensions in a single observation.
    """
    if cont_obs:
        obs_ids = None
        obs_dim = env.observation_space.shape[0]
        n_obs_per_dim = [np.inf for _ in range(obs_dim)]
    else:
        if md_obs:
            n_obs_per_dim = [
                space.n if hasattr(space, "n") else np.inf
                for space in env.observation_space.spaces
            ]
            obs_ids = (
                None
                if np.inf in n_obs_per_dim
                else list(product(*[range(i) for i in n_obs_per_dim]))
            )
            obs_dim = len(n_obs_per_dim)
        else:
            obs_dim = 1
            n_obs_per_dim = [env.observation_space.n]
            obs_ids = list(range(n_obs_per_dim[0]))

    return n_obs_per_dim, obs_ids, obs_dim


def env_stats(env):
    """
    Compute statistics for the current environment.

    Parameters
    ----------
    env : ``gym.wrappers`` or ``gym.envs`` instance
        The environment to evaluate.

    Returns
    -------
    env_info : dict
        A dictionary containing information about the action and observation
        spaces of `env`.
    """
    md_action, md_obs, tuple_action, tuple_obs = is_multidimensional(env)
    cont_action, cont_obs = is_continuous(env, tuple_action, tuple_obs)

    n_actions_per_dim, action_ids, action_dim = action_stats(
        env, md_action, cont_action,
    )
    n_obs_per_dim, obs_ids, obs_dim = obs_stats(env, md_obs, cont_obs)

    env_info = {
        "id": env.spec.id,
        "seed": env.spec.seed if "seed" in dir(env.spec) else None,
        "deterministic": bool(~env.spec.nondeterministic),
        "tuple_actions": tuple_action,
        "tuple_observations": tuple_obs,
        "multidim_actions": md_action,
        "multidim_observations": md_obs,
        "continuous_actions": cont_action,
        "continuous_observations": cont_obs,
        "n_actions_per_dim": n_actions_per_dim,
        "action_dim": action_dim,
        "n_obs_per_dim": n_obs_per_dim,
        "obs_dim": obs_dim,
        "action_ids": action_ids,
        "obs_ids": obs_ids,
    }

    return env_info


================================================
FILE: numpy_ml/rl_models/tiles/__init__.py
================================================
from . import tiles3


================================================
FILE: numpy_ml/rl_models/tiles/tiles3.py
================================================
"""
Tile Coding Software version 3.0beta
by Rich Sutton
based on a program created by Steph Schaeffer and others
External documentation and recommendations on the use of this code is available in the
reinforcement learning textbook by Sutton and Barto, and on the web.
These need to be understood before this code is.

This software is for Python 3 or more.

This is an implementation of grid-style tile codings, based originally on
the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed.
Here we provide a function, "tiles", that maps floating and integer
variables to a list of tiles, and a second function "tiles-wrap" that does the same while
wrapping some floats to provided widths (the lower wrap value is always 0).

The float variables will be gridded at unit intervals, so generalization
will be by approximately 1 in each direction, and any scaling will have
to be done externally before calling tiles.

Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should
also be greater than or equal to four times the number of floats.

The first argument is either an index hash table of a given size (created by (make-iht size)),
an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile
coordinates are to be returned without being converted to indices).
"""

from math import floor
from itertools import zip_longest


basehash = hash


class IHT:
    "Structure to handle collisions"

    def __init__(self, sizeval):
        self.size = sizeval
        self.overfullCount = 0
        self.dictionary = {}

    def __str__(self):
        "Prepares a string for printing whenever this object is printed"
        return (
            "Collision table:"
            + " size:"
            + str(self.size)
            + " overfullCount:"
            + str(self.overfullCount)
            + " dictionary:"
            + str(len(self.dictionary))
            + " items"
        )

    def count(self):
        return len(self.dictionary)

    def fullp(self):
        return len(self.dictionary) >= self.size

    def getindex(self, obj, readonly=False):
        d = self.dictionary
        if obj in d:
            return d[obj]
        elif readonly:
            return None
        size = self.size
        count = self.count()
        if count >= size:
            if self.overfullCount == 0:
                print("IHT full, starting to allow collisions")
            self.overfullCount += 1
            return basehash(obj) % self.size
        else:
            d[obj] = count
            return count


def hashcoords(coordinates, m, readonly=False):
    if type(m) == IHT:
        return m.getindex(tuple(coordinates), readonly)
    if type(m) == int:
        return basehash(tuple(coordinates)) % m
    if m == None:
        return coordinates


def tiles(ihtORsize, numtilings, floats, ints=[], readonly=False):
    """returns num-tilings tile indices corresponding to the floats and ints"""
    qfloats = [floor(f * numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append((q + b) // numtilings)
            b += tilingX2
        coords.extend(ints)
        Tiles.append(hashcoords(coords, ihtORsize, readonly))
    return Tiles


def tileswrap(ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False):
    """returns num-tilings tile indices corresponding to the floats and ints,
    wrapping some floats"""
    qfloats = [floor(f * numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q, width in zip_longest(qfloats, wrapwidths):
            c = (q + b % numtilings) // numtilings
            coords.append(c % width if width else c)
            b += tilingX2
        coords.extend(ints)
        Tiles.append(hashcoords(coords, ihtORsize, readonly))
    return Tiles


================================================
FILE: numpy_ml/rl_models/trainer.py
================================================
from time import time
import numpy as np


class Trainer(object):
    def __init__(self, agent, env):
        """
        An object to facilitate agent training and evaluation.

        Parameters
        ----------
        agent : :class:`AgentBase` instance
            The agent to train.
        env : ``gym.wrappers`` or ``gym.envs`` instance
            The environment to run the agent on.
        """
        self.env = env
        self.agent = agent
        self.rewards = {"total": [], "smooth_total": [], "n_steps": [], "duration": []}

    def _train_episode(self, max_steps, render_every=None):
        t0 = time()
        if "train_episode" in dir(self.agent):
            # online training updates over the course of the episode
            reward, n_steps = self.agent.train_episode(max_steps)
        else:
            # offline training updates upon completion of the episode
            reward, n_steps = self.agent.run_episode(max_steps)
            self.agent.update()
        duration = time() - t0
        return reward, duration, n_steps

    def train(
        self,
        n_episodes,
        max_steps,
        seed=None,
        plot=True,
        verbose=True,
        render_every=None,
        smooth_factor=0.05,
    ):
        """
        Train an agent on an OpenAI gym environment, logging training
        statistics along the way.

        Parameters
        ----------
        n_episodes : int
            The number of episodes to train the agent across.
        max_steps : int
            The maximum number of steps the agent can take on each episode.
        seed : int or None
            A seed for the random number generator. Default is None.
        plot : bool
            Whether to generate a plot of the cumulative reward as a function
            of training episode. Default is True.
        verbose : bool
            Whether to print intermediate run statistics to stdout during
            training. Default is True.
        smooth_factor : float in [0, 1]
            The amount to smooth the cumulative reward across episodes. Larger
            values correspond to less smoothing.
        """
        if seed:
            np.random.seed(seed)
            self.env.seed(seed=seed)

        t0 = time()
        render_every = np.inf if render_every is None else render_every
        sf = smooth_factor

        for ep in range(n_episodes):
            tot_rwd, duration, n_steps = self._train_episode(max_steps)
            smooth_tot = tot_rwd if ep == 0 else (1 - sf) * smooth_tot + sf * tot_rwd

            if verbose:
                fstr = "[Ep. {:2}] {:<6.2f} Steps | Total Reward: {:<7.2f}"
                fstr += " | Smoothed Total: {:<7.2f} | Duration: {:<6.2f}s"
                print(fstr.format(ep + 1, n_steps, tot_rwd, smooth_tot, duration))

            if (ep + 1) % render_every == 0:
                fstr = "\tGreedy policy total reward: {:.2f}, n_steps: {:.2f}"
                total, n_steps = self.agent.greedy_policy(max_steps)
                print(fstr.format(total, n_steps))

            self.rewards["total"].append(tot_rwd)
            self.rewards["n_steps"].append(n_steps)
            self.rewards["duration"].append(duration)
            self.rewards["smooth_total"].append(smooth_tot)

        train_time = (time() - t0) / 60
        fstr = "Training took {:.2f} mins [{:.2f}s/episode]"
        print(fstr.format(train_time, np.mean(self.rewards["duration"])))

        rwd_greedy, n_steps = self.agent.greedy_policy(max_steps, render=False)
        fstr = "Final greedy reward: {:.2f} | n_steps: {:.2f}"
        print(fstr.format(rwd_greedy, n_steps))

        if plot:
            self.plot_rewards(rwd_greedy)

    def plot_rewards(self, rwd_greedy):
        """
        Plot the cumulative reward per episode as a function of episode number.

        Notes
        -----
        Saves plot to the file ``./img/<agent>-<env>.png``

        Parameters
        ----------
        rwd_greedy : float
            The cumulative reward earned with a final execution of a greedy
            target policy.
        """
        try:
            import matplotlib.pyplot as plt
            import seaborn as sns

            # https://seaborn.pydata.org/generated/seaborn.set_context.html
            # https://seaborn.pydata.org/generated/seaborn.set_style.html
            sns.set_style("white")
            sns.set_context("notebook", font_scale=1)
        except:
            fstr = "Error importing `matplotlib` and `seaborn` -- plotting functionality is disabled"
            raise ImportError(fstr)

        R = self.rewards
        fig, ax = plt.subplots()
        x = np.arange(len(R["total"]))
        y = R["smooth_total"]
        y_raw = R["total"]

        ax.plot(x, y, label="smoothed")
        ax.plot(x, y_raw, alpha=0.5, label="raw")
        ax.axhline(y=rwd_greedy, xmin=min(x), xmax=max(x), ls=":", label="final greedy")
        ax.legend()
        sns.despine()

        env = self.agent.env_info["id"]
        agent = self.agent.hyperparameters["agent"]

        ax.set_xlabel("Episode")
        ax.set_ylabel("Cumulative reward")
        ax.set_title("{} on '{}'".format(agent, env))
        plt.savefig("img/{}-{}.png".format(agent, env))
        plt.close("all")


================================================
FILE: numpy_ml/tests/__init__.py
================================================
"""Unit tests for various numpy-ml modules"""


================================================
FILE: numpy_ml/tests/nn_torch_models.py
================================================
# flake8: noqa

import torch
import torch.nn as nn
import torch.nn.functional as F

import tensorflow as tf

import numpy as np

#######################################################################
#       Gold-standard implementations for testing custom layers       #
#                       (Requires Pytorch)                            #
#######################################################################


def torchify(var, requires_grad=True):
    return torch.autograd.Variable(torch.FloatTensor(var), requires_grad=requires_grad)


def torch_gradient_generator(fn, **kwargs):
    def get_grad(z):
        z1 = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
        z2 = fn(z1, **kwargs).sum()
        z2.backward()
        grad = z1.grad.numpy()
        return grad

    return get_grad


def torch_xe_grad(y, z):
    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
    y = torch.LongTensor(y.argmax(axis=1))
    loss = F.cross_entropy(z, y, reduction="sum")
    loss.backward()
    grad = z.grad.numpy()
    return grad


def torch_mse_grad(y, z, act_fn):
    y = torch.FloatTensor(y)
    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
    y_pred = act_fn(z)
    loss = F.mse_loss(y_pred, y, reduction="sum")  # size_average=False).sum()
    loss.backward()
    grad = z.grad.numpy()
    return grad


class TorchVAELoss(nn.Module):
    def __init__(self):
        super(TorchVAELoss, self).__init__()

    def extract_grads(self, X, X_recon, t_mean, t_log_var):
        eps = np.finfo(float).eps
        X = torchify(X, requires_grad=False)
        X_recon = torchify(np.clip(X_recon, eps, 1 - eps))
        t_mean = torchify(t_mean)
        t_log_var = torchify(t_log_var)

        BCE = torch.sum(F.binary_cross_entropy(X_recon, X, reduction="none"), dim=1)

        # see Appendix B from VAE paper:
        # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
        # https://arxiv.org/abs/1312.6114
        # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
        KLD = -0.5 * torch.sum(1 + t_log_var - t_mean.pow(2) - t_log_var.exp(), dim=1)

        loss = torch.mean(BCE + KLD)
        loss.backward()

        grads = {
            "loss": loss.detach().numpy(),
            "dX_recon": X_recon.grad.numpy(),
            "dt_mean": t_mean.grad.numpy(),
            "dt_log_var": t_log_var.grad.numpy(),
        }
        return grads


class TorchWGANGPLoss(nn.Module):
    def __init__(self, lambda_=10):
        self.lambda_ = torchify([lambda_])
        super(TorchWGANGPLoss, self).__init__()

    def forward(self, Y_real, Y_fake, gradInterp):
        GY_fake = Y_fake.copy()
        self.Y_real = torchify(Y_real)
        self.Y_fake = torchify(Y_fake)
        self.GY_fake = torchify(GY_fake)
        self.gradInterp = torchify(gradInterp)

        # calc grad penalty
        norm = self.gradInterp.norm(2, dim=1)
        self.norm1 = torch.sqrt(torch.sum(self.gradInterp.pow(2), dim=1))
        assert torch.allclose(norm, self.norm1)

        self.gpenalty = self.lambda_ * ((self.norm1 - 1).pow(2)).mean()
        self.C_loss = self.Y_fake.mean() - self.Y_real.mean() + self.gpenalty
        self.G_loss = -self.GY_fake.mean()

    def extract_grads(self, Y_real, Y_fake, gradInterp):
        self.forward(Y_real, Y_fake, gradInterp)

        self.C_loss.backward()
        self.G_loss.backward()

        grads = {
            "Y_real": self.Y_real.detach().numpy(),
            "Y_fake": self.Y_fake.detach().numpy(),
            "gradInterp": self.gradInterp.detach().numpy(),
            "GP": self.gpenalty.detach().numpy(),
            "C_loss": self.C_loss.detach().numpy(),
            "G_loss": self.G_loss.detach().numpy(),
            "C_dY_real": self.Y_real.grad.numpy(),
            "C_dGradInterp": self.gradInterp.grad.numpy(),
            "C_dY_fake": self.Y_fake.grad.numpy(),
            "G_dY_fake": self.GY_fake.grad.numpy(),
        }
        return grads


class TorchLinearActivation(nn.Module):
    def __init__(self):
        super(TorchLinearActivation, self).__init__()
        pass

    @staticmethod
    def forward(input):
        return input

    @staticmethod
    def backward(grad_output):
        return torch.ones_like(grad_output)


class TorchBatchNormLayer(nn.Module):
    def __init__(self, n_in, params, mode, momentum=0.9, epsilon=1e-5):
        super(TorchBatchNormLayer, self).__init__()

        scaler = params["scaler"]
        intercept = params["intercept"]

        if mode == "1D":
            self.layer1 = nn.BatchNorm1d(
                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
            )
        elif mode == "2D":
            self.layer1 = nn.BatchNorm2d(
                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
            )

        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))

    def forward(self, X):
        # (N, H, W, C) -> (N, C, H, W)
        if X.ndim == 4:
            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])

        if not isinstance(X, torch.Tensor):
            X = torchify(X)

        self.X = X
        self.Y = self.layer1(self.X)
        self.Y.retain_grad()

    def extract_grads(self, X, Y_true=None):
        self.forward(X)

        if isinstance(Y_true, np.ndarray):
            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
            self.loss1 = (
                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
            )
        else:
            self.loss1 = self.Y.sum()

        self.loss1.backward()

        X_np = self.X.detach().numpy()
        Y_np = self.Y.detach().numpy()
        dX_np = self.X.grad.numpy()
        dY_np = self.Y.grad.numpy()

        if self.X.dim() == 4:
            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
            if isinstance(Y_true, np.ndarray):
                Y_true = np.moveaxis(Y_true, orig, X_swap)
            X_np = np.moveaxis(X_np, orig, X_swap)
            Y_np = np.moveaxis(Y_np, orig, X_swap)
            dX_np = np.moveaxis(dX_np, orig, X_swap)
            dY_np = np.moveaxis(dY_np, orig, X_swap)

        grads = {
            "loss": self.loss1.detach().numpy(),
            "X": X_np,
            "momentum": 1 - self.layer1.momentum,
            "epsilon": self.layer1.eps,
            "intercept": self.layer1.bias.detach().numpy(),
            "scaler": self.layer1.weight.detach().numpy(),
            "running_mean": self.layer1.running_mean.detach().numpy(),
            "running_var": self.layer1.running_var.detach().numpy(),
            "y": Y_np,
            "dLdy": dY_np,
            "dLdIntercept": self.layer1.bias.grad.numpy(),
            "dLdScaler": self.layer1.weight.grad.numpy(),
            "dLdX": dX_np,
        }
        if isinstance(Y_true, np.ndarray):
            grads["Y_true"] = Y_true
        return grads


class TorchLayerNormLayer(nn.Module):
    def __init__(self, feat_dims, params, mode, epsilon=1e-5):
        super(TorchLayerNormLayer, self).__init__()

        self.layer1 = nn.LayerNorm(
            normalized_shape=feat_dims, eps=epsilon, elementwise_affine=True
        )

        scaler = params["scaler"]
        intercept = params["intercept"]

        if mode == "2D":
            scaler = np.moveaxis(scaler, [0, 1, 2], [-2, -1, -3])
            intercept = np.moveaxis(intercept, [0, 1, 2], [-2, -1, -3])

        assert scaler.shape == self.layer1.weight.shape
        assert intercept.shape == self.layer1.bias.shape
        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))

    def forward(self, X):
        # (N, H, W, C) -> (N, C, H, W)
        if X.ndim == 4:
            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])

        if not isinstance(X, torch.Tensor):
            X = torchify(X)

        self.X = X
        self.Y = self.layer1(self.X)
        self.Y.retain_grad()

    def extract_grads(self, X, Y_true=None):
        self.forward(X)

        if isinstance(Y_true, np.ndarray):
            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
            self.loss1 = (
                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
            )
        else:
            self.loss1 = self.Y.sum()

        self.loss1.backward()

        X_np = self.X.detach().numpy()
        Y_np = self.Y.detach().numpy()
        dX_np = self.X.grad.numpy()
        dY_np = self.Y.grad.numpy()
        intercept_np = self.layer1.bias.detach().numpy()
        scaler_np = self.layer1.weight.detach().numpy()
        dIntercept_np = self.layer1.bias.grad.numpy()
        dScaler_np = self.layer1.weight.grad.numpy()

        if self.X.dim() == 4:
            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
            orig_p, p_swap = [0, 1, 2], [-1, -3, -2]
            if isinstance(Y_true, np.ndarray):
                Y_true = np.moveaxis(Y_true, orig, X_swap)
            X_np = np.moveaxis(X_np, orig, X_swap)
            Y_np = np.moveaxis(Y_np, orig, X_swap)
            dX_np = np.moveaxis(dX_np, orig, X_swap)
            dY_np = np.moveaxis(dY_np, orig, X_swap)
            scaler_np = np.moveaxis(scaler_np, orig_p, p_swap)
            intercept_np = np.moveaxis(intercept_np, orig_p, p_swap)
            dScaler_np = np.moveaxis(dScaler_np, orig_p, p_swap)
            dIntercept_np = np.moveaxis(dIntercept_np, orig_p, p_swap)

        grads = {
            "loss": self.loss1.detach().numpy(),
            "X": X_np,
            "epsilon": self.layer1.eps,
            "intercept": intercept_np,
            "scaler": scaler_np,
            "y": Y_np,
            "dLdy": dY_np,
            "dLdIntercept": dIntercept_np,
            "dLdScaler": dScaler_np,
            "dLdX": dX_np,
        }
        if isinstance(Y_true, np.ndarray):
            grads["Y_true"] = Y_true
        return grads


class TorchAddLayer(nn.Module):
    def __init__(self, act_fn, **kwargs):
        super(TorchAddLayer, self).__init__()
        self.act_fn = act_fn

    def forward(self, Xs):
        self.Xs = []
        x = Xs[0].copy()
        if not isinstance(x, torch.Tensor):
            x = torchify(x)

        self.sum = x.clone()
        x.retain_grad()
        self.Xs.append(x)

        for i in range(1, len(Xs)):
            x = Xs[i]
            if not isinstance(x, torch.Tensor):
                x = torchify(x)

            x.retain_grad()
            self.Xs.append(x)
            self.sum += x

        self.sum.retain_grad()
        self.Y = self.act_fn(self.sum)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()
        grads = {
            "Xs": X,
            "Sum": self.sum.detach().numpy(),
            "Y": self.Y.detach().numpy(),
            "dLdY": self.Y.grad.numpy(),
            "dLdSum": self.sum.grad.numpy(),
        }
        grads.update(
            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
        )
        return grads


class TorchMultiplyLayer(nn.Module):
    def __init__(self, act_fn, **kwargs):
        super(TorchMultiplyLayer, self).__init__()
        self.act_fn = act_fn

    def forward(self, Xs):
        self.Xs = []
        x = Xs[0].copy()
        if not isinstance(x, torch.Tensor):
            x = torchify(x)

        self.prod = x.clone()
        x.retain_grad()
        self.Xs.append(x)

        for i in range(1, len(Xs)):
            x = Xs[i]
            if not isinstance(x, torch.Tensor):
                x = torchify(x)

            x.retain_grad()
            self.Xs.append(x)
            self.prod *= x

        self.prod.retain_grad()
        self.Y = self.act_fn(self.prod)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()
        grads = {
            "Xs": X,
            "Prod": self.prod.detach().numpy(),
            "Y": self.Y.detach().numpy(),
            "dLdY": self.Y.grad.numpy(),
            "dLdProd": self.prod.grad.numpy(),
        }
        grads.update(
            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
        )
        return grads


class TorchSkipConnectionIdentity(nn.Module):
    def __init__(self, act_fn, pad1, pad2, params, hparams, momentum=0.9, epsilon=1e-5):
        super(TorchSkipConnectionIdentity, self).__init__()

        self.conv1 = nn.Conv2d(
            hparams["in_ch"],
            hparams["out_ch"],
            hparams["kernel_shape1"],
            padding=pad1,
            stride=hparams["stride1"],
            bias=True,
        )

        self.act_fn = act_fn

        self.batchnorm1 = nn.BatchNorm2d(
            num_features=hparams["out_ch"],
            momentum=1 - momentum,
            eps=epsilon,
            affine=True,
        )

        self.conv2 = nn.Conv2d(
            hparams["out_ch"],
            hparams["out_ch"],
            hparams["kernel_shape2"],
            padding=pad2,
            stride=hparams["stride2"],
            bias=True,
        )

        self.batchnorm2 = nn.BatchNorm2d(
            num_features=hparams["out_ch"],
            momentum=1 - momentum,
            eps=epsilon,
            affine=True,
        )

        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
        W = params["components"]["conv1"]["W"]
        b = params["components"]["conv1"]["b"]
        W = np.moveaxis(W, orig, W_swap)
        assert self.conv1.weight.shape == W.shape
        assert self.conv1.bias.shape == b.flatten().shape
        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        scaler = params["components"]["batchnorm1"]["scaler"]
        intercept = params["components"]["batchnorm1"]["intercept"]
        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))

        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
        W = params["components"]["conv2"]["W"]
        b = params["components"]["conv2"]["b"]
        W = np.moveaxis(W, orig, W_swap)
        assert self.conv2.weight.shape == W.shape
        assert self.conv2.bias.shape == b.flatten().shape
        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        scaler = params["components"]["batchnorm2"]["scaler"]
        intercept = params["components"]["batchnorm2"]["intercept"]
        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))

    def forward(self, X):
        if not isinstance(X, torch.Tensor):
            # (N, H, W, C) -> (N, C, H, W)
            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
            X = torchify(X)

        self.X = X
        self.X.retain_grad()

        self.conv1_out = self.conv1(self.X)
        self.conv1_out.retain_grad()

        self.act_fn1_out = self.act_fn(self.conv1_out)
        self.act_fn1_out.retain_grad()

        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
        self.batchnorm1_out.retain_grad()

        self.conv2_out = self.conv2(self.batchnorm1_out)
        self.conv2_out.retain_grad()

        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
        self.batchnorm2_out.retain_grad()

        self.layer3_in = self.batchnorm2_out + self.X
        self.layer3_in.retain_grad()

        self.Y = self.act_fn(self.layer3_in)
        self.Y.retain_grad()

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
        grads = {
            # layer parameters
            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
            # layer inputs/outputs (forward step)
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            # layer gradients (backward step)
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
            # layer parameter gradients (backward step)
            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
        }
        return grads


class TorchCausalConv1d(torch.nn.Conv1d):
    """https://github.com/pytorch/pytorch/issues/1333

    NB: this is only ensures that the convolution out length is the same as
    the input length IFF stride = 1. Otherwise, in/out lengths will differ.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride=1,
        dilation=1,
        groups=1,
        bias=True,
    ):
        self.__padding = (kernel_size - 1) * dilation

        super(TorchCausalConv1d, self).__init__(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=self.__padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

    def forward(self, input):
        result = super(TorchCausalConv1d, self).forward(input)
        if self.__padding != 0:
            return result[:, :, : -self.__padding]
        return result


class TorchWavenetModule(nn.Module):
    def __init__(self, params, hparams, conv_1x1_pad):
        super(TorchWavenetModule, self).__init__()
        self.conv_dilation = TorchCausalConv1d(
            in_channels=hparams["components"]["conv_dilation"]["in_ch"],
            out_channels=hparams["components"]["conv_dilation"]["out_ch"],
            kernel_size=hparams["components"]["conv_dilation"]["kernel_width"],
            stride=hparams["components"]["conv_dilation"]["stride"],
            dilation=hparams["components"]["conv_dilation"]["dilation"] + 1,
            bias=True,
        )

        self.conv_1x1 = nn.Conv1d(
            in_channels=hparams["components"]["conv_1x1"]["in_ch"],
            out_channels=hparams["components"]["conv_1x1"]["out_ch"],
            kernel_size=hparams["components"]["conv_1x1"]["kernel_width"],
            stride=hparams["components"]["conv_1x1"]["stride"],
            padding=conv_1x1_pad,
            dilation=hparams["components"]["conv_1x1"]["dilation"] + 1,
            bias=True,
        )

        W = params["components"]["conv_dilation"]["W"]
        b = params["components"]["conv_dilation"]["b"]
        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
        self.conv_dilation.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv_dilation.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
        assert self.conv_dilation.weight.shape == W.shape
        assert self.conv_dilation.bias.shape == b.flatten().shape

        W = params["components"]["conv_1x1"]["W"]
        b = params["components"]["conv_1x1"]["b"]
        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
        self.conv_1x1.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv_1x1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
        assert self.conv_1x1.weight.shape == W.shape
        assert self.conv_1x1.bias.shape == b.flatten().shape

    def forward(self, X_main, X_skip):
        # (N, W, C) -> (N, C, W)
        self.X_main = np.moveaxis(X_main, [0, 1, 2], [0, -1, -2])
        self.X_main = torchify(self.X_main)
        self.X_main.retain_grad()

        self.conv_dilation_out = self.conv_dilation(self.X_main)
        self.conv_dilation_out.retain_grad()

        self.tanh_out = torch.tanh(self.conv_dilation_out)
        self.sigm_out = torch.sigmoid(self.conv_dilation_out)

        self.tanh_out.retain_grad()
        self.sigm_out.retain_grad()

        self.multiply_gate_out = self.tanh_out * self.sigm_out
        self.multiply_gate_out.retain_grad()

        self.conv_1x1_out = self.conv_1x1(self.multiply_gate_out)
        self.conv_1x1_out.retain_grad()

        self.X_skip = torch.zeros_like(self.conv_1x1_out)
        if X_skip is not None:
            self.X_skip = torchify(np.moveaxis(X_skip, [0, 1, 2], [0, -1, -2]))
        self.X_skip.retain_grad()

        self.Y_skip = self.X_skip + self.conv_1x1_out
        self.Y_main = self.X_main + self.conv_1x1_out

        self.Y_skip.retain_grad()
        self.Y_main.retain_grad()

    def extract_grads(self, X_main, X_skip):
        self.forward(X_main, X_skip)
        self.loss = (self.Y_skip + self.Y_main).sum()
        self.loss.backward()

        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
        grads = {
            "X_main": np.moveaxis(self.X_main.detach().numpy(), orig, X_swap),
            "X_skip": np.moveaxis(self.X_skip.detach().numpy(), orig, X_swap),
            "conv_dilation_W": np.moveaxis(
                self.conv_dilation.weight.detach().numpy(), orig, W_swap
            ),
            "conv_dilation_b": self.conv_dilation.bias.detach()
            .numpy()
            .reshape(1, 1, -1),
            "conv_1x1_W": np.moveaxis(
                self.conv_1x1.weight.detach().numpy(), orig, W_swap
            ),
            "conv_1x1_b": self.conv_1x1.bias.detach().numpy().reshape(1, 1, -1),
            "conv_dilation_out": np.moveaxis(
                self.conv_dilation_out.detach().numpy(), orig, X_swap
            ),
            "tanh_out": np.moveaxis(self.tanh_out.detach().numpy(), orig, X_swap),
            "sigm_out": np.moveaxis(self.sigm_out.detach().numpy(), orig, X_swap),
            "multiply_gate_out": np.moveaxis(
                self.multiply_gate_out.detach().numpy(), orig, X_swap
            ),
            "conv_1x1_out": np.moveaxis(
                self.conv_1x1_out.detach().numpy(), orig, X_swap
            ),
            "Y_main": np.moveaxis(self.Y_main.detach().numpy(), orig, X_swap),
            "Y_skip": np.moveaxis(self.Y_skip.detach().numpy(), orig, X_swap),
            "dLdY_skip": np.moveaxis(self.Y_skip.grad.numpy(), orig, X_swap),
            "dLdY_main": np.moveaxis(self.Y_main.grad.numpy(), orig, X_swap),
            "dLdConv_1x1_out": np.moveaxis(
                self.conv_1x1_out.grad.numpy(), orig, X_swap
            ),
            "dLdConv_1x1_W": np.moveaxis(
                self.conv_1x1.weight.grad.numpy(), orig, W_swap
            ),
            "dLdConv_1x1_b": self.conv_1x1.bias.grad.numpy().reshape(1, 1, -1),
            "dLdMultiply_out": np.moveaxis(
                self.multiply_gate_out.grad.numpy(), orig, X_swap
            ),
            "dLdTanh_out": np.moveaxis(self.tanh_out.grad.numpy(), orig, X_swap),
            "dLdSigm_out": np.moveaxis(self.sigm_out.grad.numpy(), orig, X_swap),
            "dLdConv_dilation_out": np.moveaxis(
                self.conv_dilation_out.grad.numpy(), orig, X_swap
            ),
            "dLdConv_dilation_W": np.moveaxis(
                self.conv_dilation.weight.grad.numpy(), orig, W_swap
            ),
            "dLdConv_dilation_b": self.conv_dilation.bias.grad.numpy().reshape(
                1, 1, -1
            ),
            "dLdX_main": np.moveaxis(self.X_main.grad.numpy(), orig, X_swap),
            "dLdX_skip": np.moveaxis(self.X_skip.grad.numpy(), orig, X_swap),
        }

        return grads


class TorchSkipConnectionConv(nn.Module):
    def __init__(
        self, act_fn, pad1, pad2, pad_skip, params, hparams, momentum=0.9, epsilon=1e-5
    ):
        super(TorchSkipConnectionConv, self).__init__()

        self.conv1 = nn.Conv2d(
            hparams["in_ch"],
            hparams["out_ch1"],
            hparams["kernel_shape1"],
            padding=pad1,
            stride=hparams["stride1"],
            bias=True,
        )

        self.act_fn = act_fn

        self.batchnorm1 = nn.BatchNorm2d(
            num_features=hparams["out_ch1"],
            momentum=1 - momentum,
            eps=epsilon,
            affine=True,
        )

        self.conv2 = nn.Conv2d(
            hparams["out_ch1"],
            hparams["out_ch2"],
            hparams["kernel_shape2"],
            padding=pad2,
            stride=hparams["stride2"],
            bias=True,
        )

        self.batchnorm2 = nn.BatchNorm2d(
            num_features=hparams["out_ch2"],
            momentum=1 - momentum,
            eps=epsilon,
            affine=True,
        )

        self.conv_skip = nn.Conv2d(
            hparams["in_ch"],
            hparams["out_ch2"],
            hparams["kernel_shape_skip"],
            padding=pad_skip,
            stride=hparams["stride_skip"],
            bias=True,
        )

        self.batchnorm_skip = nn.BatchNorm2d(
            num_features=hparams["out_ch2"],
            momentum=1 - momentum,
            eps=epsilon,
            affine=True,
        )

        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
        W = params["components"]["conv1"]["W"]
        b = params["components"]["conv1"]["b"]
        W = np.moveaxis(W, orig, W_swap)
        assert self.conv1.weight.shape == W.shape
        assert self.conv1.bias.shape == b.flatten().shape
        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        scaler = params["components"]["batchnorm1"]["scaler"]
        intercept = params["components"]["batchnorm1"]["intercept"]
        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))

        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
        W = params["components"]["conv2"]["W"]
        b = params["components"]["conv2"]["b"]
        W = np.moveaxis(W, orig, W_swap)
        assert self.conv2.weight.shape == W.shape
        assert self.conv2.bias.shape == b.flatten().shape
        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        scaler = params["components"]["batchnorm2"]["scaler"]
        intercept = params["components"]["batchnorm2"]["intercept"]
        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))

        W = params["components"]["conv_skip"]["W"]
        b = params["components"]["conv_skip"]["b"]
        W = np.moveaxis(W, orig, W_swap)
        assert self.conv_skip.weight.shape == W.shape
        assert self.conv_skip.bias.shape == b.flatten().shape
        self.conv_skip.weight = nn.Parameter(torch.FloatTensor(W))
        self.conv_skip.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        scaler = params["components"]["batchnorm_skip"]["scaler"]
        intercept = params["components"]["batchnorm_skip"]["intercept"]
        self.batchnorm_skip.weight = nn.Parameter(torch.FloatTensor(scaler))
        self.batchnorm_skip.bias = nn.Parameter(torch.FloatTensor(intercept))

    def forward(self, X):
        if not isinstance(X, torch.Tensor):
            # (N, H, W, C) -> (N, C, H, W)
            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
            X = torchify(X)

        self.X = X
        self.X.retain_grad()

        self.conv1_out = self.conv1(self.X)
        self.conv1_out.retain_grad()

        self.act_fn1_out = self.act_fn(self.conv1_out)
        self.act_fn1_out.retain_grad()

        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
        self.batchnorm1_out.retain_grad()

        self.conv2_out = self.conv2(self.batchnorm1_out)
        self.conv2_out.retain_grad()

        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
        self.batchnorm2_out.retain_grad()

        self.c_skip_out = self.conv_skip(self.X)
        self.c_skip_out.retain_grad()

        self.bn_skip_out = self.batchnorm_skip(self.c_skip_out)
        self.bn_skip_out.retain_grad()

        self.layer3_in = self.batchnorm2_out + self.bn_skip_out
        self.layer3_in.retain_grad()

        self.Y = self.act_fn(self.layer3_in)
        self.Y.retain_grad()

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
        grads = {
            # layer parameters
            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
            "conv_skip_W": np.moveaxis(
                self.conv_skip.weight.detach().numpy(), orig, W_swap
            ),
            "conv_skip_b": self.conv_skip.bias.detach().numpy().reshape(1, 1, 1, -1),
            "bn_skip_intercept": self.batchnorm_skip.bias.detach().numpy(),
            "bn_skip_scaler": self.batchnorm_skip.weight.detach().numpy(),
            "bn_skip_running_mean": self.batchnorm_skip.running_mean.detach().numpy(),
            "bn_skip_running_var": self.batchnorm_skip.running_var.detach().numpy(),
            # layer inputs/outputs (forward step)
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
            "conv_skip_out": np.moveaxis(
                self.c_skip_out.detach().numpy(), orig, X_swap
            ),
            "bn_skip_out": np.moveaxis(self.bn_skip_out.detach().numpy(), orig, X_swap),
            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            # layer gradients (backward step)
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
            "dLdBnSkip_out": np.moveaxis(self.bn_skip_out.grad.numpy(), orig, X_swap),
            "dLdConvSkip_out": np.moveaxis(self.c_skip_out.grad.numpy(), orig, X_swap),
            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
            # layer parameter gradients (backward step)
            "dLdBnSkip_intercept": self.batchnorm_skip.bias.grad.numpy(),
            "dLdBnSkip_scaler": self.batchnorm_skip.weight.grad.numpy(),
            "dLdConvSkip_W": np.moveaxis(
                self.conv_skip.weight.grad.numpy(), orig, W_swap
            ),
            "dLdConvSkip_b": self.conv_skip.bias.grad.numpy().reshape(1, 1, 1, -1),
            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
        }
        return grads


class TorchBidirectionalLSTM(nn.Module):
    def __init__(self, n_in, n_out, params, **kwargs):
        super(TorchBidirectionalLSTM, self).__init__()

        self.layer1 = nn.LSTM(
            input_size=n_in,
            hidden_size=n_out,
            num_layers=1,
            bidirectional=True,
            bias=True,
        )

        Wiu = params["components"]["cell_fwd"]["Wu"][n_out:, :].T
        Wif = params["components"]["cell_fwd"]["Wf"][n_out:, :].T
        Wic = params["components"]["cell_fwd"]["Wc"][n_out:, :].T
        Wio = params["components"]["cell_fwd"]["Wo"][n_out:, :].T
        W_ih_f = np.vstack([Wiu, Wif, Wic, Wio])

        Whu = params["components"]["cell_fwd"]["Wu"][:n_out, :].T
        Whf = params["components"]["cell_fwd"]["Wf"][:n_out, :].T
        Whc = params["components"]["cell_fwd"]["Wc"][:n_out, :].T
        Who = params["components"]["cell_fwd"]["Wo"][:n_out, :].T
        W_hh_f = np.vstack([Whu, Whf, Whc, Who])

        assert self.layer1.weight_ih_l0.shape == W_ih_f.shape
        assert self.layer1.weight_hh_l0.shape == W_hh_f.shape

        self.layer1.weight_ih_l0 = nn.Parameter(torch.FloatTensor(W_ih_f))
        self.layer1.weight_hh_l0 = nn.Parameter(torch.FloatTensor(W_hh_f))

        Wiu = params["components"]["cell_bwd"]["Wu"][n_out:, :].T
        Wif = params["components"]["cell_bwd"]["Wf"][n_out:, :].T
        Wic = params["components"]["cell_bwd"]["Wc"][n_out:, :].T
        Wio = params["components"]["cell_bwd"]["Wo"][n_out:, :].T
        W_ih_b = np.vstack([Wiu, Wif, Wic, Wio])

        Whu = params["components"]["cell_bwd"]["Wu"][:n_out, :].T
        Whf = params["components"]["cell_bwd"]["Wf"][:n_out, :].T
        Whc = params["components"]["cell_bwd"]["Wc"][:n_out, :].T
        Who = params["components"]["cell_bwd"]["Wo"][:n_out, :].T
        W_hh_b = np.vstack([Whu, Whf, Whc, Who])

        assert self.layer1.weight_ih_l0_reverse.shape == W_ih_b.shape
        assert self.layer1.weight_hh_l0_reverse.shape == W_hh_b.shape

        self.layer1.weight_ih_l0_reverse = nn.Parameter(torch.FloatTensor(W_ih_b))
        self.layer1.weight_hh_l0_reverse = nn.Parameter(torch.FloatTensor(W_hh_b))

        b_f = np.concatenate(
            [
                params["components"]["cell_fwd"]["bu"],
                params["components"]["cell_fwd"]["bf"],
                params["components"]["cell_fwd"]["bc"],
                params["components"]["cell_fwd"]["bo"],
            ],
            axis=-1,
        ).flatten()

        assert self.layer1.bias_ih_l0.shape == b_f.shape
        assert self.layer1.bias_hh_l0.shape == b_f.shape

        self.layer1.bias_ih_l0 = nn.Parameter(torch.FloatTensor(b_f))
        self.layer1.bias_hh_l0 = nn.Parameter(torch.FloatTensor(b_f))

        b_b = np.concatenate(
            [
                params["components"]["cell_bwd"]["bu"],
                params["components"]["cell_bwd"]["bf"],
                params["components"]["cell_bwd"]["bc"],
                params["components"]["cell_bwd"]["bo"],
            ],
            axis=-1,
        ).flatten()

        assert self.layer1.bias_ih_l0_reverse.shape == b_b.shape
        assert self.layer1.bias_hh_l0_reverse.shape == b_b.shape

        self.layer1.bias_ih_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))
        self.layer1.bias_hh_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))

    def forward(self, X):
        # (batch, input_size, seq_len) -> (seq_len, batch, input_size)
        self.X = np.moveaxis(X, [0, 1, 2], [-2, -1, -3])

        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        # initial hidden state is 0
        n_ex, n_in, n_timesteps = self.X.shape
        n_out, n_out = self.layer1.weight_hh_l0.shape

        # forward pass
        self.A, (At, Ct) = self.layer1(self.X)
        self.A.retain_grad()
        return self.A

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.A.sum()
        self.loss.backward()

        # forward
        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0.chunk(4, 0)
        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0.chunk(4, 0)
        bu_f, bf_f, bc_f, bo_f = self.layer1.bias_ih_l0.chunk(4, 0)

        Wu_f = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
        Wf_f = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
        Wc_f = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
        Wo_f = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)

        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0.grad.chunk(4, 0)
        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0.grad.chunk(4, 0)
        dbu_f, dbf_f, dbc_f, dbo_f = self.layer1.bias_ih_l0.grad.chunk(4, 0)

        dWu_f = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
        dWf_f = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
        dWc_f = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
        dWo_f = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)

        # backward
        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0_reverse.chunk(4, 0)
        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0_reverse.chunk(4, 0)
        bu_b, bf_b, bc_b, bo_b = self.layer1.bias_ih_l0_reverse.chunk(4, 0)

        Wu_b = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
        Wf_b = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
        Wc_b = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
        Wo_b = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)

        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0_reverse.grad.chunk(4, 0)
        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0_reverse.grad.chunk(4, 0)
        dbu_b, dbf_b, dbc_b, dbo_b = self.layer1.bias_ih_l0_reverse.grad.chunk(4, 0)

        dWu_b = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
        dWf_b = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
        dWc_b = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
        dWo_b = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)

        orig, X_swap = [0, 1, 2], [-1, -3, -2]
        grads = {
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "Wu_f": Wu_f.detach().numpy(),
            "Wf_f": Wf_f.detach().numpy(),
            "Wc_f": Wc_f.detach().numpy(),
            "Wo_f": Wo_f.detach().numpy(),
            "bu_f": bu_f.detach().numpy().reshape(-1, 1),
            "bf_f": bf_f.detach().numpy().reshape(-1, 1),
            "bc_f": bc_f.detach().numpy().reshape(-1, 1),
            "bo_f": bo_f.detach().numpy().reshape(-1, 1),
            "Wu_b": Wu_b.detach().numpy(),
            "Wf_b": Wf_b.detach().numpy(),
            "Wc_b": Wc_b.detach().numpy(),
            "Wo_b": Wo_b.detach().numpy(),
            "bu_b": bu_b.detach().numpy().reshape(-1, 1),
            "bf_b": bf_b.detach().numpy().reshape(-1, 1),
            "bc_b": bc_b.detach().numpy().reshape(-1, 1),
            "bo_b": bo_b.detach().numpy().reshape(-1, 1),
            "y": np.moveaxis(self.A.detach().numpy(), orig, X_swap),
            "dLdA": self.A.grad.numpy(),
            "dLdWu_f": dWu_f.numpy(),
            "dLdWf_f": dWf_f.numpy(),
            "dLdWc_f": dWc_f.numpy(),
            "dLdWo_f": dWo_f.numpy(),
            "dLdBu_f": dbu_f.numpy().reshape(-1, 1),
            "dLdBf_f": dbf_f.numpy().reshape(-1, 1),
            "dLdBc_f": dbc_f.numpy().reshape(-1, 1),
            "dLdBo_f": dbo_f.numpy().reshape(-1, 1),
            "dLdWu_b": dWu_b.numpy(),
            "dLdWf_b": dWf_b.numpy(),
            "dLdWc_b": dWc_b.numpy(),
            "dLdWo_b": dWo_b.numpy(),
            "dLdBu_b": dbu_b.numpy().reshape(-1, 1),
            "dLdBf_b": dbf_b.numpy().reshape(-1, 1),
            "dLdBc_b": dbc_b.numpy().reshape(-1, 1),
            "dLdBo_b": dbo_b.numpy().reshape(-1, 1),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
        }
        return grads


class TorchPool2DLayer(nn.Module):
    def __init__(self, in_channels, hparams, **kwargs):
        super(TorchPool2DLayer, self).__init__()

        if hparams["mode"] == "max":
            self.layer1 = nn.MaxPool2d(
                kernel_size=hparams["kernel_shape"],
                padding=hparams["pad"],
                stride=hparams["stride"],
            )
        elif hparams["mode"] == "average":
            self.layer1 = nn.AvgPool2d(
                kernel_size=hparams["kernel_shape"],
                padding=hparams["pad"],
                stride=hparams["stride"],
            )

    def forward(self, X):
        # (N, H, W, C) -> (N, C, H, W)
        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()
        self.Y = self.layer1(self.X)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
        orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
        grads = {
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
        }
        return grads


class TorchConv2DLayer(nn.Module):
    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
        super(TorchConv2DLayer, self).__init__()

        W = params["W"]
        b = params["b"]
        self.act_fn = act_fn

        self.layer1 = nn.Conv2d(
            in_channels,
            out_channels,
            hparams["kernel_shape"],
            padding=hparams["pad"],
            stride=hparams["stride"],
            dilation=hparams["dilation"] + 1,
            bias=True,
        )

        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -3, -4])
        assert self.layer1.weight.shape == W.shape
        assert self.layer1.bias.shape == b.flatten().shape

        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

    def forward(self, X):
        # (N, H, W, C) -> (N, C, H, W)
        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        self.Z = self.layer1(self.X)
        self.Z.retain_grad()

        self.Y = self.act_fn(self.Z)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
        grads = {
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
        }
        return grads


class TorchConv1DLayer(nn.Module):
    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
        super(TorchConv1DLayer, self).__init__()

        W = params["W"]
        b = params["b"]
        self.act_fn = act_fn

        self.layer1 = nn.Conv1d(
            in_channels,
            out_channels,
            hparams["kernel_width"],
            padding=hparams["pad"],
            stride=hparams["stride"],
            dilation=hparams["dilation"] + 1,
            bias=True,
        )

        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
        assert self.layer1.weight.shape == W.shape
        assert self.layer1.bias.shape == b.flatten().shape

        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

    def forward(self, X):
        # (N, W, C) -> (N, C, W)
        self.X = np.moveaxis(X, [0, 1, 2], [0, -1, -2])
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        self.Z = self.layer1(self.X)
        self.Z.retain_grad()

        self.Y = self.act_fn(self.Z)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
        grads = {
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
            "b": self.layer1.bias.detach().numpy().reshape(1, 1, -1),
            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, -1),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
        }
        return grads


class TorchDeconv2DLayer(nn.Module):
    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
        super(TorchDeconv2DLayer, self).__init__()

        W = params["W"]
        b = params["b"]
        self.act_fn = act_fn

        self.layer1 = nn.ConvTranspose2d(
            in_channels,
            out_channels,
            hparams["kernel_shape"],
            padding=hparams["pad"],
            stride=hparams["stride"],
            dilation=1,
            bias=True,
        )

        # (f[0], f[1], n_in, n_out) -> (n_in, n_out, f[0], f[1])
        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -4, -3])
        assert self.layer1.weight.shape == W.shape
        assert self.layer1.bias.shape == b.flatten().shape

        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

    def forward(self, X):
        # (N, H, W, C) -> (N, C, H, W)
        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        self.Z = self.layer1(self.X)
        self.Z.retain_grad()

        self.Y = self.act_fn(self.Z)
        self.Y.retain_grad()
        return self.Y

    def extract_grads(self, X):
        self.forward(X)
        self.loss = self.Y.sum()
        self.loss.backward()

        # W (theirs): (n_in, n_out, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-2, -1, -4, -3]
        grads = {
            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
        }
        return grads


class TorchLSTMCell(nn.Module):
    def __init__(self, n_in, n_out, params, **kwargs):
        super(TorchLSTMCell, self).__init__()

        Wiu = params["Wu"][n_out:, :].T
        Wif = params["Wf"][n_out:, :].T
        Wic = params["Wc"][n_out:, :].T
        Wio = params["Wo"][n_out:, :].T
        W_ih = np.vstack([Wiu, Wif, Wic, Wio])

        Whu = params["Wu"][:n_out, :].T
        Whf = params["Wf"][:n_out, :].T
        Whc = params["Wc"][:n_out, :].T
        Who = params["Wo"][:n_out, :].T
        W_hh = np.vstack([Whu, Whf, Whc, Who])

        self.layer1 = nn.LSTMCell(input_size=n_in, hidden_size=n_out, bias=True)
        assert self.layer1.weight_ih.shape == W_ih.shape
        assert self.layer1.weight_hh.shape == W_hh.shape
        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(W_ih))
        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(W_hh))

        b = np.concatenate(
            [params["bu"], params["bf"], params["bc"], params["bo"]], axis=-1
        ).flatten()
        assert self.layer1.bias_ih.shape == b.shape
        assert self.layer1.bias_hh.shape == b.shape
        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(b))
        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(b))

    def forward(self, X):
        self.X = X
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        # initial hidden state is 0
        n_ex, n_in, n_timesteps = self.X.shape
        n_out, n_out = self.layer1.weight_hh.shape

        # initialize hidden states
        a0 = torchify(np.zeros((n_ex, n_out)))
        c0 = torchify(np.zeros((n_ex, n_out)))
        a0.retain_grad()
        c0.retain_grad()

        # forward pass
        A, C = [], []
        at = a0
        ct = c0
        for t in range(n_timesteps):
            A.append(at)
            C.append(ct)
            at1, ct1 = self.layer1(self.X[:, :, t], (at, ct))
            at.retain_grad()
            ct.retain_grad()
            at = at1
            ct = ct1

        at.retain_grad()
        ct.retain_grad()
        A.append(at)
        C.append(ct)

        # don't inclue a0 in our outputs
        self.A = A[1:]
        self.C = C[1:]
        return self.A, self.C

    def extract_grads(self, X):
        self.forward(X)
        self.loss = torch.stack(self.A).sum()
        self.loss.backward()

        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih.chunk(4, 0)
        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh.chunk(4, 0)
        bu, bf, bc, bo = self.layer1.bias_ih.chunk(4, 0)

        Wu = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
        Wf = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
        Wc = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
        Wo = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)

        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih.grad.chunk(4, 0)
        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh.grad.chunk(4, 0)
        dbu, dbf, dbc, dbo = self.layer1.bias_ih.grad.chunk(4, 0)

        dWu = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
        dWf = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
        dWc = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
        dWo = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)

        grads = {
            "X": self.X.detach().numpy(),
            "Wu": Wu.detach().numpy(),
            "Wf": Wf.detach().numpy(),
            "Wc": Wc.detach().numpy(),
            "Wo": Wo.detach().numpy(),
            "bu": bu.detach().numpy().reshape(-1, 1),
            "bf": bf.detach().numpy().reshape(-1, 1),
            "bc": bc.detach().numpy().reshape(-1, 1),
            "bo": bo.detach().numpy().reshape(-1, 1),
            "C": torch.stack(self.C).detach().numpy(),
            "y": np.swapaxes(
                np.swapaxes(torch.stack(self.A).detach().numpy(), 1, 0), 1, 2
            ),
            "dLdA": np.array([a.grad.numpy() for a in self.A]),
            "dLdWu": dWu.numpy(),
            "dLdWf": dWf.numpy(),
            "dLdWc": dWc.numpy(),
            "dLdWo": dWo.numpy(),
            "dLdBu": dbu.numpy().reshape(-1, 1),
            "dLdBf": dbf.numpy().reshape(-1, 1),
            "dLdBc": dbc.numpy().reshape(-1, 1),
            "dLdBo": dbo.numpy().reshape(-1, 1),
            "dLdX": self.X.grad.numpy(),
        }
        return grads


class TorchRNNCell(nn.Module):
    def __init__(self, n_in, n_hid, params, **kwargs):
        super(TorchRNNCell, self).__init__()

        self.layer1 = nn.RNNCell(n_in, n_hid, bias=True, nonlinearity="tanh")

        # set weights and bias to match those of RNNCell
        # NB: we pass the *transpose* of the RNNCell weights and biases to
        # pytorch, meaning we need to check against the *transpose* of our
        # outputs for any function of the weights
        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(params["Wax"].T))
        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(params["Waa"].T))
        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(params["bx"].T))
        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(params["ba"].T))

    def forward(self, X):
        self.X = X
        if not isinstance(self.X, torch.Tensor):
            self.X = torchify(self.X)

        self.X.retain_grad()

        # initial hidden state is 0
        n_ex, n_in, n_timesteps = self.X.shape
        n_out, n_out = self.layer1.weight_hh.shape

        # initialize hidden states
        a0 = torchify(np.zeros((n_ex, n_out)))
        a0.retain_grad()

        # forward pass
        A = []
        at = a0
        for t in range(n_timesteps):
            A += [at]
            at1 = self.layer1(self.X[:, :, t], at)
            at.retain_grad()
            at = at1

        at.retain_grad()
        A += [at]

        # don't inclue a0 in our outputs
        self.A = A[1:]
        return self.A

    def extract_grads(self, X):
        self.forward(X)
        self.loss = torch.stack(self.A).sum()
        self.loss.backward()
        grads = {
            "X": self.X.detach().numpy(),
            "ba": self.layer1.bias_hh.detach().numpy(),
            "bx": self.layer1.bias_ih.detach().numpy(),
            "Wax": self.layer1.weight_ih.detach().numpy(),
            "Waa": self.layer1.weight_hh.detach().numpy(),
            "y": torch.stack(self.A).detach().numpy(),
            "dLdA": np.array([a.grad.numpy() for a in self.A]),
            "dLdWaa": self.layer1.weight_hh.grad.numpy(),
            "dLdWax": self.layer1.weight_ih.grad.numpy(),
            "dLdBa": self.layer1.bias_hh.grad.numpy(),
            "dLdBx": self.layer1.bias_ih.grad.numpy(),
            "dLdX": self.X.grad.numpy(),
        }
        return grads


class TorchFCLayer(nn.Module):
    def __init__(self, n_in, n_hid, act_fn, params, **kwargs):
        super(TorchFCLayer, self).__init__()
        self.layer1 = nn.Linear(n_in, n_hid)

        # explicitly set weights and bias
        # NB: we pass the *transpose* of the weights to pytorch, meaning
        # we'll need to check against the *transpose* of our outputs for
        # any function of the weights
        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"].T))
        self.layer1.bias = nn.Parameter(torch.FloatTensor(params["b"]))

        self.act_fn = act_fn
        self.model = nn.Sequential(self.layer1, self.act_fn)

    def forward(self, X):
        self.X = X
        if not isinstance(X, torch.Tensor):
            self.X = torchify(X)

        self.z1 = self.layer1(self.X)
        self.z1.retain_grad()

        self.out1 = self.act_fn(self.z1)
        self.out1.retain_grad()

    def extract_grads(self, X):
        self.forward(X)
        self.loss1 = self.out1.sum()
        self.loss1.backward()
        grads = {
            "X": self.X.detach().numpy(),
            "b": self.layer1.bias.detach().numpy(),
            "W": self.layer1.weight.detach().numpy(),
            "y": self.out1.detach().numpy(),
            "dLdy": self.out1.grad.numpy(),
            "dLdZ": self.z1.grad.numpy(),
            "dLdB": self.layer1.bias.grad.numpy(),
            "dLdW": self.layer1.weight.grad.numpy(),
            "dLdX": self.X.grad.numpy(),
        }
        return grads


class TorchEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, n_out, params, **kwargs):
        super(TorchEmbeddingLayer, self).__init__()
        self.layer1 = nn.Embedding(vocab_size, n_out)

        # explicitly set embedding weights
        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"]))
        self.model = nn.Sequential(self.layer1)

    def forward(self, X):
        self.X = X
        if not isinstance(X, torch.Tensor):
            self.X = torch.from_numpy(X)

        self.out1 = self.layer1(self.X)
        self.out1.retain_grad()

    def extract_grads(self, X):
        self.forward(X)
        self.loss1 = self.out1.sum()
        self.loss1.backward()
        grads = {
            "X": self.X.detach().numpy(),
            "W": self.layer1.weight.detach().numpy(),
            "y": self.out1.detach().numpy(),
            "dLdy": self.out1.grad.numpy(),
            "dLdW": self.layer1.weight.grad.numpy(),
        }
        return grads


class TorchSDPAttentionLayer(nn.Module):
    def __init__(self):
        super(TorchSDPAttentionLayer, self).__init__()

    def forward(self, Q, K, V, mask=None):
        self.Q = Q
        self.K = K
        self.V = V

        if not isinstance(self.Q, torch.Tensor):
            self.Q = torchify(self.Q)
        if not isinstance(self.K, torch.Tensor):
            self.K = torchify(self.K)
        if not isinstance(self.V, torch.Tensor):
            self.V = torchify(self.V)

        self.Q.retain_grad()
        self.K.retain_grad()
        self.V.retain_grad()

        self.d_k = self.Q.size(-1)
        self.scores = torch.matmul(self.Q, self.K.transpose(-2, -1)) / np.sqrt(self.d_k)
        if mask is not None:
            self.scores = self.scores.masked_fill(mask == 0, -1e9)
        self.scores.retain_grad()

        self.weights = F.softmax(self.scores, dim=-1)
        self.weights.retain_grad()
        self.Y = torch.matmul(self.weights, self.V)
        self.Y.retain_grad()
        return self.Y, self.weights

    def extract_grads(self, Q, K, V, mask=None):
        self.forward(Q, K, V, mask=mask)
        self.loss1 = self.Y.sum()
        self.loss1.backward()
        grads = {
            "Q": self.Q.detach().numpy(),
            "K": self.K.detach().numpy(),
            "V": self.V.detach().numpy(),
            "d_k": self.d_k,
            "scores": self.scores.detach().numpy(),
            "weights": self.weights.detach().numpy(),
            "Y": self.Y.detach().numpy(),
            "dLdV": self.V.grad.numpy(),
            "dWeights": self.weights.grad.numpy(),
            "dScores": self.scores.grad.numpy(),
            "dLdQ": self.Q.grad.numpy(),
            "dLdK": self.K.grad.numpy(),
        }
        return grads


class TorchMultiHeadedAttentionModule(nn.Module):
    def __init__(self, params, hparams):
        "Take in model size and number of heads."
        super(TorchMultiHeadedAttentionModule, self).__init__()
        assert hparams["kqv_dim"] % hparams["n_heads"] == 0
        self.n_heads = hparams["n_heads"]
        self.latent_dim = hparams["kqv_dim"] // hparams["n_heads"]
        self.p_dropout = hparams["dropout_p"]
        self.projections = {
            "Q": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
            "K": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
            "V": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
            "O": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
        }
        self.projections["Q"].weight = nn.Parameter(
            torch.FloatTensor(params["components"]["Q"]["W"].T)
        )
        self.projections["Q"].bias = nn.Parameter(
            torch.FloatTensor(params["components"]["Q"]["b"])
        )
        self.projections["K"].weight = nn.Parameter(
            torch.FloatTensor(params["components"]["K"]["W"].T)
        )
        self.projections["K"].bias = nn.Parameter(
            torch.FloatTensor(params["components"]["K"]["b"])
        )
        self.projections["V"].weight = nn.Parameter(
            torch.FloatTensor(params["components"]["V"]["W"].T)
        )
        self.projections["V"].bias = nn.Parameter(
            torch.FloatTensor(params["components"]["V"]["b"])
        )
        self.projections["O"].weight = nn.Parameter(
            torch.FloatTensor(params["components"]["O"]["W"].T)
        )
        self.projections["O"].bias = nn.Parameter(
            torch.FloatTensor(params["components"]["O"]["b"])
        )

        self.attn = None
        self.dropout = nn.Dropout(p=hparams["dropout_p"])

    def forward(self, Q, K, V, mask=None):
        self.Q = Q
        self.K = K
        self.V = V

        if not isinstance(self.Q, torch.Tensor):
            self.Q = torchify(self.Q)
        if not isinstance(self.K, torch.Tensor):
            self.K = torchify(self.K)
        if not isinstance(self.V, torch.Tensor):
            self.V = torchify(self.V)

        self.Q.retain_grad()
        self.K.retain_grad()
        self.V.retain_grad()

        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        n_ex = self.Q.size(0)

        self.Q_proj = (
            self.projections["Q"](self.Q)
            .view(n_ex, -1, self.n_heads, self.latent_dim)
            .transpose(1, 2)
        )

        self.K_proj = (
            self.projections["K"](self.K)
            .view(n_ex, -1, self.n_heads, self.latent_dim)
            .transpose(1, 2)
        )

        self.V_proj = (
            self.projections["V"](self.V)
            .view(n_ex, -1, self.n_heads, self.latent_dim)
            .transpose(1, 2)
        )

        self.Q_proj.retain_grad()
        self.K_proj.retain_grad()
        self.V_proj.retain_grad()

        # 2) Apply attention on all the projected vectors in batch.
        self.attn_out, self.attn = TorchSDPAttentionLayer().forward(
            self.Q_proj, self.K_proj, self.V_proj, mask=mask
        )
        self.attn.retain_grad()
        self.attn_out.retain_grad()

        # 3) "Concat" using a view and apply a final linear transformation
        self.attn_out_reshaped = (
            self.attn_out.transpose(1, 2)
            .contiguous()
            .view(n_ex, -1, self.n_heads * self.latent_dim)
        )
        self.attn_out_reshaped.retain_grad()
        print(self.attn_out_reshaped.shape)
        self.Y = self.projections["O"](self.attn_out_reshaped)
        print(self.Y.shape)
        self.Y.retain_grad()

    def extract_grads(self, Q, K, V, mask=None):
        self.forward(Q, K, V, mask=mask)
        self.loss1 = self.Y.sum()
        self.loss1.backward()
        grads = {
            "Q": self.Q.detach().numpy(),
            "K": self.K.detach().numpy(),
            "V": self.V.detach().numpy(),
            "O_W": self.projections["O"].weight.detach().numpy().T,
            "V_W": self.projections["V"].weight.detach().numpy().T,
            "K_W": self.projections["K"].weight.detach().numpy().T,
            "Q_W": self.projections["Q"].weight.detach().numpy().T,
            "O_b": self.projections["O"].bias.detach().numpy(),
            "V_b": self.projections["V"].bias.detach().numpy(),
            "K_b": self.projections["K"].bias.detach().numpy(),
            "Q_b": self.projections["Q"].bias.detach().numpy(),
            "latent_dim": self.latent_dim,
            "n_heads": self.n_heads,
            "Q_proj": self.Q_proj.detach().numpy(),  # .reshape(self.Q_proj.shape[0], -1),
            "K_proj": self.K_proj.detach().numpy(),  # .reshape(self.K_proj.shape[0], -1),
            "V_proj": self.V_proj.detach().numpy(),  # .reshape(self.V_proj.shape[0], -1),
            "weights": self.attn.detach().numpy(),
            "attn_out": self.attn_out_reshaped.detach().numpy(),  # .squeeze(),
            #  .reshape(self.attn_out_reshaped.shape[0], -1),
            "Y": self.Y.detach().numpy(),
            "dO_W": self.projections["O"].weight.grad.numpy().T,
            "dV_W": self.projections["V"].weight.grad.numpy().T,
            "dK_W": self.projections["K"].weight.grad.numpy().T,
            "dQ_W": self.projections["Q"].weight.grad.numpy().T,
            "dO_b": self.projections["O"].bias.grad.numpy(),
            "dV_b": self.projections["V"].bias.grad.numpy(),
            "dK_b": self.projections["K"].bias.grad.numpy(),
            "dQ_b": self.projections["Q"].bias.grad.numpy(),
            "dLdy": self.Y.grad.numpy(),
            "dAttn_out": self.attn_out_reshaped.grad.numpy(),
            "dWeights": self.attn.grad.numpy(),
            "dQ_proj": self.Q_proj.grad.numpy(),
            "dK_proj": self.K_proj.grad.numpy(),
            "dV_proj": self.V_proj.grad.numpy(),
            "dQ": self.Q.grad.numpy(),
            "dK": self.K.grad.numpy(),
            "dV": self.V.grad.numpy(),
        }
        return grads


#######################################################################
#              TF WGAN GP Gold Standard Implementation                #
#  adapted from: https://github.com/igul222/improved_wgan_training/   #
#######################################################################

_params = {}
_param_aliases = {}


def param(name, *args, **kwargs):
    """
    A wrapper for `tf.Variable` which enables parameter sharing in models.

    Creates and returns theano shared variables similarly to `tf.Variable`,
    except if you try to create a param with the same name as a
    previously-created one, `param(...)` will just return the old one instead of
    making a new one.

    This constructor also adds a `param` attribute to the shared variables it
    creates, so that you can easily search a graph for all params.
    """

    if name not in _params:
        kwargs["name"] = name
        param = tf.Variable(*args, **kwargs)
        param.param = True
        _params[name] = param
    result = _params[name]
    i = 0
    while result in _param_aliases:
        i += 1
        result = _param_aliases[result]
    return result


def params_with_name(name):
    return [p for n, p in _params.items() if name in n]


def ReLULayer(name, n_in, n_out, inputs, w_initialization):
    if isinstance(w_initialization, np.ndarray):
        weight_values = w_initialization.astype("float32")

    W = param(name + ".W", weight_values)
    result = tf.matmul(inputs, W)
    output = tf.nn.bias_add(
        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
    )
    output = tf.nn.relu(output)
    return output, W


def LinearLayer(name, n_in, n_out, inputs, w_initialization):
    if isinstance(w_initialization, np.ndarray):
        weight_values = w_initialization.astype("float32")

    W = param(name + ".W", weight_values)
    result = tf.matmul(inputs, W)
    output = tf.nn.bias_add(
        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
    )
    return output, W


def Generator(n_samples, X_real, params=None):
    n_feats = 2
    W1 = W2 = W3 = W4 = "he"
    noise = tf.random.normal([n_samples, 2])
    if params is not None:
        noise = tf.convert_to_tensor(params["noise"], dtype="float32")
        W1 = params["generator"]["FC1"]["W"]
        W2 = params["generator"]["FC2"]["W"]
        W3 = params["generator"]["FC3"]["W"]
        W4 = params["generator"]["FC4"]["W"]
        DIM = params["g_hidden"]
        n_feats = params["n_in"]

    outs = {}
    weights = {}
    output, W = ReLULayer("Generator.1", n_feats, DIM, noise, w_initialization=W1)
    outs["FC1"] = output
    weights["FC1"] = W
    output, W = ReLULayer("Generator.2", DIM, DIM, output, w_initialization=W2)
    outs["FC2"] = output
    weights["FC2"] = W
    output, W = ReLULayer("Generator.3", DIM, DIM, output, w_initialization=W3)
    outs["FC3"] = output
    weights["FC3"] = W
    output, W = LinearLayer("Generator.4", DIM, n_feats, output, w_initialization=W4)
    outs["FC4"] = output
    weights["FC4"] = W
    return output, outs, weights


def Discriminator(inputs, params=None):
    n_feats = 2
    W1 = W2 = W3 = W4 = "he"
    if params is not None:
        W1 = params["critic"]["FC1"]["W"]
        W2 = params["critic"]["FC2"]["W"]
        W3 = params["critic"]["FC3"]["W"]
        W4 = params["critic"]["FC4"]["W"]
        DIM = params["g_hidden"]
        n_feats = params["n_in"]

    outs = {}
    weights = {}
    output, W = ReLULayer("Discriminator.1", n_feats, DIM, inputs, w_initialization=W1)
    outs["FC1"] = output
    weights["FC1"] = W

    output, W = ReLULayer("Discriminator.2", DIM, DIM, output, w_initialization=W2)
    outs["FC2"] = output
    weights["FC2"] = W

    output, W = ReLULayer("Discriminator.3", DIM, DIM, output, w_initialization=W3)
    outs["FC3"] = output
    weights["FC3"] = W

    output, W = LinearLayer("Discriminator.4", DIM, 1, output, w_initialization=W4)
    outs["FC4"] = output
    weights["FC4"] = W

    # get bias
    for var in params_with_name("Discriminator"):
        if "1.b:" in var.name:
            weights["FC1_b"] = var
        elif "2.b:" in var.name:
            weights["FC2_b"] = var
        elif "3.b:" in var.name:
            weights["FC3_b"] = var
        elif "4.b:" in var.name:
            weights["FC4_b"] = var

    return tf.reshape(output, [-1]), outs, weights


def WGAN_GP_tf(X, lambda_, params, batch_size):
    tf.compat.v1.disable_eager_execution()

    batch_size = X.shape[0]

    # get alpha value
    n_steps = params["n_steps"]
    c_updates_per_epoch = params["c_updates_per_epoch"]
    alpha = tf.convert_to_tensor(params["alpha"], dtype="float32")

    X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params["n_in"]])
    X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params)

    Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params)
    Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params)

    # WGAN loss
    mean_fake = tf.reduce_mean(Y_fake)
    mean_real = tf.reduce_mean(Y_real)

    C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real)
    G_loss = -tf.reduce_mean(Y_fake)

    # WGAN gradient penalty
    X_interp = alpha * X_real + ((1 - alpha) * X_fake)
    Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params)
    gradInterp = tf.gradients(Y_interp, [X_interp])[0]

    norm_gradInterp = tf.sqrt(
        tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1])
    )
    gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2)
    C_loss += lambda_ * gradient_penalty

    # extract gradient of Y_interp wrt. each layer output in critic
    C_bwd_Y_interp = {}
    for k, v in C_out_Y_interp.items():
        C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0]

    C_bwd_W = {}
    for k, v in C_Y_interp_weights.items():
        C_bwd_W[k] = tf.gradients(C_loss, [v])[0]

    # get gradients
    dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0]
    dC_Y_real = tf.gradients(C_loss, [Y_real])[0]
    dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0]
    dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0]

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())

        for iteration in range(n_steps):
            # Train critic
            for i in range(c_updates_per_epoch):
                _data = X
                (
                    _alpha,
                    _X_interp,
                    _Y_interp,
                    _gradInterp,
                    _norm_gradInterp,
                    _gradient_penalty,
                    _C_loss,
                    _X_fake,
                    _Y_fake,
                    _Y_real,
                    _dC_Y_fake,
                    _dC_Y_real,
                    _dC_gradInterp,
                    _dG_Y_fake,
                    _mean_fake,
                    _mean_real,
                    _G_weights_FC1,
                    _G_weights_FC2,
                    _G_weights_FC3,
                    _G_weights_FC4,
                    _G_fwd_X_fake_FC1,
                    _G_fwd_X_fake_FC2,
                    _G_fwd_X_fake_FC3,
                    _G_fwd_X_fake_FC4,
                    _C_weights_Y_fake_FC1,
                    _C_weights_Y_fake_FC2,
                    _C_weights_Y_fake_FC3,
                    _C_weights_Y_fake_FC4,
                    _C_fwd_Y_fake_FC1,
                    _C_fwd_Y_fake_FC2,
                    _C_fwd_Y_fake_FC3,
                    _C_fwd_Y_fake_FC4,
                    _C_weights_Y_real_FC1,
                    _C_weights_Y_real_FC2,
                    _C_weights_Y_real_FC3,
                    _C_weights_Y_real_FC4,
                    _C_fwd_Y_real_FC1,
                    _C_fwd_Y_real_FC2,
                    _C_fwd_Y_real_FC3,
                    _C_fwd_Y_real_FC4,
                    _C_weights_Y_interp_FC1,
                    _C_weights_Y_interp_FC2,
                    _C_weights_Y_interp_FC3,
                    _C_weights_Y_interp_FC4,
                    _C_dY_interp_wrt_FC1,
                    _C_dY_interp_wrt_FC2,
                    _C_dY_interp_wrt_FC3,
                    _C_dY_interp_wrt_FC4,
                    _C_fwd_Y_interp_FC1,
                    _C_fwd_Y_interp_FC2,
                    _C_fwd_Y_interp_FC3,
                    _C_fwd_Y_interp_FC4,
                    _C_dW_FC1,
                    _C_db_FC1,
                    _C_dW_FC2,
                    _C_db_FC2,
                    _C_dW_FC3,
                    _C_db_FC3,
                    _C_dW_FC4,
                    _C_db_FC4,
                ) = session.run(
                    [
                        alpha,
                        X_interp,
                        Y_interp,
                        gradInterp,
                        norm_gradInterp,
                        gradient_penalty,
                        C_loss,
                        X_fake,
                        Y_fake,
                        Y_real,
                        dC_Y_fake,
                        dC_Y_real,
                        dC_gradInterp,
                        dG_Y_fake,
                        mean_fake,
                        mean_real,
                        G_weights["FC1"],
                        G_weights["FC2"],
                        G_weights["FC3"],
                        G_weights["FC4"],
                        G_out_X_fake["FC1"],
                        G_out_X_fake["FC2"],
                        G_out_X_fake["FC3"],
                        G_out_X_fake["FC4"],
                        C_Y_fake_weights["FC1"],
                        C_Y_fake_weights["FC2"],
                        C_Y_fake_weights["FC3"],
                        C_Y_fake_weights["FC4"],
                        C_out_Y_fake["FC1"],
                        C_out_Y_fake["FC2"],
                        C_out_Y_fake["FC3"],
                        C_out_Y_fake["FC4"],
                        C_Y_real_weights["FC1"],
                        C_Y_real_weights["FC2"],
                        C_Y_real_weights["FC3"],
                        C_Y_real_weights["FC4"],
                        C_out_Y_real["FC1"],
                        C_out_Y_real["FC2"],
                        C_out_Y_real["FC3"],
                        C_out_Y_real["FC4"],
                        C_Y_interp_weights["FC1"],
                        C_Y_interp_weights["FC2"],
                        C_Y_interp_weights["FC3"],
                        C_Y_interp_weights["FC4"],
                        C_bwd_Y_interp["FC1"],
                        C_bwd_Y_interp["FC2"],
                        C_bwd_Y_interp["FC3"],
                        C_bwd_Y_interp["FC4"],
                        C_out_Y_interp["FC1"],
                        C_out_Y_interp["FC2"],
                        C_out_Y_interp["FC3"],
                        C_out_Y_interp["FC4"],
                        C_bwd_W["FC1"],
                        C_bwd_W["FC1_b"],
                        C_bwd_W["FC2"],
                        C_bwd_W["FC2_b"],
                        C_bwd_W["FC3"],
                        C_bwd_W["FC3_b"],
                        C_bwd_W["FC4"],
                        C_bwd_W["FC4_b"],
                    ],
                    feed_dict={X_real: _data},
                )

            _G_loss = session.run(G_loss, feed_dict={X_real: _data})

        grads = {
            "X_real": _data,
            "X_interp": _X_interp,
            "G_weights_FC1": _G_weights_FC1,
            "G_weights_FC2": _G_weights_FC2,
            "G_weights_FC3": _G_weights_FC3,
            "G_weights_FC4": _G_weights_FC4,
            "G_fwd_X_fake_FC1": _G_fwd_X_fake_FC1,
            "G_fwd_X_fake_FC2": _G_fwd_X_fake_FC2,
            "G_fwd_X_fake_FC3": _G_fwd_X_fake_FC3,
            "G_fwd_X_fake_FC4": _G_fwd_X_fake_FC4,
            "X_fake": _X_fake,
            "C_weights_Y_fake_FC1": _C_weights_Y_fake_FC1,
            "C_weights_Y_fake_FC2": _C_weights_Y_fake_FC2,
            "C_weights_Y_fake_FC3": _C_weights_Y_fake_FC3,
            "C_weights_Y_fake_FC4": _C_weights_Y_fake_FC4,
            "C_fwd_Y_fake_FC1": _C_fwd_Y_fake_FC1,
            "C_fwd_Y_fake_FC2": _C_fwd_Y_fake_FC2,
            "C_fwd_Y_fake_FC3": _C_fwd_Y_fake_FC3,
            "C_fwd_Y_fake_FC4": _C_fwd_Y_fake_FC4,
            "Y_fake": _Y_fake,
            "C_weights_Y_real_FC1": _C_weights_Y_real_FC1,
            "C_weights_Y_real_FC2": _C_weights_Y_real_FC2,
            "C_weights_Y_real_FC3": _C_weights_Y_real_FC3,
            "C_weights_Y_real_FC4": _C_weights_Y_real_FC4,
            "C_fwd_Y_real_FC1": _C_fwd_Y_real_FC1,
            "C_fwd_Y_real_FC2": _C_fwd_Y_real_FC2,
            "C_fwd_Y_real_FC3": _C_fwd_Y_real_FC3,
            "C_fwd_Y_real_FC4": _C_fwd_Y_real_FC4,
            "Y_real": _Y_real,
            "C_weights_Y_interp_FC1": _C_weights_Y_interp_FC1,
            "C_weights_Y_interp_FC2": _C_weights_Y_interp_FC2,
            "C_weights_Y_interp_FC3": _C_weights_Y_interp_FC3,
            "C_weights_Y_interp_FC4": _C_weights_Y_interp_FC4,
            "C_fwd_Y_interp_FC1": _C_fwd_Y_interp_FC1,
            "C_fwd_Y_interp_FC2": _C_fwd_Y_interp_FC2,
            "C_fwd_Y_interp_FC3": _C_fwd_Y_interp_FC3,
            "C_fwd_Y_interp_FC4": _C_fwd_Y_interp_FC4,
            "Y_interp": _Y_interp,
            "dY_interp_wrt_FC1": _C_dY_interp_wrt_FC1,
            "dY_interp_wrt_FC2": _C_dY_interp_wrt_FC2,
            "dY_interp_wrt_FC3": _C_dY_interp_wrt_FC3,
            "dY_interp_wrt_FC4": _C_dY_interp_wrt_FC4,
            "gradInterp": _gradInterp,
            "gradInterp_norm": _norm_gradInterp,
            "G_loss": _G_loss,
            "C_loss": _C_loss,
            "dC_loss_dW_FC1": _C_dW_FC1,
            "dC_loss_db_FC1": _C_db_FC1,
            "dC_loss_dW_FC2": _C_dW_FC2,
            "dC_loss_db_FC2": _C_db_FC2,
            "dC_loss_dW_FC3": _C_dW_FC3,
            "dC_loss_db_FC3": _C_db_FC3,
            "dC_loss_dW_FC4": _C_dW_FC4,
            "dC_loss_db_FC4": _C_db_FC4,
            "dC_Y_fake": _dC_Y_fake,
            "dC_Y_real": _dC_Y_real,
            "dC_gradInterp": _dC_gradInterp,
            "dG_Y_fake": _dG_Y_fake,
        }
    return grads


def TFNCELoss(X, target_word, L):
    from tensorflow.python.ops.nn_impl import _compute_sampled_logits
    from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits

    tf.compat.v1.disable_eager_execution()

    in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape)
    in_bias = tf.compat.v1.placeholder(
        tf.float32, shape=L.parameters["b"].flatten().shape
    )
    in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters["W"].shape)
    in_target_word = tf.compat.v1.placeholder(tf.int64)
    in_neg_samples = tf.compat.v1.placeholder(tf.int32)
    in_target_prob = tf.compat.v1.placeholder(tf.float32)
    in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32)

    #  in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape)
    #  in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters["b"].flatten().shape)
    #  in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters["W"].shape)
    #  in_target_word = tf.keras.Input(dtype=tf.int64, shape=())
    #  in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=())
    #  in_target_prob = tf.keras.Input(dtype=tf.float32, shape=())
    #  in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=())

    feed = {
        in_embed: X,
        in_weights: L.parameters["W"],
        in_target_word: target_word,
        in_bias: L.parameters["b"].flatten(),
        in_neg_samples: L.derived_variables["noise_samples"][0],
        in_target_prob: L.derived_variables["noise_samples"][1],
        in_neg_samp_prob: L.derived_variables["noise_samples"][2],
    }

    # Compute the NCE loss, using a sample of the negative labels each time.
    nce_unreduced = tf.nn.nce_loss(
        weights=in_weights,
        biases=in_bias,
        labels=in_target_word,
        inputs=in_embed,
        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
        num_sampled=L.num_negative_samples,
        num_classes=L.n_classes,
    )

    loss = tf.reduce_sum(nce_unreduced)
    dLdW = tf.gradients(loss, [in_weights])[0]
    dLdb = tf.gradients(loss, [in_bias])[0]
    dLdX = tf.gradients(loss, [in_embed])[0]

    sampled_logits, sampled_labels = _compute_sampled_logits(
        weights=in_weights,
        biases=in_bias,
        labels=in_target_word,
        inputs=in_embed,
        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
        num_sampled=L.num_negative_samples,
        num_classes=L.n_classes,
        num_true=1,
        subtract_log_q=True,
    )

    sampled_losses = sigmoid_cross_entropy_with_logits(
        labels=sampled_labels, logits=sampled_logits
    )

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())
        (
            _final_loss,
            _nce_unreduced,
            _dLdW,
            _dLdb,
            _dLdX,
            _sampled_logits,
            _sampled_labels,
            _sampled_losses,
        ) = session.run(
            [
                loss,
                nce_unreduced,
                dLdW,
                dLdb,
                dLdX,
                sampled_logits,
                sampled_labels,
                sampled_losses,
            ],
            feed_dict=feed,
        )
    tf.compat.v1.reset_default_graph()
    return {
        "final_loss": _final_loss,
        "nce_unreduced": _nce_unreduced,
        "dLdW": _dLdW,
        "dLdb": _dLdb,
        "dLdX": _dLdX,
        "out_logits": _sampled_logits,
        "out_labels": _sampled_labels,
        "sampled_loss": _sampled_losses,
    }


================================================
FILE: numpy_ml/tests/test_glm.py
================================================
# flake8: noqa
import numpy as np

import statsmodels.api as sm
from numpy_ml.linear_models import GeneralizedLinearModel
from numpy_ml.linear_models.glm import _GLM_LINKS
from numpy_ml.utils.testing import random_tensor


def test_glm(N=20):
    np.random.seed(12345)
    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        n_samples = np.random.randint(10, 100)

        # n_feats << n_samples to avoid perfect separation / multiple solutions
        n_feats = np.random.randint(1, 1 + n_samples // 2)
        target_dim = 1

        fit_intercept = np.random.choice([True, False])
        _link = np.random.choice(list(_GLM_LINKS.keys()))

        families = {
            "identity": sm.families.Gaussian(),
            "logit": sm.families.Binomial(),
            "log": sm.families.Poisson(),
        }

        print(f"Link: {_link}")
        print(f"Fit intercept: {fit_intercept}")

        X = random_tensor((n_samples, n_feats), standardize=True)
        if _link == "logit":
            y = np.random.choice([0.0, 1.0], size=(n_samples, target_dim))
        elif _link == "log":
            y = np.random.choice(np.arange(0, 100), size=(n_samples, target_dim))
        elif _link == "identity":
            y = random_tensor((n_samples, target_dim), standardize=True)
        else:
            raise ValueError(f"Unknown link function {_link}")

        # Fit gold standard model on the entire dataset
        fam = families[_link]
        Xdesign = np.c_[np.ones(X.shape[0]), X] if fit_intercept else X

        glm_gold = sm.GLM(y, Xdesign, family=fam)
        glm_gold = glm_gold.fit()

        glm_mine = GeneralizedLinearModel(link=_link, fit_intercept=fit_intercept)
        glm_mine.fit(X, y)

        # check that model coefficients match
        beta = glm_mine.beta.T.ravel()
        np.testing.assert_almost_equal(beta, glm_gold.params, decimal=6)
        print("\t1. Overall model coefficients match")

        # check that model predictions match
        np.testing.assert_almost_equal(
            glm_mine.predict(X), glm_gold.predict(Xdesign), decimal=5
        )
        print("\t2. Overall model predictions match")

        print("\tPASSED\n")
        i += 1


================================================
FILE: numpy_ml/tests/test_linear_regression.py
================================================
# flake8: noqa
import numpy as np

from sklearn.linear_model import LinearRegression as LinearRegressionGold

from numpy_ml.linear_models import LinearRegression
from numpy_ml.utils.testing import random_tensor


def test_linear_regression(N=10):
    np.random.seed(12345)
    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        train_samples = np.random.randint(2, 30)
        update_samples = np.random.randint(1, 30)
        n_samples = train_samples + update_samples

        # ensure n_feats < train_samples, otherwise multiple solutions are
        # possible
        n_feats = np.random.randint(1, train_samples)
        target_dim = np.random.randint(1, 10)

        fit_intercept = np.random.choice([True, False])

        X = random_tensor((n_samples, n_feats), standardize=True)
        y = random_tensor((n_samples, target_dim), standardize=True)

        weighted = np.random.choice([True, False])
        weights = np.random.rand(n_samples) if weighted else np.ones(n_samples)

        X_train, X_update = X[:train_samples], X[train_samples:]
        y_train, y_update = y[:train_samples], y[train_samples:]
        w_train, w_update = weights[:train_samples], weights[train_samples:]

        print(f"Weights: {weighted}")
        print(f"Fit intercept: {fit_intercept}")

        # Fit gold standard model on the entire dataset
        lr_gold = LinearRegressionGold(fit_intercept=fit_intercept, normalize=False)
        lr_gold.fit(X, y, sample_weight=weights)

        lr_mine = LinearRegression(fit_intercept=fit_intercept)
        lr_mine.fit(X, y, weights=weights)

        # check that model predictions match
        np.testing.assert_almost_equal(
            lr_mine.predict(X), lr_gold.predict(X), decimal=5
        )
        print("\t1. Overall model predictions match")

        # check that model coefficients match
        beta = lr_mine.beta.T[:, 1:] if fit_intercept else lr_mine.beta.T
        np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)
        print("\t2. Overall model coefficients match")

        # Fit our model on just (X_train, y_train)...
        lr = LinearRegression(fit_intercept=fit_intercept)
        lr.fit(X_train, y_train, weights=w_train)

        do_single_sample_update = np.random.choice([True, False])

        # ...then update our model on the examples (X_update, y_update)
        if do_single_sample_update:
            for x_new, y_new, w_new in zip(X_update, y_update, w_update):
                lr.update(x_new, y_new, w_new)
        else:
            lr.update(X_update, y_update, w_update)

        # check that model predictions match
        np.testing.assert_almost_equal(lr.predict(X), lr_gold.predict(X), decimal=5)
        print("\t3. Iterative model predictions match")

        # check that model coefficients match
        beta = lr.beta.T[:, 1:] if fit_intercept else lr.beta.T
        np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)
        print("\t4. Iterative model coefficients match")

        print("\tPASSED\n")
        i += 1


================================================
FILE: numpy_ml/tests/test_naive_bayes.py
================================================
# flake8: noqa
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn import naive_bayes

from numpy_ml.linear_models import GaussianNBClassifier
from numpy_ml.utils.testing import random_tensor


def test_GaussianNB(N=10):
    np.random.seed(12345)
    N = np.inf if N is None else N

    i = 1
    eps = np.finfo(float).eps
    while i < N + 1:
        n_ex = np.random.randint(1, 300)
        n_feats = np.random.randint(1, 100)
        n_classes = np.random.randint(2, 10)

        X = random_tensor((n_ex, n_feats), standardize=True)
        y = np.random.randint(0, n_classes, size=n_ex)

        X_test = random_tensor((n_ex, n_feats), standardize=True)

        NB = GaussianNBClassifier(eps=1e-09)
        NB.fit(X, y)

        preds = NB.predict(X_test)

        sklearn_NB = naive_bayes.GaussianNB()
        sklearn_NB.fit(X, y)

        sk_preds = sklearn_NB.predict(X_test)

        for j in range(len(NB.labels)):
            P = NB.parameters
            jointi = np.log(sklearn_NB.class_prior_[j])
            jointi_mine = np.log(P["prior"][j])

            np.testing.assert_almost_equal(jointi, jointi_mine)

            n_jk = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[j, :] + eps))
            n_jk_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][j] + eps))

            np.testing.assert_almost_equal(n_jk_mine, n_jk)

            n_jk2 = n_jk - 0.5 * np.sum(
                ((X_test - sklearn_NB.theta_[j, :]) ** 2) / (sklearn_NB.sigma_[j, :]), 1
            )

            n_jk2_mine = n_jk_mine - 0.5 * np.sum(
                ((X_test - P["mean"][j]) ** 2) / (P["sigma"][j]), 1
            )
            np.testing.assert_almost_equal(n_jk2_mine, n_jk2, decimal=4)

            llh = jointi + n_jk2
            llh_mine = jointi_mine + n_jk2_mine

            np.testing.assert_almost_equal(llh_mine, llh, decimal=4)

        np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_)
        np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_)
        np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_)
        np.testing.assert_almost_equal(
            sklearn_NB._joint_log_likelihood(X_test),
            NB._log_posterior(X_test),
            decimal=4,
        )
        np.testing.assert_almost_equal(preds, sk_preds)
        print("PASSED")
        i += 1


================================================
FILE: numpy_ml/tests/test_ngram.py
================================================
# flake8: noqa
import tempfile

import nltk
import numpy as np

from ..preprocessing.nlp import tokenize_words
from ..ngram import AdditiveNGram, MLENGram
from ..utils.testing import random_paragraph


class MLEGold:
    def __init__(
        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
    ):
        self.N = N
        self.K = K
        self.unk = unk
        self.filter_stopwords = filter_stopwords
        self.filter_punctuation = filter_punctuation

        self.hyperparameters = {
            "N": N,
            "K": K,
            "unk": unk,
            "filter_stopwords": filter_stopwords,
            "filter_punctuation": filter_punctuation,
        }

    def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set([])

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.MLE(order=n)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self.n_words = n_words
        self._models = models
        self.n_tokens = len(vocab) if vocab is not None else len(tokens)

    def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob

    def _log_ngram_prob(self, ngram):
        N = len(ngram)
        return self._models[N].logscore(ngram[-1], ngram[:-1])


class AdditiveGold:
    def __init__(
        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
    ):
        self.N = N
        self.K = K
        self.unk = unk
        self.filter_stopwords = filter_stopwords
        self.filter_punctuation = filter_punctuation

        self.hyperparameters = {
            "N": N,
            "K": K,
            "unk": unk,
            "filter_stopwords": filter_stopwords,
            "filter_punctuation": filter_punctuation,
        }

    def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set()

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self._models = models
        self.n_words = n_words
        self.n_tokens = len(vocab) if vocab is not None else len(tokens)

    def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob

    def _log_ngram_prob(self, ngram):
        N = len(ngram)
        return self._models[N].logscore(ngram[-1], ngram[:-1])


def test_mle():
    N = np.random.randint(2, 5)
    gold = MLEGold(N, unk=True, filter_stopwords=False, filter_punctuation=False)
    mine = MLENGram(N, unk=True, filter_stopwords=False, filter_punctuation=False)

    with tempfile.NamedTemporaryFile() as temp:
        temp.write(bytes(" ".join(random_paragraph(1000)), encoding="utf-8-sig"))
        gold.train(temp.name, encoding="utf-8-sig")
        mine.train(temp.name, encoding="utf-8-sig")

    for k in mine.counts[N].keys():
        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
            continue

        err_str = "{}, mine: {}, gold: {}"
        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
            k, mine.counts[N][k], gold.counts[N][k]
        )

        M = mine.log_prob(k, N)
        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
        np.testing.assert_allclose(M, G)
        print("PASSED")


def test_additive():
    K = np.random.rand()
    N = np.random.randint(2, 5)
    gold = AdditiveGold(
        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
    )
    mine = AdditiveNGram(
        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
    )

    with tempfile.NamedTemporaryFile() as temp:
        temp.write(bytes(" ".join(random_paragraph(1000)), encoding="utf-8-sig"))
        gold.train(temp.name, encoding="utf-8-sig")
        mine.train(temp.name, encoding="utf-8-sig")

    for k in mine.counts[N].keys():
        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
            continue

        err_str = "{}, mine: {}, gold: {}"
        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
            k, mine.counts[N][k], gold.counts[N][k]
        )

        M = mine.log_prob(k, N)
        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
        np.testing.assert_allclose(M, G)
        print("PASSED")


================================================
FILE: numpy_ml/tests/test_nn.py
================================================
# flake8: noqa
import time
from copy import deepcopy

import numpy as np
from numpy.testing import assert_almost_equal

from sklearn.metrics import log_loss, mean_squared_error

# for testing sigmoid
from scipy.special import expit

import torch
import torch.nn as nn
import torch.nn.functional as F

from numpy_ml.neural_nets.utils import (
    calc_pad_dims_2D,
    conv2D_naive,
    conv2D,
    pad2D,
    pad1D,
)
from numpy_ml.utils.testing import (
    random_one_hot_matrix,
    random_stochastic_matrix,
    random_tensor,
)

from .nn_torch_models import (
    TFNCELoss,
    WGAN_GP_tf,
    torch_xe_grad,
    torch_mse_grad,
    TorchVAELoss,
    TorchFCLayer,
    TorchRNNCell,
    TorchLSTMCell,
    TorchAddLayer,
    TorchWGANGPLoss,
    TorchConv1DLayer,
    TorchConv2DLayer,
    TorchPool2DLayer,
    TorchWavenetModule,
    TorchMultiplyLayer,
    TorchDeconv2DLayer,
    TorchLayerNormLayer,
    TorchBatchNormLayer,
    TorchEmbeddingLayer,
    TorchLinearActivation,
    TorchSDPAttentionLayer,
    TorchBidirectionalLSTM,
    torch_gradient_generator,
    TorchSkipConnectionConv,
    TorchSkipConnectionIdentity,
    TorchMultiHeadedAttentionModule,
)

#######################################################################
#                           Debug Formatter                           #
#######################################################################


def err_fmt(params, golds, ix, warn_str=""):
    mine, label = params[ix]
    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
    prev_mine, prev_label = params[max(ix - 1, 0)]
    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
        prev_label, prev_mine, prev_label, golds[prev_label]
    )
    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
        label, mine, label, golds[label]
    )
    err_msg += warn_str
    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
    return err_msg


#######################################################################
#                         Loss Functions                              #
#######################################################################


def test_squared_error(N=15):
    from numpy_ml.neural_nets.losses import SquaredError

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = SquaredError()
    gold = (
        lambda y, y_pred: mean_squared_error(y, y_pred)
        * y_pred.shape[0]
        * y_pred.shape[1]
        * 0.5
    )

    # ensure we get 0 when the two arrays are equal
    n_dims = np.random.randint(2, 100)
    n_examples = np.random.randint(1, 1000)
    y = y_pred = random_tensor((n_examples, n_dims))
    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
    print("PASSED")

    i = 1
    while i < N:
        n_dims = np.random.randint(2, 100)
        n_examples = np.random.randint(1, 1000)
        y = random_tensor((n_examples, n_dims))
        y_pred = random_tensor((n_examples, n_dims))
        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred), decimal=5)
        print("PASSED")
        i += 1


def test_cross_entropy(N=15):
    from numpy_ml.neural_nets.losses import CrossEntropy

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = CrossEntropy()
    gold = log_loss

    # ensure we get 0 when the two arrays are equal
    n_classes = np.random.randint(2, 100)
    n_examples = np.random.randint(1, 1000)
    y = y_pred = random_one_hot_matrix(n_examples, n_classes)
    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
    print("PASSED")

    # test on random inputs
    i = 1
    while i < N:
        n_classes = np.random.randint(2, 100)
        n_examples = np.random.randint(1, 1000)
        y = random_one_hot_matrix(n_examples, n_classes)
        y_pred = random_stochastic_matrix(n_examples, n_classes)

        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred, normalize=False))
        print("PASSED")
        i += 1


def test_VAE_loss(N=15):
    from numpy_ml.neural_nets.losses import VAELoss

    np.random.seed(12345)

    N = np.inf if N is None else N
    eps = np.finfo(float).eps

    i = 1
    while i < N:
        n_ex = np.random.randint(1, 10)
        t_dim = np.random.randint(2, 10)
        t_mean = random_tensor([n_ex, t_dim], standardize=True)
        t_log_var = np.log(np.abs(random_tensor([n_ex, t_dim], standardize=True) + eps))
        im_cols, im_rows = np.random.randint(2, 40), np.random.randint(2, 40)
        X = np.random.rand(n_ex, im_rows * im_cols)
        X_recon = np.random.rand(n_ex, im_rows * im_cols)

        mine = VAELoss()
        mine_loss = mine(X, X_recon, t_mean, t_log_var)
        dX_recon, dLogVar, dMean = mine.grad(X, X_recon, t_mean, t_log_var)
        golds = TorchVAELoss().extract_grads(X, X_recon, t_mean, t_log_var)

        params = [
            (mine_loss, "loss"),
            (dX_recon, "dX_recon"),
            (dLogVar, "dt_log_var"),
            (dMean, "dt_mean"),
        ]
        print("\nTrial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            np.testing.assert_allclose(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix),
                rtol=0.1,
                atol=1e-2,
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_WGAN_GP_loss(N=5):
    from numpy_ml.neural_nets.losses import WGAN_GPLoss

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 1
    while i < N:
        lambda_ = np.random.randint(0, 10)
        n_ex = np.random.randint(1, 10)
        n_feats = np.random.randint(2, 10)
        Y_real = random_tensor([n_ex], standardize=True)
        Y_fake = random_tensor([n_ex], standardize=True)
        gradInterp = random_tensor([n_ex, n_feats], standardize=True)

        mine = WGAN_GPLoss(lambda_=lambda_)
        C_loss = mine(Y_fake, "C", Y_real, gradInterp)
        G_loss = mine(Y_fake, "G")

        C_dY_fake, dY_real, dGradInterp = mine.grad(Y_fake, "C", Y_real, gradInterp)
        G_dY_fake = mine.grad(Y_fake, "G")

        golds = TorchWGANGPLoss(lambda_).extract_grads(Y_real, Y_fake, gradInterp)
        if np.isnan(golds["C_dGradInterp"]).any():
            continue

        params = [
            (Y_real, "Y_real"),
            (Y_fake, "Y_fake"),
            (gradInterp, "gradInterp"),
            (C_loss, "C_loss"),
            (G_loss, "G_loss"),
            (-dY_real, "C_dY_real"),
            (-C_dY_fake, "C_dY_fake"),
            (dGradInterp, "C_dGradInterp"),
            (G_dY_fake, "G_dY_fake"),
        ]

        print("\nTrial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            np.testing.assert_allclose(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix),
                rtol=0.1,
                atol=1e-2,
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_NCELoss(N=1):
    from numpy_ml.neural_nets.losses import NCELoss
    from numpy_ml.utils.data_structures import DiscreteSampler

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        n_c = np.random.randint(1, 10)
        n_out = np.random.randint(1, 300)
        vocab_size = np.random.randint(200, 1000)
        num_negative_samples = np.random.randint(1, 10)

        embeddings = random_tensor((n_ex, n_c, n_out), standardize=True)
        target = np.random.randint(0, vocab_size, (n_ex, 1))

        probs = np.random.rand(vocab_size)
        probs /= probs.sum()

        D = DiscreteSampler(probs, log=False, with_replacement=False)
        NCE = NCELoss(vocab_size, D, num_negative_samples)
        my_loss, _ = NCE(embeddings, target.flatten())

        my_dLdX = NCE.grad(update_params=False)
        my_dLdW = NCE.gradients["W"]
        my_dLdb = NCE.gradients["b"]

        NCE.gradients["W"] = np.zeros_like(NCE.parameters["W"])
        NCE.gradients["b"] = np.zeros_like(NCE.parameters["b"])

        MY_final_loss, TF_final_loss = 0, 0
        MY_dLdX, TF_dLdX = np.zeros_like(embeddings), np.zeros_like(embeddings)
        TF_dLdW, TF_dLdb = (
            np.zeros_like(NCE.parameters["W"]),
            np.zeros_like(NCE.parameters["b"]),
        )

        # XXX: instead of calculating the tf NCE on the entire batch, we
        # calculate it per-example and then sum. this is really lame and should
        # be changed to operate on batches.
        nv = NCE.derived_variables["noise_samples"][0]
        for ix, emb in enumerate(embeddings):
            sv = (nv[0], np.array([nv[1][0, ix]]), nv[2])

            NCE.X = []
            for k, v in NCE.derived_variables.items():
                NCE.derived_variables[k] = []

            for k, v in NCE.gradients.items():
                NCE.gradients[k] = np.zeros_like(v)

            my, _ = NCE(emb[None, :, :], target[ix], neg_samples=sv[0])

            NCE.derived_variables["noise_samples"] = [sv]
            dldx = NCE.grad(update_params=False)
            NCE.derived_variables["noise_samples"] = sv

            MY_final_loss += my
            MY_dLdX[ix, ...] += np.squeeze(dldx, axis=0)

            TF_dict = TFNCELoss(emb, np.array([target[ix]]), NCE)

            TF_loss = TF_dict["final_loss"]
            TF_final_loss += TF_loss
            TF_dLdX[ix, ...] += TF_dict["dLdX"]
            TF_dLdW[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values
            TF_dLdb[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values

            tf_dw = np.zeros_like(NCE.gradients["W"])
            tf_dw[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values

            tf_db = np.zeros_like(NCE.gradients["b"])
            tf_db[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values

        print("\nTrial {}".format(i))
        np.testing.assert_almost_equal(my_loss, TF_final_loss, decimal=3)
        print("PASSED: final loss")

        maps = [
            ("dLdW", my_dLdW, TF_dLdW),
            ("dLdb", my_dLdb, TF_dLdb),
            ("dLdX", my_dLdX, TF_dLdX),
        ]
        for (ll, k1, k2) in maps:
            np.testing.assert_almost_equal(k1, k2, decimal=2, err_msg=ll)
            print("PASSED: {}".format(ll))

        i += 1


#######################################################################
#                       Loss Function Gradients                       #
#######################################################################


def test_squared_error_grad(N=15):
    from numpy_ml.neural_nets.losses import SquaredError
    from numpy_ml.neural_nets.activations import Tanh

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = SquaredError()
    gold = torch_mse_grad
    act = Tanh()

    i = 1
    while i < N:
        n_dims = np.random.randint(2, 100)
        n_examples = np.random.randint(1, 1000)
        y = random_tensor((n_examples, n_dims))

        # raw inputs
        z = random_tensor((n_examples, n_dims))
        y_pred = act.fn(z)

        assert_almost_equal(
            mine.grad(y, y_pred, z, act), 0.5 * gold(y, z, torch.tanh), decimal=4
        )
        print("PASSED")
        i += 1


def test_cross_entropy_grad(N=15):
    from numpy_ml.neural_nets.losses import CrossEntropy
    from numpy_ml.neural_nets.layers import Softmax

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = CrossEntropy()
    gold = torch_xe_grad
    sm = Softmax()

    i = 1
    while i < N:
        n_classes = np.random.randint(2, 100)
        n_examples = np.random.randint(1, 1000)

        y = random_one_hot_matrix(n_examples, n_classes)

        # the cross_entropy_gradient returns the gradient wrt. z (NOT softmax(z))
        z = random_tensor((n_examples, n_classes))
        y_pred = sm.forward(z)

        assert_almost_equal(mine.grad(y, y_pred), gold(y, z), decimal=5)
        print("PASSED")
        i += 1


#######################################################################
#                          Activations                                #
#######################################################################


def test_sigmoid_activation(N=15):
    from numpy_ml.neural_nets.activations import Sigmoid

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = Sigmoid()
    gold = expit

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_tensor((1, n_dims))
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_elu_activation(N=15):
    from numpy_ml.neural_nets.activations import ELU

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 10)
        z = random_tensor((1, n_dims))

        alpha = np.random.uniform(0, 10)

        mine = ELU(alpha)
        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()

        assert_almost_equal(mine.fn(z), gold(z, alpha))
        print("PASSED")
        i += 1


def test_softmax_activation(N=15):
    from numpy_ml.neural_nets.layers import Softmax

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = Softmax()
    gold = lambda z: F.softmax(torch.FloatTensor(z), dim=1).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.forward(z), gold(z))
        print("PASSED")
        i += 1


def test_relu_activation(N=15):
    from numpy_ml.neural_nets.activations import ReLU

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = ReLU()
    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_softplus_activation(N=15):
    from numpy_ml.neural_nets.activations import SoftPlus

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = SoftPlus()
    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


#######################################################################
#                      Activation Gradients                           #
#######################################################################


def test_sigmoid_grad(N=15):
    from numpy_ml.neural_nets.activations import Sigmoid

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = Sigmoid()
    gold = torch_gradient_generator(torch.sigmoid)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_elu_grad(N=15):
    from numpy_ml.neural_nets.activations import ELU

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 10)
        n_dims = np.random.randint(1, 10)
        alpha = np.random.uniform(0, 10)
        z = random_tensor((n_ex, n_dims))

        mine = ELU(alpha)
        gold = torch_gradient_generator(F.elu, alpha=alpha)
        assert_almost_equal(mine.grad(z), gold(z), decimal=5)
        print("PASSED")
        i += 1


def test_tanh_grad(N=15):
    from numpy_ml.neural_nets.activations import Tanh

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = Tanh()
    gold = torch_gradient_generator(torch.tanh)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_relu_grad(N=15):
    from numpy_ml.neural_nets.activations import ReLU

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = ReLU()
    gold = torch_gradient_generator(F.relu)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_softmax_grad(N=15):
    from numpy_ml.neural_nets.layers import Softmax
    from functools import partial

    np.random.seed(12345)

    N = np.inf if N is None else N
    p_soft = partial(F.softmax, dim=1)
    gold = torch_gradient_generator(p_soft)

    i = 0
    while i < N:
        mine = Softmax()
        n_ex = np.random.randint(1, 3)
        n_dims = np.random.randint(1, 50)
        z = random_tensor((n_ex, n_dims), standardize=True)
        out = mine.forward(z)

        assert_almost_equal(
            gold(z),
            mine.backward(np.ones_like(out)),
            err_msg="Theirs:\n{}\n\nMine:\n{}\n".format(
                gold(z), mine.backward(np.ones_like(out))
            ),
            decimal=3,
        )
        print("PASSED")
        i += 1


def test_softplus_grad(N=15):
    from numpy_ml.neural_nets.activations import SoftPlus

    np.random.seed(12345)

    N = np.inf if N is None else N

    mine = SoftPlus()
    gold = torch_gradient_generator(F.softplus)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims), standardize=True)
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


#######################################################################
#                          Layers                                     #
#######################################################################


def test_FullyConnected(N=15):
    from numpy_ml.neural_nets.layers import FullyConnected
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    np.random.seed(12345)

    N = np.inf if N is None else N

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 100)
        n_in = np.random.randint(1, 100)
        n_out = np.random.randint(1, 100)
        X = random_tensor((n_ex, n_in), standardize=True)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize FC layer
        L1 = FullyConnected(n_out=n_out, act_fn=act_fn)

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchFCLayer(n_in, n_out, torch_fn, L1.parameters)
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["W"].T, "W"),
            (L1.parameters["b"], "b"),
            (dLdy, "dLdy"),
            (L1.gradients["W"].T, "dLdW"),
            (L1.gradients["b"], "dLdB"),
            (dLdX, "dLdX"),
        ]

        print("\nTrial {}\nact_fn={}".format(i, act_fn_name))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_Embedding(N=15):
    from numpy_ml.neural_nets.layers import Embedding

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        vocab_size = np.random.randint(1, 2000)
        n_ex = np.random.randint(1, 100)
        n_in = np.random.randint(1, 100)
        emb_dim = np.random.randint(1, 100)

        X = np.random.randint(0, vocab_size, (n_ex, n_in))

        # initialize Embedding layer
        L1 = Embedding(n_out=emb_dim, vocab_size=vocab_size)

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        #  dLdX = L1.backward(dLdy)
        L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchEmbeddingLayer(vocab_size, emb_dim, L1.parameters)
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["W"], "W"),
            (dLdy, "dLdy"),
            (L1.gradients["W"], "dLdW"),
            #  (dLdX, "dLdX"),
        ]

        print("\nTrial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_BatchNorm1D(N=15):
    from numpy_ml.neural_nets.layers import BatchNorm1D

    np.random.seed(12345)

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 1000)
        n_in = np.random.randint(1, 1000)
        X = random_tensor((n_ex, n_in), standardize=True)

        # initialize BatchNorm1D layer
        L1 = BatchNorm1D()

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchBatchNormLayer(
            n_in, L1.parameters, "1D", epsilon=L1.epsilon, momentum=L1.momentum
        )
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["scaler"].T, "scaler"),
            (L1.parameters["intercept"], "intercept"),
            (L1.parameters["running_mean"], "running_mean"),
            #  (L1.parameters["running_var"], "running_var"),
            (L1.gradients["scaler"], "dLdScaler"),
            (L1.gradients["intercept"], "dLdIntercept"),
            (dLdX, "dLdX"),
        ]

        print("Trial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_LayerNorm1D(N=15):
    from numpy_ml.neural_nets.layers import LayerNorm1D

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 1000)
        n_in = np.random.randint(1, 1000)
        X = random_tensor((n_ex, n_in), standardize=True)

        # initialize BatchNorm1D layer
        L1 = LayerNorm1D()

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchLayerNormLayer(n_in, L1.parameters, "1D", epsilon=L1.epsilon)
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["scaler"].T, "scaler"),
            (L1.parameters["intercept"], "intercept"),
            (L1.gradients["scaler"], "dLdScaler"),
            (L1.gradients["intercept"], "dLdIntercept"),
            (dLdX, "dLdX"),
        ]

        print("Trial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_LayerNorm2D(N=15):
    from numpy_ml.neural_nets.layers import LayerNorm2D

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 10)
        in_rows = np.random.randint(1, 10)
        in_cols = np.random.randint(1, 10)
        n_in = np.random.randint(1, 3)

        # initialize LayerNorm2D layer
        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
        L1 = LayerNorm2D()

        # forward prop
        y_pred = L1.forward(X)

        # standard sum loss
        dLdy = np.ones_like(X)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchLayerNormLayer(
            [n_in, in_rows, in_cols], L1.parameters, mode="2D", epsilon=L1.epsilon
        )
        golds = gold_mod.extract_grads(X, Y_true=None)

        params = [
            (L1.X[0], "X"),
            (L1.hyperparameters["epsilon"], "epsilon"),
            (L1.parameters["scaler"], "scaler"),
            (L1.parameters["intercept"], "intercept"),
            (y_pred, "y"),
            (L1.gradients["scaler"], "dLdScaler"),
            (L1.gradients["intercept"], "dLdIntercept"),
            (dLdX, "dLdX"),
        ]

        print("Trial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )

            print("\tPASSED {}".format(label))

        i += 1


def test_MultiplyLayer(N=15):
    from numpy_ml.neural_nets.layers import Multiply
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        Xs = []
        n_ex = np.random.randint(1, 100)
        n_in = np.random.randint(1, 100)
        n_entries = np.random.randint(2, 5)
        for _ in range(n_entries):
            Xs.append(random_tensor((n_ex, n_in), standardize=True))

        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize Add layer
        L1 = Multiply(act_fn)

        # forward prop
        y_pred = L1.forward(Xs)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdXs = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchMultiplyLayer(torch_fn)
        golds = gold_mod.extract_grads(Xs)

        params = [(Xs, "Xs"), (y_pred, "Y")]
        params.extend(
            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
        )

        print("\nTrial {}".format(i))
        print("n_ex={}, n_in={}".format(n_ex, n_in))
        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_AddLayer(N=15):
    from numpy_ml.neural_nets.layers import Add
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        Xs = []
        n_ex = np.random.randint(1, 100)
        n_in = np.random.randint(1, 100)
        n_entries = np.random.randint(2, 5)
        for _ in range(n_entries):
            Xs.append(random_tensor((n_ex, n_in), standardize=True))

        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize Add layer
        L1 = Add(act_fn)

        # forward prop
        y_pred = L1.forward(Xs)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdXs = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchAddLayer(torch_fn)
        golds = gold_mod.extract_grads(Xs)

        params = [(Xs, "Xs"), (y_pred, "Y")]
        params.extend(
            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
        )

        print("\nTrial {}".format(i))
        print("n_ex={}, n_in={}".format(n_ex, n_in))
        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_BatchNorm2D(N=15):
    from numpy_ml.neural_nets.layers import BatchNorm2D

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 10)
        in_rows = np.random.randint(1, 10)
        in_cols = np.random.randint(1, 10)
        n_in = np.random.randint(1, 3)

        # initialize BatchNorm2D layer
        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
        L1 = BatchNorm2D()

        # forward prop
        y_pred = L1.forward(X)

        # standard sum loss
        dLdy = np.ones_like(X)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchBatchNormLayer(
            n_in, L1.parameters, mode="2D", epsilon=L1.epsilon, momentum=L1.momentum
        )
        golds = gold_mod.extract_grads(X, Y_true=None)

        params = [
            (L1.X[0], "X"),
            (L1.hyperparameters["momentum"], "momentum"),
            (L1.hyperparameters["epsilon"], "epsilon"),
            (L1.parameters["scaler"].T, "scaler"),
            (L1.parameters["intercept"], "intercept"),
            (L1.parameters["running_mean"], "running_mean"),
            #  (L1.parameters["running_var"], "running_var"),
            (y_pred, "y"),
            (L1.gradients["scaler"], "dLdScaler"),
            (L1.gradients["intercept"], "dLdIntercept"),
            (dLdX, "dLdX"),
        ]

        print("Trial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )

            print("\tPASSED {}".format(label))

        i += 1


def test_RNNCell(N=15):
    from numpy_ml.neural_nets.layers import RNNCell

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        n_in = np.random.randint(1, 10)
        n_out = np.random.randint(1, 10)
        n_t = np.random.randint(1, 10)
        X = random_tensor((n_ex, n_in, n_t), standardize=True)

        # initialize RNN layer
        L1 = RNNCell(n_out=n_out)

        # forward prop
        y_preds = []
        for t in range(n_t):
            y_pred = L1.forward(X[:, :, t])
            y_preds += [y_pred]

        # backprop
        dLdX = []
        dLdAt = np.ones_like(y_preds[t])
        for t in reversed(range(n_t)):
            dLdXt = L1.backward(dLdAt)
            dLdX.insert(0, dLdXt)
        dLdX = np.dstack(dLdX)

        # get gold standard gradients
        gold_mod = TorchRNNCell(n_in, n_out, L1.parameters)
        golds = gold_mod.extract_grads(X)

        params = [
            (X, "X"),
            (np.array(y_preds), "y"),
            (L1.parameters["ba"].T, "ba"),
            (L1.parameters["bx"].T, "bx"),
            (L1.parameters["Wax"].T, "Wax"),
            (L1.parameters["Waa"].T, "Waa"),
            (L1.gradients["ba"].T, "dLdBa"),
            (L1.gradients["bx"].T, "dLdBx"),
            (L1.gradients["Wax"].T, "dLdWax"),
            (L1.gradients["Waa"].T, "dLdWaa"),
            (dLdX, "dLdX"),
        ]

        print("Trial {}".format(i))
        for ix, (mine, label) in enumerate(params):
            np.testing.assert_allclose(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix),
                atol=1e-3,
                rtol=1e-3,
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_Conv2D(N=15):
    from numpy_ml.neural_nets.layers import Conv2D
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        in_rows = np.random.randint(1, 10)
        in_cols = np.random.randint(1, 10)
        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
        f_shape = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
        d = np.random.randint(0, 5)

        fr, fc = f_shape[0] * (d + 1) - d, f_shape[1] * (d + 1) - d
        out_rows = int(1 + (in_rows + 2 * p - fr) / s)
        out_cols = int(1 + (in_cols + 2 * p - fc) / s)

        if out_rows <= 0 or out_cols <= 0:
            continue

        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize Conv2D layer
        L1 = Conv2D(
            out_ch=n_out,
            kernel_shape=f_shape,
            act_fn=act_fn,
            pad=p,
            stride=s,
            dilation=d,
        )

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchConv2DLayer(
            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
        )
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["W"], "W"),
            (L1.parameters["b"], "b"),
            (L1.gradients["W"], "dLdW"),
            (L1.gradients["b"], "dLdB"),
            (dLdX, "dLdX"),
        ]

        print("\nTrial {}".format(i))
        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
        print("dilation={}".format(d))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_DPAttention(N=15):
    from numpy_ml.neural_nets.layers import DotProductAttention

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        d_k = np.random.randint(1, 100)
        d_v = np.random.randint(1, 100)

        Q = random_tensor((n_ex, d_k), standardize=True)
        K = random_tensor((n_ex, d_k), standardize=True)
        V = random_tensor((n_ex, d_v), standardize=True)

        # initialize DotProductAttention layer
        mine = DotProductAttention(scale=True, dropout_p=0)

        # forward prop
        y_pred = mine.forward(Q, K, V)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdQ, dLdK, dLdV = mine.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchSDPAttentionLayer()
        golds = gold_mod.extract_grads(Q, K, V)

        params = [
            (mine.X[0][0], "Q"),
            (mine.X[0][1], "K"),
            (mine.X[0][2], "V"),
            (y_pred, "Y"),
            (dLdV, "dLdV"),
            (dLdK, "dLdK"),
            (dLdQ, "dLdQ"),
        ]

        print("\nTrial {}".format(i))
        print("n_ex={} d_k={} d_v={}".format(n_ex, d_k, d_v))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_Conv1D(N=15):
    from numpy_ml.neural_nets.layers import Conv1D
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        l_in = np.random.randint(1, 10)
        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
        f_width = min(l_in, np.random.randint(1, 5))
        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
        d = np.random.randint(0, 5)

        fc = f_width * (d + 1) - d
        l_out = int(1 + (l_in + 2 * p - fc) / s)

        if l_out <= 0:
            continue

        X = random_tensor((n_ex, l_in, n_in), standardize=True)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize Conv2D layer
        L1 = Conv1D(
            out_ch=n_out,
            kernel_width=f_width,
            act_fn=act_fn,
            pad=p,
            stride=s,
            dilation=d,
        )

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchConv1DLayer(
            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
        )
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (y_pred, "y"),
            (L1.parameters["W"], "W"),
            (L1.parameters["b"], "b"),
            (L1.gradients["W"], "dLdW"),
            (L1.gradients["b"], "dLdB"),
            (dLdX, "dLdX"),
        ]

        print("\nTrial {}".format(i))
        print("pad={}, stride={}, f_width={}, n_ex={}".format(p, s, f_width, n_ex))
        print("l_in={}, n_in={}".format(l_in, n_in))
        print("l_out={}, n_out={}".format(l_out, n_out))
        print("dilation={}".format(d))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_Deconv2D(N=15):
    from numpy_ml.neural_nets.layers import Deconv2D
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        in_rows = np.random.randint(1, 10)
        in_cols = np.random.randint(1, 10)
        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
        f_shape = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        p, s = np.random.randint(0, 5), np.random.randint(1, 3)

        out_rows = s * (in_rows - 1) - 2 * p + f_shape[0]
        out_cols = s * (in_cols - 1) - 2 * p + f_shape[1]

        if out_rows <= 0 or out_cols <= 0:
            continue

        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        # initialize Deconv2D layer
        L1 = Deconv2D(
            out_ch=n_out, kernel_shape=f_shape, act_fn=act_fn, pad=p, stride=s
        )

        # forward prop
        try:
            y_pred = L1.forward(X)
        except ValueError:
            print("Improper dimensions; retrying")
            continue

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchDeconv2DLayer(
            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
        )
        golds = gold_mod.extract_grads(X)

        params = [
            (L1.X[0], "X"),
            (L1.parameters["W"], "W"),
            (L1.parameters["b"], "b"),
            (y_pred, "y"),
            (L1.gradients["W"], "dLdW"),
            (L1.gradients["b"], "dLdB"),
            (dLdX, "dLdX"),
        ]

        print("\nTrial {}".format(i))
        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_Pool2D(N=15):
    from numpy_ml.neural_nets.layers import Pool2D

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        in_rows = np.random.randint(1, 10)
        in_cols = np.random.randint(1, 10)
        n_in = np.random.randint(1, 3)
        f_shape = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        p, s = np.random.randint(0, max(1, min(f_shape) // 2)), np.random.randint(1, 3)
        #  mode = ["max", "average"][np.random.randint(0, 2)]
        mode = "average"
        out_rows = int(1 + (in_rows + 2 * p - f_shape[0]) / s)
        out_cols = int(1 + (in_cols + 2 * p - f_shape[1]) / s)

        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
        print("\nmode: {}".format(mode))
        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_in))

        # initialize Pool2D layer
        L1 = Pool2D(kernel_shape=f_shape, pad=p, stride=s, mode=mode)

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchPool2DLayer(n_in, L1.hyperparameters)
        golds = gold_mod.extract_grads(X)

        params = [(L1.X[0], "X"), (y_pred, "y"), (dLdX, "dLdX")]
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_LSTMCell(N=15):
    from numpy_ml.neural_nets.layers import LSTMCell

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        n_in = np.random.randint(1, 10)
        n_out = np.random.randint(1, 10)
        n_t = np.random.randint(1, 10)
        X = random_tensor((n_ex, n_in, n_t), standardize=True)

        # initialize LSTM layer
        L1 = LSTMCell(n_out=n_out)

        # forward prop
        Cs = []
        y_preds = []
        for t in range(n_t):
            y_pred, Ct = L1.forward(X[:, :, t])
            y_preds.append(y_pred)
            Cs.append(Ct)

        # backprop
        dLdX = []
        dLdAt = np.ones_like(y_preds[t])
        for t in reversed(range(n_t)):
            dLdXt = L1.backward(dLdAt)
            dLdX.insert(0, dLdXt)
        dLdX = np.dstack(dLdX)
        y_preds = np.dstack(y_preds)
        Cs = np.array(Cs)

        # get gold standard gradients
        gold_mod = TorchLSTMCell(n_in, n_out, L1.parameters)
        golds = gold_mod.extract_grads(X)

        params = [
            (X, "X"),
            (np.array(Cs), "C"),
            (y_preds, "y"),
            (L1.parameters["bo"].T, "bo"),
            (L1.parameters["bu"].T, "bu"),
            (L1.parameters["bf"].T, "bf"),
            (L1.parameters["bc"].T, "bc"),
            (L1.parameters["Wo"], "Wo"),
            (L1.parameters["Wu"], "Wu"),
            (L1.parameters["Wf"], "Wf"),
            (L1.parameters["Wc"], "Wc"),
            (L1.gradients["bo"].T, "dLdBo"),
            (L1.gradients["bu"].T, "dLdBu"),
            (L1.gradients["bf"].T, "dLdBf"),
            (L1.gradients["bc"].T, "dLdBc"),
            (L1.gradients["Wo"], "dLdWo"),
            (L1.gradients["Wu"], "dLdWu"),
            (L1.gradients["Wf"], "dLdWf"),
            (L1.gradients["Wc"], "dLdWc"),
            (dLdX, "dLdX"),
        ]

        print("Case {}".format(i))
        for ix, (mine, label) in enumerate(params):
            np.testing.assert_allclose(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix),
                atol=1e-4,
                rtol=1e-4,
            )

            print("\tPASSED {}".format(label))
        i += 1


def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):
    """
    Manual gradient calc for vanilla RNN parameters
    """
    if param_name in ["Ba", "Bx"]:
        param_name = param_name.lower()
    elif param_name in ["X", "y"]:
        return None

    param_orig = model.parameters[param_name]
    model.flush_gradients()
    grads = np.zeros_like(param_orig)

    for flat_ix, val in enumerate(param_orig.flat):
        param = deepcopy(param_orig)
        md_ix = np.unravel_index(flat_ix, param.shape)

        # plus
        y_preds_plus = []
        param[md_ix] = val + epsilon
        model.parameters[param_name] = param
        for t in range(n_t):
            y_pred_plus = model.forward(X[:, :, t])
            y_preds_plus += [y_pred_plus]
        loss_plus = loss_func(y_preds_plus)
        model.flush_gradients()

        # minus
        y_preds_minus = []
        param[md_ix] = val - epsilon
        model.parameters[param_name] = param
        for t in range(n_t):
            y_pred_minus = model.forward(X[:, :, t])
            y_preds_minus += [y_pred_minus]
        loss_minus = loss_func(y_preds_minus)
        model.flush_gradients()

        grad = (loss_plus - loss_minus) / (2 * epsilon)
        grads[md_ix] = grad
    return grads.T


#######################################################################
#                               Modules                               #
#######################################################################


def test_MultiHeadedAttentionModule(N=15):
    from numpy_ml.neural_nets.modules import MultiHeadedAttentionModule

    N = np.inf if N is None else N
    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        latent_dim = np.random.randint(1, 20)
        n_heads = np.random.randint(2, 10)
        d_k = d_v = n_heads * latent_dim

        Q = random_tensor((n_ex, d_k), standardize=True)
        K = random_tensor((n_ex, d_k), standardize=True)
        V = random_tensor((n_ex, d_v), standardize=True)

        mine = MultiHeadedAttentionModule(n_heads=n_heads, dropout_p=0)

        # forward prop
        y_pred = mine.forward(Q, K, V)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdQ, dLdK, dLdV = mine.backward(dLdy)

        # get gold standard gradients
        params = mine.parameters
        hparams = mine.hyperparameters
        gold_mod = TorchMultiHeadedAttentionModule(params, hparams)
        golds = gold_mod.extract_grads(Q, K, V)

        dv = mine.derived_variables
        params = mine.parameters["components"]
        grads = mine.gradients["components"]
        params = [
            (Q, "Q"),
            (K, "K"),
            (V, "V"),
            (mine.n_heads, "n_heads"),
            (mine.latent_dim, "latent_dim"),
            (params["O"]["W"], "O_W"),
            (params["K"]["W"], "K_W"),
            (params["V"]["W"], "V_W"),
            (params["Q"]["W"], "Q_W"),
            (params["O"]["b"], "O_b"),
            (params["K"]["b"], "K_b"),
            (params["V"]["b"], "V_b"),
            (params["Q"]["b"], "Q_b"),
            (dv["Q_proj"], "Q_proj"),
            (dv["K_proj"], "K_proj"),
            (dv["V_proj"], "V_proj"),
            (dv["attention_weights"][0], "weights"),
            (dv["attention_out"], "attn_out"),
            (y_pred, "Y"),
            (dLdy, "dLdy"),
            (dv["dQ_proj"], "dQ_proj"),
            (dv["dK_proj"], "dK_proj"),
            (dv["dV_proj"], "dV_proj"),
            (grads["O"]["W"], "dO_W"),
            (grads["V"]["W"], "dV_W"),
            (grads["K"]["W"], "dK_W"),
            (grads["Q"]["W"], "dQ_W"),
            (grads["O"]["b"], "dO_b"),
            (grads["V"]["b"], "dV_b"),
            (grads["K"]["b"], "dK_b"),
            (grads["Q"]["b"], "dQ_b"),
            (dLdQ, "dQ"),
            (dLdK, "dK"),
            (dLdV, "dV"),
        ]

        print("\nTrial {}".format(i))
        print(
            "n_ex={} d_k=d_v={} latent_dim={} n_heads={}".format(
                n_ex, d_k, latent_dim, n_heads
            )
        )
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_SkipConnectionIdentityModule(N=15):
    from numpy_ml.neural_nets.modules import SkipConnectionIdentityModule
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 10)
        in_rows = np.random.randint(2, 25)
        in_cols = np.random.randint(2, 25)
        n_in = np.random.randint(2, 5)
        n_out = n_in
        f_shape1 = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        f_shape2 = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        s1 = np.random.randint(1, 5)
        s2 = np.random.randint(1, 5)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)

        p1 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape1, s1)
        if p1[0] != p1[1] or p1[2] != p1[3]:
            continue

        p2 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape2, s2)
        if p2[0] != p2[1] or p2[2] != p2[3]:
            continue

        p1 = (p1[0], p1[2])
        p2 = (p2[0], p2[2])

        # initialize SkipConnectionIdentity module
        L1 = SkipConnectionIdentityModule(
            out_ch=n_out,
            kernel_shape1=f_shape1,
            kernel_shape2=f_shape2,
            stride1=s1,
            stride2=s2,
            act_fn=act_fn,
            epsilon=1e-5,
            momentum=0.9,
        )

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchSkipConnectionIdentity(
            torch_fn,
            p1,
            p2,
            L1.parameters,
            L1.hyperparameters,
            momentum=L1.momentum,
            epsilon=L1.epsilon,
        )
        golds = gold_mod.extract_grads(X)

        params = L1.parameters["components"]
        grads = L1.gradients["components"]
        params = [
            (X, "X"),
            (params["conv1"]["W"], "conv1_W"),
            (params["conv1"]["b"], "conv1_b"),
            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
            (params["batchnorm1"]["intercept"], "bn1_intercept"),
            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
            (params["conv2"]["W"], "conv2_W"),
            (params["conv2"]["b"], "conv2_b"),
            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
            (params["batchnorm2"]["intercept"], "bn2_intercept"),
            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
            (L1._dv["conv1_out"], "act1_out"),
            (L1._dv["batchnorm1_out"], "bn1_out"),
            (L1._dv["conv2_out"], "conv2_out"),
            (L1._dv["batchnorm2_out"], "bn2_out"),
            (y_pred, "Y"),
            (dLdy, "dLdY"),
            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
            (dLdX, "dLdX"),
            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
            (grads["conv2"]["W"], "dLdConv2_W"),
            (grads["conv2"]["b"], "dLdConv2_b"),
            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
            (grads["conv1"]["W"], "dLdConv1_W"),
            (grads["conv1"]["b"], "dLdConv1_b"),
        ]

        print("\nTrial {}".format(i))
        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=2
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_SkipConnectionConvModule(N=15):
    from numpy_ml.neural_nets.modules import SkipConnectionConvModule
    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine

    N = np.inf if N is None else N

    np.random.seed(12345)

    acts = [
        (Tanh(), nn.Tanh(), "Tanh"),
        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
        (ReLU(), nn.ReLU(), "ReLU"),
        (Affine(), TorchLinearActivation(), "Affine"),
    ]

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(2, 10)
        in_rows = np.random.randint(2, 10)
        in_cols = np.random.randint(2, 10)
        n_in = np.random.randint(2, 5)
        n_out1 = np.random.randint(2, 5)
        n_out2 = np.random.randint(2, 5)
        f_shape1 = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        f_shape2 = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )
        f_shape_skip = (
            min(in_rows, np.random.randint(1, 5)),
            min(in_cols, np.random.randint(1, 5)),
        )

        s1 = np.random.randint(1, 5)
        s2 = np.random.randint(1, 5)
        s_skip = np.random.randint(1, 5)

        # randomly select an activation function
        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]

        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)

        p1 = (np.random.randint(1, 5), np.random.randint(1, 5))
        p2 = (np.random.randint(1, 5), np.random.randint(1, 5))

        # initialize SkipConnectionConv module
        L1 = SkipConnectionConvModule(
            out_ch1=n_out1,
            out_ch2=n_out2,
            kernel_shape1=f_shape1,
            kernel_shape2=f_shape2,
            kernel_shape_skip=f_shape_skip,
            stride1=s1,
            stride2=s2,
            stride_skip=s_skip,
            pad1=p1,
            pad2=p2,
            act_fn=act_fn,
            epsilon=1e-5,
            momentum=0.9,
        )

        # forward prop
        try:
            y_pred = L1.forward(X)
        except (ValueError, AssertionError):
            print("Invalid padding; Retrying")
            continue

        ps = L1.hyperparameters["pad_skip"]
        if ps[0] != ps[1] or ps[2] != ps[3]:
            continue
        pad_skip = (ps[0], ps[2])

        # backprop
        dLdy = np.ones_like(y_pred)
        dLdX = L1.backward(dLdy)

        # get gold standard gradients
        gold_mod = TorchSkipConnectionConv(
            torch_fn,
            p1,
            p2,
            pad_skip,
            L1.parameters,
            L1.hyperparameters,
            momentum=L1.momentum,
            epsilon=L1.epsilon,
        )
        golds = gold_mod.extract_grads(X)

        params = L1.parameters["components"]
        grads = L1.gradients["components"]
        params = [
            (X, "X"),
            (params["conv1"]["W"], "conv1_W"),
            (params["conv1"]["b"], "conv1_b"),
            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
            (params["batchnorm1"]["intercept"], "bn1_intercept"),
            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
            (params["conv2"]["W"], "conv2_W"),
            (params["conv2"]["b"], "conv2_b"),
            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
            (params["batchnorm2"]["intercept"], "bn2_intercept"),
            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
            (params["conv_skip"]["W"], "conv_skip_W"),
            (params["conv_skip"]["b"], "conv_skip_b"),
            (params["batchnorm_skip"]["scaler"].T, "bn_skip_scaler"),
            (params["batchnorm_skip"]["intercept"], "bn_skip_intercept"),
            (params["batchnorm_skip"]["running_mean"], "bn_skip_running_mean"),
            #  (params["batchnorm_skip"]["running_var"], "bn_skip_running_var"),
            (L1._dv["conv1_out"], "act1_out"),
            (L1._dv["batchnorm1_out"], "bn1_out"),
            (L1._dv["conv2_out"], "conv2_out"),
            (L1._dv["batchnorm2_out"], "bn2_out"),
            (L1._dv["conv_skip_out"], "conv_skip_out"),
            (L1._dv["batchnorm_skip_out"], "bn_skip_out"),
            (y_pred, "Y"),
            (dLdy, "dLdY"),
            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
            (L1.derived_variables["dLdBnSkip"], "dLdBnSkip_out"),
            (L1.derived_variables["dLdConvSkip"], "dLdConvSkip_out"),
            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
            (dLdX, "dLdX"),
            (grads["batchnorm_skip"]["scaler"].T, "dLdBnSkip_scaler"),
            (grads["batchnorm_skip"]["intercept"], "dLdBnSkip_intercept"),
            (grads["conv_skip"]["W"], "dLdConvSkip_W"),
            (grads["conv_skip"]["b"], "dLdConvSkip_b"),
            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
            (grads["conv2"]["W"], "dLdConv2_W"),
            (grads["conv2"]["b"], "dLdConv2_b"),
            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
            (grads["conv1"]["W"], "dLdConv1_W"),
            (grads["conv1"]["b"], "dLdConv1_b"),
        ]

        print("\nTrial {}".format(i))
        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
        print("stride_skip={}, f_shape_skip={}".format(s_skip, f_shape_skip))
        warn_str = (
            "\n[NOTE] The tests in this module can fail sometimes during "
            "backprop due to the ReLU issue: while the difference in the forward pass "
            "between z=-1e-9 and z=1e-9 is miniscule, the difference during the backward "
            "pass is significant due to ReLU's kink about 0."
        )
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix, warn_str),
                decimal=2,
            )
            print("\tPASSED {}".format(label))
        i += 1


def test_BidirectionalLSTM(N=15):
    from numpy_ml.neural_nets.modules import BidirectionalLSTM

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        n_in = np.random.randint(1, 10)
        n_out = np.random.randint(1, 10)
        n_t = np.random.randint(1, 10)
        X = random_tensor((n_ex, n_in, n_t), standardize=True)

        # initialize LSTM layer
        L1 = BidirectionalLSTM(n_out=n_out)

        # forward prop
        y_pred = L1.forward(X)

        # backprop
        dLdA = np.ones_like(y_pred)
        dLdX = L1.backward(dLdA)

        # get gold standard gradients
        gold_mod = TorchBidirectionalLSTM(n_in, n_out, L1.parameters)
        golds = gold_mod.extract_grads(X)

        pms, grads = L1.parameters["components"], L1.gradients["components"]
        params = [
            (X, "X"),
            (y_pred, "y"),
            (pms["cell_fwd"]["bo"].T, "bo_f"),
            (pms["cell_fwd"]["bu"].T, "bu_f"),
            (pms["cell_fwd"]["bf"].T, "bf_f"),
            (pms["cell_fwd"]["bc"].T, "bc_f"),
            (pms["cell_fwd"]["Wo"], "Wo_f"),
            (pms["cell_fwd"]["Wu"], "Wu_f"),
            (pms["cell_fwd"]["Wf"], "Wf_f"),
            (pms["cell_fwd"]["Wc"], "Wc_f"),
            (pms["cell_bwd"]["bo"].T, "bo_b"),
            (pms["cell_bwd"]["bu"].T, "bu_b"),
            (pms["cell_bwd"]["bf"].T, "bf_b"),
            (pms["cell_bwd"]["bc"].T, "bc_b"),
            (pms["cell_bwd"]["Wo"], "Wo_b"),
            (pms["cell_bwd"]["Wu"], "Wu_b"),
            (pms["cell_bwd"]["Wf"], "Wf_b"),
            (pms["cell_bwd"]["Wc"], "Wc_b"),
            (grads["cell_fwd"]["bo"].T, "dLdBo_f"),
            (grads["cell_fwd"]["bu"].T, "dLdBu_f"),
            (grads["cell_fwd"]["bf"].T, "dLdBf_f"),
            (grads["cell_fwd"]["bc"].T, "dLdBc_f"),
            (grads["cell_fwd"]["Wo"], "dLdWo_f"),
            (grads["cell_fwd"]["Wu"], "dLdWu_f"),
            (grads["cell_fwd"]["Wf"], "dLdWf_f"),
            (grads["cell_fwd"]["Wc"], "dLdWc_f"),
            (grads["cell_bwd"]["bo"].T, "dLdBo_b"),
            (grads["cell_bwd"]["bu"].T, "dLdBu_b"),
            (grads["cell_bwd"]["bf"].T, "dLdBf_b"),
            (grads["cell_bwd"]["bc"].T, "dLdBc_b"),
            (grads["cell_bwd"]["Wo"], "dLdWo_b"),
            (grads["cell_bwd"]["Wu"], "dLdWu_b"),
            (grads["cell_bwd"]["Wf"], "dLdWf_b"),
            (grads["cell_bwd"]["Wc"], "dLdWc_b"),
            (dLdX, "dLdX"),
        ]

        print("Case {}".format(i))
        for ix, (mine, label) in enumerate(params):
            np.testing.assert_allclose(
                mine,
                golds[label],
                err_msg=err_fmt(params, golds, ix),
                atol=1e-4,
                rtol=1e-4,
            )

            print("\tPASSED {}".format(label))
        i += 1


def test_WaveNetModule(N=10):
    from numpy_ml.neural_nets.modules import WavenetResidualModule

    N = np.inf if N is None else N

    np.random.seed(12345)

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 10)
        l_in = np.random.randint(1, 10)
        ch_residual, ch_dilation = np.random.randint(1, 5), np.random.randint(1, 5)
        f_width = min(l_in, np.random.randint(1, 5))
        d = np.random.randint(0, 5)

        X_main = np.zeros_like(
            random_tensor((n_ex, l_in, ch_residual), standardize=True)
        )
        X_main[0][0][0] = 1.0
        X_skip = np.zeros_like(
            random_tensor((n_ex, l_in, ch_residual), standardize=True)
        )

        # initialize Conv2D layer
        L1 = WavenetResidualModule(
            ch_residual=ch_residual,
            ch_dilation=ch_dilation,
            kernel_width=f_width,
            dilation=d,
        )

        # forward prop
        Y_main, Y_skip = L1.forward(X_main, X_skip)

        # backprop
        dLdY_skip = np.ones_like(Y_skip)
        dLdY_main = np.ones_like(Y_main)
        dLdX_main, dLdX_skip = L1.backward(dLdY_skip, dLdY_main)

        _, conv_1x1_pad = pad1D(
            L1._dv["multiply_gate_out"], "same", kernel_width=1, stride=1, dilation=0
        )
        if conv_1x1_pad[0] != conv_1x1_pad[1]:
            print("Skipping")
            continue

        conv_1x1_pad = conv_1x1_pad[0]

        # get gold standard gradients
        gold_mod = TorchWavenetModule(L1.parameters, L1.hyperparameters, conv_1x1_pad)
        golds = gold_mod.extract_grads(X_main, X_skip)

        dv = L1.derived_variables
        pc = L1.parameters["components"]
        gr = L1.gradients["components"]

        params = [
            (L1.X_main, "X_main"),
            (L1.X_skip, "X_skip"),
            (pc["conv_dilation"]["W"], "conv_dilation_W"),
            (pc["conv_dilation"]["b"], "conv_dilation_b"),
            (pc["conv_1x1"]["W"], "conv_1x1_W"),
            (pc["conv_1x1"]["b"], "conv_1x1_b"),
            (dv["conv_dilation_out"], "conv_dilation_out"),
            (dv["tanh_out"], "tanh_out"),
            (dv["sigm_out"], "sigm_out"),
            (dv["multiply_gate_out"], "multiply_gate_out"),
            (dv["conv_1x1_out"], "conv_1x1_out"),
            (Y_main, "Y_main"),
            (Y_skip, "Y_skip"),
            (dLdY_skip, "dLdY_skip"),
            (dLdY_main, "dLdY_main"),
            (dv["dLdConv_1x1"], "dLdConv_1x1_out"),
            (gr["conv_1x1"]["W"], "dLdConv_1x1_W"),
            (gr["conv_1x1"]["b"], "dLdConv_1x1_b"),
            (dv["dLdMultiply"], "dLdMultiply_out"),
            (dv["dLdTanh"], "dLdTanh_out"),
            (dv["dLdSigmoid"], "dLdSigm_out"),
            (dv["dLdConv_dilation"], "dLdConv_dilation_out"),
            (gr["conv_dilation"]["W"], "dLdConv_dilation_W"),
            (gr["conv_dilation"]["b"], "dLdConv_dilation_b"),
            (dLdX_main, "dLdX_main"),
            (dLdX_skip, "dLdX_skip"),
        ]

        print("\nTrial {}".format(i))
        print("f_width={}, n_ex={}".format(f_width, n_ex))
        print("l_in={}, ch_residual={}".format(l_in, ch_residual))
        print("ch_dilation={} dilation={}".format(ch_dilation, d))
        for ix, (mine, label) in enumerate(params):
            assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
            )
            print("\tPASSED {}".format(label))
        i += 1


#######################################################################
#                                Utils                                #
#######################################################################


def test_pad1D(N=15):
    from numpy_ml.neural_nets.layers import Conv1D
    from .nn_torch_models import TorchCausalConv1d, torchify

    np.random.seed(12345)

    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        p = np.random.choice(["same", "causal"])
        n_ex = np.random.randint(1, 10)
        l_in = np.random.randint(1, 10)
        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
        f_width = min(l_in, np.random.randint(1, 5))
        s = np.random.randint(1, 3)
        d = np.random.randint(0, 5)

        X = random_tensor((n_ex, l_in, n_in), standardize=True)
        X_pad, _ = pad1D(X, p, kernel_width=f_width, stride=s, dilation=d)

        # initialize Conv2D layer
        L1 = Conv1D(out_ch=n_out, kernel_width=f_width, pad=0, stride=s, dilation=d)

        # forward prop
        try:
            y_pred = L1.forward(X_pad)
        except ValueError:
            continue

        # ignore n. output channels
        print("Trial {}".format(i))
        print("p={} d={} s={} l_in={} f_width={}".format(p, d, s, l_in, f_width))
        print("n_ex={} n_in={} n_out={}".format(n_ex, n_in, n_out))
        assert y_pred.shape[:2] == X.shape[:2], print(
            "y_pred.shape={} X.shape={}".format(y_pred.shape, X.shape)
        )

        if p == "causal":
            gold = TorchCausalConv1d(
                in_channels=n_in,
                out_channels=n_out,
                kernel_size=f_width,
                stride=s,
                dilation=d + 1,
                bias=True,
            )
            if s != 1:
                print(
                    "TorchCausalConv1D does not do `same` padding for stride > 1. Skipping"
                )
                continue

            XT = torchify(np.moveaxis(X, [0, 1, 2], [0, -1, -2]))
        else:
            gold = nn.Conv1d(
                in_channels=n_in,
                out_channels=n_out,
                kernel_size=f_width,
                padding=0,
                stride=s,
                dilation=d + 1,
                bias=True,
            )
            XT = torchify(np.moveaxis(X_pad, [0, 1, 2], [0, -1, -2]))

        # import weights and biases
        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
        b = L1.parameters["b"]
        W = np.moveaxis(L1.parameters["W"], [0, 1, 2], [-1, -2, -3])
        assert gold.weight.shape == W.shape
        assert gold.bias.shape == b.flatten().shape

        gold.weight = nn.Parameter(torch.FloatTensor(W))
        gold.bias = nn.Parameter(torch.FloatTensor(b.flatten()))

        outT = gold(XT)
        if outT.ndimension() == 2:
            import ipdb

            ipdb.set_trace()

        gold_out = np.moveaxis(outT.detach().numpy(), [0, 1, 2], [0, -1, -2])
        assert gold_out.shape[:2] == X.shape[:2]

        np.testing.assert_almost_equal(
            y_pred,
            gold_out,
            err_msg=err_fmt(
                [(y_pred.shape, "out.shape"), (y_pred, "out")],
                {"out.shape": gold_out.shape, "out": gold_out},
                1,
            ),
            decimal=4,
        )
        print("PASSED\n")
        i += 1


def test_conv(N=15):
    np.random.seed(12345)
    N = np.inf if N is None else N
    i = 0
    while i < N:
        n_ex = np.random.randint(2, 15)
        in_rows = np.random.randint(2, 15)
        in_cols = np.random.randint(2, 15)
        in_ch = np.random.randint(2, 15)
        out_ch = np.random.randint(2, 15)
        f_shape = (
            min(in_rows, np.random.randint(2, 10)),
            min(in_cols, np.random.randint(2, 10)),
        )
        s = np.random.randint(1, 3)
        p = np.random.randint(0, 5)

        X = np.random.rand(n_ex, in_rows, in_cols, in_ch)
        X_pad, p = pad2D(X, p)
        W = np.random.randn(f_shape[0], f_shape[1], in_ch, out_ch)

        gold = conv2D_naive(X, W, s, p)
        mine = conv2D(X, W, s, p)

        np.testing.assert_almost_equal(mine, gold)
        print("PASSED")
        i += 1


#######################################################################
#                               Models                                #
#######################################################################


def fit_VAE():
    # for testing
    import tensorflow.keras.datasets.mnist as mnist
    from numpy_ml.neural_nets.models.vae import BernoulliVAE

    np.random.seed(12345)

    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    # scale pixel intensities to [0, 1]
    X_train = np.expand_dims(X_train.astype("float32") / 255.0, 3)
    X_test = np.expand_dims(X_test.astype("float32") / 255.0, 3)

    X_train = X_train[: 128 * 1]  # 1 batch

    BV = BernoulliVAE()
    BV.fit(X_train, n_epochs=1, verbose=False)


def test_WGAN_GP(N=1):
    from numpy_ml.neural_nets.models.wgan_gp import WGAN_GP

    np.random.seed(12345)

    ss = np.random.randint(0, 1000)
    np.random.seed(ss)

    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        c_updates_per_epoch, n_steps = 1, 1
        n_ex = np.random.randint(1, 500)
        n_in = np.random.randint(1, 100)
        lambda_ = np.random.randint(0, 20)
        g_hidden = np.random.randint(2, 500)
        X = random_tensor((n_ex, n_in), standardize=True)

        # initialize WGAN_GP model
        L1 = WGAN_GP(g_hidden=g_hidden, debug=True)

        # forward prop
        batchsize = n_ex
        L1.fit(
            X,
            lambda_=lambda_,
            c_updates_per_epoch=c_updates_per_epoch,
            n_steps=n_steps,
            batchsize=batchsize,
        )

        # backprop
        dv = L1.derived_variables
        params = L1.parameters["components"]
        grads = L1.gradients["components"]
        params["noise"] = dv["noise"]
        params["alpha"] = dv["alpha"]
        params["n_in"] = n_in
        params["g_hidden"] = g_hidden
        params["c_updates_per_epoch"] = c_updates_per_epoch
        params["n_steps"] = n_steps

        # get gold standard gradients
        golds = WGAN_GP_tf(X, lambda_=lambda_, batch_size=batchsize, params=params)

        params = [
            (dv["X_real"], "X_real"),
            (params["generator"]["FC1"]["W"], "G_weights_FC1"),
            (params["generator"]["FC2"]["W"], "G_weights_FC2"),
            (params["generator"]["FC3"]["W"], "G_weights_FC3"),
            (params["generator"]["FC4"]["W"], "G_weights_FC4"),
            (dv["G_fwd_X_fake"]["FC1"], "G_fwd_X_fake_FC1"),
            (dv["G_fwd_X_fake"]["FC2"], "G_fwd_X_fake_FC2"),
            (dv["G_fwd_X_fake"]["FC3"], "G_fwd_X_fake_FC3"),
            (dv["G_fwd_X_fake"]["FC4"], "G_fwd_X_fake_FC4"),
            (dv["X_fake"], "X_fake"),
            (dv["X_interp"], "X_interp"),
            (params["critic"]["FC1"]["W"], "C_weights_Y_real_FC1"),
            (params["critic"]["FC2"]["W"], "C_weights_Y_real_FC2"),
            (params["critic"]["FC3"]["W"], "C_weights_Y_real_FC3"),
            (params["critic"]["FC4"]["W"], "C_weights_Y_real_FC4"),
            (dv["C_fwd_Y_real"]["FC1"], "C_fwd_Y_real_FC1"),
            (dv["C_fwd_Y_real"]["FC2"], "C_fwd_Y_real_FC2"),
            (dv["C_fwd_Y_real"]["FC3"], "C_fwd_Y_real_FC3"),
            (dv["C_fwd_Y_real"]["FC4"], "C_fwd_Y_real_FC4"),
            (dv["Y_real"].flatten(), "Y_real"),
            (params["critic"]["FC1"]["W"], "C_weights_Y_fake_FC1"),
            (params["critic"]["FC2"]["W"], "C_weights_Y_fake_FC2"),
            (params["critic"]["FC3"]["W"], "C_weights_Y_fake_FC3"),
            (params["critic"]["FC4"]["W"], "C_weights_Y_fake_FC4"),
            (dv["C_fwd_Y_fake"]["FC1"], "C_fwd_Y_fake_FC1"),
            (dv["C_fwd_Y_fake"]["FC2"], "C_fwd_Y_fake_FC2"),
            (dv["C_fwd_Y_fake"]["FC3"], "C_fwd_Y_fake_FC3"),
            (dv["C_fwd_Y_fake"]["FC4"], "C_fwd_Y_fake_FC4"),
            (dv["Y_fake"].flatten(), "Y_fake"),
            (params["critic"]["FC1"]["W"], "C_weights_Y_interp_FC1"),
            (params["critic"]["FC2"]["W"], "C_weights_Y_interp_FC2"),
            (params["critic"]["FC3"]["W"], "C_weights_Y_interp_FC3"),
            (params["critic"]["FC4"]["W"], "C_weights_Y_interp_FC4"),
            (dv["C_fwd_Y_interp"]["FC1"], "C_fwd_Y_interp_FC1"),
            (dv["C_fwd_Y_interp"]["FC2"], "C_fwd_Y_interp_FC2"),
            (dv["C_fwd_Y_interp"]["FC3"], "C_fwd_Y_interp_FC3"),
            (dv["C_fwd_Y_interp"]["FC4"], "C_fwd_Y_interp_FC4"),
            (dv["Y_interp"].flatten(), "Y_interp"),
            (dv["C_dY_interp_wrt"]["FC4"], "dY_interp_wrt_FC4"),
            (dv["C_dY_interp_wrt"]["FC3"], "dY_interp_wrt_FC3"),
            (dv["C_dY_interp_wrt"]["FC2"], "dY_interp_wrt_FC2"),
            (dv["C_dY_interp_wrt"]["FC1"], "dY_interp_wrt_FC1"),
            (dv["gradInterp"], "gradInterp"),
            (dv["C_loss"], "C_loss"),
            (dv["G_loss"], "G_loss"),
            (grads["critic"]["FC1"]["W"], "dC_loss_dW_FC1"),
            (grads["critic"]["FC1"]["b"].flatten(), "dC_loss_db_FC1"),
            (grads["critic"]["FC2"]["W"], "dC_loss_dW_FC2"),
            (grads["critic"]["FC2"]["b"].flatten(), "dC_loss_db_FC2"),
            (grads["critic"]["FC3"]["W"], "dC_loss_dW_FC3"),
            (grads["critic"]["FC3"]["b"].flatten(), "dC_loss_db_FC3"),
            (grads["critic"]["FC4"]["W"], "dC_loss_dW_FC4"),
            (grads["critic"]["FC4"]["b"].flatten(), "dC_loss_db_FC4"),
            (dv["dG_Y_fake"].flatten(), "dG_Y_fake"),
            (dv["dY_real"].flatten(), "dC_Y_real"),
            (dv["dC_Y_fake"].flatten(), "dC_Y_fake"),
            (dv["dGrad_interp"], "dC_gradInterp"),
        ]

        print("\nTrial {}".format(i))
        print("Seed: {} g_hidden={}".format(ss, g_hidden))
        print("lambda={} n_ex={} n_in={}".format(lambda_, n_ex, n_in))
        print(
            "c_updates_per_epoch={}, n_steps={} batchsize={}".format(
                c_updates_per_epoch, n_steps, batchsize
            )
        )

        for ix, (mine, label) in enumerate(params):
            np.testing.assert_almost_equal(
                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
            )
            print("\tPASSED {}".format(label))
        i += 1


================================================
FILE: numpy_ml/tests/test_nn_activations.py
================================================
# flake8: noqa
import time
import numpy as np

from numpy.testing import assert_almost_equal
from scipy.special import expit

import torch
import torch.nn.functional as F

from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor


def torch_gradient_generator(fn, **kwargs):
    def get_grad(z):
        z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)
        z2 = fn(z1, **kwargs).sum()
        z2.backward()
        grad = z1.grad.numpy()
        return grad

    return get_grad


#######################################################################
#                           Debug Formatter                           #
#######################################################################


def err_fmt(params, golds, ix, warn_str=""):
    mine, label = params[ix]
    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
    prev_mine, prev_label = params[max(ix - 1, 0)]
    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
        prev_label, prev_mine, prev_label, golds[prev_label]
    )
    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
        label, mine, label, golds[label]
    )
    err_msg += warn_str
    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
    return err_msg


#######################################################################
#                            Test Suite                               #
#######################################################################
#
#
#  def test_activations(N=50):
#      print("Testing Sigmoid activation")
#      time.sleep(1)
#      test_sigmoid_activation(N)
#      test_sigmoid_grad(N)
#
#      #  print("Testing Softmax activation")
#      #  time.sleep(1)
#      #  test_softmax_activation(N)
#      #  test_softmax_grad(N)
#
#      print("Testing Tanh activation")
#      time.sleep(1)
#      test_tanh_grad(N)
#
#      print("Testing ReLU activation")
#      time.sleep(1)
#      test_relu_activation(N)
#      test_relu_grad(N)
#
#      print("Testing ELU activation")
#      time.sleep(1)
#      test_elu_activation(N)
#      test_elu_grad(N)
#
#      print("Testing SELU activation")
#      time.sleep(1)
#      test_selu_activation(N)
#      test_selu_grad(N)
#
#      print("Testing LeakyRelu activation")
#      time.sleep(1)
#      test_leakyrelu_activation(N)
#      test_leakyrelu_grad(N)
#
#      print("Testing SoftPlus activation")
#      time.sleep(1)
#      test_softplus_activation(N)
#      test_softplus_grad(N)
#

#######################################################################
#                          Activations                                #
#######################################################################


def test_sigmoid_activation(N=50):
    from numpy_ml.neural_nets.activations import Sigmoid

    N = np.inf if N is None else N

    mine = Sigmoid()
    gold = expit

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_tensor((1, n_dims))
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_softplus_activation(N=50):
    from numpy_ml.neural_nets.activations import SoftPlus

    N = np.inf if N is None else N

    mine = SoftPlus()
    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_elu_activation(N=50):
    from numpy_ml.neural_nets.activations import ELU

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 10)
        z = random_tensor((1, n_dims))

        alpha = np.random.uniform(0, 10)

        mine = ELU(alpha)
        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()

        assert_almost_equal(mine.fn(z), gold(z, alpha))
        print("PASSED")
        i += 1


def test_relu_activation(N=50):
    from numpy_ml.neural_nets.activations import ReLU

    N = np.inf if N is None else N

    mine = ReLU()
    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_selu_activation(N=50):
    from numpy_ml.neural_nets.activations import SELU

    N = np.inf if N is None else N

    mine = SELU()
    gold = lambda z: F.selu(torch.FloatTensor(z)).numpy()

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        assert_almost_equal(mine.fn(z), gold(z))
        print("PASSED")
        i += 1


def test_leakyrelu_activation(N=50):
    from numpy_ml.neural_nets.activations import LeakyReLU

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        alpha = np.random.uniform(0, 10)

        mine = LeakyReLU(alpha=alpha)
        gold = lambda z: F.leaky_relu(torch.FloatTensor(z), alpha).numpy()
        assert_almost_equal(mine.fn(z), gold(z))

        print("PASSED")
        i += 1


def test_gelu_activation(N=50):
    from numpy_ml.neural_nets.activations import GELU

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_dims = np.random.randint(1, 100)
        z = random_stochastic_matrix(1, n_dims)
        approx = np.random.choice([True, False])

        mine = GELU(approximate=False)
        mine_approx = GELU(approximate=True)
        gold = lambda z: F.gelu(torch.FloatTensor(z)).numpy()
        np.testing.assert_allclose(mine.fn(z), gold(z), rtol=1e-3)
        assert_almost_equal(mine.fn(z), mine_approx.fn(z))

        print("PASSED")
        i += 1


#######################################################################
#                      Activation Gradients                           #
#######################################################################


def test_sigmoid_grad(N=50):
    from numpy_ml.neural_nets.activations import Sigmoid

    N = np.inf if N is None else N

    mine = Sigmoid()
    gold = torch_gradient_generator(torch.sigmoid)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_elu_grad(N=50):
    from numpy_ml.neural_nets.activations import ELU

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 10)
        n_dims = np.random.randint(1, 10)
        alpha = np.random.uniform(0, 10)
        z = random_tensor((n_ex, n_dims))

        mine = ELU(alpha)
        gold = torch_gradient_generator(F.elu, alpha=alpha)
        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
        print("PASSED")
        i += 1


def test_tanh_grad(N=50):
    from numpy_ml.neural_nets.activations import Tanh

    N = np.inf if N is None else N

    mine = Tanh()
    gold = torch_gradient_generator(torch.tanh)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_relu_grad(N=50):
    from numpy_ml.neural_nets.activations import ReLU

    N = np.inf if N is None else N

    mine = ReLU()
    gold = torch_gradient_generator(F.relu)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


def test_gelu_grad(N=50):
    from numpy_ml.neural_nets.activations import GELU

    N = np.inf if N is None else N

    mine = GELU(approximate=False)
    mine_approx = GELU(approximate=True)
    gold = torch_gradient_generator(F.gelu)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z), decimal=3)
        assert_almost_equal(mine.grad(z), mine_approx.grad(z))
        print("PASSED")
        i += 1


def test_selu_grad(N=50):
    from numpy_ml.neural_nets.activations import SELU

    N = np.inf if N is None else N

    mine = SELU()
    gold = torch_gradient_generator(F.selu)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims))
        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
        print("PASSED")
        i += 1


def test_leakyrelu_grad(N=50):
    from numpy_ml.neural_nets.activations import LeakyReLU

    N = np.inf if N is None else N

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 10)
        n_dims = np.random.randint(1, 10)
        alpha = np.random.uniform(0, 10)
        z = random_tensor((n_ex, n_dims))

        mine = LeakyReLU(alpha)
        gold = torch_gradient_generator(F.leaky_relu, negative_slope=alpha)
        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
        print("PASSED")
        i += 1


def test_softplus_grad(N=50):
    from numpy_ml.neural_nets.activations import SoftPlus

    N = np.inf if N is None else N

    mine = SoftPlus()
    gold = torch_gradient_generator(F.softplus)

    i = 0
    while i < N:
        n_ex = np.random.randint(1, 100)
        n_dims = np.random.randint(1, 100)
        z = random_tensor((n_ex, n_dims), standardize=True)
        assert_almost_equal(mine.grad(z), gold(z))
        print("PASSED")
        i += 1


if __name__ == "__main__":
    test_activations(N=50)


================================================
FILE: numpy_ml/tests/test_nonparametric.py
================================================
# flake8: noqa
import numpy as np

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessRegressor

from numpy_ml.nonparametric.knn import KNN
from numpy_ml.nonparametric.gp import GPRegression
from numpy_ml.utils.distance_metrics import euclidean


def test_knn_regression(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = np.random.choice(["uniform", "distance"])

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.rand(N)

        knn = KNN(
            k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights
        )
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsRegressor(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_knn_clf(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        n_classes = np.random.randint(2, 10)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = "uniform"

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.randint(0, n_classes, size=N)

        knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsClassifier(
            p=2,
            metric="minkowski",
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_gp_regression(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        alpha = np.random.rand()
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        K = np.random.randint(1, N)
        J = np.random.randint(1, 3)

        X = np.random.rand(N, M)
        y = np.random.rand(N, J)
        X_test = np.random.rand(K, M)

        gp = GPRegression(kernel="RBFKernel(sigma=1)", alpha=alpha)
        gold = GaussianProcessRegressor(
            kernel=None, alpha=alpha, optimizer=None, normalize_y=False
        )

        gp.fit(X, y)
        gold.fit(X, y)

        preds, _ = gp.predict(X_test)
        gold_preds = gold.predict(X_test)
        np.testing.assert_almost_equal(preds, gold_preds)

        mll = gp.marginal_log_likelihood()
        gold_mll = gold.log_marginal_likelihood()
        np.testing.assert_almost_equal(mll, gold_mll)

        print("PASSED")
        i += 1


================================================
FILE: numpy_ml/tests/test_preprocessing.py
================================================
# flake8: noqa
from collections import Counter

# gold-standard imports
import huffman
import numpy as np

from scipy.fftpack import dct

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from librosa.core.time_frequency import fft_frequencies
except ImportError:
    # for librosa >= 0.8.0
    from librosa import fft_frequencies
from librosa.feature import mfcc as lr_mfcc
from librosa.util import frame
from librosa.filters import mel

# numpy-ml implementations
from numpy_ml.preprocessing.general import Standardizer
from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder
from numpy_ml.preprocessing.dsp import (
    DCT,
    DFT,
    mfcc,
    to_frames,
    mel_filterbank,
    dft_bins,
)
from numpy_ml.utils.testing import random_paragraph


def test_huffman(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        n_words = np.random.randint(1, 100)
        para = random_paragraph(n_words)
        HT = HuffmanEncoder()
        HT.fit(para)
        my_dict = HT._item2code
        their_dict = huffman.codebook(Counter(para).items())

        for k, v in their_dict.items():
            fstr = "their_dict['{}'] = {}, but my_dict['{}'] = {}"
            assert k in my_dict, "key `{}` not in my_dict".format(k)
            assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])
        print("PASSED")
        i += 1


def test_standardizer(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        mean = bool(np.random.randint(2))
        std = bool(np.random.randint(2))
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        X = np.random.rand(N, M)

        S = Standardizer(with_mean=mean, with_std=std)
        S.fit(X)
        mine = S.transform(X)

        theirs = StandardScaler(with_mean=mean, with_std=std)
        gold = theirs.fit_transform(X)

        np.testing.assert_almost_equal(mine, gold)
        print("PASSED")
        i += 1


def test_tfidf(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        docs = []
        n_docs = np.random.randint(1, 10)
        for d in range(n_docs):
            n_lines = np.random.randint(1, 1000)
            lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)]
            docs.append("\n".join([" ".join(l) for l in lines]))

        smooth = bool(np.random.randint(2))

        tfidf = TFIDFEncoder(
            lowercase=True,
            min_count=0,
            smooth_idf=smooth,
            max_tokens=None,
            input_type="strings",
            filter_stopwords=False,
        )
        gold = TfidfVectorizer(
            input="content",
            norm=None,
            use_idf=True,
            lowercase=True,
            smooth_idf=smooth,
            sublinear_tf=False,
        )

        tfidf.fit(docs)
        mine = tfidf.transform(ignore_special_chars=True)
        theirs = gold.fit_transform(docs).toarray()

        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_dct(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(2, 100)
        signal = np.random.rand(N)
        ortho = bool(np.random.randint(2))
        mine = DCT(signal, orthonormal=ortho)
        theirs = dct(signal, norm="ortho" if ortho else None)

        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_dft(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(2, 100)
        signal = np.random.rand(N)
        mine = DFT(signal)
        theirs = np.fft.rfft(signal)

        np.testing.assert_almost_equal(mine.real, theirs.real)
        print("PASSED")
        i += 1


def test_mfcc(N=1):
    """Broken"""
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(500, 1000)
        fs = np.random.randint(50, 100)
        n_mfcc = 12
        window_len = 100
        stride_len = 50
        n_filters = 20
        window_dur = window_len / fs
        stride_dur = stride_len / fs
        signal = np.random.rand(N)

        mine = mfcc(
            signal,
            fs=fs,
            window="hann",
            window_duration=window_dur,
            stride_duration=stride_dur,
            lifter_coef=0,
            alpha=0,
            n_mfccs=n_mfcc,
            normalize=False,
            center=True,
            n_filters=n_filters,
            replace_intercept=False,
        )

        theirs = lr_mfcc(
            signal,
            sr=fs,
            n_mels=n_filters,
            n_mfcc=n_mfcc,
            n_fft=window_len,
            hop_length=stride_len,
            htk=True,
        ).T

        np.testing.assert_almost_equal(mine, theirs, decimal=4)
        print("PASSED")
        i += 1


def test_framing(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(500, 100000)
        window_len = np.random.randint(10, 100)
        stride_len = np.random.randint(1, 50)
        signal = np.random.rand(N)

        mine = to_frames(signal, window_len, stride_len, writeable=False)
        theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T

        assert len(mine) == len(theirs), "len(mine) = {}, len(theirs) = {}".format(
            len(mine), len(theirs)
        )
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_dft_bins(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        N = np.random.randint(500, 100000)
        fs = np.random.randint(50, 1000)

        mine = dft_bins(N, fs=fs, positive_only=True)
        theirs = fft_frequencies(fs, N)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_mel_filterbank(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        fs = np.random.randint(50, 10000)
        n_filters = np.random.randint(2, 20)
        window_len = np.random.randint(10, 100)
        norm = np.random.randint(2)

        mine = mel_filterbank(
            window_len, n_filters, fs, min_freq=0, max_freq=None, normalize=bool(norm)
        )

        theirs = mel(
            fs,
            n_fft=window_len,
            n_mels=n_filters,
            htk=True,
            norm="slaney" if norm == 1 else None,
        )

        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


================================================
FILE: numpy_ml/tests/test_trees.py
================================================
# flake8: noqa
import numpy as np

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import make_regression, make_blobs
from sklearn.model_selection import train_test_split

from numpy_ml.trees.gbdt import GradientBoostedDecisionTree
from numpy_ml.trees.dt import DecisionTree, Node, Leaf
from numpy_ml.trees.rf import RandomForest
from numpy_ml.utils.testing import random_tensor


def clone_tree(dtree):
    children_left = dtree.tree_.children_left
    children_right = dtree.tree_.children_right
    feature = dtree.tree_.feature
    threshold = dtree.tree_.threshold
    values = dtree.tree_.value

    def grow(node_id):
        l, r = children_left[node_id], children_right[node_id]
        if l == r:
            return Leaf(values[node_id].argmax())
        n = Node(None, None, (feature[node_id], threshold[node_id]))
        n.left = grow(l)
        n.right = grow(r)
        return n

    node_id = 0
    root = Node(None, None, (feature[node_id], threshold[node_id]))
    root.left = grow(children_left[node_id])
    root.right = grow(children_right[node_id])
    return root


def compare_trees(mine, gold):
    clone = clone_tree(gold)
    mine = mine.root

    def test(mine, clone):
        if isinstance(clone, Node) and isinstance(mine, Node):
            assert mine.feature == clone.feature, "Node {} not equal".format(depth)
            np.testing.assert_allclose(mine.threshold, clone.threshold)
            test(mine.left, clone.left, depth + 1)
            test(mine.right, clone.right, depth + 1)
        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
            np.testing.assert_allclose(mine.value, clone.value)
            return
        else:
            raise ValueError("Nodes at depth {} are not equal".format(depth))

    depth = 0
    ok = True
    while ok:
        if isinstance(clone, Node) and isinstance(mine, Node):
            assert mine.feature == clone.feature
            np.testing.assert_allclose(mine.threshold, clone.threshold)
            test(mine.left, clone.left, depth + 1)
            test(mine.right, clone.right, depth + 1)
        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
            np.testing.assert_allclose(mine.value, clone.value)
            return
        else:
            raise ValueError("Nodes at depth {} are not equal".format(depth))


def test_DecisionTree(N=1):
    i = 1
    np.random.seed(12345)
    while i <= N:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            criterion = np.random.choice(["entropy", "gini"])
            mine = DecisionTree(
                classifier=classifier, max_depth=max_depth, criterion=criterion
            )
            gold = DecisionTreeClassifier(
                criterion=criterion,
                max_depth=max_depth,
                splitter="best",
                random_state=i,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = DecisionTree(
                criterion=criterion, max_depth=max_depth, classifier=classifier
            )
            gold = DecisionTreeRegressor(
                criterion=criterion, max_depth=max_depth, splitter="best"
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds on training set
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))
        i += 1


def test_RandomForest(N=1):
    np.random.seed(12345)
    i = 1
    while i <= N:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        n_trees = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth,
            )
            gold = RandomForestClassifier(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth,
                classifier=classifier,
            )
            gold = RandomForestRegressor(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))

        print("PASSED")
        i += 1


def test_gbdt(N=1):
    np.random.seed(12345)
    i = 1
    while i <= N:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        n_trees = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = GradientBoostedDecisionTree(
                n_iter=n_trees,
                classifier=classifier,
                max_depth=max_depth,
                learning_rate=0.1,
                loss="crossentropy",
                step_size="constant",
            )
            gold = RandomForestClassifier(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = GradientBoostedDecisionTree(
                n_iter=n_trees,
                max_depth=max_depth,
                classifier=classifier,
                learning_rate=0.1,
                loss="mse",
                step_size="constant",
            )
            gold = RandomForestRegressor(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))

        print("PASSED")
        i += 1


================================================
FILE: numpy_ml/tests/test_utils.py
================================================
# flake8: noqa
import numpy as np

import scipy
import networkx as nx

from sklearn.neighbors import BallTree as sk_BallTree
from sklearn.metrics.pairwise import rbf_kernel as sk_rbf
from sklearn.metrics.pairwise import linear_kernel as sk_linear
from sklearn.metrics.pairwise import polynomial_kernel as sk_poly


from numpy_ml.utils.distance_metrics import (
    hamming,
    euclidean,
    chebyshev,
    manhattan,
    minkowski,
)
from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel
from numpy_ml.utils.data_structures import BallTree
from numpy_ml.utils.graphs import (
    Edge,
    DiGraph,
    UndirectedGraph,
    random_DAG,
    random_unweighted_graph,
)

#######################################################################
#                               Kernels                               #
#######################################################################


def test_linear_kernel(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        M = np.random.randint(1, 100)
        C = np.random.randint(1, 1000)

        X = np.random.rand(N, C)
        Y = np.random.rand(M, C)

        mine = LinearKernel()(X, Y)
        gold = sk_linear(X, Y)

        np.testing.assert_almost_equal(mine, gold)
        print("PASSED")
        i += 1


def test_polynomial_kernel(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        M = np.random.randint(1, 100)
        C = np.random.randint(1, 1000)
        gamma = np.random.rand()
        d = np.random.randint(1, 5)
        c0 = np.random.rand()

        X = np.random.rand(N, C)
        Y = np.random.rand(M, C)

        mine = PolynomialKernel(gamma=gamma, d=d, c0=c0)(X, Y)
        gold = sk_poly(X, Y, gamma=gamma, degree=d, coef0=c0)

        np.testing.assert_almost_equal(mine, gold)
        print("PASSED")
        i += 1


def test_radial_basis_kernel(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        M = np.random.randint(1, 100)
        C = np.random.randint(1, 1000)
        gamma = np.random.rand()

        X = np.random.rand(N, C)
        Y = np.random.rand(M, C)

        # sklearn (gamma) <-> mine (sigma) conversion:
        # gamma = 1 / (2 * sigma^2)
        # sigma = np.sqrt(1 / 2 * gamma)

        mine = RBFKernel(sigma=np.sqrt(1 / (2 * gamma)))(X, Y)
        gold = sk_rbf(X, Y, gamma=gamma)

        np.testing.assert_almost_equal(mine, gold)
        print("PASSED")
        i += 1


#######################################################################
#                          Distance Metrics                           #
#######################################################################


def test_euclidean(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        x = np.random.rand(N)
        y = np.random.rand(N)
        mine = euclidean(x, y)
        theirs = scipy.spatial.distance.euclidean(x, y)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_hamming(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        x = (np.random.rand(N) * 100).round().astype(int)
        y = (np.random.rand(N) * 100).round().astype(int)
        mine = hamming(x, y)
        theirs = scipy.spatial.distance.hamming(x, y)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_minkowski(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        p = 1 + np.random.rand() * 10
        x = np.random.rand(N)
        y = np.random.rand(N)
        mine = minkowski(x, y, p)
        theirs = scipy.spatial.distance.minkowski(x, y, p)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_chebyshev(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        x = np.random.rand(N)
        y = np.random.rand(N)
        mine = chebyshev(x, y)
        theirs = scipy.spatial.distance.chebyshev(x, y)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


def test_manhattan(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(1, 100)
        x = np.random.rand(N)
        y = np.random.rand(N)
        mine = manhattan(x, y)
        theirs = scipy.spatial.distance.cityblock(x, y)
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1


#######################################################################
#                           Data Structures                           #
#######################################################################


def test_ball_tree(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        ls = np.min([np.random.randint(1, 10), N - 1])

        X = np.random.rand(N, M)
        BT = BallTree(leaf_size=ls, metric=euclidean)
        BT.fit(X)

        x = np.random.rand(M)
        mine = BT.nearest_neighbors(k, x)
        assert len(mine) == k

        mine_neighb = np.array([n.key for n in mine])
        mine_dist = np.array([n.distance for n in mine])

        sort_ix = np.argsort(mine_dist)
        mine_dist = mine_dist[sort_ix]
        mine_neighb = mine_neighb[sort_ix]

        sk = sk_BallTree(X, leaf_size=ls)
        theirs_dist, ind = sk.query(x.reshape(1, -1), k=k)
        sort_ix = np.argsort(theirs_dist.flatten())

        theirs_dist = theirs_dist.flatten()[sort_ix]
        theirs_neighb = X[ind.flatten()[sort_ix]]

        for j in range(len(theirs_dist)):
            np.testing.assert_almost_equal(mine_neighb[j], theirs_neighb[j])
            np.testing.assert_almost_equal(mine_dist[j], theirs_dist[j])

        print("PASSED")
        i += 1


#######################################################################
#                               Graphs                                #
#######################################################################


def from_networkx(G_nx):
    """Convert a networkx graph to my graph representation"""
    V = list(G_nx.nodes)
    edges = list(G_nx.edges)
    is_weighted = "weight" in G_nx[edges[0][0]][edges[0][1]]

    E = []
    for e in edges:
        if is_weighted:
            E.append(Edge(e[0], e[1], G_nx[e[0]][e[1]]["weight"]))
        else:
            E.append(Edge(e[0], e[1]))

    return DiGraph(V, E) if nx.is_directed(G_nx) else UndirectedGraph(V, E)


def to_networkx(G):
    """Convert my graph representation to a networkx graph"""
    G_nx = nx.DiGraph() if G.is_directed else nx.Graph()
    V = list(G._V2I.keys())
    G_nx.add_nodes_from(V)

    for v in V:
        fr_i = G._V2I[v]
        edges = G._G[fr_i]

        for edge in edges:
            G_nx.add_edge(edge.fr, edge.to, weight=edge._w)
    return G_nx


def test_all_paths(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        p = np.random.rand()
        directed = np.random.rand() < 0.5
        G = random_unweighted_graph(n_vertices=5, edge_prob=p, directed=directed)

        nodes = G._I2V.keys()
        G_nx = to_networkx(G)

        # for each graph, test all_paths for all pairs of start and end
        # vertices. note that graph is not guaranteed to be connected, so many
        # paths will be empty
        for s_i in nodes:
            for e_i in nodes:
                if s_i == e_i:
                    continue

                paths = G.all_paths(s_i, e_i)
                paths_nx = nx.all_simple_paths(G_nx, source=s_i, target=e_i, cutoff=10)

                paths = sorted(paths)
                paths_nx = sorted(list(paths_nx))

                for p1, p2 in zip(paths, paths_nx):
                    np.testing.assert_array_equal(p1, p2)

                print("PASSED")
                i += 1


def test_random_DAG(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        p = np.random.uniform(0.25, 1)
        n_v = np.random.randint(5, 50)

        G = random_DAG(n_v, p)
        G_nx = to_networkx(G)

        assert nx.is_directed_acyclic_graph(G_nx)
        print("PASSED")
        i += 1


def test_topological_ordering(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        p = np.random.uniform(0.25, 1)
        n_v = np.random.randint(5, 10)

        G = random_DAG(n_v, p)
        G_nx = to_networkx(G)

        if nx.is_directed_acyclic_graph(G_nx):
            topo_order = G.topological_ordering()

            #  test topological order
            seen_it = set()
            for n_i in topo_order:
                seen_it.add(n_i)
                assert any([c_i in seen_it for c_i in G.get_neighbors(n_i)]) == False

            print("PASSED")
            i += 1


def test_is_acyclic(N=1):
    np.random.seed(12345)
    i = 0
    while i < N:
        p = np.random.rand()
        directed = np.random.rand() < 0.5
        G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=True)
        G_nx = to_networkx(G)

        assert G.is_acyclic() == nx.is_directed_acyclic_graph(G_nx)
        print("PASSED")
        i += 1


================================================
FILE: numpy_ml/trees/README.md
================================================
# Tree-Based Models
This module implements:

1. [Decision trees](https://en.wikipedia.org/wiki/Decision_tree_learning) for classification and regression via the CART algorithm ([Breiman, Friedman, Olshen, & Stone, 1984](https://www.amazon.com/Classification-Regression-Wadsworth-Statistics-Probability/dp/0412048418)).
2. [Random forests](https://en.wikipedia.org/wiki/Random_forest) for classification and regression (an example of bootstrap
   aggregating) ([Breiman, 2001](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf)).
3. [Gradient-boosted decision
   trees](https://en.wikipedia.org/wiki/Gradient_boosting) for classification and regression (an
   example of gradient boosted machines) ([Friedman, 1999/2001](https://projecteuclid.org/download/pdf_1/euclid.aos/1013203451)).

## Plots
<p align="center">
   <img src="img/plot.png" height='800' align='center' />
</p>


================================================
FILE: numpy_ml/trees/__init__.py
================================================
from . import losses
from .dt import *
from .rf import *
from .gbdt import *


================================================
FILE: numpy_ml/trees/dt.py
================================================
import numpy as np


class Node:
    def __init__(self, left, right, rule):
        self.left = left
        self.right = right
        self.feature = rule[0]
        self.threshold = rule[1]


class Leaf:
    def __init__(self, value):
        """
        `value` is an array of class probabilities if classifier is True, else
        the mean of the region
        """
        self.value = value


class DecisionTree:
    def __init__(
        self,
        classifier=True,
        max_depth=None,
        n_feats=None,
        criterion="entropy",
        seed=None,
    ):
        """
        A decision tree model for regression and classification problems.

        Parameters
        ----------
        classifier : bool
            Whether to treat target values as categorical (classifier =
            True) or continuous (classifier = False). Default is True.
        max_depth: int or None
            The depth at which to stop growing the tree. If None, grow the tree
            until all leaves are pure. Default is None.
        n_feats : int
            Specifies the number of features to sample on each split. If None,
            use all features on each split. Default is None.
        criterion : {'mse', 'entropy', 'gini'}
            The error criterion to use when calculating splits. When
            `classifier` is False, valid entries are {'mse'}. When `classifier`
            is True, valid entries are {'entropy', 'gini'}. Default is
            'entropy'.
        seed : int or None
            Seed for the random number generator. Default is None.
        """
        if seed:
            np.random.seed(seed)

        self.depth = 0
        self.root = None

        self.n_feats = n_feats
        self.criterion = criterion
        self.classifier = classifier
        self.max_depth = max_depth if max_depth else np.inf

        if not classifier and criterion in ["gini", "entropy"]:
            raise ValueError(
                "{} is a valid criterion only when classifier = True.".format(criterion)
            )
        if classifier and criterion == "mse":
            raise ValueError("`mse` is a valid criterion only when classifier = False.")

    def fit(self, X, Y):
        """
        Fit a binary decision tree to a dataset.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The training data of `N` examples, each with `M` features
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            An array of integer class labels for each example in `X` if
            self.classifier = True, otherwise the set of target values for
            each example in `X`.
        """
        self.n_classes = max(Y) + 1 if self.classifier else None
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow(X, Y)

    def predict(self, X):
        """
        Use the trained decision tree to classify or predict the examples in `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The training data of `N` examples, each with `M` features

        Returns
        -------
        preds : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The integer class labels predicted for each example in `X` if
            self.classifier = True, otherwise the predicted target values.
        """
        return np.array([self._traverse(x, self.root) for x in X])

    def predict_class_probs(self, X):
        """
        Use the trained decision tree to return the class probabilities for the
        examples in `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The training data of `N` examples, each with `M` features

        Returns
        -------
        preds : :py:class:`ndarray <numpy.ndarray>` of shape `(N, n_classes)`
            The class probabilities predicted for each example in `X`.
        """
        assert self.classifier, "`predict_class_probs` undefined for classifier = False"
        return np.array([self._traverse(x, self.root, prob=True) for x in X])

    def _grow(self, X, Y, cur_depth=0):
        # if all labels are the same, return a leaf
        if len(set(Y)) == 1:
            if self.classifier:
                prob = np.zeros(self.n_classes)
                prob[Y[0]] = 1.0
            return Leaf(prob) if self.classifier else Leaf(Y[0])

        # if we have reached max_depth, return a leaf
        if cur_depth >= self.max_depth:
            v = np.mean(Y, axis=0)
            if self.classifier:
                v = np.bincount(Y, minlength=self.n_classes) / len(Y)
            return Leaf(v)

        cur_depth += 1
        self.depth = max(self.depth, cur_depth)

        N, M = X.shape
        feat_idxs = np.random.choice(M, self.n_feats, replace=False)

        # greedily select the best split according to `criterion`
        feat, thresh = self._segment(X, Y, feat_idxs)
        l = np.argwhere(X[:, feat] <= thresh).flatten()
        r = np.argwhere(X[:, feat] > thresh).flatten()

        # grow the children that result from the split
        left = self._grow(X[l, :], Y[l], cur_depth)
        right = self._grow(X[r, :], Y[r], cur_depth)
        return Node(left, right, (feat, thresh))

    def _segment(self, X, Y, feat_idxs):
        """
        Find the optimal split rule (feature index and split threshold) for the
        data according to `self.criterion`.
        """
        best_gain = -np.inf
        split_idx, split_thresh = None, None
        for i in feat_idxs:
            vals = X[:, i]
            levels = np.unique(vals)
            thresholds = (levels[:-1] + levels[1:]) / 2 if len(levels) > 1 else levels
            gains = np.array([self._impurity_gain(Y, t, vals) for t in thresholds])

            if gains.max() > best_gain:
                split_idx = i
                best_gain = gains.max()
                split_thresh = thresholds[gains.argmax()]

        return split_idx, split_thresh

    def _impurity_gain(self, Y, split_thresh, feat_values):
        """
        Compute the impurity gain associated with a given split.

        IG(split) = loss(parent) - weighted_avg[loss(left_child), loss(right_child)]
        """
        if self.criterion == "entropy":
            loss = entropy
        elif self.criterion == "gini":
            loss = gini
        elif self.criterion == "mse":
            loss = mse

        parent_loss = loss(Y)

        # generate split
        left = np.argwhere(feat_values <= split_thresh).flatten()
        right = np.argwhere(feat_values > split_thresh).flatten()

        if len(left) == 0 or len(right) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(Y)
        n_l, n_r = len(left), len(right)
        e_l, e_r = loss(Y[left]), loss(Y[right])
        child_loss = (n_l / n) * e_l + (n_r / n) * e_r

        # impurity gain is difference in loss before vs. after split
        ig = parent_loss - child_loss
        return ig

    def _traverse(self, X, node, prob=False):
        if isinstance(node, Leaf):
            if self.classifier:
                return node.value if prob else node.value.argmax()
            return node.value
        if X[node.feature] <= node.threshold:
            return self._traverse(X, node.left, prob)
        return self._traverse(X, node.right, prob)


def mse(y):
    """
    Mean squared error for decision tree (ie., mean) predictions
    """
    return np.mean((y - np.mean(y)) ** 2)


def entropy(y):
    """
    Entropy of a label sequence
    """
    hist = np.bincount(y)
    ps = hist / np.sum(hist)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


def gini(y):
    """
    Gini impurity (local entropy) of a label sequence
    """
    hist = np.bincount(y)
    N = np.sum(hist)
    return 1 - sum([(i / N) ** 2 for i in hist])


================================================
FILE: numpy_ml/trees/gbdt.py
================================================
import numpy as np

from .dt import DecisionTree
from .losses import MSELoss, CrossEntropyLoss


def to_one_hot(labels, n_classes=None):
    if labels.ndim > 1:
        raise ValueError("labels must have dimension 1, but got {}".format(labels.ndim))

    N = labels.size
    n_cols = np.max(labels) + 1 if n_classes is None else n_classes
    one_hot = np.zeros((N, n_cols))
    one_hot[np.arange(N), labels] = 1.0
    return one_hot


class GradientBoostedDecisionTree:
    def __init__(
        self,
        n_iter,
        max_depth=None,
        classifier=True,
        learning_rate=1,
        loss="crossentropy",
        step_size="constant",
    ):
        """
        A gradient boosted ensemble of decision trees.

        Notes
        -----
        Gradient boosted machines (GBMs) fit an ensemble of `m` weak learners such that:

        .. math::

            f_m(X) = b(X) + \eta w_1 g_1 + \ldots + \eta w_m g_m

        where `b` is a fixed initial estimate for the targets, :math:`\eta` is
        a learning rate parameter, and :math:`w_{\cdot}` and :math:`g_{\cdot}`
        denote the weights and learner predictions for subsequent fits.

        We fit each `w` and `g` iteratively using a greedy strategy so that at each
        iteration `i`,

        .. math::

            w_i, g_i = \\arg \min_{w_i, g_i} L(Y, f_{i-1}(X) + w_i g_i)

        On each iteration we fit a new weak learner to predict the negative
        gradient of the loss with respect to the previous prediction, :math:`f_{i-1}(X)`.
        We then use the element-wise product of the predictions of this weak
        learner, :math:`g_i`, with a weight, :math:`w_i`, to compute the amount to
        adjust the predictions of our model at the previous iteration, :math:`f_{i-1}(X)`:

        .. math::

            f_i(X) := f_{i-1}(X) + w_i g_i

        Parameters
        ----------
        n_iter : int
            The number of iterations / weak estimators to use when fitting each
            dimension / class of `Y`.
        max_depth : int
            The maximum depth of each decision tree weak estimator. Default is
            None.
        classifier : bool
            Whether `Y` contains class labels or real-valued targets. Default
            is True.
        learning_rate : float
            Value in [0, 1] controlling the amount each weak estimator
            contributes to the overall model prediction. Sometimes known as the
            `shrinkage parameter` in the GBM literature. Default is 1.
        loss : {'crossentropy', 'mse'}
            The loss to optimize for the GBM. Default is 'crossentropy'.
        step_size : {"constant", "adaptive"}
            How to choose the weight for each weak learner. If "constant", use
            a fixed weight of 1 for each learner. If "adaptive", use a step
            size computed via line-search on the current iteration's loss.
            Default is 'constant'.
        """
        self.loss = loss
        self.weights = None
        self.learners = None
        self.out_dims = None
        self.n_iter = n_iter
        self.base_estimator = None
        self.max_depth = max_depth
        self.step_size = step_size
        self.classifier = classifier
        self.learning_rate = learning_rate

    def fit(self, X, Y):
        """
        Fit the gradient boosted decision trees on a dataset.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape (N, M)
            The training data of `N` examples, each with `M` features
        Y : :py:class:`ndarray <numpy.ndarray>` of shape (N,)
            An array of integer class labels for each example in `X` if
            ``self.classifier = True``, otherwise the set of target values for
            each example in `X`.
        """
        if self.loss == "mse":
            loss = MSELoss()
        elif self.loss == "crossentropy":
            loss = CrossEntropyLoss()

        # convert Y to one_hot if not already
        if self.classifier:
            Y = to_one_hot(Y.flatten())
        else:
            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y

        N, M = X.shape
        self.out_dims = Y.shape[1]
        self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)
        self.weights = np.ones((self.n_iter, self.out_dims))
        self.weights[1:, :] *= self.learning_rate

        # fit the base estimator
        Y_pred = np.zeros((N, self.out_dims))
        for k in range(self.out_dims):
            t = loss.base_estimator()
            t.fit(X, Y[:, k])
            Y_pred[:, k] += t.predict(X)
            self.learners[0, k] = t

        # incrementally fit each learner on the negative gradient of the loss
        # wrt the previous fit (pseudo-residuals)
        for i in range(1, self.n_iter):
            for k in range(self.out_dims):
                y, y_pred = Y[:, k], Y_pred[:, k]
                neg_grad = -1 * loss.grad(y, y_pred)

                # use MSE as the surrogate loss when fitting to negative gradients
                t = DecisionTree(
                    classifier=False, max_depth=self.max_depth, criterion="mse"
                )

                # fit current learner to negative gradients
                t.fit(X, neg_grad)
                self.learners[i, k] = t

                # compute step size and weight for the current learner
                step = 1.0
                h_pred = t.predict(X)
                if self.step_size == "adaptive":
                    step = loss.line_search(y, y_pred, h_pred)

                # update weights and our overall prediction for Y
                self.weights[i, k] *= step
                Y_pred[:, k] += self.weights[i, k] * h_pred

    def predict(self, X):
        """
        Use the trained model to classify or predict the examples in `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The training data of `N` examples, each with `M` features

        Returns
        -------
        preds : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The integer class labels predicted for each example in `X` if
            ``self.classifier = True``, otherwise the predicted target values.
        """
        Y_pred = np.zeros((X.shape[0], self.out_dims))
        for i in range(self.n_iter):
            for k in range(self.out_dims):
                Y_pred[:, k] += self.weights[i, k] * self.learners[i, k].predict(X)

        if self.classifier:
            Y_pred = Y_pred.argmax(axis=1)

        return Y_pred


================================================
FILE: numpy_ml/trees/losses.py
================================================
import numpy as np

#######################################################################
#                           Base Estimators                           #
#######################################################################


class ClassProbEstimator:
    def fit(self, X, y):
        self.class_prob = y.sum() / len(y)

    def predict(self, X):
        pred = np.empty(X.shape[0], dtype=np.float64)
        pred.fill(self.class_prob)
        return pred


class MeanBaseEstimator:
    def fit(self, X, y):
        self.avg = np.mean(y)

    def predict(self, X):
        pred = np.empty(X.shape[0], dtype=np.float64)
        pred.fill(self.avg)
        return pred


#######################################################################
#                           Loss Functions                            #
#######################################################################


class MSELoss:
    def __call__(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    def base_estimator(self):
        return MeanBaseEstimator()

    def grad(self, y, y_pred):
        return -2 / len(y) * (y - y_pred)

    def line_search(self, y, y_pred, h_pred):
        # TODO: revise this
        Lp = np.sum((y - y_pred) * h_pred)
        Lpp = np.sum(h_pred * h_pred)

        # if we perfectly fit the residuals, use max step size
        return 1 if np.sum(Lpp) == 0 else Lp / Lpp


class CrossEntropyLoss:
    def __call__(self, y, y_pred):
        eps = np.finfo(float).eps
        return -np.sum(y * np.log(y_pred + eps))

    def base_estimator(self):
        return ClassProbEstimator()

    def grad(self, y, y_pred):
        eps = np.finfo(float).eps
        return -y * 1 / (y_pred + eps)

    def line_search(self, y, y_pred, h_pred):
        raise NotImplementedError


================================================
FILE: numpy_ml/trees/rf.py
================================================
import numpy as np
from .dt import DecisionTree


def bootstrap_sample(X, Y):
    N, M = X.shape
    idxs = np.random.choice(N, N, replace=True)
    return X[idxs], Y[idxs]


class RandomForest:
    def __init__(
        self, n_trees, max_depth, n_feats, classifier=True, criterion="entropy"
    ):
        """
        An ensemble (forest) of decision trees where each split is calculated
        using a random subset of the features in the input.

        Parameters
        ----------
        n_trees : int
            The number of individual decision trees to use within the ensemble.
        max_depth: int or None
            The depth at which to stop growing each decision tree. If None,
            grow each tree until the leaf nodes are pure.
        n_feats : int
            The number of features to sample on each split.
        classifier : bool
            Whether `Y` contains class labels or real-valued targets. Default
            is True.
        criterion : {'entropy', 'gini', 'mse'}
            The error criterion to use when calculating splits for each weak
            learner. When ``classifier = False``, valid entries are {'mse'}.
            When ``classifier = True``, valid entries are {'entropy', 'gini'}.
            Default is 'entropy'.
        """
        self.trees = []
        self.n_trees = n_trees
        self.n_feats = n_feats
        self.max_depth = max_depth
        self.criterion = criterion
        self.classifier = classifier

    def fit(self, X, Y):
        """
        Create `n_trees`-worth of bootstrapped samples from the training data
        and use each to fit a separate decision tree.
        """
        self.trees = []
        for _ in range(self.n_trees):
            X_samp, Y_samp = bootstrap_sample(X, Y)
            tree = DecisionTree(
                n_feats=self.n_feats,
                max_depth=self.max_depth,
                criterion=self.criterion,
                classifier=self.classifier,
            )
            tree.fit(X_samp, Y_samp)
            self.trees.append(tree)

    def predict(self, X):
        """
        Predict the target value for each entry in `X`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            The training data of `N` examples, each with `M` features.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            Model predictions for each entry in `X`.
        """
        tree_preds = np.array([[t._traverse(x, t.root) for x in X] for t in self.trees])
        return self._vote(tree_preds)

    def _vote(self, predictions):
        """
        Return the aggregated prediction across all trees in the RF for each problem.

        Parameters
        ----------
        predictions : :py:class:`ndarray <numpy.ndarray>` of shape `(n_trees, N)`
            The array of predictions from each decision tree in the RF for each
            of the `N` problems in `X`.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            If classifier is True, the class label predicted by the majority of
            the decision trees for each problem in `X`. If classifier is False,
            the average prediction across decision trees on each problem.
        """
        if self.classifier:
            out = [np.bincount(x).argmax() for x in predictions.T]
        else:
            out = [np.mean(x) for x in predictions.T]
        return np.array(out)


================================================
FILE: numpy_ml/utils/README.md
================================================
# Utilities

The utilities module implements a number of useful functions and objects that
power other ML algorithms across the repo.

- `data_structures.py` implements a few useful data structures
    - A max- and min-heap ordered priority queue
    - A [ball tree](https://en.wikipedia.org/wiki/Ball_tree) with the KNS1 algorithm ([Omohundro, 1989](http://ftp.icsi.berkeley.edu/ftp/pub/techreports/1989/tr-89-063.pdf); [Moore & Gray, 2006](http://people.ee.duke.edu/~lcarin/liu06a.pdf))
    - A discrete sampler implementing Vose's algorithm for the [alias method](https://en.wikipedia.org/wiki/Alias_method) ([Walker, 1977](https://dl.acm.org/citation.cfm?id=355749); [Vose, 1991](https://pdfs.semanticscholar.org/f65b/cde1fcf82e05388b31de80cba10bf65acc07.pdf))

- `kernels.py` implements several general-purpose similarity kernels
    - Linear kernel
    - Polynomial kernel
    - Radial basis function kernel

- `distance_metrics.py` implements common distance metrics
    - Euclidean (L2) distance
    - Manhattan (L1) distance
    - Chebyshev (L-infinity) distance
    - Minkowski-p distance
    - Hamming distance

- `graphs.py` implements simple data structures and algorithms for graph
  processing.
    - Undirected + directed graph objects allowing for probabilistic edge weights
    - Graph generators (Erdos-Renyi, random DAGs)
    - Topological sorting for DAGs
    - Cycle detection
    - Simple path-finding

- `windows.py` implements several common windowing functions
    - Hann
    - Hamming
    - Blackman-Harris
    - Generalized cosine

- `testing.py` implements helper functions that prove useful when writing unit
  tests, including data generators and various assert statements


================================================
FILE: numpy_ml/utils/__init__.py
================================================
"""Utilities module"""

from . import testing
from . import data_structures
from . import distance_metrics
from . import kernels
from . import windows
from . import graphs
from . import misc


================================================
FILE: numpy_ml/utils/data_structures.py
================================================
import heapq
from copy import copy
from collections import Hashable

import numpy as np

from .distance_metrics import euclidean

#######################################################################
#                           Priority Queue                            #
#######################################################################


class PQNode(object):
    def __init__(self, key, val, priority, entry_id, **kwargs):
        """A generic node object for holding entries in :class:`PriorityQueue`"""
        self.key = key
        self.val = val
        self.entry_id = entry_id
        self.priority = priority

    def __repr__(self):
        fstr = "PQNode(key={}, val={}, priority={}, entry_id={})"
        return fstr.format(self.key, self.val, self.priority, self.entry_id)

    def to_dict(self):
        """Return a dictionary representation of the node's contents"""
        d = self.__dict__
        d["id"] = "PQNode"
        return d

    def __gt__(self, other):
        if not isinstance(other, PQNode):
            return -1
        if self.priority == other.priority:
            return self.entry_id > other.entry_id
        return self.priority > other.priority

    def __ge__(self, other):
        if not isinstance(other, PQNode):
            return -1
        return self.priority >= other.priority

    def __lt__(self, other):
        if not isinstance(other, PQNode):
            return -1
        if self.priority == other.priority:
            return self.entry_id < other.entry_id
        return self.priority < other.priority

    def __le__(self, other):
        if not isinstance(other, PQNode):
            return -1
        return self.priority <= other.priority


class PriorityQueue:
    def __init__(self, capacity, heap_order="max"):
        """
        A priority queue implementation using a binary heap.

        Notes
        -----
        A priority queue is a data structure useful for storing the top
        `capacity` largest or smallest elements in a collection of values. As a
        result of using a binary heap, ``PriorityQueue`` offers `O(log N)`
        :meth:`push` and :meth:`pop` operations.

        Parameters
        ----------
        capacity: int
            The maximum number of items that can be held in the queue.
        heap_order: {"max", "min"}
            Whether the priority queue should retain the items with the
            `capacity` smallest (`heap_order` = 'min') or `capacity` largest
            (`heap_order` = 'max') priorities.
        """
        assert heap_order in ["max", "min"], "heap_order must be either 'max' or 'min'"
        self.capacity = capacity
        self.heap_order = heap_order

        self._pq = []
        self._count = 0
        self._entry_counter = 0

    def __repr__(self):
        fstr = "PriorityQueue(capacity={}, heap_order={}) with {} items"
        return fstr.format(self.capacity, self.heap_order, self._count)

    def __len__(self):
        return self._count

    def __iter__(self):
        return iter(self._pq)

    def push(self, key, priority, val=None):
        """
        Add a new (key, value) pair with priority `priority` to the queue.

        Notes
        -----
        If the queue is at capacity and `priority` exceeds the priority of the
        item with the largest/smallest priority currently in the queue, replace
        the current queue item with (`key`, `val`).

        Parameters
        ----------
        key : hashable object
            The key to insert into the queue.
        priority : comparable
            The priority for the `key`, `val` pair.
        val : object
            The value associated with `key`. Default is None.
        """
        if self.heap_order == "max":
            priority = -1 * priority

        item = PQNode(key=key, val=val, priority=priority, entry_id=self._entry_counter)
        heapq.heappush(self._pq, item)

        self._count += 1
        self._entry_counter += 1

        while self._count > self.capacity:
            self.pop()

    def pop(self):
        """
        Remove the item with the largest/smallest (depending on
        ``self.heap_order``) priority from the queue and return it.

        Notes
        -----
        In contrast to :meth:`peek`, this operation is `O(log N)`.

        Returns
        -------
        item : :class:`PQNode` instance or None
            Item with the largest/smallest priority, depending on
            ``self.heap_order``.
        """
        item = heapq.heappop(self._pq).to_dict()
        if self.heap_order == "max":
            item["priority"] = -1 * item["priority"]
        self._count -= 1
        return item

    def peek(self):
        """
        Return the item with the largest/smallest (depending on
        ``self.heap_order``) priority *without* removing it from the queue.

        Notes
        -----
        In contrast to :meth:`pop`, this operation is O(1).

        Returns
        -------
        item : :class:`PQNode` instance or None
            Item with the largest/smallest priority, depending on
            ``self.heap_order``.
        """
        item = None
        if self._count > 0:
            item = copy(self._pq[0].to_dict())
            if self.heap_order == "max":
                item["priority"] = -1 * item["priority"]
        return item


#######################################################################
#                              Ball Tree                              #
#######################################################################


class BallTreeNode:
    def __init__(self, centroid=None, X=None, y=None):
        self.left = None
        self.right = None
        self.radius = None
        self.is_leaf = False

        self.data = X
        self.targets = y
        self.centroid = centroid

    def __repr__(self):
        fstr = "BallTreeNode(centroid={}, is_leaf={})"
        return fstr.format(self.centroid, self.is_leaf)

    def to_dict(self):
        d = self.__dict__
        d["id"] = "BallTreeNode"
        return d


class BallTree:
    def __init__(self, leaf_size=40, metric=None):
        """
        A ball tree data structure.

        Notes
        -----
        A ball tree is a binary tree in which every node defines a
        `D`-dimensional hypersphere ("ball") containing a subset of the points
        to be searched. Each internal node of the tree partitions the data
        points into two disjoint sets which are associated with different
        balls. While the balls themselves may intersect, each point is assigned
        to one or the other ball in the partition according to its distance
        from the ball's center. Each leaf node in the tree defines a ball and
        enumerates all data points inside that ball.

        Parameters
        ----------
        leaf_size : int
            The maximum number of datapoints at each leaf. Default is 40.
        metric : :doc:`Distance metric <numpy_ml.utils.distance_metrics>` or None
            The distance metric to use for computing nearest neighbors. If
            None, use the :func:`~numpy_ml.utils.distance_metrics.euclidean`
            metric. Default is None.

        References
        ----------
        .. [1] Omohundro, S. M. (1989). "Five balltree construction algorithms". *ICSI
           Technical Report TR-89-063*.
        .. [2] Liu, T., Moore, A., & Gray A. (2006). "New algorithms for efficient
           high-dimensional nonparametric classification". *J. Mach. Learn. Res.,
           7*, 1135-1158.
        """
        self.root = None
        self.leaf_size = leaf_size
        self.metric = metric if metric is not None else euclidean

    def fit(self, X, y=None):
        """
        Build a ball tree recursively using the O(M log N) `k`-d construction
        algorithm.

        Notes
        -----
        Recursively divides data into nodes defined by a centroid `C` and radius
        `r` such that each point below the node lies within the hyper-sphere
        defined by `C` and `r`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            An array of `N` examples each with `M` features.
        y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, \*)` or None
            An array of target values / labels associated with the entries in
            `X`. Default is None.
        """
        centroid, left_X, left_y, right_X, right_y = self._split(X, y)
        self.root = BallTreeNode(centroid=centroid)
        self.root.radius = np.max([self.metric(centroid, x) for x in X])
        self.root.left = self._build_tree(left_X, left_y)
        self.root.right = self._build_tree(right_X, right_y)

    def _build_tree(self, X, y):
        centroid, left_X, left_y, right_X, right_y = self._split(X, y)

        if X.shape[0] <= self.leaf_size:
            leaf = BallTreeNode(centroid=centroid, X=X, y=y)
            leaf.radius = np.max([self.metric(centroid, x) for x in X])
            leaf.is_leaf = True
            return leaf

        node = BallTreeNode(centroid=centroid)
        node.radius = np.max([self.metric(centroid, x) for x in X])
        node.left = self._build_tree(left_X, left_y)
        node.right = self._build_tree(right_X, right_y)
        return node

    def _split(self, X, y=None):
        # find the dimension with greatest variance
        split_dim = np.argmax(np.var(X, axis=0))

        # sort X and y along split_dim
        sort_ixs = np.argsort(X[:, split_dim])
        X, y = X[sort_ixs], y[sort_ixs] if y is not None else None

        # divide at median value of split_dim
        med_ix = X.shape[0] // 2
        centroid = X[med_ix]  # , split_dim

        # split data into two halves at the centroid (median always appears on
        # the right split)
        left_X, left_y = X[:med_ix], y[:med_ix] if y is not None else None
        right_X, right_y = X[med_ix:], y[med_ix:] if y is not None else None
        return centroid, left_X, left_y, right_X, right_y

    def nearest_neighbors(self, k, x):
        """
        Find the `k` nearest neighbors in the ball tree to a query vector `x`
        using the KNS1 algorithm.

        Parameters
        ----------
        k : int
            The number of closest points in `X` to return
        x : :py:class:`ndarray <numpy.ndarray>` of shape `(1, M)`
            The query vector.

        Returns
        -------
        nearest : list of :class:`PQNode` s of length `k`
            List of the `k` points in `X` to closest to the query vector. The
            ``key`` attribute of each :class:`PQNode` contains the point itself, the
            ``val`` attribute contains its target, and the ``distance``
            attribute contains its distance to the query vector.
        """
        # maintain a max-first priority queue with priority = distance to x
        PQ = PriorityQueue(capacity=k, heap_order="max")
        nearest = self._knn(k, x, PQ, self.root)
        for n in nearest:
            n.distance = self.metric(x, n.key)
        return nearest

    def _knn(self, k, x, PQ, root):
        dist = self.metric
        dist_to_ball = dist(x, root.centroid) - root.radius
        dist_to_farthest_neighbor = dist(x, PQ.peek()["key"]) if len(PQ) > 0 else np.inf

        if dist_to_ball >= dist_to_farthest_neighbor and len(PQ) == k:
            return PQ
        if root.is_leaf:
            targets = [None] * len(root.data) if root.targets is None else root.targets
            for point, target in zip(root.data, targets):
                dist_to_x = dist(x, point)
                if len(PQ) == k and dist_to_x < dist_to_farthest_neighbor:
                    PQ.push(key=point, val=target, priority=dist_to_x)
                else:
                    PQ.push(key=point, val=target, priority=dist_to_x)
        else:
            l_closest = dist(x, root.left.centroid) < dist(x, root.right.centroid)
            PQ = self._knn(k, x, PQ, root.left if l_closest else root.right)
            PQ = self._knn(k, x, PQ, root.right if l_closest else root.left)
        return PQ


#######################################################################
#                         Multinomial Sampler                         #
#######################################################################


class DiscreteSampler:
    def __init__(self, probs, log=False, with_replacement=True):
        """
        Sample from an arbitrary multinomial PMF over the first `N` nonnegative
        integers using Vose's algorithm for the alias method.

        Notes
        -----
        Vose's algorithm takes `O(n)` time to initialize, requires `O(n)` memory,
        and generates samples in constant time.

        References
        ----------
        .. [1] Walker, A. J. (1977) "An efficient method for generating discrete
           random variables with general distributions". *ACM Transactions on
           Mathematical Software, 3(3)*, 253-256.

        .. [2] Vose, M. D. (1991) "A linear algorithm for generating random numbers
           with a given distribution". *IEEE Trans. Softw. Eng., 9*, 972-974.

        .. [3] Schwarz, K (2011) "Darts, dice, and coins: sampling from a discrete
           distribution". http://www.keithschwarz.com/darts-dice-coins/

        Parameters
        ----------
        probs : :py:class:`ndarray <numpy.ndarray>` of length `(N,)`
            A list of probabilities of the `N` outcomes in the sample space.
            `probs[i]` returns the probability of outcome `i`.
        log : bool
            Whether the probabilities in `probs` are in logspace. Default is
            False.
        with_replacement : bool
            Whether to generate samples with or without replacement. Default is
            True.
        """
        if not isinstance(probs, np.ndarray):
            probs = np.array(probs)

        self.log = log
        self.N = len(probs)
        self.probs = probs
        self.with_replacement = with_replacement

        alias = np.zeros(self.N)
        prob = np.zeros(self.N)
        scaled_probs = self.probs + np.log(self.N) if log else self.probs * self.N

        selector = scaled_probs < 0 if log else scaled_probs < 1
        small, large = np.where(selector)[0].tolist(), np.where(~selector)[0].tolist()

        while len(small) and len(large):
            l, g = small.pop(), large.pop()

            alias[l] = g
            prob[l] = scaled_probs[l]

            if log:
                pg = np.log(np.exp(scaled_probs[g]) + np.exp(scaled_probs[l]) - 1)
            else:
                pg = scaled_probs[g] + scaled_probs[l] - 1

            scaled_probs[g] = pg
            to_small = pg < 0 if log else pg < 1
            if to_small:
                small.append(g)
            else:
                large.append(g)

        while len(large):
            prob[large.pop()] = 0 if log else 1

        while len(small):
            prob[small.pop()] = 0 if log else 1

        self.prob_table = prob
        self.alias_table = alias

    def __call__(self, n_samples=1):
        """
        Generate random draws from the `probs` distribution over integers in
        [0, N).

        Parameters
        ----------
        n_samples: int
            The number of samples to generate. Default is 1.

        Returns
        -------
        sample : :py:class:`ndarray <numpy.ndarray>` of shape `(n_samples,)`
            A collection of draws from the distribution defined by `probs`.
            Each sample is an int in the range `[0, N)`.
        """
        return self.sample(n_samples)

    def sample(self, n_samples=1):
        """
        Generate random draws from the `probs` distribution over integers in
        [0, N).

        Parameters
        ----------
        n_samples: int
            The number of samples to generate. Default is 1.

        Returns
        -------
        sample : :py:class:`ndarray <numpy.ndarray>` of shape `(n_samples,)`
            A collection of draws from the distribution defined by `probs`.
            Each sample is an int in the range `[0, N)`.
        """
        ixs = np.random.randint(0, self.N, n_samples)
        p = np.exp(self.prob_table[ixs]) if self.log else self.prob_table[ixs]
        flips = np.random.binomial(1, p)
        samples = [ix if f else self.alias_table[ix] for ix, f in zip(ixs, flips)]

        # do recursive rejection sampling to sample without replacement
        if not self.with_replacement:
            unique = list(set(samples))
            while len(samples) != len(unique):
                n_new = len(samples) - len(unique)
                samples = unique + self.sample(n_new).tolist()
                unique = list(set(samples))

        return np.array(samples, dtype=int)


#######################################################################
#                                Dict                                 #
#######################################################################


class Dict(dict):
    def __init__(self, encoder=None):
        """
        A dictionary subclass which returns the key value if it is not in the
        dict.

        Parameters
        ----------
        encoder : function or None
            A function which is applied to a key before adding / retrieving it
            from the dictionary. If None, the function defaults to the
            identity. Default is None.
        """
        super(Dict, self).__init__()
        self._encoder = encoder
        self._id_max = 0

    def __setitem__(self, key, value):
        if self._encoder is not None:
            key = self._encoder(key)
        elif not isinstance(key, Hashable):
            key = tuple(key)
        super(Dict, self).__setitem__(key, value)

    def _encode_key(self, key):
        D = super(Dict, self)
        enc_key = self._encoder(key)
        if D.__contains__(enc_key):
            val = D.__getitem__(enc_key)
        else:
            val = self._id_max
            D.__setitem__(enc_key, val)
            self._id_max += 1
        return val

    def __getitem__(self, key):
        self._key = copy.deepcopy(key)
        if self._encoder is not None:
            return self._encode_key(key)
        elif not isinstance(key, Hashable):
            key = tuple(key)
        return super(Dict, self).__getitem__(key)

    def __missing__(self, key):
        return self._key


================================================
FILE: numpy_ml/utils/distance_metrics.py
================================================
import numpy as np


def euclidean(x, y):
    """
    Compute the Euclidean (`L2`) distance between two real vectors

    Notes
    -----
    The Euclidean distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \sqrt{ \sum_i (x_i - y_i)^2  }

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between

    Returns
    -------
    d : float
        The L2 distance between **x** and **y**.
    """
    return np.sqrt(np.sum((x - y) ** 2))


def manhattan(x, y):
    """
    Compute the Manhattan (`L1`) distance between two real vectors

    Notes
    -----
    The Manhattan distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \sum_i |x_i - y_i|

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between

    Returns
    -------
    d : float
        The L1 distance between **x** and **y**.
    """
    return np.sum(np.abs(x - y))


def chebyshev(x, y):
    """
    Compute the Chebyshev (:math:`L_\infty`) distance between two real vectors

    Notes
    -----
    The Chebyshev distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \max_i |x_i - y_i|

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between

    Returns
    -------
    d : float
        The Chebyshev distance between **x** and **y**.
    """
    return np.max(np.abs(x - y))


def minkowski(x, y, p):
    """
    Compute the Minkowski-`p` distance between two real vectors.

    Notes
    -----
    The Minkowski-`p` distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \left( \sum_i |x_i - y_i|^p \\right)^{1/p}

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between
    p : float > 1
        The parameter of the distance function. When `p = 1`, this is the `L1`
        distance, and when `p=2`, this is the `L2` distance. For `p < 1`,
        Minkowski-`p` does not satisfy the triangle inequality and hence is not
        a valid distance metric.

    Returns
    -------
    d : float
        The Minkowski-`p` distance between **x** and **y**.
    """
    return np.sum(np.abs(x - y) ** p) ** (1 / p)


def hamming(x, y):
    """
    Compute the Hamming distance between two integer-valued vectors.

    Notes
    -----
    The Hamming distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \\frac{1}{N} \sum_i \mathbb{1}_{x_i \\neq y_i}

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between. Both vectors should be
        integer-valued.

    Returns
    -------
    d : float
        The Hamming distance between **x** and **y**.
    """
    return np.sum(x != y) / len(x)


================================================
FILE: numpy_ml/utils/graphs.py
================================================
from abc import ABC, abstractmethod
from collections import defaultdict
from itertools import combinations, permutations

import numpy as np

#######################################################################
#                          Graph Components                           #
#######################################################################


class Edge(object):
    def __init__(self, fr, to, w=None):
        """
        A generic directed edge object.

        Parameters
        ----------
        fr: int
            The id of the vertex the edge goes from
        to: int
            The id of the vertex the edge goes to
        w: float, :class:`Object` instance, or None
            The edge weight, if applicable. If weight is an arbitrary Object it
            must have a method called 'sample' which takes no arguments and
            returns a random sample from the weight distribution. If `w` is
            None, no weight is assumed. Default is None.
        """
        self.fr = fr
        self.to = to
        self._w = w

    def __repr__(self):
        return "{} -> {}, weight: {}".format(self.fr, self.to, self._w)

    @property
    def weight(self):
        return self._w.sample() if hasattr(self._w, "sample") else self._w

    def reverse(self):
        """Reverse the edge direction"""
        return Edge(self.t, self.f, self.w)


#######################################################################
#                             Graph Types                             #
#######################################################################


class Graph(ABC):
    def __init__(self, V, E):
        self._I2V = {i: v for i, v in zip(range(len(V)), V)}
        self._V2I = {v: i for i, v in zip(range(len(V)), V)}
        self._G = {i: set() for i in range(len(V))}
        self._V = V
        self._E = E

        self._build_adjacency_list()

    def __getitem__(self, v_i):
        return self.get_neighbors(v_i)

    def get_index(self, v):
        """Get the internal index for a given vetex"""
        return self._V2I[v]

    def get_vertex(self, v_i):
        """Get the original vertex from a given internal index"""
        return self._I2V[v_i]

    @property
    def vertices(self):
        return self._V

    @property
    def indices(self):
        return list(range(len(self.vertices)))

    @property
    def edges(self):
        return self._E

    def get_neighbors(self, v_i):
        """
        Return the internal indices of the vertices reachable from the vertex
        with index `v_i`.
        """
        return [self._V2I[e.to] for e in self._G[v_i]]

    def to_matrix(self):
        """Return an adjacency matrix representation of the graph"""
        adj_mat = np.zeros((len(self._V), len(self._V)))
        for e in self.edges:
            fr, to = self._V2I[e.fr], self._V2I[e.to]
            adj_mat[fr, to] = 1 if e.weight is None else e.weight
        return adj_mat

    def to_adj_dict(self):
        """Return an adjacency dictionary representation of the graph"""
        adj_dict = defaultdict(lambda: list())
        for e in self.edges:
            adj_dict[e.fr].append(e)
        return adj_dict

    def path_exists(self, s_i, e_i):
        """
        Check whether a path exists from vertex index `s_i` to `e_i`.

        Parameters
        ----------
        s_i: Int
            The interal index of the start vertex
        e_i: Int
            The internal index of the end vertex

        Returns
        -------
        path_exists : Boolean
            Whether or not a valid path exists between `s_i` and `e_i`.
        """
        queue = [(s_i, [s_i])]
        while len(queue):
            c_i, path = queue.pop(0)
            nbrs_not_on_path = set(self.get_neighbors(c_i)) - set(path)

            for n_i in nbrs_not_on_path:
                queue.append((n_i, path + [n_i]))
                if n_i == e_i:
                    return True
        return False

    def all_paths(self, s_i, e_i):
        """
        Find all simple paths between `s_i` and `e_i` in the graph.

        Notes
        -----
        Uses breadth-first search. Ignores all paths with repeated vertices.

        Parameters
        ----------
        s_i: Int
            The interal index of the start vertex
        e_i: Int
            The internal index of the end vertex

        Returns
        -------
        complete_paths : list of lists
            A list of all paths from `s_i` to `e_i`. Each path is represented
            as a list of interal vertex indices.
        """
        complete_paths = []
        queue = [(s_i, [s_i])]

        while len(queue):
            c_i, path = queue.pop(0)
            nbrs_not_on_path = set(self.get_neighbors(c_i)) - set(path)

            for n_i in nbrs_not_on_path:
                if n_i == e_i:
                    complete_paths.append(path + [n_i])
                else:
                    queue.append((n_i, path + [n_i]))

        return complete_paths

    @abstractmethod
    def _build_adjacency_list(self):
        pass


class DiGraph(Graph):
    def __init__(self, V, E):
        """
        A generic directed graph object.

        Parameters
        ----------
        V : list
            A list of vertex IDs.
        E : list of :class:`Edge <numpy_ml.utils.graphs.Edge>` objects
            A list of directed edges connecting pairs of vertices in ``V``.
        """
        super().__init__(V, E)
        self.is_directed = True
        self._topological_ordering = []

    def _build_adjacency_list(self):
        """Encode directed graph as an adjancency list"""
        # assumes no parallel edges
        for e in self.edges:
            fr_i = self._V2I[e.fr]
            self._G[fr_i].add(e)

    def reverse(self):
        """Reverse the direction of all edges in the graph"""
        return DiGraph(self.vertices, [e.reverse() for e in self.edges])

    def topological_ordering(self):
        """
        Returns a (non-unique) topological sort / linearization of the nodes
        IFF the graph is acyclic, otherwise returns None.

        Notes
        -----
        A topological sort is an ordering on the nodes in `G` such that for every
        directed edge :math:`u \\rightarrow v` in the graph, `u` appears before
        `v` in the ordering.  The topological ordering is produced by ordering
        the nodes in `G` by their DFS "last visit time," from greatest to
        smallest.

        This implementation follows a recursive, DFS-based approach [1]_ which
        may break if the graph is very large. For an iterative version, see
        Khan's algorithm [2]_ .

        References
        ----------
        .. [1] Tarjan, R. (1976), Edge-disjoint spanning trees and depth-first
           search, *Acta Informatica, 6 (2)*: 171–185.
        .. [2] Kahn, A. (1962), Topological sorting of large networks,
           *Communications of the ACM, 5 (11)*: 558–562.

        Returns
        -------
        ordering : list or None
            A topoligical ordering of the vertex indices if the graph is a DAG,
            otherwise None.
        """
        ordering = []
        visited = set()

        def dfs(v_i, path=None):
            """A simple DFS helper routine"""
            path = set([v_i]) if path is None else path
            for nbr_i in self.get_neighbors(v_i):
                if nbr_i in path:
                    return True  # cycle detected!
                elif nbr_i not in visited:
                    visited.add(nbr_i)
                    path.add(nbr_i)
                    is_cyclic = dfs(nbr_i, path)
                    if is_cyclic:
                        return True

            # insert to the beginning of the ordering
            ordering.insert(0, v_i)
            path -= set([v_i])
            return False

        for s_i in self.indices:
            if s_i not in visited:
                visited.add(s_i)
                is_cyclic = dfs(s_i)

                if is_cyclic:
                    return None

        return ordering

    def is_acyclic(self):
        """Check whether the graph contains cycles"""
        return self.topological_ordering() is not None


class UndirectedGraph(Graph):
    def __init__(self, V, E):
        """
        A generic undirected graph object.

        Parameters
        ----------
        V : list
            A list of vertex IDs.
        E : list of :class:`Edge <numpy_ml.utils.graphs.Edge>` objects
            A list of edges connecting pairs of vertices in ``V``. For any edge
            connecting vertex `u` to vertex `v`, :class:`UndirectedGraph
            <numpy_ml.utils.graphs.UndirectedGraph>` will assume that there
            exists a corresponding edge connecting `v` to `u`, even if this is
            not present in `E`.
        """
        super().__init__(V, E)
        self.is_directed = False

    def _build_adjacency_list(self):
        """Encode undirected, unweighted graph as an adjancency list"""
        # assumes no parallel edges
        # each edge appears twice as (u,v) and (v,u)
        for e in self.edges:
            fr_i = self._V2I[e.fr]
            to_i = self._V2I[e.to]

            self._G[fr_i].add(e)
            self._G[to_i].add(e.reverse())


#######################################################################
#                          Graph Generators                           #
#######################################################################


def random_unweighted_graph(n_vertices, edge_prob=0.5, directed=False):
    """
    Generate an unweighted Erdős-Rényi random graph [*]_.

    References
    ----------
    .. [*] Erdős, P. and Rényi, A. (1959). On Random Graphs, *Publ. Math. 6*, 290.

    Parameters
    ----------
    n_vertices : int
        The number of vertices in the graph.
    edge_prob : float in [0, 1]
        The probability of forming an edge between two vertices. Default is
        0.5.
    directed : bool
        Whether the edges in the graph should be directed. Default is False.

    Returns
    -------
    G : :class:`Graph` instance
        The resulting random graph.
    """
    vertices = list(range(n_vertices))
    candidates = permutations(vertices, 2) if directed else combinations(vertices, 2)

    edges = []
    for (fr, to) in candidates:
        if np.random.rand() <= edge_prob:
            edges.append(Edge(fr, to))

    return DiGraph(vertices, edges) if directed else UndirectedGraph(vertices, edges)


def random_DAG(n_vertices, edge_prob=0.5):
    """
    Create a 'random' unweighted directed acyclic graph by pruning all the
    backward connections from a random graph.

    Parameters
    ----------
    n_vertices : int
        The number of vertices in the graph.
    edge_prob : float in [0, 1]
        The probability of forming an edge between two vertices in the
        underlying random graph, before edge pruning. Default is 0.5.

    Returns
    -------
    G : :class:`Graph` instance
        The resulting DAG.
    """
    G = random_unweighted_graph(n_vertices, edge_prob, directed=True)

    # prune edges to remove backwards connections between vertices
    G = DiGraph(G.vertices, [e for e in G.edges if e.fr < e.to])

    # if we pruned away all the edges, generate a new graph
    while not len(G.edges):
        G = random_unweighted_graph(n_vertices, edge_prob, directed=True)
        G = DiGraph(G.vertices, [e for e in G.edges if e.fr < e.to])
    return G


================================================
FILE: numpy_ml/utils/kernels.py
================================================
import re
from abc import ABC, abstractmethod

import numpy as np


class KernelBase(ABC):
    def __init__(self):
        super().__init__()
        self.parameters = {}
        self.hyperparameters = {}

    @abstractmethod
    def _kernel(self, X, Y):
        raise NotImplementedError

    def __call__(self, X, Y=None):
        """Refer to documentation for the `_kernel` method"""
        return self._kernel(X, Y)

    def __str__(self):
        P, H = self.parameters, self.hyperparameters
        p_str = ", ".join(["{}={}".format(k, v) for k, v in P.items()])
        return "{}({})".format(H["id"], p_str)

    def summary(self):
        """Return the dictionary of model parameters, hyperparameters, and ID"""
        return {
            "id": self.hyperparameters["id"],
            "parameters": self.parameters,
            "hyperparameters": self.hyperparameters,
        }

    def set_params(self, summary_dict):
        """
        Set the model parameters and hyperparameters using the settings in
        `summary_dict`.

        Parameters
        ----------
        summary_dict : dict
            A dictionary with keys 'parameters' and 'hyperparameters',
            structured as would be returned by the :meth:`summary` method. If
            a particular (hyper)parameter is not included in this dict, the
            current value will be used.

        Returns
        -------
        new_kernel : :doc:`Kernel <numpy_ml.utils.kernels>` instance
            A kernel with parameters and hyperparameters adjusted to those
            specified in `summary_dict`.
        """
        kr, sd = self, summary_dict

        # collapse `parameters` and `hyperparameters` nested dicts into a single
        # merged dictionary
        flatten_keys = ["parameters", "hyperparameters"]
        for k in flatten_keys:
            if k in sd:
                entry = sd[k]
                sd.update(entry)
                del sd[k]

        for k, v in sd.items():
            if k in self.parameters:
                kr.parameters[k] = v
            if k in self.hyperparameters:
                kr.hyperparameters[k] = v
        return kr


class LinearKernel(KernelBase):
    def __init__(self, c0=0):
        """
        The linear (i.e., dot-product) kernel.

        Notes
        -----
        For input vectors :math:`\mathbf{x}` and :math:`\mathbf{y}`, the linear
        kernel is:

        .. math::

            k(\mathbf{x}, \mathbf{y}) = \mathbf{x}^\\top \mathbf{y} + c_0

        Parameters
        ----------
        c0 : float
            An "inhomogeneity" parameter. When `c0` = 0, the kernel is said to be
            homogenous. Default is 1.
        """
        super().__init__()
        self.hyperparameters = {"id": "LinearKernel"}
        self.parameters = {"c0": c0}

    def _kernel(self, X, Y=None):
        """
        Compute the linear kernel (i.e., dot-product) between all pairs of rows in
        `X` and `Y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            Collection of `N` input vectors
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)` or None
            Collection of `M` input vectors. If None, assume `Y` = `X`.
            Default is None.

        Returns
        -------
        out : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            Similarity between `X` and `Y`, where index (`i`, `j`) gives
            :math:`k(x_i, y_j)`.
        """
        X, Y = kernel_checks(X, Y)
        return X @ Y.T + self.parameters["c0"]


class PolynomialKernel(KernelBase):
    def __init__(self, d=3, gamma=None, c0=1):
        """
        The degree-`d` polynomial kernel.

        Notes
        -----
        For input vectors :math:`\mathbf{x}` and :math:`\mathbf{y}`, the polynomial
        kernel is:

        .. math::

            k(\mathbf{x}, \mathbf{y}) = (\gamma \mathbf{x}^\\top \mathbf{y} + c_0)^d

        In contrast to the linear kernel, the polynomial kernel also computes
        similarities *across* dimensions of the **x** and **y** vectors,
        allowing it to account for interactions between features.  As an
        instance of the dot product family of kernels, the polynomial kernel is
        invariant to a rotation of the coordinates about the origin, but *not*
        to translations.

        Parameters
        ----------
        d : int
            Degree of the polynomial kernel. Default is 3.
        gamma : float or None
            A scaling parameter for the dot product between `x` and `y`,
            determining the amount of smoothing/resonlution of the kernel.
            Larger values result in greater smoothing. If None, defaults to 1 /
            `C`.  Sometimes referred to as the kernel bandwidth.  Default is
            None.
        c0 : float
            Parameter trading off the influence of higher-order versus lower-order
            terms in the polynomial. If `c0` = 0, the kernel is said to be
            homogenous. Default is 1.
        """
        super().__init__()
        self.hyperparameters = {"id": "PolynomialKernel"}
        self.parameters = {"d": d, "c0": c0, "gamma": gamma}

    def _kernel(self, X, Y=None):
        """
        Compute the degree-`d` polynomial kernel between all pairs of rows in `X`
        and `Y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            Collection of `N` input vectors
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)` or None
            Collection of `M` input vectors. If None, assume `Y = X`. Default
            is None.

        Returns
        -------
        out : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            Similarity between `X` and `Y` where index (`i`, `j`) gives
            :math:`k(x_i, y_j)` (i.e., the kernel's Gram-matrix).
        """
        P = self.parameters
        X, Y = kernel_checks(X, Y)
        gamma = 1 / X.shape[1] if P["gamma"] is None else P["gamma"]
        return (gamma * (X @ Y.T) + P["c0"]) ** P["d"]


class RBFKernel(KernelBase):
    def __init__(self, sigma=None):
        """
        Radial basis function (RBF) / squared exponential kernel.

        Notes
        -----
        For input vectors :math:`\mathbf{x}` and :math:`\mathbf{y}`, the radial
        basis function kernel is:

        .. math::

            k(\mathbf{x}, \mathbf{y}) = \exp \left\{ -0.5
                \left\lVert \\frac{\mathbf{x} -
                    \mathbf{y}}{\sigma} \\right\\rVert_2^2 \\right\}

        The RBF kernel decreases with distance and ranges between zero (in the
        limit) to one (when **x** = **y**). Notably, the implied feature space
        of the kernel has an infinite number of dimensions.

        Parameters
        ----------
        sigma : float or array of shape `(C,)` or None
            A scaling parameter for the vectors **x** and **y**, producing an
            isotropic kernel if a float, or an anistropic kernel if an array of
            length `C`.  Larger values result in higher resolution / greater
            smoothing. If None, defaults to :math:`\sqrt(C / 2)`. Sometimes
            referred to as the kernel 'bandwidth'. Default is None.
        """
        super().__init__()
        self.hyperparameters = {"id": "RBFKernel"}
        self.parameters = {"sigma": sigma}

    def _kernel(self, X, Y=None):
        """
        Computes the radial basis function (RBF) kernel between all pairs of
        rows in `X` and `Y`.

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
            Collection of `N` input vectors, each with dimension `C`.
        Y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)`
            Collection of `M` input vectors. If None, assume `Y` = `X`. Default
            is None.

        Returns
        -------
        out : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
            Similarity between `X` and `Y` where index (i, j) gives :math:`k(x_i, y_j)`.
        """
        P = self.parameters
        X, Y = kernel_checks(X, Y)
        sigma = np.sqrt(X.shape[1] / 2) if P["sigma"] is None else P["sigma"]
        return np.exp(-0.5 * pairwise_l2_distances(X / sigma, Y / sigma) ** 2)


class KernelInitializer(object):
    def __init__(self, param=None):
        """
        A class for initializing learning rate schedulers. Valid inputs are:
            (a) __str__ representations of `KernelBase` instances
            (b) `KernelBase` instances
            (c) Parameter dicts (e.g., as produced via the :meth:`summary` method in
                `KernelBase` instances)

        If `param` is None, return `LinearKernel`.
        """
        self.param = param

    def __call__(self):
        param = self.param
        if param is None:
            kernel = LinearKernel()
        elif isinstance(param, KernelBase):
            kernel = param
        elif isinstance(param, str):
            kernel = self.init_from_str()
        elif isinstance(param, dict):
            kernel = self.init_from_dict()
        return kernel

    def init_from_str(self):
        r = r"([a-zA-Z0-9]*)=([^,)]*)"
        kr_str = self.param.lower()
        kwargs = dict([(i, eval(j)) for (i, j) in re.findall(r, self.param)])

        if "linear" in kr_str:
            kernel = LinearKernel(**kwargs)
        elif "polynomial" in kr_str:
            kernel = PolynomialKernel(**kwargs)
        elif "rbf" in kr_str:
            kernel = RBFKernel(**kwargs)
        else:
            raise NotImplementedError("{}".format(kr_str))
        return kernel

    def init_from_dict(self):
        S = self.param
        sc = S["hyperparameters"] if "hyperparameters" in S else None

        if sc is None:
            raise ValueError("Must have `hyperparameters` key: {}".format(S))

        if sc and sc["id"] == "LinearKernel":
            scheduler = LinearKernel().set_params(S)
        elif sc and sc["id"] == "PolynomialKernel":
            scheduler = PolynomialKernel().set_params(S)
        elif sc and sc["id"] == "RBFKernel":
            scheduler = RBFKernel().set_params(S)
        elif sc:
            raise NotImplementedError("{}".format(sc["id"]))
        return scheduler


def kernel_checks(X, Y):
    X = X.reshape(-1, 1) if X.ndim == 1 else X
    Y = X if Y is None else Y
    Y = Y.reshape(-1, 1) if Y.ndim == 1 else Y

    assert X.ndim == 2, "X must have 2 dimensions, but got {}".format(X.ndim)
    assert Y.ndim == 2, "Y must have 2 dimensions, but got {}".format(Y.ndim)
    assert X.shape[1] == Y.shape[1], "X and Y must have the same number of columns"
    return X, Y


def pairwise_l2_distances(X, Y):
    """
    A fast, vectorized way to compute pairwise l2 distances between rows in `X`
    and `Y`.

    Notes
    -----
    An entry of the pairwise Euclidean distance matrix for two vectors is

    .. math::

        d[i, j]  &=  \sqrt{(x_i - y_i) @ (x_i - y_i)} \\\\
                 &=  \sqrt{sum (x_i - y_j)^2} \\\\
                 &=  \sqrt{sum (x_i)^2 - 2 x_i y_j + (y_j)^2}

    The code below computes the the third line using numpy broadcasting
    fanciness to avoid any for loops.

    Parameters
    ----------
    X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)`
        Collection of `N` input vectors
    Y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)`
        Collection of `M` input vectors. If None, assume `Y` = `X`. Default is
        None.

    Returns
    -------
    dists : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
        Pairwise distance matrix. Entry (i, j) contains the `L2` distance between
        :math:`x_i` and :math:`y_j`.
    """
    D = -2 * X @ Y.T + np.sum(Y ** 2, axis=1) + np.sum(X ** 2, axis=1)[:, np.newaxis]
    D[D < 0] = 0  # clip any value less than 0 (a result of numerical imprecision)
    return np.sqrt(D)


================================================
FILE: numpy_ml/utils/misc.py
================================================
"""Miscellaneous utility functions"""
import numpy as np


def logsumexp(log_probs, axis=None):
    """
    Redefine scipy.special.logsumexp
    see: http://bayesjumping.net/log-sum-exp-trick/
    """
    _max = np.max(log_probs)
    ds = log_probs - _max
    exp_sum = np.exp(ds).sum(axis=axis)
    return _max + np.log(exp_sum)


def log_gaussian_pdf(x_i, mu, sigma):
    """Compute log N(x_i | mu, sigma)"""
    n = len(mu)
    a = n * np.log(2 * np.pi)
    _, b = np.linalg.slogdet(sigma)

    y = np.linalg.solve(sigma, x_i - mu)
    c = np.dot(x_i - mu, y)
    return -0.5 * (a + b + c)


================================================
FILE: numpy_ml/utils/testing.py
================================================
"""Utilities for writing unit tests"""
import numbers
import numpy as np


#######################################################################
#                             Assertions                              #
#######################################################################


def is_symmetric(X):
    """Check that an array `X` is symmetric along its main diagonal"""
    return np.allclose(X, X.T)


def is_symmetric_positive_definite(X):
    """Check that a matrix `X` is a symmetric and positive-definite."""
    if is_symmetric(X):
        try:
            # if matrix is symmetric, check whether the Cholesky decomposition
            # (defined only for symmetric/Hermitian positive definite matrices)
            # exists
            np.linalg.cholesky(X)
            return True
        except np.linalg.LinAlgError:
            return False
    return False


def is_stochastic(X):
    """True if `X` contains probabilities that sum to 1 along the columns"""
    msg = "Array should be stochastic along the columns"
    assert len(X[X < 0]) == len(X[X > 1]) == 0, msg
    assert np.allclose(np.sum(X, axis=1), np.ones(X.shape[0])), msg
    return True


def is_number(a):
    """Check that a value `a` is numeric"""
    return isinstance(a, numbers.Number)


def is_one_hot(x):
    """Return True if array `x` is a binary array with a single 1"""
    msg = "Matrix should be one-hot binary"
    assert np.array_equal(x, x.astype(bool)), msg
    assert np.allclose(np.sum(x, axis=1), np.ones(x.shape[0])), msg
    return True


def is_binary(x):
    """Return True if array `x` consists only of binary values"""
    msg = "Matrix must be binary"
    assert np.array_equal(x, x.astype(bool)), msg
    return True


#######################################################################
#                           Data Generators                           #
#######################################################################


def random_one_hot_matrix(n_examples, n_classes):
    """Create a random one-hot matrix of shape (`n_examples`, `n_classes`)"""
    X = np.eye(n_classes)
    X = X[np.random.choice(n_classes, n_examples)]
    return X


def random_stochastic_matrix(n_examples, n_classes):
    """Create a random stochastic matrix of shape (`n_examples`, `n_classes`)"""
    X = np.random.rand(n_examples, n_classes)
    X /= X.sum(axis=1, keepdims=True)
    return X


def random_tensor(shape, standardize=False):
    """
    Create a random real-valued tensor of shape `shape`. If `standardize` is
    True, ensure each column has mean 0 and std 1.
    """
    offset = np.random.randint(-300, 300, shape)
    X = np.random.rand(*shape) + offset

    if standardize:
        eps = np.finfo(float).eps
        X = (X - X.mean(axis=0)) / (X.std(axis=0) + eps)
    return X


def random_binary_tensor(shape, sparsity=0.5):
    """
    Create a random binary tensor of shape `shape`. `sparsity` is a value
    between 0 and 1 controlling the ratio of 0s to 1s in the output tensor.
    """
    return (np.random.rand(*shape) >= (1 - sparsity)).astype(float)


def random_paragraph(n_words, vocab=None):
    """
    Generate a random paragraph consisting of `n_words` words. If `vocab` is
    not None, words will be drawn at random from this list. Otherwise, words
    will be sampled uniformly from a collection of 26 Latin words.
    """
    if vocab is None:
        vocab = [
            "at",
            "stet",
            "accusam",
            "aliquyam",
            "clita",
            "lorem",
            "ipsum",
            "dolor",
            "dolore",
            "dolores",
            "sit",
            "amet",
            "consetetur",
            "sadipscing",
            "elitr",
            "sed",
            "diam",
            "nonumy",
            "eirmod",
            "duo",
            "ea",
            "eos",
            "erat",
            "est",
            "et",
            "gubergren",
        ]
    return [np.random.choice(vocab) for _ in range(n_words)]


#######################################################################
#                           Custom Warnings                           #
#######################################################################


class DependencyWarning(RuntimeWarning):
    pass


================================================
FILE: numpy_ml/utils/windows.py
================================================
import numpy as np


def blackman_harris(window_len, symmetric=False):
    """
    The Blackman-Harris window.

    Notes
    -----
    The Blackman-Harris window is an instance of the more general class of
    cosine-sum windows where `K=3`. Additional coefficients extend the Hamming
    window to further minimize the magnitude of the nearest side-lobe in the
    frequency response.

    .. math::
        \\text{bh}(n) = a_0 - a_1 \cos\left(\\frac{2 \pi n}{N}\\right) +
            a_2 \cos\left(\\frac{4 \pi n }{N}\\right) -
                a_3 \cos\left(\\frac{6 \pi n}{N}\\right)

    where `N` = `window_len` - 1, :math:`a_0` = 0.35875, :math:`a_1` = 0.48829,
    :math:`a_2` = 0.14128, and :math:`a_3` = 0.01168.

    Parameters
    ----------
    window_len : int
        The length of the window in samples. Should be equal to the
        `frame_width` if applying to a windowed signal.
    symmetric : bool
        If False, create a 'periodic' window that can be used in with an FFT /
        in spectral analysis.  If True, generate a symmetric window that can be
        used in, e.g., filter design. Default is False.

    Returns
    -------
    window : :py:class:`ndarray <numpy.ndarray>` of shape `(window_len,)`
        The window
    """
    return generalized_cosine(
        window_len, [0.35875, 0.48829, 0.14128, 0.01168], symmetric
    )


def hamming(window_len, symmetric=False):
    """
    The Hamming window.

    Notes
    -----
    The Hamming window is an instance of the more general class of cosine-sum
    windows where `K=1` and :math:`a_0 = 0.54`. Coefficients selected to
    minimize the magnitude of the nearest side-lobe in the frequency response.

    .. math::

        \\text{hamming}(n) = 0.54 -
            0.46 \cos\left(\\frac{2 \pi n}{\\text{window_len} - 1}\\right)

    Parameters
    ----------
    window_len : int
        The length of the window in samples. Should be equal to the
        `frame_width` if applying to a windowed signal.
    symmetric : bool
        If False, create a 'periodic' window that can be used in with an FFT /
        in spectral analysis.  If True, generate a symmetric window that can be
        used in, e.g., filter design. Default is False.

    Returns
    -------
    window : :py:class:`ndarray <numpy.ndarray>` of shape `(window_len,)`
        The window
    """
    return generalized_cosine(window_len, [0.54, 1 - 0.54], symmetric)


def hann(window_len, symmetric=False):
    """
    The Hann window.

    Notes
    -----
    The Hann window is an instance of the more general class of cosine-sum
    windows where `K=1` and :math:`a_0` = 0.5. Unlike the Hamming window, the
    end points of the Hann window touch zero.

    .. math::

        \\text{hann}(n) = 0.5 - 0.5 \cos\left(\\frac{2 \pi n}{\\text{window_len} - 1}\\right)

    Parameters
    ----------
    window_len : int
        The length of the window in samples. Should be equal to the
        `frame_width` if applying to a windowed signal.
    symmetric : bool
        If False, create a 'periodic' window that can be used in with an FFT /
        in spectral analysis.  If True, generate a symmetric window that can be
        used in, e.g., filter design. Default is False.

    Returns
    -------
    window : :py:class:`ndarray <numpy.ndarray>` of shape `(window_len,)`
        The window
    """
    return generalized_cosine(window_len, [0.5, 0.5], symmetric)


def generalized_cosine(window_len, coefs, symmetric=False):
    """
    The generalized cosine family of window functions.

    Notes
    -----
    The generalized cosine window is a simple weighted sum of cosine terms.

    For :math:`n \in \{0, \ldots, \\text{window_len} \}`:

    .. math::

        \\text{GCW}(n) = \sum_{k=0}^K (-1)^k a_k \cos\left(\\frac{2 \pi k n}{\\text{window_len}}\\right)

    Parameters
    ----------
    window_len : int
        The length of the window in samples. Should be equal to the
        `frame_width` if applying to a windowed signal.
    coefs: list of floats
        The :math:`a_k` coefficient values
    symmetric : bool
        If False, create a 'periodic' window that can be used in with an FFT /
        in spectral analysis.  If True, generate a symmetric window that can be
        used in, e.g., filter design. Default is False.

    Returns
    -------
    window : :py:class:`ndarray <numpy.ndarray>` of shape `(window_len,)`
        The window
    """
    window_len += 1 if not symmetric else 0
    entries = np.linspace(-np.pi, np.pi, window_len)  # (-1)^k * 2pi*n / window_len
    window = np.sum([ak * np.cos(k * entries) for k, ak in enumerate(coefs)], axis=0)
    return window[:-1] if not symmetric else window


class WindowInitializer:
    def __call__(self, window):
        if window == "hamming":
            return hamming
        elif window == "blackman_harris":
            return blackman_harris
        elif window == "hann":
            return hann
        elif window == "generalized_cosine":
            return generalized_cosine
        else:
            raise NotImplementedError("{}".format(window))


================================================
FILE: requirements-dev.txt
================================================
numpy
scipy
sklearn
torch
networkx
matplotlib
seaborn
tensorflow
gym
keras
huffman
librosa==0.7.2
llvmlite==0.32.1
numba==0.45.0
nltk
hmmlearn
statsmodels
pre-commit
tox
pytest


================================================
FILE: requirements-test.txt
================================================
numpy
scipy
sklearn
torch
networkx
tensorflow
keras
gym
huffman
librosa==0.7.2
llvmlite==0.32.1
numba==0.45.0
nltk
hmmlearn
statsmodels
tox
pytest


================================================
FILE: requirements.txt
================================================
numpy
scipy


================================================
FILE: setup.py
================================================
# flake8: noqa
from codecs import open

from setuptools import setup, find_packages

with open("README.md", encoding="utf-8") as f:
    LONG_DESCRIPTION = f.read()

with open("requirements.txt") as requirements:
    REQUIREMENTS = [r.strip() for r in requirements if r != "\n"]

PROJECT_URLS = {
    "Bug Tracker": "https://github.com/ddbourgin/numpy-ml/issues",
    "Documentation": "https://numpy-ml.readthedocs.io/en/latest/",
    "Source": "https://github.com/ddbourgin/numpy-ml",
}

setup(
    name="numpy-ml",
    version="0.1.2",
    author="David Bourgin",
    author_email="ddbourgin@gmail.com",
    project_urls=PROJECT_URLS,
    url="https://github.com/ddbourgin/numpy-ml",
    description="Machine learning in NumPy",
    long_description=LONG_DESCRIPTION,
    long_description_content_type="text/markdown",
    install_requires=REQUIREMENTS,
    packages=find_packages(),
    license="GPLv3+",
    include_package_data=True,
    python_requires=">=3.5",
    extras_require={"rl": ["gym", "matplotlib"]},
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "Intended Audience :: Developers",
        "Topic :: Scientific/Engineering",
        "License :: OSI Approved :: GNU General Public License (GPL)",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
    ],
)


================================================
FILE: tox.ini
================================================
[tox]
envlist = py36,py38
skip_missing_interpreters=true
[testenv]
deps = -rrequirements-test.txt
commands = pytest