Repository: ddbourgin/numpy-ml
Branch: master
Commit: b0359af5285f
Files: 194
Total size: 1.3 MB
Directory structure:
gitextract_t47luwfk/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ └── a--bug-performance-issue.md
│ └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── conf.py
│ ├── index.rst
│ ├── make.bat
│ ├── numpy_ml.bandits.bandits.rst
│ ├── numpy_ml.bandits.policies.rst
│ ├── numpy_ml.bandits.rst
│ ├── numpy_ml.bandits.trainer.rst
│ ├── numpy_ml.factorization.factors.rst
│ ├── numpy_ml.factorization.rst
│ ├── numpy_ml.gmm.gmm.rst
│ ├── numpy_ml.gmm.rst
│ ├── numpy_ml.hmm.MultinomialHMM.rst
│ ├── numpy_ml.hmm.rst
│ ├── numpy_ml.lda.lda.rst
│ ├── numpy_ml.lda.rst
│ ├── numpy_ml.lda.smoothed_lda.rst
│ ├── numpy_ml.linear_models.lm.rst
│ ├── numpy_ml.linear_models.rst
│ ├── numpy_ml.neural_nets.activations.rst
│ ├── numpy_ml.neural_nets.initializers.rst
│ ├── numpy_ml.neural_nets.layers.rst
│ ├── numpy_ml.neural_nets.losses.rst
│ ├── numpy_ml.neural_nets.models.rst
│ ├── numpy_ml.neural_nets.modules.rst
│ ├── numpy_ml.neural_nets.optimizers.rst
│ ├── numpy_ml.neural_nets.rst
│ ├── numpy_ml.neural_nets.schedulers.rst
│ ├── numpy_ml.neural_nets.utils.rst
│ ├── numpy_ml.neural_nets.wrappers.rst
│ ├── numpy_ml.ngram.additive.rst
│ ├── numpy_ml.ngram.goodturing.rst
│ ├── numpy_ml.ngram.mle.rst
│ ├── numpy_ml.ngram.rst
│ ├── numpy_ml.nonparametric.gp.rst
│ ├── numpy_ml.nonparametric.kernel_regression.rst
│ ├── numpy_ml.nonparametric.knn.rst
│ ├── numpy_ml.nonparametric.rst
│ ├── numpy_ml.preprocessing.dsp.rst
│ ├── numpy_ml.preprocessing.general.rst
│ ├── numpy_ml.preprocessing.nlp.rst
│ ├── numpy_ml.preprocessing.rst
│ ├── numpy_ml.rl_models.agents.rst
│ ├── numpy_ml.rl_models.rl_utils.rst
│ ├── numpy_ml.rl_models.rst
│ ├── numpy_ml.rl_models.trainer.rst
│ ├── numpy_ml.trees.dt.rst
│ ├── numpy_ml.trees.gbdt.rst
│ ├── numpy_ml.trees.losses.rst
│ ├── numpy_ml.trees.rf.rst
│ ├── numpy_ml.trees.rst
│ ├── numpy_ml.utils.data_structures.rst
│ ├── numpy_ml.utils.distance_metrics.rst
│ ├── numpy_ml.utils.graphs.rst
│ ├── numpy_ml.utils.kernels.rst
│ ├── numpy_ml.utils.rst
│ ├── numpy_ml.utils.testing.rst
│ ├── numpy_ml.utils.windows.rst
│ └── requirements.txt
├── numpy_ml/
│ ├── README.md
│ ├── __init__.py
│ ├── bandits/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── bandits.py
│ │ ├── policies.py
│ │ └── trainer.py
│ ├── factorization/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── factors.py
│ ├── gmm/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── gmm.py
│ ├── hmm/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── hmm.py
│ ├── lda/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── lda.py
│ │ └── lda_smoothed.py
│ ├── linear_models/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── bayesian_regression.py
│ │ ├── glm.py
│ │ ├── linear_regression.py
│ │ ├── logistic.py
│ │ ├── naive_bayes.py
│ │ └── ridge.py
│ ├── neural_nets/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── activations/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── activations.py
│ │ ├── initializers/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── initializers.py
│ │ ├── layers/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── layers.py
│ │ ├── losses/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── losses.py
│ │ ├── models/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── vae.py
│ │ │ ├── w2v.py
│ │ │ └── wgan_gp.py
│ │ ├── modules/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── modules.py
│ │ ├── optimizers/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── optimizers.py
│ │ ├── schedulers/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── schedulers.py
│ │ ├── utils/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ └── wrappers/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── wrappers.py
│ ├── ngram/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── ngram.py
│ ├── nonparametric/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── gp.py
│ │ ├── kernel_regression.py
│ │ └── knn.py
│ ├── plots/
│ │ ├── bandit_plots.py
│ │ ├── gmm_plots.py
│ │ ├── hmm_plots.py
│ │ ├── lda_plots.py
│ │ ├── lm_plots.py
│ │ ├── ngram_plots.py
│ │ ├── nn_activations_plots.py
│ │ ├── nn_schedulers_plots.py
│ │ ├── nonparametric_plots.py
│ │ ├── rl_plots.py
│ │ └── trees_plots.py
│ ├── preprocessing/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── dsp.py
│ │ ├── general.py
│ │ └── nlp.py
│ ├── rl_models/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── agents.py
│ │ ├── rl_utils.py
│ │ ├── tiles/
│ │ │ ├── __init__.py
│ │ │ └── tiles3.py
│ │ └── trainer.py
│ ├── tests/
│ │ ├── __init__.py
│ │ ├── nn_torch_models.py
│ │ ├── test_glm.py
│ │ ├── test_linear_regression.py
│ │ ├── test_naive_bayes.py
│ │ ├── test_ngram.py
│ │ ├── test_nn.py
│ │ ├── test_nn_activations.py
│ │ ├── test_nonparametric.py
│ │ ├── test_preprocessing.py
│ │ ├── test_trees.py
│ │ └── test_utils.py
│ ├── trees/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── dt.py
│ │ ├── gbdt.py
│ │ ├── losses.py
│ │ └── rf.py
│ └── utils/
│ ├── README.md
│ ├── __init__.py
│ ├── data_structures.py
│ ├── distance_metrics.py
│ ├── graphs.py
│ ├── kernels.py
│ ├── misc.py
│ ├── testing.py
│ └── windows.py
├── requirements-dev.txt
├── requirements-test.txt
├── requirements.txt
├── setup.py
└── tox.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/a--bug-performance-issue.md
================================================
---
name: Bug/Performance Issue
about: Use this template for reporting a bug or a performance issue.
labels: bugfix
---
**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- NumPy version:
**Describe the current behavior**
**Describe the expected behavior**
**Code to reproduce the issue**
**Other info / logs**
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### All Submissions
* [ ] Is the code you are submitting your own work?
* [ ] Have you followed the [contributing guidelines](https://github.com/ddbourgin/numpy-ml/CONTRIBUTING.md)?
* [ ] Have you checked to ensure there aren't other open [Pull Requests](https://github.com/ddbourgin/numpy-ml/pulls) for the same update/change?
### New Model Submissions
* [ ] Is the code you are submitting your own work?
* [ ] Did you properly attribute the authors of any code you referenced?
* [ ] Did you write unit tests for your new model?
* [ ] Does your submission pass the unit tests?
* [ ] Did you write documentation for your new model?
* [ ] Have you formatted your code using the [black](https://black.now.sh/) deaults?
### Changes to Existing Models
* [ ] Have you added an explanation of what your changes do and why you'd like us to include them?
* [ ] Have you written new tests for your changes, as applicable?
* [ ] Have you successfully ran tests with your changes locally?
================================================
FILE: .gitignore
================================================
### OSX ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### Python Patch ###
.venv/
### Vim ###
# Swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
# No pdfs
*.pdf
# No TODOs ;-)
TODO
_build
_static
================================================
FILE: .readthedocs.yml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# Build documentation with MkDocs
#mkdocs:
# configuration: mkdocs.yml
# Optionally build your docs in additional formats such as PDF and ePub
formats:
- htmlzip
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# NumPy-ML Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at ddbourgin@gmail.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: CONTRIBUTING.md
================================================
## Contributing
Thank you for contributing to numpy-ml!
|
⚠️ ⚠️ All PRs should reflect earnest attempts at implementing a model yourself. ⚠️⚠️
It is fine to reference others' code. It is not fine to blindly copy without attribution. When in doubt, please ask. |
| --- |
### General guidelines
1. Please include a clear list of what you've done
2. For pull requests, please make sure all commits are [*atomic*](https://en.wikipedia.org/wiki/Atomic_commit) (i.e., one feature per commit)
3. If you're submitting a new model / feature / module, **please include proper documentation and unit tests.**
- See the `test.py` file in one of the existing modules for examples of unit tests.
- Documentation is loosely based on the [NumPy docstring style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html). When in doubt, refer to existing examples
4. Please format your code using the [black](https://github.com/python/black) defaults. You can use this [online formatter](https://black.now.sh/).
### Specific guidelines
#### I have a new model / model component to contribute
Awesome - create a [pull request](https://github.com/ddbourgin/numpy-ml/pulls)! When preparing your PR, please include a brief description of the model, the canonical reference(s) in the literature, and, most importantly unit tests against an existing implementation!
- Refer to the `test.py` file in one of the existing modules for examples.
#### I have a major new enhancement / adjustment that will affect multiple models
Please post an [issue](https://github.com/ddbourgin/numpy-ml/issues) with your proposal before you begin working on it. When outlining your proposal, please include as much detail about your intended changes as possible.
#### I found a bug
If there isn't already an [open issue](https://github.com/ddbourgin/numpy-ml/issues), please start one! When creating your issue, include:
1. A title and clear description
2. As much relevant information as possible
3. A code sample demonstrating the expected behavior that is not occurring
#### I fixed a bug
Thank you! Please open a new [pull request](https://github.com/ddbourgin/numpy-ml/pulls) with the patch. When doing so, ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Copyright (C)
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
================================================
FILE: MANIFEST.in
================================================
include README.md
include requirements*.txt
include docs/*.rst
include docs/img/*.png
================================================
FILE: README.md
================================================
# numpy-ml
Ever wish you had an inefficient but somewhat legible collection of machine
learning algorithms implemented exclusively in NumPy? No?
## Installation
### For rapid experimentation
To use this code as a starting point for ML prototyping / experimentation, just clone the repository, create a new [virtualenv](https://pypi.org/project/virtualenv/), and start hacking:
```sh
$ git clone https://github.com/ddbourgin/numpy-ml.git
$ cd numpy-ml && virtualenv npml && source npml/bin/activate
$ pip3 install -r requirements-dev.txt
```
### As a package
If you don't plan to modify the source, you can also install numpy-ml as a
Python package: `pip3 install -u numpy_ml`.
The reinforcement learning agents train on environments defined in the [OpenAI
gym](https://github.com/openai/gym). To install these alongside numpy-ml, you
can use `pip3 install -u 'numpy_ml[rl]'`.
## Documentation
For more details on the available models, see the [project documentation](https://numpy-ml.readthedocs.io/).
## Available models
Click to expand!
1. **Gaussian mixture model**
- EM training
2. **Hidden Markov model**
- Viterbi decoding
- Likelihood computation
- MLE parameter estimation via Baum-Welch/forward-backward algorithm
3. **Latent Dirichlet allocation** (topic model)
- Standard model with MLE parameter estimation via variational EM
- Smoothed model with MAP parameter estimation via MCMC
4. **Neural networks**
* Layers / Layer-wise ops
- Add
- Flatten
- Multiply
- Softmax
- Fully-connected/Dense
- Sparse evolutionary connections
- LSTM
- Elman-style RNN
- Max + average pooling
- Dot-product attention
- Embedding layer
- Restricted Boltzmann machine (w. CD-n training)
- 2D deconvolution (w. padding and stride)
- 2D convolution (w. padding, dilation, and stride)
- 1D convolution (w. padding, dilation, stride, and causality)
* Modules
- Bidirectional LSTM
- ResNet-style residual blocks (identity and convolution)
- WaveNet-style residual blocks with dilated causal convolutions
- Transformer-style multi-headed scaled dot product attention
* Regularizers
- Dropout
* Normalization
- Batch normalization (spatial and temporal)
- Layer normalization (spatial and temporal)
* Optimizers
- SGD w/ momentum
- AdaGrad
- RMSProp
- Adam
* Learning Rate Schedulers
- Constant
- Exponential
- Noam/Transformer
- Dlib scheduler
* Weight Initializers
- Glorot/Xavier uniform and normal
- He/Kaiming uniform and normal
- Standard and truncated normal
* Losses
- Cross entropy
- Squared error
- Bernoulli VAE loss
- Wasserstein loss with gradient penalty
- Noise contrastive estimation loss
* Activations
- ReLU
- Tanh
- Affine
- Sigmoid
- Leaky ReLU
- ELU
- SELU
- GELU
- Exponential
- Hard Sigmoid
- Softplus
* Models
- Bernoulli variational autoencoder
- Wasserstein GAN with gradient penalty
- word2vec encoder with skip-gram and CBOW architectures
* Utilities
- `col2im` (MATLAB port)
- `im2col` (MATLAB port)
- `conv1D`
- `conv2D`
- `deconv2D`
- `minibatch`
5. **Tree-based models**
- Decision trees (CART)
- [Bagging] Random forests
- [Boosting] Gradient-boosted decision trees
6. **Linear models**
- Ridge regression
- Logistic regression
- Ordinary least squares
- Weighted linear regression
- Generalized linear model (log, logit, and identity link)
- Gaussian naive Bayes classifier
- Bayesian linear regression w/ conjugate priors
- Unknown mean, known variance (Gaussian prior)
- Unknown mean, unknown variance (Normal-Gamma / Normal-Inverse-Wishart prior)
7. **n-Gram sequence models**
- Maximum likelihood scores
- Additive/Lidstone smoothing
- Simple Good-Turing smoothing
8. **Multi-armed bandit models**
- UCB1
- LinUCB
- Epsilon-greedy
- Thompson sampling w/ conjugate priors
- Beta-Bernoulli sampler
- LinUCB
8. **Reinforcement learning models**
- Cross-entropy method agent
- First visit on-policy Monte Carlo agent
- Weighted incremental importance sampling Monte Carlo agent
- Expected SARSA agent
- TD-0 Q-learning agent
- Dyna-Q / Dyna-Q+ with prioritized sweeping
9. **Nonparameteric models**
- Nadaraya-Watson kernel regression
- k-Nearest neighbors classification and regression
- Gaussian process regression
10. **Matrix factorization**
- Regularized alternating least-squares
- Non-negative matrix factorization
11. **Preprocessing**
- Discrete Fourier transform (1D signals)
- Discrete cosine transform (type-II) (1D signals)
- Bilinear interpolation (2D signals)
- Nearest neighbor interpolation (1D and 2D signals)
- Autocorrelation (1D signals)
- Signal windowing
- Text tokenization
- Feature hashing
- Feature standardization
- One-hot encoding / decoding
- Huffman coding / decoding
- Byte pair encoding / decoding
- Term frequency-inverse document frequency (TF-IDF) encoding
- MFCC encoding
12. **Utilities**
- Similarity kernels
- Distance metrics
- Priority queue
- Ball tree
- Discrete sampler
- Graph processing and generators
## Contributing
Am I missing your favorite model? Is there something that could be cleaner /
less confusing? Did I mess something up? Submit a PR! The only requirement is
that your models are written with just the [Python standard
library](https://docs.python.org/3/library/) and [NumPy](https://www.numpy.org/). The
[SciPy library](https://scipy.github.io/devdocs/) is also permitted under special
circumstances ;)
See full contributing guidelines [here](./CONTRIBUTING.md).
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
To build the documentation locally, [install sphinx](http://www.sphinx-doc.org/en/master/usage/installation.html), cd into the docs, directory and run `make html`. Local files will be generated in the `docs/_build/html` directory.
================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
import inspect
sys.path.insert(0, os.path.abspath(".."))
gh_url = "https://github.com/ddbourgin/numpy-ml"
# -- Project information -----------------------------------------------------
project = "numpy-ml"
copyright = "2022, David Bourgin"
author = "David Bourgin"
# The short X.Y version
version = "0.1"
# The full version, including alpha/beta/rc tags
release = "0.1.0"
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
"sphinx.ext.coverage",
"sphinx.ext.mathjax",
"sphinx.ext.ifconfig",
"sphinx.ext.githubpages",
"sphinx.ext.napoleon",
"sphinx.ext.linkcode"
# "numpydoc",
]
# to avoid memory errors in the read-the-docs build process
autodoc_mock_imports = ["tensorflow", "torch", "gym"]
# Try to link to source code on GitHub
def linkcode_resolve(domain, info):
if domain != "py":
return None
module = info.get("module", None)
fullname = info.get("fullname", None)
if not module or not fullname:
return None
obj = sys.modules.get(module, None)
if obj is None:
return None
for part in fullname.split("."):
obj = getattr(obj, part)
if isinstance(obj, property):
obj = obj.fget
try:
file = inspect.getsourcefile(obj)
if file is None:
return None
except:
return None
file = os.path.relpath(file, start=os.path.abspath(".."))
source, line_start = inspect.getsourcelines(obj)
line_end = line_start + len(source) - 1
filename = f"{file}#L{line_start}-L{line_end}"
return f"{gh_url}/blob/master/{filename}"
# Napoleon settings
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/sphinxcontrib.napoleon.html#sphinxcontrib.napoleon.Config
napoleon_google_docstring = False
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = False
napoleon_include_private_with_doc = False
napoleon_include_special_with_doc = False
napoleon_use_admonition_for_examples = False
napoleon_use_admonition_for_notes = False
napoleon_use_admonition_for_references = False
napoleon_use_ivar = True
napoleon_use_param = True
napoleon_use_rtype = False
napoleon_use_keyword = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"
# The master toctree document.
master_doc = "index"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "friendly"
autoclass_content = "both"
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "alabaster"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
html_css_files = ["css/custom.css"]
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
html_sidebars = {
"**": [
"about.html",
"navigation.html",
"relations.html",
"searchbox.html",
"donate.html",
]
}
html_theme_options = {
"github_user": "ddbourgin",
"github_repo": "numpy-ml",
"description": "Machine learning, in NumPy",
"github_button": True,
"show_powered_by": False,
"fixed_sidebar": True,
"analytics_id": "UA-65839510-3",
# 'logo': 'logo.png',
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = "numpy-mldoc"
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, "numpy-ml.tex", "numpy-ml Documentation", "David Bourgin", "manual")
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "numpy-ml", "numpy-ml Documentation", [author], 1)]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
master_doc,
"numpy-ml",
"numpy-ml Documentation",
author,
"numpy-ml",
"Machine learning, in NumPy.",
"Miscellaneous",
)
]
# -- Options for Epub output -------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ["search.html"]
autodoc_member_order = "bysource"
# -- Extension configuration -------------------------------------------------
# -- Options for intersphinx extension ---------------------------------------
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"python": ("https://docs.python.org/", None),
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
}
# -- Options for todo extension ----------------------------------------------
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for numpydocs extension -----------------------------------------
# https://numpydoc.readthedocs.io/en/latest/install.html
# Whether to produce plot:: directives for Examples sections that contain
# import matplotlib or from matplotlib import.
numpydoc_use_plots = True
# Whether to show all members of a class in the Methods and Attributes sections
# automatically. True by default.
numpydoc_show_class_members = True
# Whether to show all inherited members of a class in the Methods and
# Attributes sections automatically. If it’s false, inherited members won’t
# shown. True by default.
numpydoc_show_inherited_class_members = True
# Whether to create a Sphinx table of contents for the lists of class methods
# and attributes. If a table of contents is made, Sphinx expects each entry to
# have a separate page. True by default.
numpydoc_class_members_toctree = False
# A regular expression matching citations which should be mangled to avoid
# conflicts due to duplication across the documentation. Defaults to [\w-]+.
numpydoc_citation_re = r"[\w-]+"
# Until version 0.8, parameter definitions were shown as blockquotes, rather
# than in a definition list. If your styling requires blockquotes, switch this
# config option to True. This option will be removed in version 0.10.
numpydoc_use_blockquotes = False
# Whether to format the Attributes section of a class page in the same way as
# the Parameter section. If it's False, the Attributes section will be
# formatted as the Methods section using an autosummary table. True by default.
numpydoc_attributes_as_param_list = False
# Whether to create cross-references for the parameter types in the Parameters,
# Other Parameters, Returns and Yields sections of the docstring. False by
# default.
numpydoc_xref_param_type = False
# Mappings to fully qualified paths (or correct ReST references) for the
# aliases/shortcuts used when specifying the types of parameters. The keys
# should not have any spaces. Together with the intersphinx extension, you can
# map to links in any documentation. The default is an empty dict. This
# option depends on the numpydoc_xref_param_type option being True.
numpydoc_xref_aliases = {}
# Words not to cross-reference. Most likely, these are common words used in
# parameter type descriptions that may be confused for classes of the same
# name. For example: {'type', 'optional', 'default'}. The default is an empty
# set.
numpydoc_xref_ignore = set([])
# Deprecated since version edit: your HTML template instead Whether to insert
# an edit link after docstrings.
numpydoc_edit_link: bool
================================================
FILE: docs/index.rst
================================================
Welcome to numpy-ml
===================
`numpy-ml`_ is a growing collection of machine learning models, algorithms, and
tools written exclusively in `NumPy`_ and the Python `standard library`_.
The purpose of the project is to provide reference implementations of common
machine learning components for rapid prototyping and experimentation. With
that in mind, don't just read the docs -- read the source!
.. _numpy-ml: https://www.github.com/ddbourgin/numpy-ml
.. _NumPy: https://numpy.org/
.. _standard library: https://docs.python.org/3/library/
.. topic:: This documentation is under development!
We're working to expand our coverage. During this time there are likely to
be typos, bugs, and poorly-worded sections. If you encounter any of the
above, please file an `issue`_ or submit a `pull request`_!
.. _issue: https://github.com/ddbourgin/numpy-ml/issues
.. _pull request: https://github.com/ddbourgin/numpy-ml/pulls
.. toctree::
:maxdepth: 3
:hidden:
numpy_ml.hmm
numpy_ml.gmm
numpy_ml.lda
numpy_ml.ngram
numpy_ml.bandits
numpy_ml.rl_models
numpy_ml.nonparametric
numpy_ml.factorization
numpy_ml.trees
numpy_ml.neural_nets
numpy_ml.linear_models
numpy_ml.preprocessing
numpy_ml.utils
##########
Disclaimer
##########
This software is provided as-is: there are no guarantees that it fits your
purposes or that it is bug-free. Use it at your own risk!
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
================================================
FILE: docs/numpy_ml.bandits.bandits.rst
================================================
Bandit Environments
===================
``Bandit``
-----------
.. autoclass:: numpy_ml.bandits.bandits.Bandit
:members:
:undoc-members:
:inherited-members:
``MultinomialBandit``
-------------------------
.. autoclass:: numpy_ml.bandits.MultinomialBandit
:members:
:undoc-members:
:show-inheritance:
``BernoulliBandit``
-----------------------
.. autoclass:: numpy_ml.bandits.BernoulliBandit
:members:
:undoc-members:
:show-inheritance:
``GaussianBandit``
----------------------
.. autoclass:: numpy_ml.bandits.GaussianBandit
:members:
:undoc-members:
:show-inheritance:
``ShortestPathBandit``
-----------------------
.. autoclass:: numpy_ml.bandits.ShortestPathBandit
:members:
:undoc-members:
:show-inheritance:
``ContextualBernoulliBandit``
------------------------------
.. autoclass:: numpy_ml.bandits.ContextualBernoulliBandit
:members:
:undoc-members:
:show-inheritance:
``ContextualLinearBandit``
------------------------------
.. autoclass:: numpy_ml.bandits.ContextualLinearBandit
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.bandits.policies.rst
================================================
Policies
=========
``BanditPolicyBase``
--------------------
.. autoclass:: numpy_ml.bandits.policies.BanditPolicyBase
:members:
:undoc-members:
:inherited-members:
``EpsilonGreedy``
-----------------
.. autoclass:: numpy_ml.bandits.policies.EpsilonGreedy
:members:
:undoc-members:
:show-inheritance:
``UCB1``
--------
.. autoclass:: numpy_ml.bandits.policies.UCB1
:members:
:undoc-members:
:show-inheritance:
``ThompsonSamplingBetaBinomial``
--------------------------------
.. autoclass:: numpy_ml.bandits.policies.ThompsonSamplingBetaBinomial
:members:
:undoc-members:
:show-inheritance:
``LinUCB``
--------------------------------
.. autoclass:: numpy_ml.bandits.policies.LinUCB
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.bandits.rst
================================================
Multi-armed bandits
###################
.. toctree::
:maxdepth: 3
numpy_ml.bandits.bandits
numpy_ml.bandits.policies
numpy_ml.bandits.trainer
================================================
FILE: docs/numpy_ml.bandits.trainer.rst
================================================
Trainer
=======
``BanditTrainer``
------------------
.. autoclass:: numpy_ml.bandits.trainer.BanditTrainer
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.factorization.factors.rst
================================================
``VanillaALS``
--------------
.. autoclass:: numpy_ml.factorization.VanillaALS
:members:
:undoc-members:
``NMF``
--------
.. autoclass:: numpy_ml.factorization.NMF
:members:
:undoc-members:
================================================
FILE: docs/numpy_ml.factorization.rst
================================================
Matrix factorization
####################
.. toctree::
:maxdepth: 3
numpy_ml.factorization.factors
================================================
FILE: docs/numpy_ml.gmm.gmm.rst
================================================
``GMM``
-------
.. autoclass:: numpy_ml.gmm.GMM
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.gmm.rst
================================================
#######################
Gaussian mixture models
#######################
A `Gaussian mixture model`_ (GMM) is a latent variable model commonly used for
unsupervised clustering.
.. figure:: img/gmm_model.png
:scale: 30 %
:align: center
Graphical model for a GMM with `K` mixture components and `N` data points.
.. _`Gaussian mixture model` : https://en.wikipedia.org/wiki/Mixture_model#Gaussian_mixture_model
A GMM assumes that:
1. The observed data are generated from a `mixture distribution`_, `P`,
made up of `K` mixture components.
2. Each mixture component is a multivariate Gaussian with its own mean
:math:`\mu`, covariance matrix, :math:`\Sigma`, and mixture weight,
:math:`\pi`.
.. 3. To generate a new data point, we sample a mixture component in
.. proportion to its prior probability, then draw a sample from the
.. distribution parameterized by that component's mean and covariance.
.. _mixture distribution: https://en.wikipedia.org/wiki/Mixture_distribution
The parameters of a GMM model are:
- :math:`\theta`, the set of parameters for each of the `K` mixture
components. :math:`\theta = \{ \mu_1, \Sigma_1, \pi_i, \ldots, \mu_k,
\Sigma_k, \pi_k \}`.
Under a GMM, the joint probability of a sequence of cluster assignments `Z` and an observed
dataset :math:`X = \{x_1, \ldots, x_N \}`, is:
.. math::
p(Z, X \mid \theta) =
\prod_{i=1}^N p(z_i, x_i \mid \theta) =
\prod_{i=1}^N \prod_{k=1}^K
[\mathcal{N}(x_i \mid \mu_k, \Sigma_k) \pi_k ]^{\mathbb{1}_{[z_{i} = k]}}
where
- :math:`\theta` is the set of GMM parameters: :math:`\theta = \{ \mu_1,
\Sigma_1, \pi_i, \ldots, \mu_k, \Sigma_k, \pi_k \}`.
- :math:`Z_i \in \{ 1, \ldots, k \}` is a latent variable reflecting the ID
of the mixture component that generated data point `i`.
- :math:`\mathbb{1}_{[z_i = k]}` is a binary indicator function returning
1 if data point :math:`x_i` was sampled from mixture component :math:`k`
and 0 otherwise.
As with other latent-variable models, we use the `expectation-maximization (EM)
algorithm`_ to learn the GMM parameters.
.. _expectation-maximization (EM) algorithm : https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm
**Models**
- :class:`~numpy_ml.gmm.GMM`
**References**
.. [1] Bilmes, J. A. (1998). "A gentle tutorial of the EM algorithm and its
application to parameter estimation for Gaussian mixture and hidden
Markov models" *International Computer Science Institute, 4(510)*
https://www.inf.ed.ac.uk/teaching/courses/pmr/docs/EM.pdf
.. toctree::
:maxdepth: 2
:hidden:
numpy_ml.gmm.gmm
================================================
FILE: docs/numpy_ml.hmm.MultinomialHMM.rst
================================================
``MultinomialHMM``
------------------
.. autoclass:: numpy_ml.hmm.MultinomialHMM
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.hmm.rst
================================================
####################
Hidden Markov models
####################
A `hidden Markov model`_ (HMM) is a generative model for sequences of observations.
.. _`hidden Markov model` : https://en.wikipedia.org/wiki/Hidden_Markov_model
.. figure:: img/hmm_model.png
:scale: 25 %
:align: center
Graphical model for an HMM with :math:`T=4` timesteps.
An HMM assumes:
1. The observations, `O`, are generated by a process whose states,
:math:`S`, are *hidden* from the observer.
2. Each hidden state is a discrete random variable.
3. The hidden state at time `t` is independent of all hidden states before
time :math:`t - 1`.
4. The observation :math:`O_t` is independent of all previous states and
observations given the current hidden state, :math:`S_t`.
The parameters of an HMM model are:
- :math:`\pi`, the prior specifying :math:`P(S_1)`.
- :math:`\theta`, the :math:`K \times K` transition matrix specifying
:math:`P(S_t \mid S_{t-1})`.
- :math:`\phi`, the output model defining :math:`P(Y_t \mid S_t)`. If the
observations are discrete, this is a :math:`K \times L` emission matrix,
where `L` is the number of unique observation symbols.
The HMM joint distribution of a sequence of states and observations is:
.. math::
P(S_{1:T}, O_{1:T}) = P(S_1) P(O_1 \mid S_1) \prod_{t=2}^T P(S_t \mid S_{t-1})P(O_t \mid S_t)
where :math:`X_{1:T}` is shorthand for :math:`X_1, \ldots, X_T`.
As with other latent-variable models, we use the `expectation-maximization
(EM) algorithm`_ to learn the model parameters. The HMM-optimized version of
the EM algorithm is known as the `forward-backward`_ / `Baum-Welch algorithm`_.
.. _expectation-maximization (EM) algorithm : https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm
.. _forward-backward: https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm
.. _Baum-Welch algorithm: https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm
**Models**
- :class:`~numpy_ml.hmm.MultinomialHMM`
**References**
.. [1] Ghahramani, Z. (2001). "An Intro to HMMs and Bayesian networks".
*International Journal of Pattern Recognition and AI, 15(1)*: 9-42.
.. toctree::
:maxdepth: 2
:hidden:
numpy_ml.hmm.MultinomialHMM
================================================
FILE: docs/numpy_ml.lda.lda.rst
================================================
``LDA``
=======
.. autoclass:: numpy_ml.lda.LDA
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.lda.rst
================================================
###########################
Latent Dirichlet allocation
###########################
`Latent Dirichlet allocation`_ (LDA, commonly known as a topic model) is a
generative model for `bags of words`_.
.. _`Latent Dirichlet allocation` : https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
.. _bags of words : https://en.wikipedia.org/wiki/Bag-of-words_model
.. figure:: img/lda_model_smoothed.png
:scale: 25 %
:align: center
The smoothed LDA model with `T` topics, `D` documents, and :math:`N_d` words per document.
In LDA, each word in a piece of text is associated with one of `T` latent
`topics`. A `document` is an unordered collection (bag) of words. During
inference, the goal is to estimate probability of each word token under each
topic, along with the per-document topic mixture weights, using only the
observed text.
The parameters of the LDA model are:
- :math:`\theta`, the document-topic distribution. We use
:math:`\theta^{(i)}` to denote the parameters of the `categorical`_
distribution over topics associated with document :math:`i`.
- :math:`\phi`, the topic-word distribution. We use :math:`\phi^{(j)}` to
denote the parameters of the `categorical`_ distribution over words
associated with topic :math:`j`.
.. _categorical : https://en.wikipedia.org/wiki/Categorical_distribution
The standard LDA model [1]_ places a `Dirichlet`_ prior on :math:`\theta`:
.. math::
\theta^{(d)} \sim \text{Dir}(\alpha)
The smoothed/fully-Bayesian LDA model [2]_ adds an additional `Dirichlet`_ prior on :math:`\phi`:
.. math::
\phi^{(j)} \sim \text{Dir}(\beta)
.. _Dirichlet : https://en.wikipedia.org/wiki/Dirichlet_distribution
To generate a document with the smoothed LDA model, we:
1. Sample the parameters for the distribution over topics,
:math:`\theta \sim \text{Dir}(\alpha)`.
2. Sample a topic, :math:`z \sim \text{Cat}(\theta)`.
3. If we haven't already, sample the parameters for topic `z`'s categorical
distribution over words, :math:`\phi^{(z)} \sim \text{Dir}(\beta)`.
4. Sample a word, :math:`w \sim \text{Cat}(\phi^{(z)})`.
5. Repeat steps 2 through 4 until we have a bag of `N` words.
The joint distribution over words, topics, :math:`\theta`, and :math:`\phi`
under the smoothed LDA model is:
.. math::
P(w, z, \phi, \theta \mid \alpha, \beta) = \left( \prod_{t=1}^T \text{Dir}(\phi^{(t)}; \beta) \right) \prod_{d=1}^D \text{Dir}(\theta^{(d)}; \alpha) \prod_{n=1}^{N_d} P(z_n \mid \theta^{(d)}) P(w_n \mid \phi^{(z_n)})
The parameters of the LDA model can be learned using `variational expectation
maximization`_ or Markov chain Monte Carlo (e.g., `collapsed Gibbs sampling`_).
.. _`variational expectation maximization`: https://en.wikipedia.org/wiki/Variational_Bayesian_methods
.. _`collapsed Gibbs sampling`: https://en.wikipedia.org/wiki/Gibbs_sampling#Collapsed_Gibbs_sampler
**Models**
- :class:`~numpy_ml.lda.LDA`
- :class:`~numpy_ml.lda.SmoothedLDA`
**References**
.. [1] Blei, D., Ng, A., & Jordan, M. (2003). "Latent Dirichlet allocation". *Journal of
Machine Learning Research*, *3*, 993–1022.
.. [2] Griffiths, T. & Steyvers, M. (2004). "Finding scientific topics".
*PNAS*, *101(1)*, 5228-5235.
.. toctree::
:maxdepth: 3
:hidden:
numpy_ml.lda.lda
numpy_ml.lda.smoothed_lda
================================================
FILE: docs/numpy_ml.lda.smoothed_lda.rst
================================================
``SmoothedLDA``
===============
.. autoclass:: numpy_ml.lda.SmoothedLDA
:members:
:undoc-members:
:inherited-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.linear_models.lm.rst
================================================
``LinearRegression``
--------------------
.. autoclass:: numpy_ml.linear_models.LinearRegression
:members:
:undoc-members:
:inherited-members:
``RidgeRegression``
-------------------
.. autoclass:: numpy_ml.linear_models.RidgeRegression
:members:
:undoc-members:
:inherited-members:
``LogisticRegression``
----------------------
.. autoclass:: numpy_ml.linear_models.LogisticRegression
:members:
:undoc-members:
:inherited-members:
``BayesianLinearRegressionUnknownVariance``
-------------------------------------------
.. autoclass:: numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance
:members:
:undoc-members:
:inherited-members:
``BayesianLinearRegressionKnownVariance``
-----------------------------------------
.. autoclass:: numpy_ml.linear_models.BayesianLinearRegressionKnownVariance
:members:
:undoc-members:
:inherited-members:
``GaussianNBClassifier``
-----------------------------------------
.. autoclass:: numpy_ml.linear_models.GaussianNBClassifier
:members:
:undoc-members:
:inherited-members:
``GeneralizedLinearModel``
-----------------------------------------
.. autoclass:: numpy_ml.linear_models.GeneralizedLinearModel
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.linear_models.rst
================================================
Linear models
#############
.. raw:: html
Ordinary and Weighted Linear Least Squares
In weighted linear least-squares regression (WLS), a real-valued target
:math:`y_i`, is modeled as a linear combination of covariates
:math:`\mathbf{x}_i` and model coefficients **b**:
.. math::
y_i = \mathbf{b}^\top \mathbf{x}_i + \epsilon_i
In the above equation, :math:`\epsilon_i \sim \mathcal{N}(0, \sigma_i^2)` is a
normally distributed error term with variance :math:`\sigma_i^2`. Ordinary
least squares (OLS) is a special case of this model where the variance is fixed
across all examples, i.e., :math:`\sigma_i = \sigma_j \ \forall i,j`. The
maximum likelihood model parameters, :math:`\hat{\mathbf{b}}_{WLS}`, are those
that minimize the weighted squared error between the model predictions and the
true values:
.. math::
\mathcal{L} = ||\mathbf{W}^{0.5}(\mathbf{y} - \mathbf{bX})||_2^2
where :math:`\mathbf{W}` is a diagonal matrix of the example weights. In OLS,
:math:`\mathbf{W}` is the identity matrix. The maximum likelihood estimate for
the model parameters can be computed in closed-form using the normal equations:
.. math::
\hat{\mathbf{b}}_{WLS} =
(\mathbf{X}^\top \mathbf{WX})^{-1} \mathbf{X}^\top \mathbf{Wy}
**Models**
- :class:`~numpy_ml.linear_models.LinearRegression`
.. raw:: html
Ridge Regression
Ridge regression uses the same simple linear regression model but adds an
additional penalty on the `L2`-norm of the coefficients to the loss function.
This is sometimes known as Tikhonov regularization.
In particular, the ridge model is the same as the OLS model:
.. math::
\mathbf{y} = \mathbf{bX} + \mathbf{\epsilon}
where :math:`\epsilon \sim \mathcal{N}(\mathbf{0}, \sigma^2 \mathbf{I})`,
except now the error for the model is calculated as
.. math::
\mathcal{L} = ||\mathbf{y} - \mathbf{bX}||_2^2 + \alpha ||\mathbf{b}||_2^2
The MLE for the model parameters **b** can be computed in closed form via
the adjusted normal equation:
.. math::
\hat{\mathbf{b}}_{Ridge} =
(\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I})^{-1} \mathbf{X}^\top \mathbf{y}
where :math:`(\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I})^{-1}
\mathbf{X}^\top` is the pseudoinverse / Moore-Penrose inverse adjusted for
the `L2` penalty on the model coefficients.
**Models**
- :class:`~numpy_ml.linear_models.RidgeRegression`
.. raw:: html
Bayesian Linear Regression
In its general form, Bayesian linear regression extends the simple linear
regression model by introducing priors on model parameters *b* and/or the
error variance :math:`\sigma^2`.
The introduction of a prior allows us to quantify the uncertainty in our
parameter estimates for b by replacing the MLE point estimate in simple
linear regression with an entire posterior *distribution*, :math:`p(b \mid X, y,
\sigma)`, simply by applying Bayes rule:
.. math::
p(b \mid X, y) = \frac{ p(y \mid X, b) p(b \mid \sigma) }{p(y \mid X)}
We can also quantify the uncertainty in our predictions :math:`y^*` for some new
data :math:`X^*` with the posterior predictive distribution:
.. math::
p(y^* \mid X^*, X, Y) = \int_{b} p(y^* \mid X^*, b) p(b \mid X, y) \ \text{d}b
Depending on the choice of prior it may be impossible to compute an
analytic form for the posterior / posterior predictive distribution. In
these cases, it is common to use approximations, either via MCMC or
variational inference.
.. raw:: html
Known variance
--------------------------------
If we happen to already know the error variance :math:`\sigma^2`, the conjugate
prior on `b` is Gaussian. A common parameterization is:
.. math::
b | \sigma, V \sim \mathcal{N}(\mu, \sigma^2 V)
where :math:`\mu`, :math:`\sigma` and :math:`V` are hyperparameters. Ridge
regression is a special case of this model where :math:`\mu = 0`,
:math:`\sigma = 1` and :math:`V = I` (i.e., the prior on *b* is a zero-mean,
unit covariance Gaussian).
Due to the conjugacy of the above prior with the Gaussian likelihood, there
exists a closed-form solution for the posterior over the model
parameters:
.. math::
A &= (V^{-1} + X^\top X)^{-1} \\
\mu_b &= A V^{-1} \mu + A X^\top y \\
\Sigma_b &= \sigma^2 A \\
The model posterior is then
.. math::
b \mid X, y \sim \mathcal{N}(\mu_b, \Sigma_b)
We can also compute a closed-form solution for the posterior predictive distribution as
well:
.. math::
y^* \mid X^*, X, Y \sim \mathcal{N}(X^* \mu_b, \ \ X^* \Sigma X^{* \top} + I)
where :math:`X^*` is the matrix of new data we wish to predict, and :math:`y^*`
are the predicted targets for those data.
**Models**
- :class:`~numpy_ml.linear_models.BayesianLinearRegressionKnownVariance`
.. raw:: html
Unknown variance
--------------------------------
If *both* *b* and the error variance :math:`\sigma^2` are unknown, the
conjugate prior for the Gaussian likelihood is the Normal-Gamma
distribution (univariate likelihood) or the Normal-Inverse-Wishart
distribution (multivariate likelihood).
**Univariate**
.. math::
b, \sigma^2 &\sim \text{NG}(\mu, V, \alpha, \beta) \\
\sigma^2 &\sim \text{InverseGamma}(\alpha, \beta) \\
b \mid \sigma^2 &\sim \mathcal{N}(\mu, \sigma^2 V)
where :math:`\alpha, \beta, V`, and :math:`\mu` are parameters of the
prior.
**Multivariate**
.. math::
b, \Sigma &\sim \mathcal{NIW}(\mu, \lambda, \Psi, \rho) \\
\Sigma &\sim \mathcal{W}^{-1}(\Psi, \rho) \\
b \mid \Sigma &\sim \mathcal{N}(\mu, \frac{1}{\lambda} \Sigma)
where :math:`\mu, \lambda, \Psi`, and :math:`\rho` are
parameters of the prior.
Due to the conjugacy of the above priors with the Gaussian likelihood,
there exists a closed-form solution for the posterior over the model
parameters:
.. math::
B &= y - X \mu \\
\text{shape} &= N + \alpha \\
\text{scale} &= \frac{1}{\text{shape}} (\alpha \beta + B^\top (X V X^\top + I)^{-1} B) \\
where
.. math::
\sigma^2 \mid X, y &\sim \text{InverseGamma}(\text{shape}, \text{scale}) \\
A &= (V^{-1} + X^\top X)^{-1} \\
\mu_b &= A V^{-1} \mu + A X^\top y \\
\Sigma_b &= \sigma^2 A
The model posterior is then
.. math::
b | X, y, \sigma^2 \sim \mathcal{N}(\mu_b, \Sigma_b)
We can also compute a closed-form solution for the posterior predictive distribution:
.. math::
y^* \mid X^*, X, Y \sim \mathcal{N}(X^* \mu_b, \ X^* \Sigma_b X^{* \top} + I)
**Models**
- :class:`~numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance`
.. raw:: html
Naive Bayes Classifier
The naive Bayes model assumes the features of a training example
:math:`\mathbf{x}` are mutually independent given the example label :math:`y`:
.. math::
P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
where :math:`M` is the rank of the :math:`i^{th}` example :math:`\mathbf{x}_i`
and :math:`y_i` is the label associated with the :math:`i^{th}` example.
Combining this conditional independence assumption with a simple application of
Bayes' theorem gives the naive Bayes classification rule:
.. math::
\hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
&= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
&= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)
The prior class probability :math:`P(y)` can be specified in advance or
estimated empirically from the training data.
**Models**
- :class:`~numpy_ml.linear_models.GaussianNBClassifier`
.. raw:: html
Generalized Linear Model
The generalized linear model (GLM) assumes that each target/dependent variable
:math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots, y_n)`, has been
drawn independently from a pre-specified distribution in the exponential family
with unknown mean :math:`\mu_i`. The GLM models a (one-to-one, continuous,
differentiable) function, *g*, of this mean value as a linear combination of
the model parameters :math:`\mathbf{b}` and observed covariates,
:math:`\mathbf{x}_i` :
.. math::
g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i
where *g* is known as the link function. The choice of link function is
informed by the instance of the exponential family the target is drawn from.
**Models**
- :class:`~numpy_ml.linear_models.GeneralizedLinearModel`
.. toctree::
:maxdepth: 2
:hidden:
numpy_ml.linear_models.lm
================================================
FILE: docs/numpy_ml.neural_nets.activations.rst
================================================
Activations
===========
Popular (and some not-so-popular) activation functions for use within arbitrary
neural networks.
``Affine``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.Affine
:members:
:undoc-members:
:inherited-members:
``ELU``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.ELU
:members:
:undoc-members:
:inherited-members:
``Exponential``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.Exponential
:members:
:undoc-members:
:inherited-members:
``HardSigmoid``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.HardSigmoid
:members:
:undoc-members:
:inherited-members:
``Identity``
---------------
.. autoclass:: numpy_ml.neural_nets.activations.Identity
:members:
:undoc-members:
:inherited-members:
``LeakyReLU``
-------------
.. autoclass:: numpy_ml.neural_nets.activations.LeakyReLU
:members:
:undoc-members:
:inherited-members:
``ReLU``
---------
.. autoclass:: numpy_ml.neural_nets.activations.ReLU
:members:
:undoc-members:
:inherited-members:
``SELU``
---------
.. autoclass:: numpy_ml.neural_nets.activations.SELU
:members:
:undoc-members:
:inherited-members:
``GELU``
-----------
.. autoclass:: numpy_ml.neural_nets.activations.GELU
:members:
:undoc-members:
:inherited-members:
``Sigmoid``
------------
.. autoclass:: numpy_ml.neural_nets.activations.Sigmoid
:members:
:undoc-members:
:inherited-members:
``SoftPlus``
------------
.. autoclass:: numpy_ml.neural_nets.activations.SoftPlus
:members:
:undoc-members:
:inherited-members:
``Tanh``
---------
.. autoclass:: numpy_ml.neural_nets.activations.Tanh
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.neural_nets.initializers.rst
================================================
Initializers
=============
``ActivationInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.ActivationInitializer
:members:
:undoc-members:
:inherited-members:
``OptimizerInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.OptimizerInitializer
:members:
:undoc-members:
:inherited-members:
``SchedulerInitializer``
--------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.SchedulerInitializer
:members:
:undoc-members:
:inherited-members:
``WeightInitializer``
------------------------
.. autoclass:: numpy_ml.neural_nets.initializers.WeightInitializer
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.neural_nets.layers.rst
================================================
Layers
======
``LayerBase``
-------------
.. autoclass:: numpy_ml.neural_nets.layers.layers.LayerBase
:members:
:undoc-members:
:inherited-members:
``Add``
-------
.. autoclass:: numpy_ml.neural_nets.layers.Add
:members:
:undoc-members:
:show-inheritance:
``BatchNorm1D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.BatchNorm1D
:members:
:undoc-members:
:show-inheritance:
``BatchNorm2D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.BatchNorm2D
:members:
:undoc-members:
:show-inheritance:
``Conv1D``
----------
.. autoclass:: numpy_ml.neural_nets.layers.Conv1D
:members:
:undoc-members:
:show-inheritance:
``Conv2D``
----------
.. autoclass:: numpy_ml.neural_nets.layers.Conv2D
:members:
:undoc-members:
:show-inheritance:
``Deconv2D``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Deconv2D
:members:
:undoc-members:
:show-inheritance:
``DotProductAttention``
-----------------------
.. autoclass:: numpy_ml.neural_nets.layers.DotProductAttention
:members:
:undoc-members:
:show-inheritance:
``Embedding``
-------------
.. autoclass:: numpy_ml.neural_nets.layers.Embedding
:members:
:undoc-members:
:show-inheritance:
``Flatten``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.Flatten
:members:
:undoc-members:
:show-inheritance:
``FullyConnected``
------------------
.. autoclass:: numpy_ml.neural_nets.layers.FullyConnected
:members:
:undoc-members:
:show-inheritance:
``LSTM``
--------
.. autoclass:: numpy_ml.neural_nets.layers.LSTM
:members:
:undoc-members:
:show-inheritance:
``LSTMCell``
------------
.. autoclass:: numpy_ml.neural_nets.layers.LSTMCell
:members:
:undoc-members:
:show-inheritance:
``LayerNorm1D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.LayerNorm1D
:members:
:undoc-members:
:show-inheritance:
``LayerNorm2D``
---------------
.. autoclass:: numpy_ml.neural_nets.layers.LayerNorm2D
:members:
:undoc-members:
:show-inheritance:
``Multiply``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Multiply
:members:
:undoc-members:
:show-inheritance:
``Pool2D``
------------
.. autoclass:: numpy_ml.neural_nets.layers.Pool2D
:members:
:undoc-members:
:show-inheritance:
``RNN``
-------
.. autoclass:: numpy_ml.neural_nets.layers.RNN
:members:
:undoc-members:
:show-inheritance:
``RNNCell``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.RNNCell
:members:
:undoc-members:
:show-inheritance:
``RBM``
-------------------------------
.. autoclass:: numpy_ml.neural_nets.layers.RBM
:members:
:undoc-members:
:show-inheritance:
``Softmax``
-----------
.. autoclass:: numpy_ml.neural_nets.layers.Softmax
:members:
:undoc-members:
:show-inheritance:
``SparseEvolution``
-------------------
.. autoclass:: numpy_ml.neural_nets.layers.SparseEvolution
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.neural_nets.losses.rst
================================================
Loss functions
==============
``CrossEntropy``
----------------
.. autoclass:: numpy_ml.neural_nets.losses.CrossEntropy
:members:
:undoc-members:
:inherited-members:
``SquaredError``
----------------
.. autoclass:: numpy_ml.neural_nets.losses.SquaredError
:members:
:undoc-members:
:inherited-members:
``NCELoss``
-----------
.. autoclass:: numpy_ml.neural_nets.losses.NCELoss
:members:
:undoc-members:
:inherited-members:
``VAELoss``
-----------
.. autoclass:: numpy_ml.neural_nets.losses.VAELoss
:members:
:undoc-members:
:inherited-members:
``WGAN_GPLoss``
---------------
.. autoclass:: numpy_ml.neural_nets.losses.WGAN_GPLoss
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.neural_nets.models.rst
================================================
Full networks
==============
``WGAN_GP``
----------
.. autoclass:: numpy_ml.neural_nets.models.WGAN_GP
:members:
:undoc-members:
:inherited-members:
``BernoulliVAE``
----------------
.. autoclass:: numpy_ml.neural_nets.models.BernoulliVAE
:members:
:undoc-members:
:inherited-members:
``Word2Vec``
------------
.. autoclass:: numpy_ml.neural_nets.models.Word2Vec
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.neural_nets.modules.rst
================================================
Modules
========
``BidirectionalLSTM``
---------------------
.. autoclass:: numpy_ml.neural_nets.modules.BidirectionalLSTM
:members:
:undoc-members:
``MultiHeadedAttentionModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.MultiHeadedAttentionModule
:members:
:undoc-members:
``SkipConnectionConvModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.SkipConnectionConvModule
:members:
:undoc-members:
``SkipConnectionIdentityModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.SkipConnectionIdentityModule
:members:
:undoc-members:
``WavenetResidualModule``
------------------------------
.. autoclass:: numpy_ml.neural_nets.modules.WavenetResidualModule
:members:
:undoc-members:
================================================
FILE: docs/numpy_ml.neural_nets.optimizers.rst
================================================
Optimizers
===========
Popular gradient-based strategies for optimizing parameters in neural networks.
For a discussion regarding the generalization performance of the solutions
found via different optimization strategies, see:
.. [1] Wilson et al. (2017) "The marginal value of adaptive gradient methods in machine
learning", *Proceedings of the 31st Conference on Neural Information Processing Systems*
https://arxiv.org/pdf/1705.08292.pdf
``OptimizerBase``
-------------
.. autoclass:: numpy_ml.neural_nets.optimizers.optimizers.OptimizerBase
:members:
:undoc-members:
:show-inheritance:
``SGD``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.SGD
:members:
:undoc-members:
:show-inheritance:
``AdaGrad``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.AdaGrad
:members:
:undoc-members:
:show-inheritance:
``Adam``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.Adam
:members:
:undoc-members:
:show-inheritance:
``RMSProp``
-----------
.. autoclass:: numpy_ml.neural_nets.optimizers.RMSProp
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.neural_nets.rst
================================================
Neural networks
###############
The neural network module includes common building blocks for implementing
modern `deep learning`_ models.
.. _`deep learning`: https://en.wikipedia.org/wiki/Deep_learning
.. raw:: html
Layers
Most modern neural networks can be represented as a `composition`_ of
many small, parametric functions. The functions in this composition are
commonly referred to as the "layers" of the network. As an example, the
multilayer perceptron (MLP) below computes the function :math:`(f
\circ g \circ h)` where, `f`, `g`, and `h` are the individual network layers.
.. figure:: img/mlp_model.png
:scale: 40 %
:align: center
A multilayer perceptron with three layers labeled `f`, `g`, and `h`.
Many neural network layers are parametric: they express different
transformations depending on the setting of their weights (coefficients),
biases (intercepts), and/or other tunable values. These parameters are adjusted
during training to improve the performance of the network on a particular
metric.
The :doc:`numpy_ml.neural_nets.layers` module contains a number of common
transformations that can be composed to create larger networks.
.. _`composition`: https://en.wikipedia.org/wiki/Function_composition
**Layers**
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Add` | - :class:`~numpy_ml.neural_nets.layers.Deconv2D` | - :class:`~numpy_ml.neural_nets.layers.LSTM` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.BatchNorm1D` | - :class:`~numpy_ml.neural_nets.layers.DotProductAttention` | - :class:`~numpy_ml.neural_nets.layers.LSTMCell` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` | - :class:`~numpy_ml.neural_nets.layers.Embedding` | - :class:`~numpy_ml.neural_nets.layers.LayerNorm1D` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Conv1D` | - :class:`~numpy_ml.neural_nets.layers.Flatten` | - :class:`~numpy_ml.neural_nets.layers.LayerNorm2D` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Conv2D` | - :class:`~numpy_ml.neural_nets.layers.FullyConnected` | - :class:`~numpy_ml.neural_nets.layers.Multiply` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.Pool2D` | - :class:`~numpy_ml.neural_nets.layers.RNN` | - :class:`~numpy_ml.neural_nets.layers.RNNCell` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.layers.RBM` | - :class:`~numpy_ml.neural_nets.layers.Softmax` | - :class:`~numpy_ml.neural_nets.layers.SparseEvolution` |
+-----------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+
.. raw:: html
Activations
Each unit in a neural network sums its input and passes it through an
`activation function`_ before sending it on to its outgoing weights. Activation
functions in most modern networks are real-valued, non-linear functions that
are computationally inexpensive to compute and easily differentiable.
The :doc:`Activations ` module contains a
number of common activation functions.
.. _`activation function`: https://en.wikipedia.org/wiki/Activation_function
**Activations**
+----------------------------------------------------------+--------------------------------------------------------+-------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.activations.Affine` | - :class:`~numpy_ml.neural_nets.activations.Identity` | - :class:`~numpy_ml.neural_nets.activations.Sigmoid` |
|----------------------------------------------------------|--------------------------------------------------------|-------------------------------------------------------|
| - :class:`~numpy_ml.neural_nets.activations.ELU` | - :class:`~numpy_ml.neural_nets.activations.LeakyReLU` | - :class:`~numpy_ml.neural_nets.activations.SoftPlus` |
| - :class:`~numpy_ml.neural_nets.activations.Exponential` | - :class:`~numpy_ml.neural_nets.activations.ReLU` | - :class:`~numpy_ml.neural_nets.activations.Tanh` |
| - :class:`~numpy_ml.neural_nets.activations.HardSigmoid` | - :class:`~numpy_ml.neural_nets.activations.SELU` | |
+----------------------------------------------------------+--------------------------------------------------------+-------------------------------------------------------+
.. raw:: html
Losses
Training a neural network involves searching for layer parameters that optimize
the network's performance on a given task. `Loss functions`_ are the
quantitative metric we use to measure how well the network is performing. Loss
functions are typically scalar-valued functions of a network's output on some
training data.
The :doc:`Losses ` module contains loss functions
for a number of common tasks.
.. _`Loss functions`: https://en.wikipedia.org/wiki/Loss_function
**Losses**
+------------------------------------------------------+-------------------------------------------------+-----------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.losses.CrossEntropy` | - :class:`~numpy_ml.neural_nets.losses.NCELoss` | - :class:`~numpy_ml.neural_nets.losses.WGAN_GPLoss` |
|------------------------------------------------------|-------------------------------------------------|-----------------------------------------------------|
| - :class:`~numpy_ml.neural_nets.losses.SquaredError` | - :class:`~numpy_ml.neural_nets.losses.VAELoss` | |
+------------------------------------------------------+-------------------------------------------------+-----------------------------------------------------+
.. raw:: html
Optimizers
The :doc:`Optimizers ` module contains several
popular gradient-based strategies for adjusting the parameters of a neural
network to optimize a loss function. The proper choice of optimization strategy
can help reduce training time / speed up convergence, though see [1]_ for a
discussion on the generalization performance of the solutions identified via
different strategies.
.. [1] Wilson, A. C., Roelofs, R., Stern, M., Srebro, M., & Recht, B. (2017)
"The marginal value of adaptive gradient methods in machine learning",
*Proceedings of the 31st Conference on Neural Information Processing
Systems*. https://arxiv.org/pdf/1705.08292.pdf
**Optimizers**
+-------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+-----------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.optimizers.SGD` | - :class:`~numpy_ml.neural_nets.optimizers.AdaGrad` | - :class:`~numpy_ml.neural_nets.optimizers.Adam` | - :class:`~numpy_ml.neural_nets.optimizers.RMSProp` |
+-------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+-----------------------------------------------------+
.. raw:: html
Learning Rate Schedulers
It is common to reduce an optimizer's learning rate(s) over the course of
training in order to eke out additional performance improvements. The
:doc:`Schedulers ` module contains several
strategies for automatically adjusting the learning rate as a function of the
number of elapsed training steps.
**Schedulers**
+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.schedulers.ConstantScheduler` | - :class:`~numpy_ml.neural_nets.schedulers.ExponentialScheduler` | - :class:`~numpy_ml.neural_nets.schedulers.KingScheduler` |
+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.schedulers.NoamScheduler` | | |
+---------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------+
.. raw:: html
Wrappers
The :doc:`Wrappers ` module contains classes
that wrap or otherwise modify the behavior of a network layer.
**Wrappers**
- :class:`~numpy_ml.neural_nets.wrappers.Dropout`
.. raw:: html
Modules
Many deep networks consist of stacks of repeated modules. These modules, often
consisting of several layers / layer operations, can themselves be abstracted
in order to simplify the building of more complex networks. The :doc:`Modules
` module contains a few common architectural
patterns that appear across a number of popular deep learning approaches.
**Modules**
+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.modules.BidirectionalLSTM` | - :class:`~numpy_ml.neural_nets.modules.MultiHeadedAttentionModule` | - :class:`~numpy_ml.neural_nets.modules.SkipConnectionConvModule` |
+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.modules.SkipConnectionIdentityModule` | - :class:`~numpy_ml.neural_nets.modules.WavenetResidualModule` | |
+-----------------------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------------+
.. raw:: html
Full Networks
The :doc:`Models ` module contains implementations
of several well-known neural networks from recent papers.
**Full Networks**
- :class:`~numpy_ml.neural_nets.models.WGAN_GP`
- :class:`~numpy_ml.neural_nets.models.BernoulliVAE`
- :class:`~numpy_ml.neural_nets.models.Word2Vec`
.. raw:: html
Utilities
The :doc:`Utilities ` module contains a number of
helper functions for dealing with weight initialization, convolution
arithmetic, padding, and minibatching.
**Utilities**
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.minibatch` | - :class:`~numpy_ml.neural_nets.utils.pad1D` | - :class:`~numpy_ml.neural_nets.utils.calc_fan` | - :class:`~numpy_ml.neural_nets.utils.col2im` |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.conv2D` | - :class:`~numpy_ml.neural_nets.utils.pad2D` | - :class:`~numpy_ml.neural_nets.utils.calc_conv_out_dims` | - :class:`~numpy_ml.neural_nets.utils.conv2D` |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.calc_pad_dims_1D` | - :class:`~numpy_ml.neural_nets.utils.dilate` | - :class:`~numpy_ml.neural_nets.utils.im2col` | - :class:`~numpy_ml.neural_nets.utils.conv1D` |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.deconv2D_naive` | - :class:`~numpy_ml.neural_nets.utils.conv2D_naive` | - :class:`~numpy_ml.neural_nets.utils.he_uniform` | - :class:`~numpy_ml.neural_nets.utils.he_normal` |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
| - :class:`~numpy_ml.neural_nets.utils.glorot_uniform` | - :class:`~numpy_ml.neural_nets.utils.truncated_normal` | | |
+---------------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------+--------------------------------------------------+
.. toctree::
:maxdepth: 3
:hidden:
numpy_ml.neural_nets.layers
numpy_ml.neural_nets.activations
numpy_ml.neural_nets.losses
numpy_ml.neural_nets.optimizers
numpy_ml.neural_nets.schedulers
numpy_ml.neural_nets.wrappers
numpy_ml.neural_nets.modules
numpy_ml.neural_nets.models
numpy_ml.neural_nets.utils
================================================
FILE: docs/numpy_ml.neural_nets.schedulers.rst
================================================
Learning rate schedulers
=========================
``ConstantScheduler``
---------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.ConstantScheduler
:members:
:undoc-members:
:inherited-members:
``ExponentialScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.ExponentialScheduler
:members:
:undoc-members:
:inherited-members:
``KingScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.KingScheduler
:members:
:undoc-members:
:inherited-members:
``NoamScheduler``
------------------------
.. autoclass:: numpy_ml.neural_nets.schedulers.NoamScheduler
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.neural_nets.utils.rst
================================================
Utilities
==========
``minibatch``
-------------
.. autofunction:: numpy_ml.neural_nets.utils.minibatch
``calc_pad_dims_2D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_pad_dims_2D
``calc_pad_dims_1D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_pad_dims_1D
``pad1D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.pad1D
``pad2D``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.pad2D
``dilate``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.dilate
``calc_fan``
--------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_fan
``calc_conv_out_dims``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.calc_conv_out_dims
``im2col``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.im2col
``col2im``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.col2im
``conv2D``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv2D
``conv1D``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv1D
``deconv2D_naive``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.deconv2D_naive
``conv2D_naive``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.conv2D_naive
``he_uniform``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.he_uniform
``he_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.he_normal
``glorot_uniform``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.glorot_uniform
``glorot_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.glorot_normal
``truncated_normal``
-----------------------
.. autofunction:: numpy_ml.neural_nets.utils.truncated_normal
================================================
FILE: docs/numpy_ml.neural_nets.wrappers.rst
================================================
Wrappers
=========
``WrapperBase``
---------------
.. autoclass:: numpy_ml.neural_nets.wrappers.wrappers.WrapperBase
:members:
:undoc-members:
:inherited-members:
``Dropout``
-----------
.. autoclass:: numpy_ml.neural_nets.wrappers.Dropout
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.ngram.additive.rst
================================================
``AdditiveNGram``
-----------------
.. autoclass:: numpy_ml.ngram.AdditiveNGram
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.ngram.goodturing.rst
================================================
``GoodTuringNGram``
-------------------
.. autoclass:: numpy_ml.ngram.GoodTuringNGram
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.ngram.mle.rst
================================================
``MLENGram``
------------
.. autoclass:: numpy_ml.ngram.MLENGram
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.ngram.rst
================================================
#######################
N-gram smoothing models
#######################
When dealing with `n-gram`_ models, smoothing refers to the practice of
adjusting empirical probability estimates to account for insufficient data.
In the descriptions below, we use the notation :math:`w^{j}_{i}`, :math:`i < j`, to
denote the `(j - i)`-gram :math:`(w_{i}, w_{i+1}, \ldots, w_{j})`.
.. raw:: html
Laplace Smoothing
`Laplace smoothing`_ is the assumption that each `n`-gram in a corpus occurs
exactly one more time than it actually does.
.. math::
p(w_i \mid w^{i-1}_{i-n+1}) = \frac{1 + c(w^{i}_{i-n+1})}{|V| \sum_{w_i} c(w^{i}_{i-n+1})}
where :math:`c(a)` denotes the empirical count of the `n`-gram :math:`a` in the
corpus, and :math:`|V|` corresponds to the number of unique `n`-grams in the
corpus.
.. _`Laplace smoothing`: https://en.wikipedia.org/wiki/Additive_smoothing
**Models**
- :class:`~numpy_ml.ngram.AdditiveNGram`
.. raw:: html
Additive/Lidstone Smoothing
`Additive/Lidstone smoothing`_ is a generalization of Laplace smoothing, where we
assume that each `n`-gram in a corpus occurs `k` more times than it actually
does (where `k` can be any non-negative value, but typically ranges between `[0, 1]`):
.. math::
p(w_i \mid w^{i-1}_{i-n+1}) = \frac{k + c(w^{i}_{i-n+1})}{k |V| \sum_{w_i} c(w^{i}_{i-n+1})}
where :math:`c(a)` denotes the empirical count of the `n`-gram :math:`a` in the
corpus, and :math:`|V|` corresponds to the number of unique `n`-grams in the
corpus.
.. _`Additive/Lidstone smoothing`: https://en.wikipedia.org/wiki/Additive_smoothing
**Models**
- :class:`~numpy_ml.ngram.AdditiveNGram`
.. raw:: html
Good-Turing Smoothing
`Good-Turing smoothing`_ is a more sophisticated technique which takes into
account the identity of the particular `n`-gram when deciding the amount of
smoothing to apply. It proceeds by allocating a portion of the probability
space occupied by `n`-grams which occur with count `r+1` and dividing it among
the `n`-grams which occur with rate `r`.
.. math::
r^* = (r + 1) \frac{g(r + 1)}{g(r)} \\
p(w^{i}_{i-n+1} \mid c(w^{i}_{i-n+1}) = r) = \frac{r^*}{N}
where :math:`r^*` is the adjusted count for an `n`-gram which occurs `r` times,
`g(x)` is the number of `n`-grams in the corpus which occur `x` times, and `N`
is the total number of `n`-grams in the corpus.
.. _n-gram: https://en.wikipedia.org/wiki/N-gram
.. _`Good-Turing smoothing`: https://en.wikipedia.org/wiki/Good%E2%80%93Turing_frequency_estimation
**Models**
- :class:`~numpy_ml.ngram.GoodTuringNGram`
**References**
.. [1] Chen & Goodman (1998). "An empirical study of smoothing techniques
for language modeling". *Harvard Computer Science Group Technical Report
TR-10-98*.
.. [2] Gale & Sampson (1995). "Good-Turing frequency estimation without
tears". *Journal of Quantitative Linguistics*, 2(3), 217-237.
.. toctree::
:maxdepth: 3
:hidden:
numpy_ml.ngram.mle
numpy_ml.ngram.additive
numpy_ml.ngram.goodturing
================================================
FILE: docs/numpy_ml.nonparametric.gp.rst
================================================
``GPRegression``
#################
.. autoclass:: numpy_ml.nonparametric.GPRegression
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.nonparametric.kernel_regression.rst
================================================
``KernelRegression``
#####################
.. autoclass:: numpy_ml.nonparametric.KernelRegression
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.nonparametric.knn.rst
================================================
``KNN``
#######
.. autoclass:: numpy_ml.nonparametric.KNN
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.nonparametric.rst
================================================
Nonparametric models
####################
.. raw:: html
K-Nearest Neighbors
The `k-nearest neighbors`_ (KNN) model is a nonparametric supervised learning
approach that can be applied to classification or regression problems. In a
classification context, the KNN model assigns a class label for a new datapoint
by taking a majority vote amongst the labels for the `k` closest points
("neighbors") in the training data. Similarly, in a regression context, the KNN
model predicts the target value associated with a new datapoint by taking the
average of the targets associated with the `k` closes points in the training
data.
.. _`k-nearest neighbors`: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
**Models**
- :class:`~numpy_ml.nonparametric.KNN`
.. raw:: html
Gaussian Process Regression
A `Gaussian process`_ defines a prior distribution over functions mapping
:math:`X \rightarrow \mathbb{R}`, where `X` can be any finite (or
infinite!)-dimensional set.
Let :math:`f(x_k)` be the random variable corresponding to
the value of a function `f` at a point :math:`x_k \in X`. Define a random
variable :math:`z = [f(x_1), \ldots, f(x_N)]` for any finite set of points
:math:`\{x_1, \ldots, x_N\} \subset X`. If `f` is distributed according to a
Gaussian Process, it is the case that
.. math::
z \sim \mathcal{N}(\mu, K)
for
.. math::
\mu &= [\text{mean}(x_1), \ldots, \text{mean}(x_N)] \\
K_{ij} &= \text{kernel}(x_i, x_j)
where mean is the mean function (in Gaussian process regression it is common
to define mean(`x`) = 0), and `kernel` is a :doc:`kernel
` / covariance function that determines the general
shape of the GP prior over functions, `p(f)`.
In `Gaussian process regression`_ (AKA simple Kriging [2]_ [3]_), a Gaussian
process is used as a prior on functions and is combined with the Gaussian
likelihood from the linear model via Bayes' rule to compute a posterior over
functions `f`:
.. math::
y \mid X, f &\sim \mathcal{N}( [f(x_1), \ldots, f(x_n)], \alpha I ) \\
f \mid X &\sim \text{GP}(0, K)
Due to the conjugacy of the Gaussian Process prior with the regression model's
Gaussian likelihood, the posterior will also be Gaussian and can be computed in
closed form.
.. _`Gaussian process`: https://en.wikipedia.org/wiki/Gaussian_process
.. _`Gaussian process regression`: https://en.wikipedia.org/wiki/Kriging
**Models**
- :class:`~numpy_ml.nonparametric.GPRegression`
**References**
.. [1] Rasmussen, C. E., & Williams, C. K. I. (2006). Gaussian Processes for
Machine Learning. MIT Press, Cambridge, MA.
.. [2] Krige, D. G., (1951). "A statistical approach to some mine valuations and
allied problems at the Witwatersrand", *Master's thesis of the University of
Witwatersrand*.
.. [3] Matheron, G., (1963). "Principles of geostatistics", *Economic Geology, 58*, 1246-1266.
.. raw:: html
Kernel Regression
Kernel regression is another nonparametric approach to nonlinear regression.
Like the Gaussian Process regression approach (or, more generally, all
regression models), kernel regression attempts to learn a function `f` which
captures the conditional expectation of some targets **y** given the data
**X**, under the assumption that
.. math::
y_i = f(x_i) + \epsilon_i \ \ \ \ \text{where } \mathbb{E}[\epsilon | \mathbf{x}] = \mathbb{E}[\epsilon] = 0
Unlike the Gaussian Process regression approach, however, kernel regression
does not place a prior over `f`. Instead, it models :math:`f = \mathbb{E}[y |
X] = \int_y \frac{p(X, y)}{p(X)} y \ \text{d}y` using a :doc:`kernel function
`, `k`, to estimate the smoothed data probabilities.
For example, the :class:`Nadaraya-Watson `
estimator [4]_ [5]_ uses the following probability estimates:
.. math::
\hat{p}(X) &= \prod_{i=1}^N \hat{p}(x_i) = \prod_{i=1}^N \sum_{j=1}^N \frac{k(x_i - x_j)}{N} \\
\hat{p}(X, y) & \prod_{i=1}^N \hat{p}(x_i, y_i) = \prod_{i=1}^N \sum_{j=1}^N \frac{k(x_i - x_j) k(y_i - y_j)}{N}
**Models**
- :class:`~numpy_ml.nonparametric.KernelRegression`
**References**
.. [4] Nadaraya, E. A. (1964). "On estimating regression". *Theory of
Probability and Its Applications, 9 (1)*, 141-2.
.. [5] Watson, G. S. (1964). "Smooth regression analysis". *Sankhyā: The Indian
Journal of Statistics, Series A. 26 (4)*, 359–372.
.. raw:: html
See Also
The :doc:`trees ` module contains other classic nonparametric
approaches, including :doc:`decision trees `,
:doc:`random forests `, and :doc:`gradient
boosted decision trees `.
.. toctree::
:maxdepth: 2
:hidden:
numpy_ml.nonparametric.knn
numpy_ml.nonparametric.gp
numpy_ml.nonparametric.kernel_regression
================================================
FILE: docs/numpy_ml.preprocessing.dsp.rst
================================================
Digital signal processing
#########################
``DCT``
-------
.. autofunction:: numpy_ml.preprocessing.dsp.DCT
``DFT``
-------
.. autofunction:: numpy_ml.preprocessing.dsp.DFT
``dft_bins``
------------
.. autofunction:: numpy_ml.preprocessing.dsp.dft_bins
``magnitude_spectrum``
----------------------
.. autofunction:: numpy_ml.preprocessing.dsp.magnitude_spectrum
``power_spectrum``
------------------
.. autofunction:: numpy_ml.preprocessing.dsp.power_spectrum
``batch_resample``
------------------
.. autofunction:: numpy_ml.preprocessing.dsp.batch_resample
``nn_interpolate_2D``
---------------------
.. autofunction:: numpy_ml.preprocessing.dsp.nn_interpolate_2D
``nn_interpolate_1D``
---------------------
.. autofunction:: numpy_ml.preprocessing.dsp.nn_interpolate_1D
``bilinear_interpolate``
-------------------------
.. autofunction:: numpy_ml.preprocessing.dsp.bilinear_interpolate
``to_frames``
-------------
.. autofunction:: numpy_ml.preprocessing.dsp.to_frames
``autocorrelate1D``
-------------------
.. autofunction:: numpy_ml.preprocessing.dsp.autocorrelate1D
``preemphasis``
---------------
.. autofunction:: numpy_ml.preprocessing.dsp.preemphasis
``cepstral_lifter``
-------------------
.. autofunction:: numpy_ml.preprocessing.dsp.cepstral_lifter
``mel_spectrogram``
-------------------
.. autofunction:: numpy_ml.preprocessing.dsp.mel_spectrogram
``mfcc``
--------
.. autofunction:: numpy_ml.preprocessing.dsp.mfcc
``mel2hz``
----------
.. autofunction:: numpy_ml.preprocessing.dsp.mel2hz
``hz2mel``
----------
.. autofunction:: numpy_ml.preprocessing.dsp.hz2mel
``mel_filterbank``
------------------
.. autofunction:: numpy_ml.preprocessing.dsp.mel_filterbank
================================================
FILE: docs/numpy_ml.preprocessing.general.rst
================================================
General
#######
``FeatureHasher``
-----------------
.. autoclass:: numpy_ml.preprocessing.general.FeatureHasher
:members:
:undoc-members:
:inherited-members:
``OneHotEncoder``
-----------------
.. autoclass:: numpy_ml.preprocessing.general.OneHotEncoder
:members:
:undoc-members:
:inherited-members:
``Standardizer``
----------------
.. autoclass:: numpy_ml.preprocessing.general.Standardizer
:members:
:undoc-members:
:inherited-members:
``minibatch``
-------------
.. automodule:: numpy_ml.preprocessing.general
:members: minibatch
================================================
FILE: docs/numpy_ml.preprocessing.nlp.rst
================================================
Natural language processing
###########################
``BytePairEncoder``
-------------------
.. autoclass:: numpy_ml.preprocessing.nlp.BytePairEncoder
:members:
:undoc-members:
:inherited-members:
``HuffmanEncoder``
------------------
.. autoclass:: numpy_ml.preprocessing.nlp.HuffmanEncoder
:members:
:undoc-members:
:inherited-members:
``TFIDFEncoder``
------------------
.. autoclass:: numpy_ml.preprocessing.nlp.TFIDFEncoder
:members:
:undoc-members:
:inherited-members:
``Vocabulary``
--------------
.. autoclass:: numpy_ml.preprocessing.nlp.Vocabulary
:members:
:undoc-members:
:inherited-members:
``Token``
---------
.. autoclass:: numpy_ml.preprocessing.nlp.Token
:members:
:undoc-members:
:inherited-members:
``ngrams``
-----------
.. autofunction:: numpy_ml.preprocessing.nlp.ngrams
``remove_stop_words``
---------------------
.. autofunction:: numpy_ml.preprocessing.nlp.remove_stop_words
``strip_punctuation``
---------------------
.. autofunction:: numpy_ml.preprocessing.nlp.strip_punctuation
``tokenize_words``
-------------------
.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_words
``tokenize_whitespace``
------------------------
.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_whitespace
``tokenize_chars``
-------------------
.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_chars
``tokenize_bytes_raw``
-----------------------
.. autofunction:: numpy_ml.preprocessing.nlp.tokenize_bytes_raw
``bytes_to_chars``
-----------------------
.. autofunction:: numpy_ml.preprocessing.nlp.bytes_to_chars
================================================
FILE: docs/numpy_ml.preprocessing.rst
================================================
Preprocessing
#############
.. toctree::
:maxdepth: 3
numpy_ml.preprocessing.general
numpy_ml.preprocessing.dsp
numpy_ml.preprocessing.nlp
================================================
FILE: docs/numpy_ml.rl_models.agents.rst
================================================
Agents
======
``CrossEntropyAgent``
---------------------
.. autoclass:: numpy_ml.rl_models.agents.CrossEntropyAgent
:members:
:undoc-members:
:inherited-members:
``DynaAgent``
-------------
.. autoclass:: numpy_ml.rl_models.agents.DynaAgent
:members:
:undoc-members:
:inherited-members:
``MonteCarloAgent``
-------------------
Monte Carlo methods are ways of solving RL problems based on averaging
sample returns for each state-action pair. Parameters are updated only at
the completion of an episode.
In on-policy learning, the agent maintains a single policy that it updates
over the course of training. In order to ensure the policy converges to a
(near-) optimal policy, the agent must maintain that the policy assigns
non-zero probability to ALL state-action pairs during training to ensure
continual exploration.
- Thus on-policy learning is a compromise--it learns action values not for the optimal policy, but for a *near*-optimal policy that still explores.
In off-policy learning, the agent maintains two separate policies:
1. **Target policy**: The policy that is learned during training and that will eventually become the optimal policy.
2. **Behavior policy**: A policy that is more exploratory and is used to generate behavior during training.
Off-policy methods are often of greater variance and are slower to
converge. On the other hand, off-policy methods are more powerful and
general than on-policy methods.
.. autoclass:: numpy_ml.rl_models.agents.MonteCarloAgent
:members:
:undoc-members:
:inherited-members:
``TemporalDifferenceAgent``
---------------------------
Temporal difference methods are examples of bootstrapping in that they update
their estimate for the value of state `s` on the basis of a previous estimate.
Advantages of TD algorithms:
1. They do not require a model of the environment, its reward, or its next-state probability distributions.
2. They are implemented in an online, fully incremental fashion. This allows them to be used with infinite-horizons / when episodes take prohibitively long to finish.
3. TD algorithms learn from each transition regardless of what subsequent actions are taken.
4. In practice, TD methods have usually been found to converge faster than constant-:math:`\alpha` Monte Carlo methods on stochastic tasks.
.. autoclass:: numpy_ml.rl_models.agents.TemporalDifferenceAgent
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.rl_models.rl_utils.rst
================================================
Utilities
=========
.. automodule:: numpy_ml.rl_models.rl_utils
:members:
:inherited-members:
================================================
FILE: docs/numpy_ml.rl_models.rst
================================================
Reinforcement learning
######################
.. toctree::
:maxdepth: 3
numpy_ml.rl_models.agents
numpy_ml.rl_models.trainer
numpy_ml.rl_models.rl_utils
================================================
FILE: docs/numpy_ml.rl_models.trainer.rst
================================================
Training
========
``Trainer``
-----------
.. automodule:: numpy_ml.rl_models.trainer
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.trees.dt.rst
================================================
################
``DecisionTree``
################
.. autoclass:: numpy_ml.trees.DecisionTree
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.trees.gbdt.rst
================================================
``GradientBoostedDecisionTree``
###############################
.. autoclass:: numpy_ml.trees.GradientBoostedDecisionTree
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.trees.losses.rst
================================================
#########################
Losses (``trees.losses``)
#########################
.. automodule:: numpy_ml.trees.losses
:members:
:undoc-members:
:inherited-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.trees.rf.rst
================================================
``RandomForest``
################
.. autoclass:: numpy_ml.trees.RandomForest
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.trees.rst
================================================
Tree-based models
#################
.. raw:: html
Decision Trees
`Decision trees`_ [1]_ are popular nonparametric models that iteratively split a
training dataset into smaller, more homogenous subsets. Each node in the tree
is associated with a decision rule, which dictates how to divide the data the
node inherits from its parent among each of its children. Each leaf node is
associated with at least one data point from the original training set.
.. figure:: img/decision_tree.png
:width: 95%
:align: center
A binary decision tree trained on the dataset :math:`X = \{ \mathbf{x}_1,
\ldots, \mathbf{x}_{10} \}`. Each example in the dataset is a 4-dimensional
vector of real-valued features labeled :math:`x_1, \ldots, x_4`. Unshaded
circles correspond to internal decision nodes, while shaded circles
correspond to leaf nodes. Each leaf node is associated with a subset of the
examples in `X`, selected based on the decision rules along the path from
root to leaf.
At test time, new examples travel from the tree root to one of the leaves,
their path through the tree determined by the decision rules at each of the
nodes it visits. When a test example arrives at a leaf node, the targets for
the training examples at that leaf node are used to compute the model's
prediction.
Training decision trees corresponds to learning the set of decision rules to
partition the training data. This learning process proceeds greedily by
selecting the decision rule at each node that results in the greatest reduction
in an inhomogeneity or "impurity" metric, :math:`\mathcal{L}`. One popular
metric is the **information entropy**:
.. math::
-\sum_j P_n(\omega_j) \log P_n(\omega_j)
where :math:`P_n(\omega_j)` is the fraction of data points at split `n` that are
associated with category :math:`\omega_j`. Another useful metric is the **Gini
impurity**:
.. math::
\sum_{i \neq j} P_n(\omega_i) P_n(\omega_j) = 1 - \sum_{j} P_n(\omega_j)^2
For a binary tree (where each node has only two children), the reduction in
impurity after a particular split is
.. math::
\Delta \mathcal{L} = \mathcal{L}(\text{Parent}) -
P_{\text{left}} \mathcal{L}(\text{Left child}) -
(1 - P_{\text{left}})\mathcal{L}(\text{Right child})
where :math:`\mathcal{L}(x)` is the impurity of the dataset at node `x`,
and :math:`P_{\text{left}}`/:math:`P_{\text{right}}` are the proportion of
examples at the current node that are partitioned into the left / right
children, respectively, by the proposed split.
.. _`Decision trees`: https://en.wikipedia.org/wiki/Decision_tree_learning
**Models**
- :class:`~numpy_ml.trees.DecisionTree`
**References**
.. [1] Breiman, L., Friedman, J. H., Olshen, R. A., and Stone, C. J. (1984).
Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole
Advanced Books & Software.
.. raw:: html
Bootstrap Aggregating
`Bootstrap aggregating`_ (bagging) methods [2]_ are an `ensembling approach`_ that
proceeds by creating `n` bootstrapped samples of a training dataset by sampling
from it with replacement. A separate learner is fit on each of the `n`
bootstrapped datasets, with the final bootstrap aggregated model prediction
corresponding to the average (or majority vote, for classifiers) across each
of the `n` learners' predictions for a given datapoint.
The `random forest`_ model [3]_ [4]_ is a canonical example of bootstrap
aggregating. For this approach, each of the `n` learners is a different
decision tree. In addition to training each decision tree on a different
bootstrapped dataset, random forests employ a `random subspace`_ approach [5]_:
each decision tree is trained on a subsample (without replacement) of the full
collection of dataset features.
.. _`Bootstrap aggregating`: https://en.wikipedia.org/wiki/Bootstrap_aggregating
.. _`random forest`: https://en.wikipedia.org/wiki/Random_forest
.. _`ensembling approach`: https://en.wikipedia.org/wiki/Ensemble_learning
.. _`random subspace`: https://en.wikipedia.org/wiki/Random_subspace_method
**Models**
- :class:`~numpy_ml.trees.RandomForest`
**References**
.. [2] Breiman, L. (1994). "Bagging predictors". *Technical Report 421.
Statistics Department, UC Berkeley*.
.. [3] Ho, T. K. (1995). "Random decision forests". *Proceedings of the Third
International Conference on Document Analysis and Recognition, 1*: 278-282.
.. [4] Breiman, L. (2001). "Random forests". *Machine Learning. 45(1)*: 5-32.
.. [5] Ho, T. K. (1998). "The random subspace method for constructing decision
forests". *IEEE Transactions on Pattern Analysis and Machine Intelligence.
20(8)*: 832-844.
.. raw:: html
Gradient Boosting
`Gradient boosting`_ [6]_ [7]_ [8]_ is another popular `ensembling technique`_
that proceeds by iteratively fitting a sequence of `m` weak learners such that:
.. math::
f_m(X) = b(X) + \eta w_1 g_1 + \ldots + \eta w_m g_m
where `b` is a fixed initial estimate for the targets, :math:`\eta` is
a learning rate parameter, and :math:`w_{i}` and :math:`g_{i}`
denote the weights and predictions of the :math:`i^{th}` learner.
At each training iteration a new weak learner is fit to predict the negative
gradient of the loss with respect to the previous prediction,
:math:`\nabla_{f_{i-1}} \mathcal{L}(y, \ f_{i-1}(X))`. We then use the
element-wise product of the predictions of this weak learner, :math:`g_i`, with
a weight, :math:`w_i`, computed via, e.g., `line-search`_ on the objective
:math:`w_i = \arg \min_{w} \sum_{j=1}^n \mathcal{L}(y_j, f_{i-1}(x_j) + w g_i)`
, to adjust the predictions of the model from the previous iteration,
:math:`f_{i-1}(X)`:
.. math::
f_i(X) := f_{i-1}(X) + w_i g_i
The current module implements gradient boosting using decision trees as the
weak learners.
.. _`Gradient boosting`: https://en.wikipedia.org/wiki/Gradient_boosting
.. _`ensembling technique`: https://en.wikipedia.org/wiki/Ensemble_learning
.. _`line-search`: https://en.wikipedia.org/wiki/Line_search
**Models**
- :class:`~numpy_ml.trees.GradientBoostedDecisionTree`
**References**
.. [6] Breiman, L. (1997). "Arcing the edge". *Technical Report 486.
Statistics Department, UC Berkeley*.
.. [7] Friedman, J. H. (1999). "Greedy function approximation: A gradient
boosting machine". *IMS 1999 Reitz Lecture*.
.. [8] Mason, L., Baxter, J., Bartlett, P. L., Frean, M. (1999). "Boosting
algorithms as gradient descent" *Advances in Neural Information Processing
Systems, 12*: 512–518.
.. toctree::
:maxdepth: 3
:hidden:
numpy_ml.trees.dt
numpy_ml.trees.rf
numpy_ml.trees.gbdt
================================================
FILE: docs/numpy_ml.utils.data_structures.rst
================================================
Data structures
================
``BallTree``
------------
.. autoclass:: numpy_ml.utils.data_structures.BallTree
:members:
:undoc-members:
:inherited-members:
``DiscreteSampler``
-------------------
.. autoclass:: numpy_ml.utils.data_structures.DiscreteSampler
:members:
:undoc-members:
:inherited-members:
``PriorityQueue``
-----------------
.. autoclass:: numpy_ml.utils.data_structures.PriorityQueue
:members:
:undoc-members:
:inherited-members:
``PQNode``
-----------------
.. autoclass:: numpy_ml.utils.data_structures.PQNode
:members:
:undoc-members:
:inherited-members:
``Dict``
--------
.. autoclass:: numpy_ml.utils.data_structures.Dict
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.utils.distance_metrics.rst
================================================
Distance metrics
================
Common distance functions.
``euclidean``
---------------
.. autofunction:: numpy_ml.utils.distance_metrics.euclidean
``chebyshev``
---------------
.. autofunction:: numpy_ml.utils.distance_metrics.chebyshev
``hamming``
-------------
.. autofunction:: numpy_ml.utils.distance_metrics.hamming
``manhattan``
--------------
.. autofunction:: numpy_ml.utils.distance_metrics.manhattan
``minkowski``
--------------
.. autofunction:: numpy_ml.utils.distance_metrics.minkowski
================================================
FILE: docs/numpy_ml.utils.graphs.rst
================================================
Graphs
======
``Graph``
---------
.. autoclass:: numpy_ml.utils.graphs.Graph
:members:
:undoc-members:
:inherited-members:
``Edge``
--------
.. autoclass:: numpy_ml.utils.graphs.Edge
:members:
:undoc-members:
:inherited-members:
``DiGraph``
-----------
.. autoclass:: numpy_ml.utils.graphs.DiGraph
:members:
:undoc-members:
:show-inheritance:
``UndirectedGraph``
-------------------
.. autoclass:: numpy_ml.utils.graphs.UndirectedGraph
:members:
:undoc-members:
:show-inheritance:
``random_unweighted_graph``
---------------------------
.. autofunction:: numpy_ml.utils.graphs.random_unweighted_graph
``random_DAG``
--------------
.. autofunction:: numpy_ml.utils.graphs.random_DAG
================================================
FILE: docs/numpy_ml.utils.kernels.rst
================================================
Kernels
=======
A collection of common kernel / similarity functions. All kernels are
continuous, bounded, and symmetric real functions which integrate to 1.
``LinearKernel``
----------------
.. autoclass:: numpy_ml.utils.kernels.LinearKernel
:members:
:undoc-members:
:inherited-members:
``PolynomialKernel``
--------------------
.. autoclass:: numpy_ml.utils.kernels.PolynomialKernel
:members:
:undoc-members:
:inherited-members:
``RBFKernel``
-------------
.. autoclass:: numpy_ml.utils.kernels.RBFKernel
:members:
:undoc-members:
:inherited-members:
================================================
FILE: docs/numpy_ml.utils.rst
================================================
Utilities
#########
.. toctree::
:maxdepth: 3
numpy_ml.utils.data_structures
numpy_ml.utils.distance_metrics
numpy_ml.utils.graphs
numpy_ml.utils.kernels
numpy_ml.utils.windows
numpy_ml.utils.testing
================================================
FILE: docs/numpy_ml.utils.testing.rst
================================================
Testing
-------
Common helper functions for testing the ML algorithms in the rest of the repo.
.. automodule:: numpy_ml.utils.testing
:members:
:undoc-members:
:inherited-members:
:show-inheritance:
================================================
FILE: docs/numpy_ml.utils.windows.rst
================================================
Window functions
================
In digital signal processing, windowing functions are useful to counteract the
assumption made by the FFT that data is infinite and to reduce spectral
leakage.
``blackman_harris``
-------------------
.. autofunction:: numpy_ml.utils.windows.blackman_harris
``generalized_cosine``
----------------------
.. autofunction:: numpy_ml.utils.windows.generalized_cosine
``hamming``
-----------
.. autofunction:: numpy_ml.utils.windows.hamming
``hann``
-----------
.. autofunction:: numpy_ml.utils.windows.hann
================================================
FILE: docs/requirements.txt
================================================
numpy
scipy
# all this is for the dang tests
matplotlib
seaborn
pandas
sklearn
huffman
================================================
FILE: numpy_ml/README.md
================================================
# Models
This repo includes code for the following models:
1. **Gaussian mixture model**
- EM training
2. **Hidden Markov model**
- Viterbi decoding
- Likelihood computation
- MLE parameter estimation via Baum-Welch/forward-backward algorithm
3. **Latent Dirichlet allocation** (topic model)
- Standard model with MLE parameter estimation via variational EM
- Smoothed model with MAP parameter estimation via MCMC
4. **Neural networks**
* Layers / Layer-wise ops
- Add
- Flatten
- Multiply
- Softmax
- Fully-connected/Dense
- Sparse evolutionary connections
- LSTM
- Elman-style RNN
- Max + average pooling
- Dot-product attention
- Embedding layer
- Restricted Boltzmann machine (w. CD-n training)
- 2D deconvolution (w. padding and stride)
- 2D convolution (w. padding, dilation, and stride)
- 1D convolution (w. padding, dilation, stride, and causality)
* Modules
- Bidirectional LSTM
- ResNet-style residual blocks (identity and convolution)
- WaveNet-style residual blocks with dilated causal convolutions
- Transformer-style multi-headed scaled dot product attention
* Regularizers
- Dropout
* Normalization
- Batch normalization (spatial and temporal)
- Layer normalization (spatial and temporal)
* Optimizers
- SGD w/ momentum
- AdaGrad
- RMSProp
- Adam
* Learning Rate Schedulers
- Constant
- Exponential
- Noam/Transformer
- Dlib scheduler
* Weight Initializers
- Glorot/Xavier uniform and normal
- He/Kaiming uniform and normal
- Standard and truncated normal
* Losses
- Cross entropy
- Squared error
- Bernoulli VAE loss
- Wasserstein loss with gradient penalty
- Noise contrastive estimation loss
* Activations
- ReLU
- Tanh
- Affine
- Sigmoid
- Leaky ReLU
- ELU
- SELU
- Exponential
- Hard Sigmoid
- Softplus
* Models
- Bernoulli variational autoencoder
- Wasserstein GAN with gradient penalty
- word2vec encoder with skip-gram and CBOW architectures
* Utilities
- `col2im` (MATLAB port)
- `im2col` (MATLAB port)
- `conv1D`
- `conv2D`
- `deconv2D`
- `minibatch`
5. **Tree-based models**
- Decision trees (CART)
- [Bagging] Random forests
- [Boosting] Gradient-boosted decision trees
6. **Linear models**
- Ridge regression
- Logistic regression
- Ordinary least squares
- Gaussian naive Bayes classifier
- Generalized linear model (identity, log, and logit links)
- Bayesian linear regression w/ conjugate priors
- Unknown mean, known variance (Gaussian prior)
- Unknown mean, unknown variance (Normal-Gamma / Normal-Inverse-Wishart prior)
7. **n-Gram sequence models**
- Maximum likelihood scores
- Additive/Lidstone smoothing
- Simple Good-Turing smoothing
8. **Multi-armed bandit models**
- UCB1
- LinUCB
- Epsilon-greedy
- Thompson sampling w/ conjugate priors
- Beta-Bernoulli sampler
- LinUCB
8. **Reinforcement learning models**
- Cross-entropy method agent
- First visit on-policy Monte Carlo agent
- Weighted incremental importance sampling Monte Carlo agent
- Expected SARSA agent
- TD-0 Q-learning agent
- Dyna-Q / Dyna-Q+ with prioritized sweeping
9. **Nonparameteric models**
- Nadaraya-Watson kernel regression
- k-Nearest neighbors classification and regression
- Gaussian process regression
10. **Matrix factorization**
- Regularized alternating least-squares
- Non-negative matrix factorization
11. **Preprocessing**
- Discrete Fourier transform (1D signals)
- Discrete cosine transform (type-II) (1D signals)
- Bilinear interpolation (2D signals)
- Nearest neighbor interpolation (1D and 2D signals)
- Autocorrelation (1D signals)
- Signal windowing
- Text tokenization
- Feature hashing
- Feature standardization
- One-hot encoding / decoding
- Huffman coding / decoding
- Byte pair encoding / decoding
- Term frequency-inverse document frequency (TF-IDF) encoding
- MFCC encoding
12. **Utilities**
- Similarity kernels
- Distance metrics
- Priority queue
- Ball tree
- Discrete sampler
- Graph processing and generators
================================================
FILE: numpy_ml/__init__.py
================================================
# noqa
"""Common ML and ML-adjacent algorithms implemented in NumPy"""
from . import utils
from . import preprocessing
from . import gmm
from . import hmm
from . import lda
from . import linear_models
from . import neural_nets
from . import ngram
from . import nonparametric
from . import rl_models
from . import trees
from . import bandits
from . import factorization
================================================
FILE: numpy_ml/bandits/README.md
================================================
# Bandits
The `bandit.py` module includes several simple multi-arm bandit
environments.
The `policies.py` module implements a number of standard multi-arm bandit
policies.
1. **Bandits**
- MAB: Bernoulli, Multinomial, and Gaussian payout distributions
- Contextual MAB: Linear contextual bandits
2. **Policies**
- Epsilon-greedy
- UCB1 ([Auer, Cesa-Bianchi, & Fisher, 2002](https://link.springer.com/content/pdf/10.1023/A:1013689704352.pdf))
- Conjugate Thompson sampler for Bernoulli bandits ([Thompson, 1933](https://www.gwern.net/docs/statistics/decision/1933-thompson.pdf); [Chapelle & Li, 2010](https://papers.nips.cc/paper/4321-an-empirical-evaluation-of-thompson-sampling.pdf))
- LinUCB ([Li, Chu, Langford, & Schapire, 2010](http://rob.schapire.net/papers/www10.pdf))
## Plots
================================================
FILE: numpy_ml/bandits/__init__.py
================================================
from .bandits import *
from . import policies
from . import trainer
================================================
FILE: numpy_ml/bandits/bandits.py
================================================
"""A module containing different variations on multi-armed bandit environments."""
from abc import ABC, abstractmethod
import numpy as np
from numpy_ml.utils.testing import random_one_hot_matrix, is_number
class Bandit(ABC):
def __init__(self, rewards, reward_probs, context=None):
assert len(rewards) == len(reward_probs)
self.step = 0
self.n_arms = len(rewards)
super().__init__()
def __repr__(self):
"""A string representation for the bandit"""
HP = self.hyperparameters
params = ", ".join(["{}={}".format(k, v) for (k, v) in HP.items() if k != "id"])
return "{}({})".format(HP["id"], params)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {}
@abstractmethod
def oracle_payoff(self, context=None):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
The current context matrix for each of the bandit arms, if
applicable. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
"""
pass
def pull(self, arm_id, context=None):
"""
"Pull" (i.e., sample from) a given arm's payoff distribution.
Parameters
----------
arm_id : int
The integer ID of the arm to sample from
context : :py:class:`ndarray ` of shape `(D,)` or None
The context vector for the current timestep if this is a contextual
bandit. Otherwise, this argument is unused and defaults to None.
Returns
-------
reward : float
The reward sampled from the given arm's payoff distribution
"""
assert arm_id < self.n_arms
self.step += 1
return self._pull(arm_id, context)
def reset(self):
"""Reset the bandit step and action counters to zero."""
self.step = 0
@abstractmethod
def _pull(self, arm_id):
pass
class MultinomialBandit(Bandit):
def __init__(self, payoffs, payoff_probs):
"""
A multi-armed bandit where each arm is associated with a different
multinomial payoff distribution.
Parameters
----------
payoffs : ragged list of length `K`
The payoff values for each of the `n` bandits. ``payoffs[k][i]``
holds the `i` th payoff value for arm `k`.
payoff_probs : ragged list of length `K`
A list of the probabilities associated with each of the payoff
values in ``payoffs``. ``payoff_probs[k][i]`` holds the probability
of payoff index `i` for arm `k`.
"""
super().__init__(payoffs, payoff_probs)
for r, rp in zip(payoffs, payoff_probs):
assert len(r) == len(rp)
np.testing.assert_almost_equal(sum(rp), 1.0)
payoffs = np.array([np.array(x) for x in payoffs])
payoff_probs = np.array([np.array(x) for x in payoff_probs])
self.payoffs = payoffs
self.payoff_probs = payoff_probs
self.arm_evs = np.array([sum(p * v) for p, v in zip(payoff_probs, payoffs)])
self.best_ev = np.max(self.arm_evs)
self.best_arm = np.argmax(self.arm_evs)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "MultinomialBandit",
"payoffs": self.payoffs,
"payoff_probs": self.payoff_probs,
}
def oracle_payoff(self, context=None):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
Unused. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
return self.best_ev, self.best_arm
def _pull(self, arm_id, context):
payoffs = self.payoffs[arm_id]
probs = self.payoff_probs[arm_id]
return np.random.choice(payoffs, p=probs)
class BernoulliBandit(Bandit):
def __init__(self, payoff_probs):
"""
A multi-armed bandit where each arm is associated with an independent
Bernoulli payoff distribution.
Parameters
----------
payoff_probs : list of length `K`
A list of the payoff probability for each arm. ``payoff_probs[k]``
holds the probability of payoff for arm `k`.
"""
payoffs = [1] * len(payoff_probs)
super().__init__(payoffs, payoff_probs)
for p in payoff_probs:
assert p >= 0 and p <= 1
self.payoffs = np.array(payoffs)
self.payoff_probs = np.array(payoff_probs)
self.arm_evs = self.payoff_probs
self.best_ev = np.max(self.arm_evs)
self.best_arm = np.argmax(self.arm_evs)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "BernoulliBandit",
"payoff_probs": self.payoff_probs,
}
def oracle_payoff(self, context=None):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
Unused. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
return self.best_ev, self.best_arm
def _pull(self, arm_id, context):
return int(np.random.rand() <= self.payoff_probs[arm_id])
class GaussianBandit(Bandit):
def __init__(self, payoff_dists, payoff_probs):
"""
A multi-armed bandit that is similar to
:class:`BernoulliBandit`, but instead of each arm having
a fixed payout of 1, the payoff values are sampled from independent
Gaussian RVs.
Parameters
----------
payoff_dists : list of 2-tuples of length `K`
The parameters the distributions over payoff values for each of the
`n` arms. Specifically, ``payoffs[k]`` is a tuple of (mean, variance)
for the Gaussian distribution over payoffs associated with arm `k`.
payoff_probs : list of length `n`
A list of the probabilities associated with each of the payoff
values in ``payoffs``. ``payoff_probs[k]`` holds the probability of
payoff for arm `k`.
"""
super().__init__(payoff_dists, payoff_probs)
for (mean, var), rp in zip(payoff_dists, payoff_probs):
assert var > 0
assert np.testing.assert_almost_equal(sum(rp), 1.0)
self.payoff_dists = payoff_dists
self.payoff_probs = payoff_probs
self.arm_evs = np.array([mu for (mu, var) in payoff_dists])
self.best_ev = np.max(self.arm_evs)
self.best_arm = np.argmax(self.arm_evs)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "GaussianBandit",
"payoff_dists": self.payoff_dists,
"payoff_probs": self.payoff_probs,
}
def _pull(self, arm_id, context):
mean, var = self.payoff_dists[arm_id]
reward = 0
if np.random.rand() < self.payoff_probs[arm_id]:
reward = np.random.normal(mean, var)
return reward
def oracle_payoff(self, context=None):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
Unused. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
return self.best_ev, self.best_arm
class ShortestPathBandit(Bandit):
def __init__(self, G, start_vertex, end_vertex):
"""
A weighted graph shortest path problem formulated as a multi-armed
bandit.
Notes
-----
Each arm corresponds to a valid path through the graph from start to
end vertex. The agent's goal is to find the path that minimizes the
expected sum of the weights on the edges it traverses.
Parameters
----------
G : :class:`Graph ` instance
A weighted graph object. Weights can be fixed or probabilistic.
start_vertex : int
The index of the path's start vertex in the graph
end_vertex : int
The index of the path's end vertex in the graph
"""
self.G = G
self.end_vertex = end_vertex
self.adj_dict = G.to_adj_dict()
self.start_vertex = start_vertex
self.paths = G.all_paths(start_vertex, end_vertex)
self.arm_evs = self._calc_arm_evs()
self.best_ev = np.max(self.arm_evs)
self.best_arm = np.argmax(self.arm_evs)
placeholder = [None] * len(self.paths)
super().__init__(placeholder, placeholder)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "ShortestPathBandit",
"G": self.G,
"end_vertex": self.end_vertex,
"start_vertex": self.start_vertex,
}
def oracle_payoff(self, context=None):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
Unused. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
return self.best_ev, self.best_arm
def _calc_arm_evs(self):
I2V = self.G.get_vertex
evs = np.zeros(len(self.paths))
for p_ix, path in enumerate(self.paths):
for ix, v_i in enumerate(path[:-1]):
e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]
evs[p_ix] -= e.weight
return evs
def _pull(self, arm_id, context):
reward = 0
I2V = self.G.get_vertex
path = self.paths[arm_id]
for ix, v_i in enumerate(path[:-1]):
e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]
reward -= e.weight
return reward
class ContextualBernoulliBandit(Bandit):
def __init__(self, context_probs):
"""
A contextual version of :class:`BernoulliBandit` where each binary
context feature is associated with an independent Bernoulli payoff
distribution.
Parameters
----------
context_probs : :py:class:`ndarray ` of shape `(D, K)`
A matrix of the payoff probabilities associated with each of the
`D` context features, for each of the `K` arms. Index `(i, j)`
contains the probability of payoff for arm `j` under context `i`.
"""
D, K = context_probs.shape
# use a dummy placeholder variable to initialize the Bandit superclass
placeholder = [None] * K
super().__init__(placeholder, placeholder)
self.context_probs = context_probs
self.arm_evs = self.context_probs
self.best_evs = self.arm_evs.max(axis=1)
self.best_arms = self.arm_evs.argmax(axis=1)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "ContextualBernoulliBandit",
"context_probs": self.context_probs,
}
def get_context(self):
"""
Sample a random one-hot context vector. This vector will be the same
for all arms.
Returns
-------
context : :py:class:`ndarray ` of shape `(D, K)`
A random `D`-dimensional one-hot context vector repeated for each
of the `K` bandit arms.
"""
D, K = self.context_probs.shape
context = np.zeros((D, K))
context[np.random.choice(D), :] = 1
return random_one_hot_matrix(1, D).ravel()
def oracle_payoff(self, context):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
The current context matrix for each of the bandit arms.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
context_id = context[:, 0].argmax()
return self.best_evs[context_id], self.best_arms[context_id]
def _pull(self, arm_id, context):
D, K = self.context_probs.shape
arm_probs = context[:, arm_id] @ self.context_probs
arm_rwds = (np.random.rand(K) <= arm_probs).astype(int)
return arm_rwds[arm_id]
class ContextualLinearBandit(Bandit):
def __init__(self, K, D, payoff_variance=1):
r"""
A contextual linear multi-armed bandit.
Notes
-----
In a contextual linear bandit the expected payoff of an arm :math:`a
\in \mathcal{A}` at time `t` is a linear combination of its context
vector :math:`\mathbf{x}_{t,a}` with a coefficient vector
:math:`\theta_a`:
.. math::
\mathbb{E}[r_{t, a} \mid \mathbf{x}_{t, a}] = \mathbf{x}_{t,a}^\top \theta_a
In this implementation, the arm coefficient vectors :math:`\theta` are
initialized independently from a uniform distribution on the interval
[-1, 1], and the specific reward at timestep `t` is normally
distributed:
.. math::
r_{t, a} \mid \mathbf{x}_{t, a} \sim
\mathcal{N}(\mathbf{x}_{t,a}^\top \theta_a, \sigma_a^2)
Parameters
----------
K : int
The number of bandit arms
D : int
The dimensionality of the context vectors
payoff_variance : float or :py:class:`ndarray ` of shape `(K,)`
The variance of the random noise in the arm payoffs. If a float,
the variance is assumed to be equal for each arm. Default is 1.
"""
if is_number(payoff_variance):
payoff_variance = [payoff_variance] * K
assert len(payoff_variance) == K
assert all(v > 0 for v in payoff_variance)
self.K = K
self.D = D
self.payoff_variance = payoff_variance
# use a dummy placeholder variable to initialize the Bandit superclass
placeholder = [None] * K
super().__init__(placeholder, placeholder)
# initialize the theta matrix
self.thetas = np.random.uniform(-1, 1, size=(D, K))
self.thetas /= np.linalg.norm(self.thetas, 2)
@property
def hyperparameters(self):
"""A dictionary of the bandit hyperparameters"""
return {
"id": "ContextualLinearBandit",
"K": self.K,
"D": self.D,
"payoff_variance": self.payoff_variance,
}
@property
def parameters(self):
"""A dictionary of the current bandit parameters"""
return {"thetas": self.thetas}
def get_context(self):
"""
Sample the context vectors for each arm from a multivariate standard
normal distribution.
Returns
-------
context : :py:class:`ndarray ` of shape `(D, K)`
A `D`-dimensional context vector sampled from a standard normal
distribution for each of the `K` bandit arms.
"""
return np.random.normal(size=(self.D, self.K))
def oracle_payoff(self, context):
"""
Return the expected reward for an optimal agent.
Parameters
----------
context : :py:class:`ndarray ` of shape `(D, K)` or None
The current context matrix for each of the bandit arms, if
applicable. Default is None.
Returns
-------
optimal_rwd : float
The expected reward under an optimal policy.
optimal_arm : float
The arm ID with the largest expected reward.
"""
best_arm = np.argmax(self.arm_evs)
return self.arm_evs[best_arm], best_arm
def _pull(self, arm_id, context):
K, thetas = self.K, self.thetas
self._noise = np.random.normal(scale=self.payoff_variance, size=self.K)
self.arm_evs = np.array([context[:, k] @ thetas[:, k] for k in range(K)])
return (self.arm_evs + self._noise)[arm_id]
================================================
FILE: numpy_ml/bandits/policies.py
================================================
"""A module containing exploration policies for various multi-armed bandit problems."""
from abc import ABC, abstractmethod
from collections import defaultdict
import numpy as np
from ..utils.testing import is_number
class BanditPolicyBase(ABC):
def __init__(self):
"""A simple base class for multi-armed bandit policies"""
self.step = 0
self.ev_estimates = {}
self.is_initialized = False
super().__init__()
def __repr__(self):
"""Return a string representation of the policy"""
HP = self.hyperparameters
params = ", ".join(["{}={}".format(k, v) for (k, v) in HP.items() if k != "id"])
return "{}({})".format(HP["id"], params)
@property
def hyperparameters(self):
"""A dictionary containing the policy hyperparameters"""
pass
@property
def parameters(self):
"""A dictionary containing the current policy parameters"""
pass
def act(self, bandit, context=None):
"""
Select an arm and sample from its payoff distribution.
Parameters
----------
bandit : :class:`Bandit ` instance
The multi-armed bandit to act upon
context : :py:class:`ndarray ` of shape `(D,)` or None
The context vector for the current timestep if interacting with a
contextual bandit. Otherwise, this argument is unused. Default is
None.
Returns
-------
rwd : float
The reward received after pulling ``arm_id``.
arm_id : int
The arm that was pulled to generate ``rwd``.
"""
if not self.is_initialized:
self._initialize_params(bandit)
arm_id = self._select_arm(bandit, context)
rwd = self._pull_arm(bandit, arm_id, context)
self._update_params(arm_id, rwd, context)
return rwd, arm_id
def reset(self):
"""Reset the policy parameters and counters to their initial states."""
self.step = 0
self._reset_params()
self.is_initialized = False
def _pull_arm(self, bandit, arm_id, context):
"""Execute a bandit action and return the received reward."""
self.step += 1
return bandit.pull(arm_id, context)
@abstractmethod
def _select_arm(self, bandit, context):
"""Select an arm based on the current context"""
pass
@abstractmethod
def _update_params(self, bandit, context):
"""Update the policy parameters after an interaction"""
pass
@abstractmethod
def _initialize_params(self, bandit):
"""
Initialize any policy-specific parameters that depend on information
from the bandit environment.
"""
pass
@abstractmethod
def _reset_params(self):
"""
Reset any model-specific parameters. This gets called within the
public `self.reset()` method.
"""
pass
class EpsilonGreedy(BanditPolicyBase):
def __init__(self, epsilon=0.05, ev_prior=0.5):
r"""
An epsilon-greedy policy for multi-armed bandit problems.
Notes
-----
Epsilon-greedy policies greedily select the arm with the highest
expected payoff with probability :math:`1-\epsilon`, and selects an arm
uniformly at random with probability :math:`\epsilon`:
.. math::
P(a) = \left\{
\begin{array}{lr}
\epsilon / N + (1 - \epsilon) &\text{if }
a = \arg \max_{a' \in \mathcal{A}}
\mathbb{E}_{q_{\hat{\theta}}}[r \mid a']\\
\epsilon / N &\text{otherwise}
\end{array}
\right.
where :math:`N = |\mathcal{A}|` is the number of arms,
:math:`q_{\hat{\theta}}` is the estimate of the arm payoff
distribution under current model parameters :math:`\hat{\theta}`, and
:math:`\mathbb{E}_{q_{\hat{\theta}}}[r \mid a']` is the expected
reward under :math:`q_{\hat{\theta}}` of receiving reward `r` after
taking action :math:`a'`.
Parameters
----------
epsilon : float in [0, 1]
The probability of taking a random action. Default is 0.05.
ev_prior : float
The starting expected payoff for each arm before any data has been
observed. Default is 0.5.
"""
super().__init__()
self.epsilon = epsilon
self.ev_prior = ev_prior
self.pull_counts = defaultdict(lambda: 0)
@property
def parameters(self):
"""A dictionary containing the current policy parameters"""
return {"ev_estimates": self.ev_estimates}
@property
def hyperparameters(self):
"""A dictionary containing the policy hyperparameters"""
return {
"id": "EpsilonGreedy",
"epsilon": self.epsilon,
"ev_prior": self.ev_prior,
}
def _initialize_params(self, bandit):
"""
Initialize any policy-specific parameters that depend on information
from the bandit environment.
"""
self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}
self.is_initialized = True
def _select_arm(self, bandit, context=None):
if np.random.rand() < self.epsilon:
arm_id = np.random.choice(bandit.n_arms)
else:
ests = self.ev_estimates
(arm_id, _) = max(ests.items(), key=lambda x: x[1])
return arm_id
def _update_params(self, arm_id, reward, context=None):
E, C = self.ev_estimates, self.pull_counts
C[arm_id] += 1
E[arm_id] += (reward - E[arm_id]) / (C[arm_id])
def _reset_params(self):
"""
Reset any model-specific parameters. This gets called within the
public `self.reset()` method.
"""
self.ev_estimates = {}
self.pull_counts = defaultdict(lambda: 0)
class UCB1(BanditPolicyBase):
def __init__(self, C=1, ev_prior=0.5):
r"""
A UCB1 policy for multi-armed bandit problems.
Notes
-----
The UCB1 algorithm [*]_ guarantees the cumulative regret is bounded by log
`t`, where `t` is the current timestep. To make this guarantee UCB1
assumes all arm payoffs are between 0 and 1.
Under UCB1, the upper confidence bound on the expected value for
pulling arm `a` at timestep `t` is:
.. math::
\text{UCB}(a, t) = \text{EV}_t(a) + C \sqrt{\frac{2 \log t}{N_t(a)}}
where :math:`\text{EV}_t(a)` is the average of the rewards recieved so
far from pulling arm `a`, `C` is a free parameter controlling the
"optimism" of the confidence upper bound for :math:`\text{UCB}(a, t)`
(for logarithmic regret bounds, `C` must equal 1), and :math:`N_t(a)`
is the number of times arm `a` has been pulled during the previous `t -
1` timesteps.
References
----------
.. [*] Auer, P., Cesa-Bianchi, N., & Fischer, P. (2002). Finite-time
analysis of the multiarmed bandit problem. *Machine Learning,
47(2)*.
Parameters
----------
C : float in (0, +infinity)
A confidence/optimisim parameter affecting the degree of
exploration, where larger values encourage greater exploration. The
UCB1 algorithm assumes `C=1`. Default is 1.
ev_prior : float
The starting expected value for each arm before any data has been
observed. Default is 0.5.
"""
self.C = C
self.ev_prior = ev_prior
super().__init__()
@property
def parameters(self):
"""A dictionary containing the current policy parameters"""
return {"ev_estimates": self.ev_estimates}
@property
def hyperparameters(self):
"""A dictionary containing the policy hyperparameters"""
return {
"C": self.C,
"id": "UCB1",
"ev_prior": self.ev_prior,
}
def _initialize_params(self, bandit):
"""
Initialize any policy-specific parameters that depend on information
from the bandit environment.
"""
self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}
self.is_initialized = True
def _select_arm(self, bandit, context=None):
# add eps to avoid divide-by-zero errors on the first pull of each arm
eps = np.finfo(float).eps
N, T = bandit.n_arms, self.step + 1
E, C = self.ev_estimates, self.pull_counts
scores = [E[a] + self.C * np.sqrt(np.log(T) / (C[a] + eps)) for a in range(N)]
return np.argmax(scores)
def _update_params(self, arm_id, reward, context=None):
E, C = self.ev_estimates, self.pull_counts
C[arm_id] += 1
E[arm_id] += (reward - E[arm_id]) / (C[arm_id])
def _reset_params(self):
"""
Reset any model-specific parameters. This gets called within the
public :method:`reset` method.
"""
self.ev_estimates = {}
self.pull_counts = defaultdict(lambda: 0)
class ThompsonSamplingBetaBinomial(BanditPolicyBase):
def __init__(self, alpha=1, beta=1):
r"""
A conjugate Thompson sampling [1]_ [2]_ policy for multi-armed bandits with
Bernoulli likelihoods.
Notes
-----
The policy assumes independent Beta priors on the Bernoulli arm payoff
probabilities, :math:`\theta`:
.. math::
\theta_k \sim \text{Beta}(\alpha_k, \beta_k) \\
r \mid \theta_k \sim \text{Bernoulli}(\theta_k)
where :math:`k \in \{1,\ldots,K \}` indexes arms in the MAB and
:math:`\theta_k` is the parameter of the Bernoulli likelihood for arm
`k`. The sampler begins by selecting an arm with probability
proportional to its payoff probability under the initial Beta prior.
After pulling the sampled arm and receiving a reward, `r`, the sampler
computes the posterior over the model parameters (arm payoffs) via
Bayes' rule, and then samples a new action in proportion to its payoff
probability under this posterior. This process (i.e., sample action
from posterior, take action and receive reward, compute updated
posterior) is repeated until the number of trials is exhausted.
Note that due to the conjugacy between the Beta prior and Bernoulli
likelihood the posterior for each arm will also be Beta-distributed and
can computed and sampled from efficiently:
.. math::
\theta_k \mid r \sim \text{Beta}(\alpha_k + r, \beta_k + 1 - r)
References
----------
.. [1] Thompson, W. (1933). On the likelihood that one unknown
probability exceeds another in view of the evidence of two samples.
*Biometrika, 25(3/4)*, 285-294.
.. [2] Chapelle, O., & Li, L. (2011). An empirical evaluation of
Thompson sampling. *Advances in Neural Information Processing
Systems, 24*, 2249-2257.
Parameters
----------
alpha : float or list of length `K`
Parameter for the Beta prior on arm payouts. If a float, this value
will be used in the prior for all of the `K` arms.
beta : float or list of length `K`
Parameter for the Beta prior on arm payouts. If a float, this value
will be used in the prior for all of the `K` arms.
"""
super().__init__()
self.alphas, self.betas = [], []
self.alpha, self.beta = alpha, beta
self.is_initialized = False
@property
def parameters(self):
"""A dictionary containing the current policy parameters"""
return {
"ev_estimates": self.ev_estimates,
"alphas": self.alphas,
"betas": self.betas,
}
@property
def hyperparameters(self):
"""A dictionary containing the policy hyperparameters"""
return {
"id": "ThompsonSamplingBetaBinomial",
"alpha": self.alpha,
"beta": self.beta,
}
def _initialize_params(self, bandit):
bhp = bandit.hyperparameters
fstr = "ThompsonSamplingBetaBinomial only defined for BernoulliBandit, got: {}"
assert bhp["id"] == "BernoulliBandit", fstr.format(bhp["id"])
# initialize the model prior
if is_number(self.alpha):
self.alphas = [self.alpha] * bandit.n_arms
if is_number(self.beta):
self.betas = [self.beta] * bandit.n_arms
assert len(self.alphas) == len(self.betas) == bandit.n_arms
self.ev_estimates = {i: self._map_estimate(i, 1) for i in range(bandit.n_arms)}
self.is_initialized = True
def _select_arm(self, bandit, context):
if not self.is_initialized:
self._initialize_prior(bandit)
# draw a sample from the current model posterior
posterior_sample = np.random.beta(self.alphas, self.betas)
# greedily select an action based on this sample
return np.argmax(posterior_sample)
def _update_params(self, arm_id, rwd, context):
"""
Compute the parameters of the Beta posterior, P(payoff prob | rwd),
for arm `arm_id`.
"""
self.alphas[arm_id] += rwd
self.betas[arm_id] += 1 - rwd
self.ev_estimates[arm_id] = self._map_estimate(arm_id, rwd)
def _map_estimate(self, arm_id, rwd):
"""Compute the current MAP estimate for an arm's payoff probability"""
A, B = self.alphas, self.betas
if A[arm_id] > 1 and B[arm_id] > 1:
map_payoff_prob = (A[arm_id] - 1) / (A[arm_id] + B[arm_id] - 2)
elif A[arm_id] < 1 and B[arm_id] < 1:
map_payoff_prob = rwd # 0 or 1 equally likely, make a guess
elif A[arm_id] <= 1 and B[arm_id] > 1:
map_payoff_prob = 0
elif A[arm_id] > 1 and B[arm_id] <= 1:
map_payoff_prob = 1
else:
map_payoff_prob = 0.5
return map_payoff_prob
def _reset_params(self):
"""
Reset any model-specific parameters. This gets called within the
public `self.reset()` method.
"""
self.alphas, self.betas = [], []
self.ev_estimates = {}
class LinUCB(BanditPolicyBase):
def __init__(self, alpha=1):
"""
A disjoint linear UCB policy [*]_ for contextual linear bandits.
Notes
-----
LinUCB is only defined for :class:`ContextualLinearBandit ` environments.
References
----------
.. [*] Li, L., Chu, W., Langford, J., & Schapire, R. (2010). A
contextual-bandit approach to personalized news article
recommendation. In *Proceedings of the 19th International Conference
on World Wide Web*, 661-670.
Parameters
----------
alpha : float
A confidence/optimisim parameter affecting the amount of
exploration. Default is 1.
""" # noqa
super().__init__()
self.alpha = alpha
self.A, self.b = [], []
self.is_initialized = False
@property
def parameters(self):
"""A dictionary containing the current policy parameters"""
return {"ev_estimates": self.ev_estimates, "A": self.A, "b": self.b}
@property
def hyperparameters(self):
"""A dictionary containing the policy hyperparameters"""
return {
"id": "LinUCB",
"alpha": self.alpha,
}
def _initialize_params(self, bandit):
"""
Initialize any policy-specific parameters that depend on information
from the bandit environment.
"""
bhp = bandit.hyperparameters
fstr = "LinUCB only defined for contextual linear bandits, got: {}"
assert bhp["id"] == "ContextualLinearBandit", fstr.format(bhp["id"])
self.A, self.b = [], []
for _ in range(bandit.n_arms):
self.A.append(np.eye(bandit.D))
self.b.append(np.zeros(bandit.D))
self.is_initialized = True
def _select_arm(self, bandit, context):
probs = []
for a in range(bandit.n_arms):
C, A, b = context[:, a], self.A[a], self.b[a]
A_inv = np.linalg.inv(A)
theta_hat = A_inv @ b
p = theta_hat @ C + self.alpha * np.sqrt(C.T @ A_inv @ C)
probs.append(p)
return np.argmax(probs)
def _update_params(self, arm_id, rwd, context):
"""Compute the parameters for A and b."""
self.A[arm_id] += context[:, arm_id] @ context[:, arm_id].T
self.b[arm_id] += rwd * context[:, arm_id]
def _reset_params(self):
"""
Reset any model-specific parameters. This gets called within the
public `self.reset()` method.
"""
self.A, self.b = [], []
self.ev_estimates = {}
================================================
FILE: numpy_ml/bandits/trainer.py
================================================
"""A trainer/runner object for executing and comparing MAB policies."""
import warnings
import os.path as op
from collections import defaultdict
import numpy as np
from numpy_ml.utils.testing import DependencyWarning
try:
import matplotlib.pyplot as plt
_PLOTTING = True
except ImportError:
fstr = "Cannot import matplotlib. Plotting functionality disabled."
warnings.warn(fstr, DependencyWarning)
_PLOTTING = False
def get_scriptdir():
"""Return the directory containing the `trainer.py` script"""
return op.dirname(op.realpath(__file__))
def mse(bandit, policy):
"""
Computes the mean squared error between a policy's estimates of the
expected arm payouts and the true expected payouts.
"""
if not hasattr(policy, "ev_estimates") or len(policy.ev_estimates) == 0:
return np.nan
se = []
evs = bandit.arm_evs
ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])
for ix, (est, ev) in enumerate(zip(ests, evs)):
se.append((est[1] - ev) ** 2)
return np.mean(se)
def smooth(prev, cur, weight):
r"""
Compute a simple weighted average of the previous and current value.
Notes
-----
The smoothed value at timestep `t`, :math:`\tilde{X}_t` is calculated as
.. math::
\tilde{X}_t = \epsilon \tilde{X}_{t-1} + (1 - \epsilon) X_t
where :math:`X_t` is the value at timestep `t`, :math:`\tilde{X}_{t-1}` is
the value of the smoothed signal at timestep `t-1`, and :math:`\epsilon` is
the smoothing weight.
Parameters
----------
prev : float or :py:class:`ndarray ` of shape `(N,)`
The value of the smoothed signal at the immediately preceding
timestep.
cur : float or :py:class:`ndarray ` of shape `(N,)`
The value of the signal at the current timestep
weight : float or :py:class:`ndarray ` of shape `(N,)`
The smoothing weight. Values closer to 0 result in less smoothing,
values closer to 1 produce more aggressive smoothing. If weight is an
array, each dimension will be interpreted as a separate smoothing
weight the corresponding dimension in `cur`.
Returns
-------
smoothed : float or :py:class:`ndarray ` of shape `(N,)`
The smoothed signal
"""
return weight * prev + (1 - weight) * cur
class BanditTrainer:
def __init__(self):
"""
An object to facilitate multi-armed bandit training, comparison, and
evaluation.
"""
self.logs = {}
def compare(
self,
policies,
bandit,
n_trials,
n_duplicates,
plot=True,
seed=None,
smooth_weight=0.999,
out_dir=None,
):
"""
Compare the performance of multiple policies on the same bandit
environment, generating a plot for each.
Parameters
----------
policies : list of :class:`BanditPolicyBase ` instances
The multi-armed bandit policies to compare.
bandit : :class:`Bandit ` instance
The environment to train the policies on.
n_trials : int
The number of trials per run.
n_duplicates: int
The number of times to evaluate each policy on the bandit
environment. Larger values permit a better estimate of the
variance in payoff / cumulative regret for each policy.
plot : bool
Whether to generate a plot of the policy's average reward and
regret across the episodes. Default is True.
seed : int
The seed for the random number generator. Default is None.
smooth_weight : float in [0, 1]
The smoothing weight. Values closer to 0 result in less smoothing,
values closer to 1 produce more aggressive smoothing. Default is
0.999.
out_dir : str or None
Plots will be saved to this directory if `plot` is True. If
`out_dir` is None, plots will not be saved. Default is None.
""" # noqa: E501
self.init_logs(policies)
all_axes = [None] * len(policies)
if plot and _PLOTTING:
fig, all_axes = plt.subplots(len(policies), 2, sharex=True)
fig.set_size_inches(10.5, len(policies) * 5.25)
for policy, axes in zip(policies, all_axes):
if seed:
np.random.seed(seed)
bandit.reset()
policy.reset()
self.train(
policy,
bandit,
n_trials,
n_duplicates,
axes=axes,
plot=plot,
verbose=False,
out_dir=out_dir,
smooth_weight=smooth_weight,
)
# enforce the same y-ranges across plots for straightforward comparison
a1_r, a2_r = zip(*[(a1.get_ylim(), a2.get_ylim()) for (a1, a2) in all_axes])
a1_min = min(a1_r, key=lambda x: x[0])[0]
a1_max = max(a1_r, key=lambda x: x[1])[1]
a2_min = min(a2_r, key=lambda x: x[0])[0]
a2_max = max(a2_r, key=lambda x: x[1])[1]
for (a1, a2) in all_axes:
a1.set_ylim(a1_min, a1_max)
a2.set_ylim(a2_min, a2_max)
if plot and _PLOTTING:
if out_dir is not None:
plt.savefig(op.join(out_dir, "bandit_comparison.png"), dpi=300)
plt.show()
def train(
self,
policy,
bandit,
n_trials,
n_duplicates,
plot=True,
axes=None,
verbose=True,
print_every=100,
smooth_weight=0.999,
out_dir=None,
):
"""
Train a MAB policies on a multi-armed bandit problem, logging training
statistics along the way.
Parameters
----------
policy : :class:`BanditPolicyBase ` instance
The multi-armed bandit policy to train.
bandit : :class:`Bandit ` instance
The environment to run the policy on.
n_trials : int
The number of trials per run.
n_duplicates: int
The number of runs to evaluate
plot : bool
Whether to generate a plot of the policy's average reward and
regret across the episodes. Default is True.
axes : list of :py:class:`Axis ` instances or None
If not None and ``plot = True``, these are the axes that will be
used to plot the cumulative reward and regret, respectively.
Default is None.
verbose : boolean
Whether to print run statistics during training. Default is True.
print_every : int
The number of episodes to run before printing loss values to
stdout. This is ignored if ``verbose`` is false. Default is 100.
smooth_weight : float in [0, 1]
The smoothing weight. Values closer to 0 result in less smoothing,
values closer to 1 produce more aggressive smoothing. Default is
0.999.
out_dir : str or None
Plots will be saved to this directory if `plot` is True. If
`out_dir` is None, plots will not be saved. Default is None.
Returns
-------
policy : :class:`BanditPolicyBase ` instance
The policy trained during the last (i.e. most recent) duplicate
run.
""" # noqa: E501
if not str(policy) in self.logs:
self.init_logs(policy)
p = str(policy)
D, L = n_duplicates, self.logs
for d in range(D):
if verbose:
print("\nDUPLICATE {}/{}\n".format(d + 1, D))
bandit.reset()
policy.reset()
avg_oracle_reward, cregret = 0, 0
for trial_id in range(n_trials):
rwd, arm, orwd, oarm = self._train_step(bandit, policy)
loss = mse(bandit, policy)
regret = orwd - rwd
avg_oracle_reward += orwd
cregret += regret
L[p]["mse"][trial_id + 1].append(loss)
L[p]["reward"][trial_id + 1].append(rwd)
L[p]["regret"][trial_id + 1].append(regret)
L[p]["cregret"][trial_id + 1].append(cregret)
L[p]["optimal_arm"][trial_id + 1].append(oarm)
L[p]["selected_arm"][trial_id + 1].append(arm)
L[p]["optimal_reward"][trial_id + 1].append(orwd)
if (trial_id + 1) % print_every == 0 and verbose:
fstr = "Trial {}/{}, {}/{}, Regret: {:.4f}"
print(fstr.format(trial_id + 1, n_trials, d + 1, D, regret))
avg_oracle_reward /= n_trials
if verbose:
self._print_run_summary(bandit, policy, regret)
if plot and _PLOTTING:
self._plot_reward(avg_oracle_reward, policy, smooth_weight, axes, out_dir)
return policy
def _train_step(self, bandit, policy):
P, B = policy, bandit
C = B.get_context() if hasattr(B, "get_context") else None
rwd, arm = P.act(B, C)
oracle_rwd, oracle_arm = B.oracle_payoff(C)
return rwd, arm, oracle_rwd, oracle_arm
def init_logs(self, policies):
"""
Initialize the episode logs.
Notes
-----
Training logs are represented as a nested set of dictionaries with the
following structure:
log[model_id][metric][trial_number][duplicate_number]
For example, ``logs['model1']['regret'][3][1]`` holds the regret value
accrued on the 3rd trial of the 2nd duplicate run for model1.
Available fields are 'regret', 'cregret' (cumulative regret), 'reward',
'mse' (mean-squared error between estimated arm EVs and the true EVs),
'optimal_arm', 'selected_arm', and 'optimal_reward'.
"""
if not isinstance(policies, list):
policies = [policies]
self.logs = {
str(p): {
"mse": defaultdict(lambda: []),
"regret": defaultdict(lambda: []),
"reward": defaultdict(lambda: []),
"cregret": defaultdict(lambda: []),
"optimal_arm": defaultdict(lambda: []),
"selected_arm": defaultdict(lambda: []),
"optimal_reward": defaultdict(lambda: []),
}
for p in policies
}
def _print_run_summary(self, bandit, policy, regret):
if not hasattr(policy, "ev_estimates") or len(policy.ev_estimates) == 0:
return None
evs, se = bandit.arm_evs, []
fstr = "Arm {}: {:.4f} v. {:.4f}"
ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])
print("\n\nEstimated vs. Real EV\n" + "-" * 21)
for ix, (est, ev) in enumerate(zip(ests, evs)):
print(fstr.format(ix + 1, est[1], ev))
se.append((est[1] - ev) ** 2)
fstr = "\nFinal MSE: {:.4f}\nFinal Regret: {:.4f}\n\n"
print(fstr.format(np.mean(se), regret))
def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None, out_dir=None):
L = self.logs[str(policy)]
smds = self._smoothed_metrics(policy, optimal_rwd, smooth_weight)
if axes is None:
fig, [ax1, ax2] = plt.subplots(1, 2)
else:
assert len(axes) == 2
ax1, ax2 = axes
e_ids = range(1, len(L["reward"]) + 1)
plot_params = [[ax1, ax2], ["reward", "cregret"], ["b", "r"], [optimal_rwd, 0]]
for (ax, m, c, opt) in zip(*plot_params):
avg, std = "sm_{}_avg sm_{}_std".format(m, m).split()
ax.plot(e_ids, smds[avg], color=c)
ax.axhline(opt, 0, 1, color=c, ls="--")
ax.fill_between(
e_ids,
smds[avg] + smds[std],
smds[avg] - smds[std],
color=c,
alpha=0.25,
)
ax.set_xlabel("Trial")
m = "Cumulative Regret" if m == "cregret" else m
ax.set_ylabel("Smoothed Avg. {}".format(m.title()))
if axes is None:
ax.set_aspect(np.diff(ax.get_xlim()) / np.diff(ax.get_ylim()))
if axes is not None:
ax.set_title(str(policy))
if axes is None:
fig.suptitle(str(policy))
fig.tight_layout()
if out_dir is not None:
bid = policy.hyperparameters["id"]
plt.savefig(op.join(out_dir, f"{bid}.png"), dpi=300)
plt.show()
return ax1, ax2
def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):
L = self.logs[str(policy)]
# pre-allocate smoothed data structure
smds = {}
for m in L.keys():
if m == "selections":
continue
smds["sm_{}_avg".format(m)] = np.zeros(len(L["reward"]))
smds["sm_{}_avg".format(m)][0] = np.mean(L[m][1])
smds["sm_{}_std".format(m)] = np.zeros(len(L["reward"]))
smds["sm_{}_std".format(m)][0] = np.std(L[m][1])
smoothed = {m: L[m][1] for m in L.keys()}
for e_id in range(2, len(L["reward"]) + 1):
for m in L.keys():
if m == "selections":
continue
prev, cur = smoothed[m], L[m][e_id]
smoothed[m] = [smooth(p, c, smooth_weight) for p, c in zip(prev, cur)]
smds["sm_{}_avg".format(m)][e_id - 1] = np.mean(smoothed[m])
smds["sm_{}_std".format(m)][e_id - 1] = np.std(smoothed[m])
return smds
================================================
FILE: numpy_ml/factorization/README.md
================================================
# Factors
The `factors.py` module includes common approximate matrix-factorization
algorithms including:
- Regularized alternating least squares (ALS)
- Non-negative matrix factorization via fast hierarchical least squares (HALS) ([Cichocki & Phan, 2008](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.214.6398&rep=rep1&type=pdf))
================================================
FILE: numpy_ml/factorization/__init__.py
================================================
"""Algorithms for approximate matrix factorization"""
from .factors import *
================================================
FILE: numpy_ml/factorization/factors.py
================================================
"""Algorithms for approximate matrix factorization"""
from copy import deepcopy
import numpy as np
class VanillaALS:
def __init__(self, K, alpha=1, max_iter=200, tol=1e-4):
r"""
Approximately factor a real-valued matrix using regularized alternating
least-squares (ALS).
Notes
-----
The regularized ALS minimization problem is
.. math::
\min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2 -
\alpha \left(
||\mathbf{W}||^2 + ||\mathbf{H}||^2
\right)
where :math:`||\cdot||` denotes the Frobenius norm, **X** is the
:math:`N \times M` data matrix, :math:`\mathbf{W}` and
:math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
\times K` and :math:`K \times M`, respectively, and :math:`\alpha` is a
user-defined regularization weight.
ALS proceeds by alternating between fixing **W** and optimizing for
**H** and fixing **H** and optimizing for **W**. Vanilla ALS has no
convergance guarantees and the objective function is prone to
oscillation across updates, particularly for dense input matrices [1]_.
References
----------
.. [1] Gillis, N. (2014). The why and how of nonnegative matrix
factorization. *Regularization, optimization, kernels, and support
vector machines, 12(257)*, 257-291.
Parameters
----------
K : int
The number of latent factors to include in the factor matrices W
and H.
alpha : float
The L2 regularization weight on the factor matrices. Larger
values result in more aggressive regularization. Default is 1.
max_iter : int
The maximum number of iterations to run before stopping. Default is
200.
tol : float
The tolerance for the stopping condition. Default is 1e-4.
"""
self.K = K
self.W = None
self.H = None
self.tol = tol
self.alpha = alpha
self.max_iter = max_iter
@property
def parameters(self):
"""Return a dictionary of the current model parameters"""
return {"W": self.W, "H": self.H}
@property
def hyperparameters(self):
"""Return a dictionary of the model hyperparameters"""
return {
"id": "ALSFactor",
"K": self.K,
"tol": self.tol,
"alpha": self.alpha,
"max_iter": self.max_iter,
}
def _init_factor_matrices(self, X, W=None, H=None):
"""Randomly initialize the factor matrices"""
N, M = X.shape
scale = np.sqrt(X.mean() / self.K)
self.W = np.random.rand(N, self.K) * scale if W is None else W
self.H = np.random.rand(self.K, M) * scale if H is None else H
assert self.W.shape == (N, self.K)
assert self.H.shape == (self.K, M)
def _loss(self, X, Xhat):
"""Regularized Frobenius loss"""
alpha, W, H = self.alpha, self.W, self.H
sq_fnorm = lambda x: np.sum(x ** 2) # noqa: E731
return sq_fnorm(X - Xhat) + alpha * (sq_fnorm(W) + sq_fnorm(H))
def _update_factor(self, X, A):
"""Perform the ALS update"""
T1 = np.linalg.inv(A.T @ A + self.alpha * np.eye(self.K))
return X @ A @ T1
def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
"""
Factor a data matrix into two low rank factors via ALS.
Parameters
----------
X : numpy array of shape `(N, M)`
The data matrix to factor.
W : numpy array of shape `(N, K)` or None
An initial value for the `W` factor matrix. If None, initialize `W`
randomly. Default is None.
H : numpy array of shape `(K, M)` or None
An initial value for the `H` factor matrix. If None, initialize `H`
randomly. Default is None.
n_initializations : int
Number of re-initializations of the algorithm to perform before
taking the answer with the lowest reconstruction error. This value
is ignored and set to 1 if both `W` and `H` are not None. Default
is 10.
verbose : bool
Whether to print the loss at each iteration. Default is False.
"""
if W is not None and H is not None:
n_initializations = 1
best_loss = np.inf
for f in range(n_initializations):
if verbose:
print("\nINITIALIZATION {}".format(f + 1))
new_W, new_H, loss = self._fit(X, W, H, verbose)
if loss <= best_loss:
best_loss = loss
best_W, best_H = deepcopy(new_W), deepcopy(new_H)
self.W, self.H = best_W, best_H
if verbose:
print("\nFINAL LOSS: {}".format(best_loss))
def _fit(self, X, W, H, verbose):
self._init_factor_matrices(X, W, H)
W, H = self.W, self.H
for i in range(self.max_iter):
W = self._update_factor(X, H.T)
H = self._update_factor(X.T, W).T
loss = self._loss(X, W @ H)
if verbose:
print("[Iter {}] Loss: {:.8f}".format(i + 1, loss))
if loss <= self.tol:
break
return W, H, loss
class NMF:
def __init__(self, K, max_iter=200, tol=1e-4):
r"""
Nonnegative matrix factorization (NMF) performed using fast
hierarchical alternating least squares (HALS) [*]_.
Notes
-----
The NMF minimization problem is
.. math::
\min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2
\ \ \ \ \text{subject to } \mathbf{W}, \mathbf{H} \geq 0
where :math:`||\cdot||` denotes the Frobenius norm, and the notation
:math:`\mathbf{A} \geq 0` indicates that each element of **A** is
greater than or equal to 0. In the above equation, **X** is the
:math:`N \times M` data matrix, :math:`\mathbf{W}` and
:math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
\times K` and :math:`K \times M`, respectively.
As with other ALS-based approaches, there is no guarantee that NMF will
converge to a stationary point, let alone a global minimum. As a result
it is generally good practice to run the algorithm multiple times with
different initializations, taking the outcome that achieves the lowest
reconstruction error.
References
----------
.. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
large scale nonnegative matrix and tensor factorizations. *IEICE
Transactions on Fundamentals of Electronics, Communications and
Computer Sciences, 92(3)*, 708-721.
Parameters
----------
K : int
The number of latent factors to include in the factor matrices **W**
and **H**.
max_iter : int
The maximum number of iterations to run before stopping. Default is
200.
tol : float
The tolerance for the stopping condition. Default is 1e-4.
"""
self.K = K
self.W = None
self.H = None
self.tol = tol
self.max_iter = max_iter
@property
def parameters(self):
"""Return a dictionary of the current model parameters"""
return {"W": self.W, "H": self.H}
@property
def hyperparameters(self):
"""Return a dictionary of the model hyperparameters"""
return {
"id": "NMF",
"K": self.K,
"tol": self.tol,
"max_iter": self.max_iter,
}
def _init_factor_matrices(self, X, W, H):
"""Initialize the factor matrices using vanilla ALS"""
ALS = None
N, M = X.shape
# initialize factors using ALS if not already defined
if W is None:
ALS = VanillaALS(self.K, alpha=0, max_iter=200)
ALS.fit(X, verbose=False)
W = ALS.W / np.linalg.norm(ALS.W, axis=0)
if H is None:
H = np.abs(np.random.rand(self.K, M)) if ALS is None else ALS.H
assert W.shape == (N, self.K)
assert H.shape == (self.K, M)
self.H = H
self.W = W
def _loss(self, X, Xhat):
"""Return the least-squares reconstruction loss between X and Xhat"""
return np.sum((X - Xhat) ** 2)
def _update_H(self, X, W, H):
"""Perform the fast HALS update for H"""
eps = np.finfo(float).eps
XtW = X.T @ W # dim: (M, K)
WtW = W.T @ W # dim: (K, K)
for k in range(self.K):
H[k, :] += XtW[:, k] - H.T @ WtW[:, k]
H[k, :] = np.clip(H[k, :], eps, np.inf) # enforce nonnegativity
return H
def _update_W(self, X, W, H):
"""Perform the fast HALS update for W"""
eps = np.finfo(float).eps
XHt = X @ H.T # dim: (N, K)
HHt = H @ H.T # dim: (K, K)
for k in range(self.K):
W[:, k] = W[:, k] * HHt[k, k] + XHt[:, k] - W @ HHt[:, k]
W[:, k] = np.clip(W[:, k], eps, np.inf) # enforce nonnegativity
# renormalize the new column
n = np.linalg.norm(W[:, k])
W[:, k] /= n if n > 0 else 1.0
return W
def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
r"""
Factor a data matrix into two nonnegative low rank factor matrices via
fast HALS.
Notes
-----
This method implements Algorithm 2 from [*]_. In contrast to vanilla
ALS, HALS proceeds by minimizing a *set* of local cost functions with
the same global minima. Each cost function is defined on a "residue" of
the factor matrices **W** and **H**:
.. math::
\mathbf{X}^{(j)} :=
\mathbf{X} - \mathbf{WH}^\top + \mathbf{w}_j \mathbf{h}_j^\top
where :math:`\mathbf{X}^{(j)}` is the :math:`j^{th}` residue, **X** is
the input data matrix, and :math:`\mathbf{w}_j` and
:math:`\mathbf{h}_j` are the :math:`j^{th}` columns of the current
factor matrices **W** and **H**. HALS proceeds by minimizing the cost
for each residue, first with respect to :math:`\mathbf{w}_j`, and then
with respect to :math:`\mathbf{h}_j`. In either case, the cost for
residue `j`, :math:`\mathcal{L}^{(j)}` is simply:
.. math::
\mathcal{L}^{(j)} :=
|| \mathbf{X}^{(j)} - \mathbf{w}_j \mathbf{h}_j^\top ||
where :math:`||\cdot||` denotes the Frobenius norm. For NMF,
minimization is performed under the constraint that all elements of
both **W** and **H** are nonnegative.
References
----------
.. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
large scale nonnegative matrix and tensor factorizations. *IEICE
Transactions on Fundamentals of Electronics, Communications and
Computer Sciences, 92(3)*, 708-721.
Parameters
----------
X : numpy array of shape `(N, M)`
The data matrix to factor.
W : numpy array of shape `(N, K)` or None
An initial value for the `W` factor matrix. If None, initialize
**W** using vanilla ALS. Default is None.
H : numpy array of shape `(K, M)` or None
An initial value for the `H` factor matrix. If None, initialize
**H** using vanilla ALS. Default is None.
n_initializations : int
Number of re-initializations of the algorithm to perform before
taking the answer with the lowest reconstruction error. This value
is ignored and set to 1 if both `W` and `H` are not None. Default
is 10.
verbose : bool
Whether to print the loss at each iteration. Default is False.
"""
if W is not None and H is not None:
n_initializations = 1
best_loss = np.inf
for f in range(n_initializations):
if verbose:
print("\nINITIALIZATION {}".format(f + 1))
new_W, new_H, loss = self._fit(X, W, H, verbose)
if loss <= best_loss:
best_loss = loss
best_W, best_H = deepcopy(new_W), deepcopy(new_H)
self.W, self.H = best_W, best_H
if verbose:
print("\nFINAL LOSS: {}".format(best_loss))
def _fit(self, X, W, H, verbose):
self._init_factor_matrices(X, W, H)
W, H = self.W, self.H
for i in range(self.max_iter):
H = self._update_H(X, W, H)
W = self._update_W(X, W, H)
loss = self._loss(X, W @ H)
if verbose:
print("[Iter {}] Loss: {:.8f}".format(i + 1, loss))
if loss <= self.tol:
break
return W, H, loss
================================================
FILE: numpy_ml/gmm/README.md
================================================
# Gaussian Mixture Models
The `gmm.py` module implements the standard (ie., non-Bayesian) [Gaussian mixture model](https://en.wikipedia.org/wiki/Mixture_model#Gaussian_mixture_model) with maximum-likelihood parameter estimates via the [EM algorithm](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm).
## Plots
================================================
FILE: numpy_ml/gmm/__init__.py
================================================
from .gmm import *
================================================
FILE: numpy_ml/gmm/gmm.py
================================================
"""A Gaussian mixture model class"""
import numpy as np
from numpy_ml.utils.misc import logsumexp, log_gaussian_pdf
class GMM(object):
def __init__(self, C=3, seed=None):
"""
A Gaussian mixture model trained via the expectation maximization
algorithm.
Parameters
----------
C : int
The number of clusters / mixture components in the GMM. Default is
3.
seed : int
Seed for the random number generator. Default is None.
Attributes
----------
N : int
The number of examples in the training dataset.
d : int
The dimension of each example in the training dataset.
pi : :py:class:`ndarray ` of shape `(C,)`
The cluster priors.
Q : :py:class:`ndarray ` of shape `(N, C)`
The variational distribution `q(T)`.
mu : :py:class:`ndarray ` of shape `(C, d)`
The cluster means.
sigma : :py:class:`ndarray ` of shape `(C, d, d)`
The cluster covariance matrices.
"""
self.elbo = None
self.parameters = {}
self.hyperparameters = {
"C": C,
"seed": seed,
}
self.is_fit = False
if seed:
np.random.seed(seed)
def _initialize_params(self, X):
"""Randomly initialize the starting GMM parameters."""
N, d = X.shape
C = self.hyperparameters["C"]
rr = np.random.rand(C)
self.parameters = {
"pi": rr / rr.sum(), # cluster priors
"Q": np.zeros((N, C)), # variational distribution q(T)
"mu": np.random.uniform(-5, 10, C * d).reshape(C, d), # cluster means
"sigma": np.array([np.eye(d) for _ in range(C)]), # cluster covariances
}
self.elbo = None
self.is_fit = False
def likelihood_lower_bound(self, X):
"""Compute the LLB under the current GMM parameters."""
N = X.shape[0]
P = self.parameters
C = self.hyperparameters["C"]
pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
eps = np.finfo(float).eps
expec1, expec2 = 0.0, 0.0
for i in range(N):
x_i = X[i]
for c in range(C):
pi_k = pi[c]
z_nk = Q[i, c]
mu_k = mu[c, :]
sigma_k = sigma[c, :, :]
log_pi_k = np.log(pi_k + eps)
log_p_x_i = log_gaussian_pdf(x_i, mu_k, sigma_k)
prob = z_nk * (log_p_x_i + log_pi_k)
expec1 += prob
expec2 += z_nk * np.log(z_nk + eps)
loss = expec1 - expec2
return loss
def fit(self, X, max_iter=100, tol=1e-3, verbose=False):
"""
Fit the parameters of the GMM on some training data.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, d)`
A collection of `N` training data points, each with dimension `d`.
max_iter : int
The maximum number of EM updates to perform before terminating
training. Default is 100.
tol : float
The convergence tolerance. Training is terminated if the difference
in VLB between the current and previous iteration is less than
`tol`. Default is 1e-3.
verbose : bool
Whether to print the VLB at each training iteration. Default is
False.
Returns
-------
success : {0, -1}
Whether training terminated without incident (0) or one of the
mixture components collapsed and training was halted prematurely
(-1).
"""
prev_vlb = -np.inf
self._initialize_params(X)
for _iter in range(max_iter):
try:
self._E_step(X)
self._M_step(X)
vlb = self.likelihood_lower_bound(X)
if verbose:
print(f"{_iter + 1}. Lower bound: {vlb}")
converged = _iter > 0 and np.abs(vlb - prev_vlb) <= tol
if np.isnan(vlb) or converged:
break
prev_vlb = vlb
except np.linalg.LinAlgError:
print("Singular matrix: components collapsed")
return -1
self.elbo = vlb
self.is_fit = True
return 0
def predict(self, X, soft_labels=True):
"""
Return the log probability of each data point in `X` under each
mixture components.
Parameters
----------
X : :py:class:`ndarray ` of shape `(M, d)`
A collection of `M` data points, each with dimension `d`.
soft_labels : bool
If True, return the log probabilities of the M data points in X
under each mixture component. If False, return only the ID of the
most probable mixture. Default is True.
Returns
-------
y : :py:class:`ndarray ` of shape `(M, C)` or `(M,)`
If `soft_labels` is True, `y` is a 2D array where index (i,j) gives
the log probability of the `i` th data point under the `j` th
mixture component. If `soft_labels` is False, `y` is a 1D array
where the `i` th index contains the ID of the most probable mixture
component.
"""
assert self.is_fit, "Must call the `.fit` method before making predictions"
P = self.parameters
C = self.hyperparameters["C"]
mu, sigma = P["mu"], P["sigma"]
y = []
for x_i in X:
cprobs = [log_gaussian_pdf(x_i, mu[c, :], sigma[c, :, :]) for c in range(C)]
if not soft_labels:
y.append(np.argmax(cprobs))
else:
y.append(cprobs)
return np.array(y)
def _E_step(self, X):
P = self.parameters
C = self.hyperparameters["C"]
pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
for i, x_i in enumerate(X):
denom_vals = []
for c in range(C):
pi_c = pi[c]
mu_c = mu[c, :]
sigma_c = sigma[c, :, :]
log_pi_c = np.log(pi_c)
log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c)
# log N(X_i | mu_c, Sigma_c) + log pi_c
denom_vals.append(log_p_x_i + log_pi_c)
# log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ]
log_denom = logsumexp(denom_vals)
q_i = np.exp([num - log_denom for num in denom_vals])
np.testing.assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i)))
Q[i, :] = q_i
def _M_step(self, X):
N, d = X.shape
P = self.parameters
C = self.hyperparameters["C"]
pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
denoms = np.sum(Q, axis=0)
# update cluster priors
pi = denoms / N
# update cluster means
nums_mu = [np.dot(Q[:, c], X) for c in range(C)]
for ix, (num, den) in enumerate(zip(nums_mu, denoms)):
mu[ix, :] = num / den if den > 0 else np.zeros_like(num)
# update cluster covariances
for c in range(C):
mu_c = mu[c, :]
n_c = denoms[c]
outer = np.zeros((d, d))
for i in range(N):
wic = Q[i, c]
xi = X[i, :]
outer += wic * np.outer(xi - mu_c, xi - mu_c)
outer = outer / n_c if n_c > 0 else outer
sigma[c, :, :] = outer
np.testing.assert_allclose(np.sum(pi), 1, err_msg="{}".format(np.sum(pi)))
================================================
FILE: numpy_ml/hmm/README.md
================================================
# Hidden Markov model
The `hmm.py` module implements a standard (i.e., non-Bayesian) [Hidden Markov
model](https://en.wikipedia.org/wiki/Hidden_Markov_model) with
maximum-likelihood parameter estimation via the
[EM-algorithm](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (specifically, [Baum-Welch](https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm)).
## Plots
================================================
FILE: numpy_ml/hmm/__init__.py
================================================
from .hmm import *
================================================
FILE: numpy_ml/hmm/hmm.py
================================================
"""Hidden Markov model module"""
import numpy as np
from numpy_ml.utils.misc import logsumexp
class MultinomialHMM:
def __init__(self, A=None, B=None, pi=None, eps=None):
r"""
A simple hidden Markov model with multinomial emission distribution.
Parameters
----------
A : :py:class:`ndarray ` of shape `(N, N)` or None
The transition matrix between latent states in the HMM. Index `i`,
`j` gives the probability of transitioning from latent state `i` to
latent state `j`. Default is None.
B : :py:class:`ndarray ` of shape `(N, V)` or None
The emission matrix. Entry `i`, `j` gives the probability of latent
state i emitting an observation of type `j`. Default is None.
pi : :py:class:`ndarray ` of shape `(N,)` or None
The prior probability of each latent state. If None, use a uniform
prior over states. Default is None.
eps : float or None
Epsilon value to avoid :math:`\log(0)` errors. If None, defaults to
the machine epsilon. Default is None.
Attributes
----------
A : :py:class:`ndarray ` of shape `(N, N)`
The transition matrix between latent states in the HMM. Index `i`,
`j` gives the probability of transitioning from latent state `i` to
latent state `j`.
B : :py:class:`ndarray ` of shape `(N, V)`
The emission matrix. Entry `i`, `j` gives the probability of latent
state `i` emitting an observation of type `j`.
N : int
The number of unique latent states
V : int
The number of unique observation types
O : :py:class:`ndarray ` of shape `(I, T)`
The collection of observed training sequences.
I : int
The number of sequences in `O`.
T : int
The number of observations in each sequence in `O`.
"""
eps = np.finfo(float).eps if eps is None else eps
# prior probability of each latent state
if pi is not None:
pi[pi == 0] = eps
# number of latent state types
N = None
if A is not None:
N = A.shape[0]
A[A == 0] = eps
# number of observation types
V = None
if B is not None:
V = B.shape[1]
B[B == 0] = eps
self.parameters = {
"A": A, # transition matrix
"B": B, # emission matrix
"pi": pi, # prior probability of each latent state
}
self.hyperparameters = {
"eps": eps, # epsilon
}
self.derived_variables = {
"N": N, # number of latent state types
"V": V, # number of observation types
}
def generate(self, n_steps, latent_state_types, obs_types):
"""
Sample a sequence from the HMM.
Parameters
----------
n_steps : int
The length of the generated sequence
latent_state_types : :py:class:`ndarray ` of shape `(N,)`
A collection of labels for the latent states
obs_types : :py:class:`ndarray ` of shape `(V,)`
A collection of labels for the observations
Returns
-------
states : :py:class:`ndarray ` of shape `(n_steps,)`
The sampled latent states.
emissions : :py:class:`ndarray ` of shape `(n_steps,)`
The sampled emissions.
"""
P = self.parameters
A, B, pi = P["A"], P["B"], P["pi"]
# sample the initial latent state
s = np.random.multinomial(1, pi).argmax()
states = [latent_state_types[s]]
# generate an emission given latent state
v = np.random.multinomial(1, B[s, :]).argmax()
emissions = [obs_types[v]]
# sample a latent transition, rinse, and repeat
for i in range(n_steps - 1):
s = np.random.multinomial(1, A[s, :]).argmax()
states.append(latent_state_types[s])
v = np.random.multinomial(1, B[s, :]).argmax()
emissions.append(obs_types[v])
return np.array(states), np.array(emissions)
def log_likelihood(self, O):
r"""
Given the HMM parameterized by :math:`(A`, B, \pi)` and an observation
sequence `O`, compute the marginal likelihood of `O`,
:math:`P(O \mid A,B,\pi)`, by marginalizing over latent states.
Notes
-----
The log likelihood is computed efficiently via DP using the forward
algorithm, which produces a 2D trellis, ``forward`` (sometimes referred
to as `alpha` in the literature), where entry `i`, `j` represents the
probability under the HMM of being in latent state `i` after seeing the
first `j` observations:
.. math::
\mathtt{forward[i,j]} = P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)
Here :math:`q_j = i` indicates that the hidden state at time `j` is of
type `i`.
The DP step is:
.. math::
\mathtt{forward[i,j]}
&= \sum_{s'=1}^N \mathtt{forward[s',j-1]} \cdot
\mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
&= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)
In words, ``forward[i,j]`` is the weighted sum of the values computed on
the previous timestep. The weight on each previous state value is the
product of the probability of transitioning from that state to state `i`
and the probability of emitting observation `j` in state `i`.
Parameters
----------
O : :py:class:`ndarray ` of shape `(1, T)`
A single set of observations.
Returns
-------
likelihood : float
The likelihood of the observations `O` under the HMM.
"""
if O.ndim == 1:
O = O.reshape(1, -1) # noqa: E741
I, T = O.shape # noqa: E741
if I != 1: # noqa: E741
raise ValueError("Likelihood only accepts a single sequence")
forward = self._forward(O[0])
log_likelihood = logsumexp(forward[:, T - 1])
return log_likelihood
def decode(self, O):
r"""
Given the HMM parameterized by :math:`(A, B, \pi)` and an observation
sequence :math:`O = o_1, \ldots, o_T`, compute the most probable
sequence of latent states, :math:`Q = q_1, \ldots, q_T`.
Notes
-----
HMM decoding is done efficiently via DP using the Viterbi algorithm,
which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the
probability under the HMM of being in state `i` at time `j` after having
passed through the *most probable* state sequence :math:`q_1,\ldots,q_{j-1}`:
.. math::
\mathtt{viterbi[i,j]} =
\max_{q_1, \ldots, q_{j-1}}
P(o_1, \ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)
Here :math:`q_j = i` indicates that the hidden state at time `j` is of
type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over
all possible latent state sequences for the first `j-1` observations.
The DP step is:
.. math::
\mathtt{viterbi[i,j]} &=
\max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot
\mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
&= \max_{s'=1}^N
P(o_1,\ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)
P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)
In words, ``viterbi[i,j]`` is the weighted sum of the values computed
on the previous timestep. The weight on each value is the product of
the probability of transitioning from that state to state `i` and the
probability of emitting observation `j` in state `i`.
To compute the most probable state sequence we maintain a second
trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the
latent state at timestep `j-1` that is most likely to lead to latent
state `i` at timestep `j`.
When we have completed the ``viterbi`` and ``back_pointer`` trellises for
all `T` timseteps/observations, we greedily move backwards through the
``back_pointer`` trellis to construct the best path for the full
sequence of observations.
Parameters
----------
O : :py:class:`ndarray ` of shape `(T,)`
An observation sequence of length `T`.
Returns
-------
best_path : list of length `T`
The most probable sequence of latent states for observations `O`.
best_path_prob : float
The probability of the latent state sequence in `best_path` under
the HMM.
"""
P = self.parameters
N = self.derived_variables["N"]
eps = self.hyperparameters["eps"]
A, B, pi = P["A"], P["B"], P["pi"]
if O.ndim == 1:
O = O.reshape(1, -1) # noqa: E741
# number of observations in each sequence
T = O.shape[1]
# number of training sequences
I = O.shape[0] # noqa: E741
if I != 1: # noqa: E741
raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)")
# initialize the viterbi and back_pointer matrices
viterbi = np.zeros((N, T))
back_pointer = np.zeros((N, T)).astype(int)
ot = O[0, 0]
for s in range(N):
back_pointer[s, 0] = 0
viterbi[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)
for t in range(1, T):
ot = O[0, t]
for s in range(N):
seq_probs = [
viterbi[s_, t - 1] + np.log(A[s_, s] + eps) + np.log(B[s, ot] + eps)
for s_ in range(N)
]
viterbi[s, t] = np.max(seq_probs)
back_pointer[s, t] = np.argmax(seq_probs)
best_path_log_prob = viterbi[:, T - 1].max()
# backtrack through the trellis to get the most likely sequence of
# latent states
pointer = viterbi[:, T - 1].argmax()
best_path = [pointer]
for t in reversed(range(1, T)):
pointer = back_pointer[pointer, t]
best_path.append(pointer)
best_path = best_path[::-1]
return best_path, best_path_log_prob
def _forward(self, Obs):
r"""
Computes the forward probability trellis for an HMM parameterized by
:math:`(A, B, \pi)`.
Notes
-----
The forward trellis (sometimes referred to as `alpha` in the HMM
literature), is a 2D array where entry `i`, `j` represents the probability
under the HMM of being in latent state `i` after seeing the first `j`
observations:
.. math::
\mathtt{forward[i,j]} =
P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)
Here :math:`q_j = i` indicates that the hidden state at time `j` is of
type `i`.
The DP step is::
.. math::
forward[i,j] &=
\sum_{s'=1}^N forward[s',j-1] \times A[s',i] \times B[i,o_j] \\
&= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
\times P(q_j=i \mid q_{j-1}=s') \times P(o_j \mid q_j=i)
In words, ``forward[i,j]`` is the weighted sum of the values computed
on the previous timestep. The weight on each previous state value is
the product of the probability of transitioning from that state to
state `i` and the probability of emitting observation `j` in state `i`.
Parameters
----------
Obs : :py:class:`ndarray ` of shape `(T,)`
An observation sequence of length `T`.
Returns
-------
forward : :py:class:`ndarray ` of shape `(N, T)`
The forward trellis.
"""
P = self.parameters
N = self.derived_variables["N"]
eps = self.hyperparameters["eps"]
A, B, pi = P["A"], P["B"], P["pi"]
T = Obs.shape[0]
# initialize the forward probability matrix
forward = np.zeros((N, T))
ot = Obs[0]
for s in range(N):
forward[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)
for t in range(1, T):
ot = Obs[t]
for s in range(N):
forward[s, t] = logsumexp(
[
forward[s_, t - 1]
+ np.log(A[s_, s] + eps)
+ np.log(B[s, ot] + eps)
for s_ in range(N)
] # noqa: C812
)
return forward
def _backward(self, Obs):
r"""
Compute the backward probability trellis for an HMM parameterized by
:math:`(A, B, \pi)`.
Notes
-----
The backward trellis (sometimes referred to as `beta` in the HMM
literature), is a 2D array where entry `i`,`j` represents the probability
of seeing the observations from time `j+1` onward given that the HMM is
in state `i` at time `j`
.. math::
\mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},\ldots,o_T \mid q_j=i,A,B,\pi)
Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`.
The DP step is::
backward[i,j] &=
\sum_{s'=1}^N backward[s',j+1] \times A[i, s'] \times B[s',o_{j+1}] \\
&= \sum_{s'=1}^N P(o_{j+1}, o_{j+2}, \ldots, o_T \mid q_j=i, A, B, pi)
\times P(q_{j+1}=s' \mid q_{j}=i) \times P(o_{j+1} \mid q_{j+1}=s')
In words, ``backward[i,j]`` is the weighted sum of the values computed
on the following timestep. The weight on each state value from the
`j+1`'th timestep is the product of the probability of transitioning from
state i to that state and the probability of emitting observation `j+1`
from that state.
Parameters
----------
Obs : :py:class:`ndarray ` of shape `(T,)`
A single observation sequence of length `T`.
Returns
-------
backward : :py:class:`ndarray ` of shape `(N, T)`
The backward trellis.
"""
P = self.parameters
A, B = P["A"], P["B"]
N = self.derived_variables["N"]
eps = self.hyperparameters["eps"]
T = Obs.shape[0]
# initialize the backward trellis
backward = np.zeros((N, T))
for s in range(N):
backward[s, T - 1] = 0
for t in reversed(range(T - 1)):
ot1 = Obs[t + 1]
for s in range(N):
backward[s, t] = logsumexp(
[
np.log(A[s, s_] + eps)
+ np.log(B[s_, ot1] + eps)
+ backward[s_, t + 1]
for s_ in range(N)
] # noqa: C812
)
return backward
def _initialize_parameters(self):
P = self.parameters
A, B, pi = P["A"], P["B"], P["pi"]
N, V = self.derived_variables["N"], self.derived_variables["V"]
# Uniform initialization of prior over latent states
if pi is None:
pi = np.ones(N)
pi = pi / pi.sum()
# Uniform initialization of A
if A is None:
A = np.ones((N, N))
A = A / A.sum(axis=1)[:, None]
# Random initialization of B
if B is None:
B = np.random.rand(N, V)
B = B / B.sum(axis=1)[:, None]
P["A"], P["B"], P["pi"] = A, B, pi
def fit(
self,
O,
latent_state_types,
observation_types,
pi=None,
tol=1e-5,
verbose=False,
):
"""
Given an observation sequence `O` and the set of possible latent states,
learn the MLE HMM parameters `A` and `B`.
Notes
-----
Model fitting is done iterativly using the Baum-Welch/Forward-Backward
algorithm, a special case of the EM algorithm.
We begin with an intial estimate for the transition (`A`) and emission
(`B`) matrices and then use these to derive better and better estimates
by computing the forward probability for an observation and then
dividing that probability mass among all the paths that contributed to
it.
Parameters
----------
O : :py:class:`ndarray ` of shape `(I, T)`
The set of `I` training observations, each of length `T`.
latent_state_types : list of length `N`
The collection of valid latent states.
observation_types : list of length `V`
The collection of valid observation states.
pi : :py:class:`ndarray ` of shape `(N,)`
The prior probability of each latent state. If None, assume each
latent state is equally likely a priori. Default is None.
tol : float
The tolerance value. If the difference in log likelihood between
two epochs is less than this value, terminate training. Default is
1e-5.
verbose : bool
Print training stats after each epoch. Default is True.
Returns
-------
A : :py:class:`ndarray ` of shape `(N, N)`
The estimated transition matrix.
B : :py:class:`ndarray ` of shape `(N, V)`
The estimated emission matrix.
pi : :py:class:`ndarray ` of shape `(N,)`
The estimated prior probabilities of each latent state.
"""
# observations
if O.ndim == 1:
O = O.reshape(1, -1) # noqa: E741
# number of training examples (I) and their lengths (T)
I, T = O.shape
# number of types of observation
self.derived_variables["V"] = len(observation_types)
# number of latent state types
self.derived_variables["N"] = len(latent_state_types)
self._initialize_parameters()
P = self.parameters
# iterate E and M steps until convergence criteria is met
step, delta = 0, np.inf
ll_prev = np.sum([self.log_likelihood(o) for o in O])
while delta > tol:
gamma, xi, phi = self._E_step(O)
P["A"], P["B"], P["pi"] = self._M_step(O, gamma, xi, phi)
ll = np.sum([self.log_likelihood(o) for o in O])
delta = ll - ll_prev
ll_prev = ll
step += 1
if verbose:
fstr = "[Epoch {}] LL: {:.3f} Delta: {:.5f}"
print(fstr.format(step, ll_prev, delta))
# return A, B, pi
def _E_step(self, O):
r"""
Run a single E-step update for the Baum-Welch/Forward-Backward
algorithm. This step estimates ``xi`` and ``gamma``, the excepted
state-state transition counts and the expected state-occupancy counts,
respectively.
``xi[i,j,k]`` gives the probability of being in state `i` at time `k`
and state `j` at time `k+1` given the observed sequence `O` and the
current estimates for transition (`A`) and emission (`B`) matrices::
.. math::
xi[i,j,k] &= P(q_k=i,q_{k+1}=j \mid O,A,B,pi) \\
&= \frac{
P(q_k=i,q_{k+1}=j,O \mid A,B,pi)
}{P(O \mid A,B,pi)} \\
&= \frac{
P(o_1,o_2,\ldots,o_k,q_k=i \mid A,B,pi) \times
P(q_{k+1}=j \mid q_k=i) \times
P(o_{k+1} \mid q_{k+1}=j) \times
P(o_{k+2},o_{k+3},\ldots,o_T \mid q_{k+1}=j,A,B,pi)
}{P(O \mid A,B,pi)} \\
&= \frac{
\mathtt{fwd[j, k] * self.A[j, i] *
self.B[i, o_{k+1}] * bwd[i, k + 1]}
}{\mathtt{fwd[:, T].sum()}}
The expected number of transitions from state `i` to state `j` across the
entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``.
``gamma[i,j]`` gives the probability of being in state `i` at time `j`
.. math:: \mathtt{gamma[i,j]} = P(q_j = i \mid O, A, B, \pi)
Parameters
----------
O : :py:class:`ndarray ` of shape `(I, T)`
The set of `I` training observations, each of length `T`.
Returns
-------
gamma : :py:class:`ndarray ` of shape `(I, N, T)`
The estimated state-occupancy count matrix.
xi : :py:class:`ndarray ` of shape `(I, N, N, T)`
The estimated state-state transition count matrix.
phi : :py:class:`ndarray ` of shape `(I, N)`
The estimated prior counts for each latent state.
"""
I, T = O.shape
P = self.parameters
A, B = P["A"], P["B"]
N = self.derived_variables["N"]
eps = self.hyperparameters["eps"]
phi = np.zeros((I, N))
gamma = np.zeros((I, N, T))
xi = np.zeros((I, N, N, T))
for i in range(I):
Obs = O[i, :]
fwd = self._forward(Obs)
bwd = self._backward(Obs)
log_likelihood = logsumexp(fwd[:, T - 1])
t = T - 1
for si in range(N):
gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood
for t in range(T - 1):
ot1 = Obs[t + 1]
for si in range(N):
gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
for sj in range(N):
xi[i, si, sj, t] = (
fwd[si, t]
+ np.log(A[si, sj] + eps)
+ np.log(B[sj, ot1] + eps)
+ bwd[sj, t + 1]
- log_likelihood
)
return gamma, xi, phi
def _M_step(self, O, gamma, xi, phi):
"""
Run a single M-step update for the Baum-Welch/Forward-Backward
algorithm.
Parameters
----------
O : :py:class:`ndarray ` of shape `(I, T)`
The set of `I` training observations, each of length `T`.
gamma : :py:class:`ndarray ` of shape `(I, N, T)`
The estimated state-occupancy count matrix.
xi : :py:class:`ndarray ` of shape `(I, N, N, T)`
The estimated state-state transition count matrix.
phi : :py:class:`ndarray ` of shape `(I, N)`
The estimated starting count matrix for each latent state.
Returns
-------
A : :py:class:`ndarray ` of shape `(N, N)`
The estimated transition matrix.
B : :py:class:`ndarray ` of shape `(N, V)`
The estimated emission matrix.
pi : :py:class:`ndarray ` of shape `(N,)`
The estimated prior probabilities for each latent state.
"""
I, T = O.shape
P = self.parameters
DV = self.derived_variables
eps = self.hyperparameters["eps"]
N, V = DV["N"], DV["V"]
A, B, pi = P["A"], P["B"], P["pi"]
# initialize the estimated transition (A) and emission (B) matrices
A = np.zeros((N, N))
B = np.zeros((N, V))
pi = np.zeros(N)
count_gamma = np.zeros((I, N, V))
count_xi = np.zeros((I, N, N))
for i in range(I):
Obs = O[i, :]
for si in range(N):
for vk in range(V):
if not (Obs == vk).any():
count_gamma[i, si, vk] = np.log(eps)
else:
count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk])
for sj in range(N):
count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :])
pi = logsumexp(phi, axis=0) - np.log(I + eps)
np.testing.assert_almost_equal(np.exp(pi).sum(), 1)
for si in range(N):
for vk in range(V):
B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp(
count_gamma[:, si, :] # noqa: C812
)
for sj in range(N):
A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp(
count_xi[:, si, :] # noqa: C812
)
np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1)
np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1)
return np.exp(A), np.exp(B), np.exp(pi)
================================================
FILE: numpy_ml/lda/README.md
================================================
# Latent Dirichlet allocation
The `lda.py` module implements:
1. [Standard (ie., non-Bayesian) latent Dirichlet
allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) with MLE parameter
estimates via variational EM (Blei, Ng, & Jordan, 2003).
2. [Fully-Bayesian (ie., smoothed) latent Dirichlet allocation](https://people.cs.umass.edu/~wallach/courses/s11/cmpsci791ss/readings/griffiths02gibbs.pdf) with MAP parameter
estimates via collapsed Gibbs sampling (Griffiths & Steyvers, 2004).
## Plots
### Unsmoothed
### Smoothed
TODO
================================================
FILE: numpy_ml/lda/__init__.py
================================================
from .lda import *
from .lda_smoothed import *
================================================
FILE: numpy_ml/lda/lda.py
================================================
import numpy as np
from scipy.special import digamma, polygamma, gammaln
class LDA(object):
def __init__(self, T=10):
"""
Vanilla (non-smoothed) LDA model trained using variational EM.
Generates maximum-likelihood estimates for model paramters
`alpha` and `beta`.
Parameters
----------
T : int
Number of topics
Attributes
----------
D : int
Number of documents
N : list of length `D`
Number of words in each document
V : int
Number of unique word tokens across all documents
phi : :py:class:`ndarray ` of shape `(D, N[d], T)`
Variational approximation to word-topic distribution
gamma : :py:class:`ndarray ` of shape `(D, T)`
Variational approximation to document-topic distribution
alpha : :py:class:`ndarray ` of shape `(1, T)`
Parameter for the Dirichlet prior on the document-topic distribution
beta : :py:class:`ndarray ` of shape `(V, T)`
Word-topic distribution
"""
self.T = T
def _maximize_phi(self):
"""
Optimize variational parameter phi
ϕ_{t, n} ∝ β_{t, w_n} e^( Ψ(γ_t) )
"""
D = self.D
N = self.N
T = self.T
phi = self.phi
beta = self.beta
gamma = self.gamma
corpus = self.corpus
for d in range(D):
for n in range(N[d]):
for t in range(T):
w_n = int(corpus[d][n])
phi[d][n, t] = beta[w_n, t] * np.exp(dg(gamma, d, t))
# Normalize over topics
phi[d][n, :] = phi[d][n, :] / np.sum(phi[d][n, :])
return phi
def _maximize_gamma(self):
"""
Optimize variational parameter gamma
γ_t = α_t + \sum_{n=1}^{N_d} ϕ_{t, n}
"""
D = self.D
phi = self.phi
alpha = self.alpha
gamma = np.tile(alpha, (D, 1)) + np.array(
list(map(lambda x: np.sum(x, axis=0), phi))
)
return gamma
def _maximize_beta(self):
"""
Optimize model parameter beta
β_{t, n} ∝ \sum_{d=1}^D \sum_{i=1}^{N_d} ϕ_{d, t, n} [ i = n]
"""
T = self.T
V = self.V
phi = self.phi
beta = self.beta
corpus = self.corpus
for n in range(V):
# Construct binary mask [i == n] to be the same shape as phi
mask = [np.tile((doc == n), (T, 1)).T for doc in corpus]
beta[n, :] = np.sum(
np.array(list(map(lambda x: np.sum(x, axis=0), phi * mask))), axis=0
)
# Normalize over words
for t in range(T):
beta[:, t] = beta[:, t] / np.sum(beta[:, t])
return beta
def _maximize_alpha(self, max_iters=1000, tol=0.1):
"""
Optimize alpha using Blei's O(n) Newton-Raphson modification
for a Hessian with special structure
"""
D = self.D
T = self.T
alpha = self.alpha
gamma = self.gamma
for _ in range(max_iters):
alpha_old = alpha
# Calculate gradient
g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum(
digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T,
axis=0,
)
# Calculate Hessian diagonal component
h = -D * polygamma(1, alpha)
# Calculate Hessian constant component
z = D * polygamma(1, np.sum(alpha))
# Calculate constant
c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0)))
# Update alpha
alpha = alpha - (g - c) / h
# Check convergence
if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol:
break
return alpha
def _E_step(self):
"""
Maximize the VLB with respect to the variational parameters, γ and ϕ
"""
self.phi = self._maximize_phi()
self.gamma = self._maximize_gamma()
def _M_step(self):
"""
Maximize the VLB with respect to the model parameters, α and β
"""
self.beta = self._maximize_beta()
self.alpha = self._maximize_alpha()
def VLB(self):
"""
Return the variational lower bound associated with the current model
parameters.
"""
phi = self.phi
alpha = self.alpha
beta = self.beta
gamma = self.gamma
corpus = self.corpus
D = self.D
T = self.T
N = self.N
a, b, c, _d = 0, 0, 0, 0
for d in range(D):
a += (
gammaln(np.sum(alpha))
- np.sum(gammaln(alpha))
+ np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)])
)
_d += (
gammaln(np.sum(gamma[d, :]))
- np.sum(gammaln(gamma[d, :]))
+ np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)])
)
for n in range(N[d]):
w_n = int(corpus[d][n])
b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)])
c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)])
_d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)])
return a + b + c - _d
def initialize_parameters(self):
"""
Provide reasonable initializations for model and variational parameters.
"""
T = self.T
V = self.V
N = self.N
D = self.D
# initialize model parameters
self.alpha = 100 * np.random.dirichlet(10 * np.ones(T), 1)[0]
self.beta = np.random.dirichlet(np.ones(V), T).T
# initialize variational parameters
self.phi = np.array([1 / T * np.ones([N[d], T]) for d in range(D)])
self.gamma = np.tile(self.alpha, (D, 1)) + np.tile(N / T, (T, 1)).T
def train(self, corpus, verbose=False, max_iter=1000, tol=5):
"""
Train the LDA model on a corpus of documents (bags of words).
Parameters
----------
corpus : list of length `D`
A list of lists, with each sublist containing the tokenized text of
a single document.
verbose : bool
Whether to print the VLB at each training iteration. Default is
True.
max_iter : int
The maximum number of training iterations to perform before
breaking. Default is 1000.
tol : int
Break the training loop if the difference betwen the VLB on the
current iteration and the previous iteration is less than `tol`.
Default is 5.
"""
self.D = len(corpus)
self.V = len(set(np.concatenate(corpus)))
self.N = np.array([len(d) for d in corpus])
self.corpus = corpus
self.initialize_parameters()
vlb = -np.inf
for i in range(max_iter):
old_vlb = vlb
self._E_step()
self._M_step()
vlb = self.VLB()
delta = vlb - old_vlb
if verbose:
print("Iteration {}: {:.3f} (delta: {:.2f})".format(i + 1, vlb, delta))
if delta < tol:
break
#######################################################################
# Utils #
#######################################################################
def dg(gamma, d, t):
"""
E[log X_t] where X_t ~ Dir
"""
return digamma(gamma[d, t]) - digamma(np.sum(gamma[d, :]))
================================================
FILE: numpy_ml/lda/lda_smoothed.py
================================================
import numpy as np
class SmoothedLDA(object):
def __init__(self, T, **kwargs):
"""
A smoothed LDA model trained using collapsed Gibbs sampling. Generates
posterior mean estimates for model parameters `phi` and `theta`.
Parameters
----------
T : int
Number of topics
Attributes
----------
D : int
Number of documents
N : int
Total number of words across all documents
V : int
Number of unique word tokens across all documents
phi : :py:class:`ndarray ` of shape `(N[d], T)`
The word-topic distribution
theta : :py:class:`ndarray ` of shape `(D, T)`
The document-topic distribution
alpha : :py:class:`ndarray ` of shape `(1, T)`
Parameter for the Dirichlet prior on the document-topic distribution
beta : :py:class:`ndarray ` of shape `(V, T)`
Parameter for the Dirichlet prior on the topic-word distribution
"""
self.T = T
self.alpha = (50.0 / self.T) * np.ones(self.T)
if "alpha" in kwargs.keys():
self.alpha = (kwargs["alpha"]) * np.ones(self.T)
self.beta = 0.01
if "beta" in kwargs.keys():
self.beta = kwargs["beta"]
def _init_params(self, texts, tokens):
self.tokens = tokens
self.D = len(texts)
self.V = len(np.unique(self.tokens))
self.N = np.sum(np.array([len(doc) for doc in texts]))
self.word_document = np.zeros(self.N)
# now that we know the number of tokens in our corpus, we can set beta
self.beta = self.beta * np.ones(self.V)
count = 0
for doc_idx, doc in enumerate(texts):
for word_idx, word in enumerate(doc):
word_idx = word_idx + count
self.word_document[word_idx] = doc_idx
count = count + len(doc)
def train(self, texts, tokens, n_gibbs=2000):
"""
Trains a topic model on the documents in texts.
Parameters
----------
texts : array of length `(D,)`
The training corpus represented as an array of subarrays, where
each subarray corresponds to the tokenized words of a single
document.
tokens : array of length `(V,)`
The set of unique tokens in the documents in `texts`.
n_gibbs : int
The number of steps to run the collapsed Gibbs sampler during
training. Default is 2000.
Returns
-------
C_wt : :py:class:`ndarray ` of shape (V, T)
The word-topic count matrix
C_dt : :py:class:`ndarray ` of shape (D, T)
The document-topic count matrix
assignments : :py:class:`ndarray ` of shape (N, n_gibbs)
The topic assignments for each word in the corpus on each Gibbs
step.
"""
self._init_params(texts, tokens)
C_wt, C_dt, assignments = self._gibbs_sampler(n_gibbs, texts)
self.fit_params(C_wt, C_dt)
return C_wt, C_dt, assignments
def what_did_you_learn(self, top_n=10):
"""
Print the `top_n` most probable words under each topic
"""
for tt in range(self.T):
top_idx = np.argsort(self.phi[:, tt])[::-1][:top_n]
top_tokens = self.tokens[top_idx]
print("\nTop Words for Topic %s:\n" % (str(tt)))
for token in top_tokens:
print("\t%s\n" % (str(token)))
def fit_params(self, C_wt, C_dt):
"""
Estimate `phi`, the word-topic distribution, and `theta`, the
topic-document distribution.
Parameters
----------
C_wt : :py:class:`ndarray ` of shape (V, T)
The word-topic count matrix
C_dt : :py:class:`ndarray ` of shape (D, T)
The document-topic count matrix
Returns
-------
phi : :py:class:`ndarray ` of shape `(V, T)`
The word-topic distribution
theta : :py:class:`ndarray ` of shape `(D, T)`
The document-topic distribution
"""
self.phi = np.zeros([self.V, self.T])
self.theta = np.zeros([self.D, self.T])
b, a = self.beta[0], self.alpha[0]
for ii in range(self.V):
for jj in range(self.T):
self.phi[ii, jj] = (C_wt[ii, jj] + b) / (
np.sum(C_wt[:, jj]) + self.V * b
)
for dd in range(self.D):
for jj in range(self.T):
self.theta[dd, jj] = (C_dt[dd, jj] + a) / (
np.sum(C_dt[dd, :]) + self.T * a
)
return self.phi, self.theta
def _estimate_topic_prob(self, ii, d, C_wt, C_dt):
"""
Compute an approximation of the conditional probability that token ii
is assigned to topic jj given all previous topic assignments and the
current document d: p(t_i = j | t_{-i}, w_i, d_i)
"""
p_vec = np.zeros(self.T)
b, a = self.beta[0], self.alpha[0]
for jj in range(self.T):
# prob of word ii under topic jj
frac1 = (C_wt[ii, jj] + b) / (np.sum(C_wt[:, jj]) + self.V * b)
# prob of topic jj under document d
frac2 = (C_dt[d, jj] + a) / (np.sum(C_dt[d, :]) + self.T * a)
p_vec[jj] = frac1 * frac2
return p_vec / np.sum(p_vec)
def _gibbs_sampler(self, n_gibbs, texts):
"""
Collapsed Gibbs sampler for estimating the posterior distribution over
topic assignments.
"""
# Initialize count matrices
C_wt = np.zeros([self.V, self.T])
C_dt = np.zeros([self.D, self.T])
assignments = np.zeros([self.N, n_gibbs + 1])
# Randomly initialize topic assignments for words
for ii in range(self.N):
token_idx = np.concatenate(texts)[ii]
assignments[ii, 0] = np.random.randint(0, self.T)
doc = self.word_document[ii]
C_dt[doc, assignments[ii, 0]] += 1
C_wt[token_idx, assignments[ii, 0]] += 1
# run collapsed Gibbs sampler
for gg in range(n_gibbs):
print("Gibbs iteration {} of {}".format(gg + 1, n_gibbs))
for jj in range(self.N):
token_idx = np.concatenate(texts)[jj]
# Decrement count matrices by 1
doc = self.word_document[jj]
C_wt[token_idx, assignments[jj, gg]] -= 1
C_dt[doc, assignments[jj, gg]] -= 1
# Draw new topic from our approximation of the conditional dist.
p_topics = self._estimate_topic_prob(token_idx, doc, C_wt, C_dt)
sampled_topic = np.nonzero(np.random.multinomial(1, p_topics))[0][0]
# Update count matrices
C_wt[token_idx, sampled_topic] += 1
C_dt[doc, sampled_topic] += 1
assignments[jj, gg + 1] = sampled_topic
return C_wt, C_dt, assignments
================================================
FILE: numpy_ml/linear_models/README.md
================================================
# Linear Models
The `linear_models` module includes:
1. [OLS linear regression](https://en.wikipedia.org/wiki/Ordinary_least_squares) with maximum likelihood parameter estimates via the normal equation.
- Includes optional weight arguments for [weighted least squares](https://en.wikipedia.org/wiki/Weighted_least_squares)
- Supports batch and online coefficient updates.
3. [Ridge regression / Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)
with maximum likelihood parameter estimates via the normal equation.
2. [Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) with maximum likelihood parameter estimates via gradient descent.
3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions)
- Known coefficient prior mean and known error variance
- Known coefficient prior mean and unknown error variance
4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods.
5. [Generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model) with identity, log, and logit link functions.
## Plots
================================================
FILE: numpy_ml/linear_models/__init__.py
================================================
"""A module containing assorted linear models."""
from .ridge import RidgeRegression
from .glm import GeneralizedLinearModel
from .logistic import LogisticRegression
from .bayesian_regression import (
BayesianLinearRegressionKnownVariance,
BayesianLinearRegressionUnknownVariance,
)
from .naive_bayes import GaussianNBClassifier
from .linear_regression import LinearRegression
================================================
FILE: numpy_ml/linear_models/bayesian_regression.py
================================================
"""A module of Bayesian linear regression models."""
import numpy as np
import scipy.stats as stats
from numpy_ml.utils.testing import is_number, is_symmetric_positive_definite
class BayesianLinearRegressionUnknownVariance:
def __init__(self, alpha=1, beta=2, mu=0, V=None, fit_intercept=True):
r"""
Bayesian linear regression model with unknown variance. Assumes a
conjugate normal-inverse-gamma joint prior on the model parameters and
error variance.
Notes
-----
The current model uses a conjugate normal-inverse-gamma joint prior on
model parameters **b** and error variance :math:`\sigma^2`. The joint
and marginal posteriors over each are:
.. math::
\mathbf{b}, \sigma^2 &\sim
\text{N-\Gamma^{-1}}(\mu, \mathbf{V}^{-1}, \alpha, \beta) \\
\sigma^2 &\sim \text{InverseGamma}(\alpha, \beta) \\
\mathbf{b} \mid \sigma^2 &\sim \mathcal{N}(\mu, \sigma^2 \mathbf{V})
Parameters
----------
alpha : float
The shape parameter for the Inverse-Gamma prior on
:math:`\sigma^2`. Must be strictly greater than 0. Default is 1.
beta : float
The scale parameter for the Inverse-Gamma prior on
:math:`\sigma^2`. Must be strictly greater than 0. Default is 1.
mu : :py:class:`ndarray ` of shape `(M,)` or float
The mean of the Gaussian prior on `b`. If a float, assume `mu`
is ``np.ones(M) * mu``. Default is 0.
V : :py:class:`ndarray ` of shape `(N, N)` or `(N,)` or None
A symmetric positive definite matrix that when multiplied
element-wise by :math:`\sigma^2` gives the covariance matrix for
the Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If
None, assume `V` is the identity matrix. Default is None.
fit_intercept : bool
Whether to fit an intercept term in addition to the coefficients in
b. If True, the estimates for b will have `M + 1` dimensions, where
the first dimension corresponds to the intercept. Default is True.
Attributes
----------
posterior : dict or None
Frozen random variables for the posterior distributions
:math:`P(\sigma^2 \mid X)` and :math:`P(b \mid X, \sigma^2)`.
posterior_predictive : dict or None
Frozen random variable for the posterior predictive distribution,
:math:`P(y \mid X)`. This value is only set following a call to
:meth:`predict `.
""" # noqa: E501
# this is a placeholder until we know the dimensions of X
V = 1.0 if V is None else V
if isinstance(V, list):
V = np.array(V)
if isinstance(V, np.ndarray):
if V.ndim == 1:
V = np.diag(V)
elif V.ndim == 2:
fstr = "V must be symmetric positive definite"
assert is_symmetric_positive_definite(V), fstr
self.V = V
self.mu = mu
self.beta = beta
self.alpha = alpha
self.fit_intercept = fit_intercept
self.posterior = None
self.posterior_predictive = None
def fit(self, X, y):
"""
Compute the posterior over model parameters using the data in `X` and
`y`.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N, K)`
The targets for each of the `N` examples in `X`, where each target
has dimension `K`.
Returns
-------
self : :class:`BayesianLinearRegressionUnknownVariance` instance
""" # noqa: E501
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
N, M = X.shape
alpha, beta, V, mu = self.alpha, self.beta, self.V, self.mu
if is_number(V):
V *= np.eye(M)
if is_number(mu):
mu *= np.ones(M)
# sigma
I = np.eye(N) # noqa: E741
a = y - (X @ mu)
b = np.linalg.inv(X @ V @ X.T + I)
c = y - (X @ mu)
shape = N + alpha
sigma = (1 / shape) * (alpha * beta ** 2 + a @ b @ c)
scale = sigma ** 2
# sigma is the mode of the inverse gamma prior on sigma^2
sigma = scale / (shape - 1)
# mean
V_inv = np.linalg.inv(V)
L = np.linalg.inv(V_inv + X.T @ X)
R = V_inv @ mu + X.T @ y
mu = L @ R
cov = L * sigma
# posterior distribution for sigma^2 and b
self.posterior = {
"sigma**2": stats.distributions.invgamma(a=shape, scale=scale),
"b | sigma**2": stats.multivariate_normal(mean=mu, cov=cov),
}
return self
def predict(self, X):
"""
Return the MAP prediction for the targets associated with `X`.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
y_pred : :py:class:`ndarray ` of shape `(Z, K)`
The model predictions for the items in `X`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
I = np.eye(X.shape[0]) # noqa: E741
mu = X @ self.posterior["b | sigma**2"].mean
cov = X @ self.posterior["b | sigma**2"].cov @ X.T + I
# MAP estimate for y corresponds to the mean of the posterior
# predictive
self.posterior_predictive = stats.multivariate_normal(mu, cov)
return mu
class BayesianLinearRegressionKnownVariance:
def __init__(self, mu=0, sigma=1, V=None, fit_intercept=True):
r"""
Bayesian linear regression model with known error variance and
conjugate Gaussian prior on model parameters.
Notes
-----
Uses a conjugate Gaussian prior on the model coefficients **b**. The
posterior over model coefficients is then
.. math::
\mathbf{b} \mid \mu, \sigma^2, \mathbf{V}
\sim \mathcal{N}(\mu, \sigma^2 \mathbf{V})
Ridge regression is a special case of this model where :math:`\mu =
\mathbf{0}`, :math:`\sigma = 1` and :math:`\mathbf{V} = \mathbf{I}`
(ie., the prior on the model coefficients **b** is a zero-mean, unit
covariance Gaussian).
Parameters
----------
mu : :py:class:`ndarray ` of shape `(M,)` or float
The mean of the Gaussian prior on `b`. If a float, assume `mu` is
``np.ones(M) * mu``. Default is 0.
sigma : float
The square root of the scaling term for covariance of the Gaussian
prior on `b`. Default is 1.
V : :py:class:`ndarray ` of shape `(N,N)` or `(N,)` or None
A symmetric positive definite matrix that when multiplied
element-wise by ``sigma ** 2`` gives the covariance matrix for the
Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If None,
assume `V` is the identity matrix. Default is None.
fit_intercept : bool
Whether to fit an intercept term in addition to the coefficients in
`b`. If True, the estimates for `b` will have `M + 1` dimensions, where
the first dimension corresponds to the intercept. Default is True.
Attributes
----------
posterior : dict or None
Frozen random variable for the posterior distribution :math:`P(b
\mid X, \sigma^2)`.
posterior_predictive : dict or None
Frozen random variable for the posterior predictive distribution,
:math:`P(y \mid X)`. This value is only set following a call to
:meth:`predict `.
""" # noqa: E501
# this is a placeholder until we know the dimensions of X
V = 1.0 if V is None else V
if isinstance(V, list):
V = np.array(V)
if isinstance(V, np.ndarray):
if V.ndim == 1:
V = np.diag(V)
elif V.ndim == 2:
fstr = "V must be symmetric positive definite"
assert is_symmetric_positive_definite(V), fstr
self.posterior = {}
self.posterior_predictive = {}
self.V = V
self.mu = mu
self.sigma = sigma
self.fit_intercept = fit_intercept
def fit(self, X, y):
"""
Compute the posterior over model parameters using the data in `X` and
`y`.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N, K)`
The targets for each of the `N` examples in `X`, where each target
has dimension `K`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
N, M = X.shape
if is_number(self.V):
self.V *= np.eye(M)
if is_number(self.mu):
self.mu *= np.ones(M)
V = self.V
mu = self.mu
sigma = self.sigma
V_inv = np.linalg.inv(V)
L = np.linalg.inv(V_inv + X.T @ X)
R = V_inv @ mu + X.T @ y
mu = L @ R
cov = L * sigma ** 2
# posterior distribution over b conditioned on sigma
self.posterior["b"] = stats.multivariate_normal(mu, cov)
def predict(self, X):
"""
Return the MAP prediction for the targets associated with `X`.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
y_pred : :py:class:`ndarray ` of shape `(Z, K)`
The MAP predictions for the targets associated with the items in
`X`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
I = np.eye(X.shape[0]) # noqa: E741
mu = X @ self.posterior["b"].mean
cov = X @ self.posterior["b"].cov @ X.T + I
# MAP estimate for y corresponds to the mean/mode of the gaussian
# posterior predictive distribution
self.posterior_predictive = stats.multivariate_normal(mu, cov)
return mu
================================================
FILE: numpy_ml/linear_models/glm.py
================================================
"""A module for the generalized linear model."""
import numpy as np
from numpy_ml.linear_models.linear_regression import LinearRegression
eps = np.finfo(float).eps
_GLM_LINKS = {
"logit": {
"link": lambda mu: np.log((mu + eps) / (1 - mu + eps)),
"inv_link": lambda eta: 1.0 / (1.0 + np.exp(-eta)),
"link_prime": lambda x: (1 / (x + eps)) + (1 / (1 - x + eps)),
"theta": lambda mu: np.log((mu + eps) / (1 - mu + eps)),
"phi": lambda x: np.ones(x.shape[0]),
"a": lambda phi: phi,
"b": lambda theta: np.log(1 + np.exp(theta)),
"p": 1,
"b_prime": lambda theta: np.exp(theta) / (1 + np.exp(theta)),
"b_prime2": lambda theta: np.exp(theta) / ((1 + np.exp(theta)) ** 2),
},
"identity": {
"link": lambda mu: mu,
"inv_link": lambda eta: eta,
"link_prime": lambda x: np.ones_like(x),
"theta": lambda mu: mu,
"phi": lambda x: np.var(x, axis=0),
"a": lambda phi: phi,
"b": lambda theta: 0.5 * theta ** 2,
"p": 1,
"b_prime": lambda theta: theta,
"b_prime2": lambda theta: np.ones_like(theta),
},
"log": {
"link": lambda mu: np.log(mu + eps),
"inv_link": lambda eta: np.exp(eta),
"link_prime": lambda x: 1 / (x + eps),
"theta": lambda mu: np.log(mu + eps),
"phi": lambda x: np.ones(x.shape[0]),
"a": lambda phi: phi,
"p": 1,
"b": lambda theta: np.exp(theta),
"b_prime": lambda theta: np.exp(theta),
"b_prime2": lambda theta: np.exp(theta),
},
}
class GeneralizedLinearModel:
def __init__(self, link, fit_intercept=True, tol=1e-5, max_iter=100):
r"""
A generalized linear model with maximum likelihood fit via
iteratively reweighted least squares (IRLS).
Notes
-----
The generalized linear model (GLM) [7]_ [8]_ assumes that each target/dependent
variable :math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots,
y_n)`, has been drawn independently from a pre-specified distribution
in the exponential family [11]_ with unknown mean :math:`\mu_i`. The GLM
models a (one-to-one, continuous, differentiable) function, *g*, of
this mean value as a linear combination of the model parameters
:math:`\mathbf{b}` and observed covariates, :math:`\mathbf{x}_i`:
.. math::
g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i
where *g* is known as the "link function" associated with the GLM. The
choice of link function is informed by the instance of the exponential
family the target is drawn from. Common examples:
.. csv-table::
:header: "Distribution", "Link", "Formula"
:widths: 25, 20, 30
"Normal", "Identity", ":math:`g(x) = x`"
"Bernoulli", "Logit", ":math:`g(x) = \log(x) - \log(1 - x)`"
"Binomial", "Logit", ":math:`g(x) = \log(x) - \log(n - x)`"
"Poisson", "Log", ":math:`g(x) = \log(x)`"
An iteratively re-weighted least squares (IRLS) algorithm [9]_ can be
employed to find the maximum likelihood estimate for the model
parameters :math:`\beta` in any instance of the generalized linear
model. IRLS is equivalent to Fisher scoring [10]_, which itself is
a slight modification of classic Newton-Raphson for finding the zeros
of the first derivative of the model log-likelihood.
References
----------
.. [7] Nelder, J., & Wedderburn, R. (1972). Generalized linear
models. *Journal of the Royal Statistical Society, Series A
(General), 135(3)*: 370–384.
.. [8] https://en.wikipedia.org/wiki/Generalized_linear_model
.. [9] https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
.. [10] https://en.wikipedia.org/wiki/Scoring_algorithm
.. [11] https://en.wikipedia.org/wiki/Exponential_family
Parameters
----------
link: {'identity', 'logit', 'log'}
The link function to use during modeling.
fit_intercept: bool
Whether to fit an intercept term in addition to the model
coefficients. Default is True.
tol : float
The minimum difference between successive iterations of IRLS
Default is 1e-5.
max_iter: int
The maximum number of iteratively reweighted least squares
iterations to run during fitting. Default is 100.
Attributes
----------
beta : :py:class:`ndarray ` of shape `(M, 1)` or None
Fitted model coefficients.
"""
err_str = f"Valid link functions are {list(_GLM_LINKS.keys())} but got {link}"
assert link in _GLM_LINKS, err_str
self._is_fit = False
self.tol = tol
self.link = link
self.beta = None
self.max_iter = max_iter
self.fit_intercept = fit_intercept
def fit(self, X, y):
"""
Find the maximum likelihood GLM coefficients via IRLS.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N,)`
The targets for each of the `N` examples in `X`.
Returns
-------
self : :class:`GeneralizedLinearModel ` instance
""" # noqa: E501
y = np.squeeze(y)
assert y.ndim == 1
N, M = X.shape
L = _GLM_LINKS[self.link]
# starting values for parameters
mu = np.ones_like(y) * np.mean(y)
eta = L["link"](mu)
theta = L["theta"](mu)
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(N), X]
# IRLS for GLM
i = 0
diff, beta = np.inf, np.inf
while diff > (self.tol * M):
if i > self.max_iter:
print("Warning: Model did not converge")
break
# compute first-order Taylor approx.
z = eta + (y - mu) * L["link_prime"](mu)
w = L["p"] / (L["b_prime2"](theta) * L["link_prime"](mu) ** 2)
# perform weighted least-squares on z
wlr = LinearRegression(fit_intercept=False)
beta_new = wlr.fit(X, z, weights=w).beta.ravel()
eta = X @ beta_new
mu = L["inv_link"](eta)
theta = L["theta"](mu)
diff = np.linalg.norm(beta - beta_new, ord=1)
beta = beta_new
i += 1
self.beta = beta
self._is_fit = True
return self
def predict(self, X):
r"""
Use the trained model to generate predictions for the distribution
means, :math:`\mu`, associated with the collection of data points in
**X**.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
mu_pred : :py:class:`ndarray ` of shape `(Z,)`
The model predictions for the expected value of the target
associated with each item in `X`.
"""
assert self._is_fit, "Must call `fit` before generating predictions"
L = _GLM_LINKS[self.link]
# convert X to a design matrix if we're using an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
mu_pred = L["inv_link"](X @ self.beta)
return mu_pred.ravel()
================================================
FILE: numpy_ml/linear_models/linear_regression.py
================================================
"""Linear regression module."""
import numpy as np
class LinearRegression:
def __init__(self, fit_intercept=True):
r"""
A weighted linear least-squares regression model.
Notes
-----
In weighted linear least-squares regression [1]_, a real-valued target
vector, **y**, is modeled as a linear combination of covariates, **X**,
and model coefficients, :math:`\beta`:
.. math::
y_i = \beta^\top \mathbf{x}_i + \epsilon_i
In this equation :math:`\epsilon_i \sim \mathcal{N}(0, \sigma^2_i)` is
the error term associated with example :math:`i`, and
:math:`\sigma^2_i` is the variance of the corresponding example.
Under this model, the maximum-likelihood estimate for the regression
coefficients, :math:`\beta`, is:
.. math::
\hat{\beta} = \Sigma^{-1} \mathbf{X}^\top \mathbf{Wy}
where :math:`\Sigma^{-1} = (\mathbf{X}^\top \mathbf{WX})^{-1}` and
**W** is a diagonal matrix of weights, with each entry inversely
proportional to the variance of the corresponding measurement. When
**W** is the identity matrix the examples are weighted equally and the
model reduces to standard linear least squares [2]_.
References
----------
.. [1] https://en.wikipedia.org/wiki/Weighted_least_squares
.. [2] https://en.wikipedia.org/wiki/General_linear_model
Parameters
----------
fit_intercept : bool
Whether to fit an intercept term in addition to the model
coefficients. Default is True.
Attributes
----------
beta : :py:class:`ndarray ` of shape `(M, K)` or None
Fitted model coefficients.
sigma_inv : :py:class:`ndarray ` of shape `(N, N)` or None
Inverse of the data covariance matrix.
"""
self.beta = None
self.sigma_inv = None
self.fit_intercept = fit_intercept
self._is_fit = False
def update(self, X, y, weights=None):
r"""
Incrementally update the linear least-squares coefficients for a set of
new examples.
Notes
-----
The recursive least-squares algorithm [3]_ [4]_ is used to efficiently
update the regression parameters as new examples become available. For
a single new example :math:`(\mathbf{x}_{t+1}, \mathbf{y}_{t+1})`, the
parameter updates are
.. math::
\beta_{t+1} = \left(
\mathbf{X}_{1:t}^\top \mathbf{X}_{1:t} +
\mathbf{x}_{t+1}\mathbf{x}_{t+1}^\top \right)^{-1}
\mathbf{X}_{1:t}^\top \mathbf{Y}_{1:t} +
\mathbf{x}_{t+1}^\top \mathbf{y}_{t+1}
where :math:`\beta_{t+1}` are the updated regression coefficients,
:math:`\mathbf{X}_{1:t}` and :math:`\mathbf{Y}_{1:t}` are the set of
examples observed from timestep 1 to *t*.
In the single-example case, the RLS algorithm uses the Sherman-Morrison
formula [5]_ to avoid re-inverting the covariance matrix on each new
update. In the multi-example case (i.e., where :math:`\mathbf{X}_{t+1}`
and :math:`\mathbf{y}_{t+1}` are matrices of `N` examples each), we use
the generalized Woodbury matrix identity [6]_ to update the inverse
covariance. This comes at a performance cost, but is still more
performant than doing multiple single-example updates if *N* is large.
References
----------
.. [3] Gauss, C. F. (1821) *Theoria combinationis observationum
erroribus minimis obnoxiae*, Werke, 4. Gottinge
.. [4] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
.. [5] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
.. [6] https://en.wikipedia.org/wiki/Woodbury_matrix_identity
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`
y : :py:class:`ndarray ` of shape `(N, K)`
The targets for each of the `N` examples in `X`, where each target
has dimension `K`
weights : :py:class:`ndarray ` of shape `(N,)` or None
Weights associated with the examples in `X`. Examples
with larger weights exert greater influence on model fit. When
`y` is a vector (i.e., `K = 1`), weights should be set to the
reciporical of the variance for each measurement (i.e., :math:`w_i
= 1/\sigma^2_i`). When `K > 1`, it is assumed that all columns of
`y` share the same weight :math:`w_i`. If None, examples are
weighted equally, resulting in the standard linear least squares
update. Default is None.
Returns
-------
self : :class:`LinearRegression ` instance
""" # noqa: E501
if not self._is_fit:
raise RuntimeError("You must call the `fit` method before calling `update`")
X, y = np.atleast_2d(X), np.atleast_2d(y)
X1, Y1 = X.shape[0], y.shape[0]
weights = np.ones(X1) if weights is None else np.atleast_1d(weights)
weights = np.squeeze(weights) if weights.size > 1 else weights
err_str = f"weights must have shape ({X1},) but got {weights.shape}"
assert weights.shape == (X1,), err_str
# scale X and y by the weight associated with each example
W = np.diag(np.sqrt(weights))
X, y = W @ X, W @ y
self._update1D(X, y, W) if X1 == Y1 == 1 else self._update2D(X, y, W)
return self
def _update1D(self, x, y, w):
"""Sherman-Morrison update for a single example"""
beta, S_inv = self.beta, self.sigma_inv
# convert x to a design vector if we're fitting an intercept
if self.fit_intercept:
x = np.c_[np.diag(w), x]
# update the inverse of the covariance matrix via Sherman-Morrison
S_inv -= (S_inv @ x.T @ x @ S_inv) / (1 + x @ S_inv @ x.T)
# update the model coefficients
beta += S_inv @ x.T @ (y - x @ beta)
def _update2D(self, X, y, W):
"""Woodbury update for multiple examples"""
beta, S_inv = self.beta, self.sigma_inv
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.diag(W), X]
I = np.eye(X.shape[0]) # noqa: E741
# update the inverse of the covariance matrix via Woodbury identity
S_inv -= S_inv @ X.T @ np.linalg.pinv(I + X @ S_inv @ X.T) @ X @ S_inv
# update the model coefficients
beta += S_inv @ X.T @ (y - X @ beta)
def fit(self, X, y, weights=None):
r"""
Fit regression coefficients via maximum likelihood.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N, K)`
The targets for each of the `N` examples in `X`, where each target
has dimension `K`.
weights : :py:class:`ndarray ` of shape `(N,)` or None
Weights associated with the examples in `X`. Examples
with larger weights exert greater influence on model fit. When
`y` is a vector (i.e., `K = 1`), weights should be set to the
reciporical of the variance for each measurement (i.e., :math:`w_i
= 1/\sigma^2_i`). When `K > 1`, it is assumed that all columns of
`y` share the same weight :math:`w_i`. If None, examples are
weighted equally, resulting in the standard linear least squares
update. Default is None.
Returns
-------
self : :class:`LinearRegression ` instance
""" # noqa: E501
N = X.shape[0]
weights = np.ones(N) if weights is None else np.atleast_1d(weights)
weights = np.squeeze(weights) if weights.size > 1 else weights
err_str = f"weights must have shape ({N},) but got {weights.shape}"
assert weights.shape == (N,), err_str
# scale X and y by the weight associated with each example
W = np.diag(np.sqrt(weights))
X, y = W @ X, W @ y
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.sqrt(weights), X]
self.sigma_inv = np.linalg.pinv(X.T @ X)
self.beta = np.atleast_2d(self.sigma_inv @ X.T @ y)
self._is_fit = True
return self
def predict(self, X):
"""
Use the trained model to generate predictions on a new collection of
data points.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
y_pred : :py:class:`ndarray ` of shape `(Z, K)`
The model predictions for the items in `X`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
return X @ self.beta
================================================
FILE: numpy_ml/linear_models/logistic.py
================================================
"""Logistic regression module"""
import numpy as np
class LogisticRegression:
def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
r"""
A simple binary logistic regression model fit via gradient descent on
the penalized negative log likelihood.
Notes
-----
In simple binary logistic regression, the entries in a binary target
vector :math:`\mathbf{y} = (y_1, \ldots, y_N)` are assumed to have been
drawn from a series of independent Bernoulli random variables with
expected values :math:`p_1, \ldots, p_N`. The binary logistic regession
model models the logit of these unknown mean parameters as a linear
function of the model coefficients, :math:`\mathbf{b}`, and the
covariates for the corresponding example, :math:`\mathbf{x}_i`:
.. math::
\text{Logit}(p_i) =
\log \left( \frac{p_i}{1 - p_i} \right) = \mathbf{b}^\top\mathbf{x}_i
The model predictions :math:`\hat{\mathbf{y}}` are the expected values
of the Bernoulli parameters for each example:
.. math::
\hat{y}_i =
\mathbb{E}[y_i \mid \mathbf{x}_i] = \sigma(\mathbf{b}^\top \mathbf{x}_i)
where :math:`\sigma` is the logistic sigmoid function :math:`\sigma(x)
= \frac{1}{1 + e^{-x}}`. Under this model, the (penalized) negative log
likelihood of the targets **y** is
.. math::
- \log \mathcal{L}(\mathbf{b}, \mathbf{y}) = -\frac{1}{N} \left[
\left(
\sum_{i=0}^N y_i \log(\hat{y}_i) +
(1-y_i) \log(1-\hat{y}_i)
\right) - R(\mathbf{b}, \gamma)
\right]
where
.. math::
R(\mathbf{b}, \gamma) = \left\{
\begin{array}{lr}
\frac{\gamma}{2} ||\mathbf{b}||_2^2 & :\texttt{ penalty = 'l2'}\\
\gamma ||\mathbf{b}||_1 & :\texttt{ penalty = 'l1'}
\end{array}
\right.
is a regularization penalty, :math:`\gamma` is a regularization weight,
`N` is the number of examples in **y**, :math:`\hat{y}_i` is the model
prediction on example *i*, and **b** is the vector of model
coefficients.
Parameters
----------
penalty : {'l1', 'l2'}
The type of regularization penalty to apply on the coefficients
`beta`. Default is 'l2'.
gamma : float
The regularization weight. Larger values correspond to larger
regularization penalties, and a value of 0 indicates no penalty.
Default is 0.
fit_intercept : bool
Whether to fit an intercept term in addition to the coefficients in
b. If True, the estimates for `beta` will have `M + 1` dimensions,
where the first dimension corresponds to the intercept. Default is
True.
Attributes
----------
beta : :py:class:`ndarray ` of shape `(M, 1)` or None
Fitted model coefficients.
"""
err_msg = "penalty must be 'l1' or 'l2', but got: {}".format(penalty)
assert penalty in ["l2", "l1"], err_msg
self.beta = None
self.gamma = gamma
self.penalty = penalty
self.fit_intercept = fit_intercept
def fit(self, X, y, lr=0.01, tol=1e-7, max_iter=1e7):
"""
Fit the regression coefficients via gradient descent on the negative
log likelihood.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N,)`
The binary targets for each of the `N` examples in `X`.
lr : float
The gradient descent learning rate. Default is 1e-7.
max_iter : float
The maximum number of iterations to run the gradient descent
solver. Default is 1e7.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
l_prev = np.inf
self.beta = np.random.rand(X.shape[1])
for _ in range(int(max_iter)):
y_pred = _sigmoid(X @ self.beta)
loss = self._NLL(X, y, y_pred)
if l_prev - loss < tol:
return
l_prev = loss
self.beta -= lr * self._NLL_grad(X, y, y_pred)
def _NLL(self, X, y, y_pred):
r"""
Penalized negative log likelihood of the targets under the current
model.
.. math::
\text{NLL} = -\frac{1}{N} \left[
\left(
\sum_{i=0}^N y_i \log(\hat{y}_i) + (1-y_i) \log(1-\hat{y}_i)
\right) - R(\mathbf{b}, \gamma)
\right]
"""
N, M = X.shape
beta, gamma = self.beta, self.gamma
order = 2 if self.penalty == "l2" else 1
norm_beta = np.linalg.norm(beta, ord=order)
nll = -np.log(y_pred[y == 1]).sum() - np.log(1 - y_pred[y == 0]).sum()
penalty = (gamma / 2) * norm_beta ** 2 if order == 2 else gamma * norm_beta
return (penalty + nll) / N
def _NLL_grad(self, X, y, y_pred):
"""Gradient of the penalized negative log likelihood wrt beta"""
N, M = X.shape
p, beta, gamma = self.penalty, self.beta, self.gamma
d_penalty = gamma * beta if p == "l2" else gamma * np.sign(beta)
return -((y - y_pred) @ X + d_penalty) / N
def predict(self, X):
"""
Use the trained model to generate prediction probabilities on a new
collection of data points.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
y_pred : :py:class:`ndarray ` of shape `(Z,)`
The model prediction probabilities for the items in `X`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
return _sigmoid(X @ self.beta)
def _sigmoid(x):
"""The logistic sigmoid function"""
return 1 / (1 + np.exp(-x))
================================================
FILE: numpy_ml/linear_models/naive_bayes.py
================================================
"""A module for naive Bayes classifiers"""
import numpy as np
class GaussianNBClassifier:
def __init__(self, eps=1e-6):
r"""
A naive Bayes classifier for real-valued data.
Notes
-----
The naive Bayes model assumes the features of each training example
:math:`\mathbf{x}` are mutually independent given the example label
*y*:
.. math::
P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
where :math:`M` is the rank of the :math:`i^{th}` example
:math:`\mathbf{x}_i` and :math:`y_i` is the label associated with the
:math:`i^{th}` example.
Combining the conditional independence assumption with a simple
application of Bayes' theorem gives the naive Bayes classification
rule:
.. math::
\hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
&= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
&= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)
In the final expression, the prior class probability :math:`P(y)` can
be specified in advance or estimated empirically from the training
data.
In the Gaussian version of the naive Bayes model, the feature
likelihood is assumed to be normally distributed for each class:
.. math::
\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
\Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
unique classes present in the data, and the parameters for the Gaussian
associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
(where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
training examples with label :math:`c`.
Parameters
----------
eps : float
A value added to the variance to prevent numerical error. Default
is 1e-6.
Attributes
----------
parameters : dict
Dictionary of model parameters: "mean", the `(K, M)` array of
feature means under each class, "sigma", the `(K, M)` array of
feature variances under each class, and "prior", the `(K,)` array of
empirical prior probabilities for each class label.
hyperparameters : dict
Dictionary of model hyperparameters
labels : :py:class:`ndarray ` of shape `(K,)`
An array containing the unique class labels for the training
examples.
"""
self.labels = None
self.hyperparameters = {"eps": eps}
self.parameters = {
"mean": None, # shape: (K, M)
"sigma": None, # shape: (K, M)
"prior": None, # shape: (K,)
}
def fit(self, X, y):
"""
Fit the model parameters via maximum likelihood.
Notes
-----
The model parameters are stored in the :py:attr:`parameters
` attribute.
The following keys are present:
"mean": :py:class:`ndarray ` of shape `(K, M)`
Feature means for each of the `K` label classes
"sigma": :py:class:`ndarray ` of shape `(K, M)`
Feature variances for each of the `K` label classes
"prior": :py:class:`ndarray ` of shape `(K,)`
Prior probability of each of the `K` label classes, estimated
empirically from the training data
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`
y: :py:class:`ndarray ` of shape `(N,)`
The class label for each of the `N` examples in `X`
Returns
-------
self : :class:`GaussianNBClassifier ` instance
""" # noqa: E501
P = self.parameters
H = self.hyperparameters
self.labels = np.unique(y)
K = len(self.labels)
N, M = X.shape
P["mean"] = np.zeros((K, M))
P["sigma"] = np.zeros((K, M))
P["prior"] = np.zeros((K,))
for i, c in enumerate(self.labels):
X_c = X[y == c, :]
P["mean"][i, :] = np.mean(X_c, axis=0)
P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
P["prior"][i] = X_c.shape[0] / N
return self
def predict(self, X):
"""
Use the trained classifier to predict the class label for each example
in **X**.
Parameters
----------
X: :py:class:`ndarray ` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
Returns
-------
labels : :py:class:`ndarray ` of shape `(N)`
The predicted class labels for each example in `X`
"""
return self.labels[self._log_posterior(X).argmax(axis=1)]
def _log_posterior(self, X):
r"""
Compute the (unnormalized) log posterior for each class.
Parameters
----------
X: :py:class:`ndarray ` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
Returns
-------
log_posterior : :py:class:`ndarray ` of shape `(N, K)`
Unnormalized log posterior probability of each class for each
example in `X`
"""
K = len(self.labels)
log_posterior = np.zeros((X.shape[0], K))
for i in range(K):
log_posterior[:, i] = self._log_class_posterior(X, i)
return log_posterior
def _log_class_posterior(self, X, class_idx):
r"""
Compute the (unnormalized) log posterior for the label at index
`class_idx` in :py:attr:`labels `.
Notes
-----
Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
:math:`c` is::
.. math::
\log P(y_i = c \mid \mathbf{x}_i, \theta)
&\propto \log P(y=c \mid \theta) +
\log P(\mathbf{x}_i \mid y_i = c, \theta) \\
&\propto \log P(y=c \mid \theta)
\sum{j=1}^M \log P(x_j \mid y_i = c, \theta)
In the Gaussian naive Bayes model, the feature likelihood for class
:math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
be normally distributed
.. math::
\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
Parameters
----------
X: :py:class:`ndarray ` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
class_idx : int
The index of the current class in :py:attr:`labels`
Returns
-------
log_class_posterior : :py:class:`ndarray ` of shape `(N,)`
Unnormalized log probability of the label at index `class_idx`
in :py:attr:`labels `
for each example in `X`
""" # noqa: E501
P = self.parameters
mu = P["mean"][class_idx]
prior = P["prior"][class_idx]
sigsq = P["sigma"][class_idx]
# log likelihood = log X | N(mu, sigsq)
log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
return log_likelihood + np.log(prior)
================================================
FILE: numpy_ml/linear_models/ridge.py
================================================
"""Ridge regression module"""
import numpy as np
class RidgeRegression:
def __init__(self, alpha=1, fit_intercept=True):
r"""
A ridge regression model with maximum likelihood fit via the normal
equations.
Notes
-----
Ridge regression is a biased estimator for linear models which adds an
additional penalty proportional to the L2-norm of the model
coefficients to the standard mean-squared-error loss:
.. math::
\mathcal{L}_{Ridge} = (\mathbf{y} - \mathbf{X} \beta)^\top
(\mathbf{y} - \mathbf{X} \beta) + \alpha ||\beta||_2^2
where :math:`\alpha` is a weight controlling the severity of the
penalty.
Given data matrix **X** and target vector **y**, the maximum-likelihood
estimate for ridge coefficients, :math:`\beta`, is:
.. math::
\hat{\beta} =
\left(\mathbf{X}^\top \mathbf{X} + \alpha \mathbf{I} \right)^{-1}
\mathbf{X}^\top \mathbf{y}
It turns out that this estimate for :math:`\beta` also corresponds to
the MAP estimate if we assume a multivariate Gaussian prior on the
model coefficients, assuming that the data matrix **X** has been
standardized and the target values **y** centered at 0:
.. math::
\beta \sim \mathcal{N}\left(\mathbf{0}, \frac{1}{2M} \mathbf{I}\right)
Parameters
----------
alpha : float
L2 regularization coefficient. Larger values correspond to larger
penalty on the L2 norm of the model coefficients. Default is 1.
fit_intercept : bool
Whether to fit an additional intercept term. Default is True.
Attributes
----------
beta : :py:class:`ndarray ` of shape `(M, K)` or None
Fitted model coefficients.
"""
self.beta = None
self.alpha = alpha
self.fit_intercept = fit_intercept
def fit(self, X, y):
"""
Fit the regression coefficients via maximum likelihood.
Parameters
----------
X : :py:class:`ndarray ` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`.
y : :py:class:`ndarray ` of shape `(N, K)`
The targets for each of the `N` examples in `X`, where each target
has dimension `K`.
Returns
-------
self : :class:`RidgeRegression ` instance
""" # noqa: E501
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
A = self.alpha * np.eye(X.shape[1])
pseudo_inverse = np.linalg.inv(X.T @ X + A) @ X.T
self.beta = pseudo_inverse @ y
return self
def predict(self, X):
"""
Use the trained model to generate predictions on a new collection of
data points.
Parameters
----------
X : :py:class:`ndarray ` of shape `(Z, M)`
A dataset consisting of `Z` new examples, each of dimension `M`.
Returns
-------
y_pred : :py:class:`ndarray ` of shape `(Z, K)`
The model predictions for the items in `X`.
"""
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
return np.dot(X, self.beta)
================================================
FILE: numpy_ml/neural_nets/README.md
================================================
# Neural network models
This module implements building-blocks for larger neural network models in the
Keras-style. This module does _not_ implement a general autograd system in order
emphasize conceptual understanding over flexibility.
1. **Activations**. Common activation nonlinearities. Includes:
- Rectified linear units (ReLU) ([Hahnloser et al., 2000](http://invibe.net/biblio_database_dyva/woda/data/att/6525.file.pdf))
- Leaky rectified linear units
([Maas, Hannun, & Ng, 2013](https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf))
- Exponential linear units (ELU) ([Clevert, Unterthiner, & Hochreiter, 2016](http://arxiv.org/abs/1511.07289))
- Scaled exponential linear units ([Klambauer, Unterthiner, & Mayr, 2017](https://arxiv.org/pdf/1706.02515.pdf))
- Softplus units
- Hard sigmoid units
- Exponential units
- Hyperbolic tangent (tanh)
- Logistic sigmoid
- Affine
2. **Losses**. Common loss functions. Includes:
- Squared error
- Categorical cross entropy
- VAE Bernoulli loss ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
- Wasserstein loss with gradient penalty ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf))
- Noise contrastive estimation (NCE) loss ([Gutmann & Hyvärinen](https://www.cs.helsinki.fi/u/ahyvarin/papers/Gutmann10AISTATS.pdf); [Minh & Teh, 2012](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf))
3. **Wrappers**. Layer wrappers. Includes:
- Dropout ([Srivastava, et al., 2014](http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf))
4. **Layers**. Common layers / layer-wise operations that can be composed to
create larger neural networks. Includes:
- Fully-connected
- Sparse evolutionary ([Mocanu et al., 2018](https://www.nature.com/articles/s41467-018-04316-3))
- Dot-product attention ([Luong, Pho, & Manning, 2015](https://arxiv.org/pdf/1508.04025.pdf); [Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
- 1D and 2D convolution (with stride, padding, and dilation) ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf); [Yu & Kolton, 2016](https://arxiv.org/pdf/1511.07122.pdf))
- 2D "deconvolution" (with stride and padding) ([Zeiler et al., 2010](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf))
- Restricted Boltzmann machines (with CD-_n_ training) ([Smolensky, 1996](http://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf); [Carreira-Perpiñán & Hinton, 2005](http://www.cs.toronto.edu/~fritz/absps/cdmiguel.pdf))
- Elementwise multiplication
- Embedding
- Summation
- Flattening
- Softmax
- Max & average pooling
- 1D and 2D batch normalization ([Ioffe & Szegedy, 2015](http://proceedings.mlr.press/v37/ioffe15.pdf))
- 1D and 2D layer normalization ([Ba, Kiros, & Hinton, 2016](https://arxiv.org/pdf/1607.06450.pdf))
- Recurrent ([Elman, 1990](https://crl.ucsd.edu/~elman/Papers/fsit.pdf))
- Long short-term memory (LSTM) ([Hochreiter & Schmidhuber, 1997](http://www.bioinf.jku.at/publications/older/2604.pdf))
5. **Optimizers**. Common modifications to stochastic gradient descent.
Includes:
- SGD with momentum ([Rummelhart, Hinton, & Williams, 1986](https://www.cs.princeton.edu/courses/archive/spring18/cos495/res/backprop_old.pdf))
- AdaGrad ([Duchi, Hazan, & Singer, 2011](http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
- RMSProp ([Tieleman & Hinton, 2012](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
- Adam ([Kingma & Ba, 2015](https://arxiv.org/pdf/1412.6980v8.pdf))
6. **Learning Rate Schedulers**. Common learning rate decay schedules.
- Constant
- Exponential decay
- Noam/Transformer scheduler ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
- King/Dlib scheduler ([King, 2018](http://blog.dlib.net/2018/02/automatic-learning-rate-scheduling-that.html))
6. **Initializers**. Common weight initialization strategies.
- Glorot/Xavier uniform and normal ([Glorot & Bengio, 2010](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
- He/Kaiming uniform and normal ([He et al., 2015](https://arxiv.org/pdf/1502.01852v1.pdf))
- Standard normal
- Truncated normal
7. **Modules**. Common multi-layer blocks that appear across many deep networks.
Includes:
- Bidirectional LSTMs ([Schuster & Paliwal, 1997](https://pdfs.semanticscholar.org/4b80/89bc9b49f84de43acc2eb8900035f7d492b2.pdf))
- ResNet-style "identity" (i.e., `same`-convolution) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
- ResNet-style "convolutional" (i.e., parametric) residual blocks ([He et al., 2015](https://arxiv.org/pdf/1512.03385.pdf))
- WaveNet-style residual block with dilated causal convolutions ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf))
- Transformer-style multi-headed dot-product attention ([Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
8. **Models**. Well-known network architectures. Includes:
- `vae.py`: Bernoulli variational autoencoder ([Kingma & Welling, 2014](https://arxiv.org/abs/1312.6114))
- `wgan_gp.py`: Wasserstein generative adversarial network with gradient
penalty ([Gulrajani et al., 2017](https://arxiv.org/pdf/1704.00028.pdf);
[Goodfellow et al., 2014](https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf))
- `w2v.py`: word2vec model with CBOW and skip-gram architectures and
training via noise contrastive estimation ([Mikolov et al., 2012](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf))
8. **Utils**. Common helper functions, primarily for dealing with CNNs.
Includes:
- `im2col`
- `col2im`
- `conv1D`
- `conv2D`
- `dilate`
- `deconv2D`
- `minibatch`
- Various weight initialization utilities
- Various padding and convolution arithmetic utilities
================================================
FILE: numpy_ml/neural_nets/__init__.py
================================================
"""A module of basic building blcoks for constructing neural networks"""
from . import utils
from . import losses
from . import activations
from . import schedulers
from . import optimizers
from . import wrappers
from . import layers
from . import initializers
from . import modules
from . import models
================================================
FILE: numpy_ml/neural_nets/activations/README.md
================================================
# Activation Functions
The `activations` module implements several common activation functions:
- Rectified linear units (ReLU) ([Hahnloser et al., 2000](http://invibe.net/biblio_database_dyva/woda/data/att/6525.file.pdf))
- Leaky rectified linear units
([Maas, Hannun, & Ng, 2013](https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf))
- Exponential linear units ([Clevert, Unterthiner, & Hochreiter, 2016](https://arxiv.org/pdf/1511.07289.pdf))
- Scaled exponential linear units ([Klambauer, Unterthiner, & Mayr, 2017](https://arxiv.org/pdf/1706.02515.pdf))
- Softplus units
- Hard sigmoid units
- Exponential units
- Hyperbolic tangent (tanh)
- Logistic sigmoid
- Affine
## Plots
================================================
FILE: numpy_ml/neural_nets/activations/__init__.py
================================================
from .activations import *
================================================
FILE: numpy_ml/neural_nets/activations/activations.py
================================================
"""A collection of activation function objects for building neural networks"""
from math import erf
from abc import ABC, abstractmethod
import numpy as np
class ActivationBase(ABC):
def __init__(self, **kwargs):
"""Initialize the ActivationBase object"""
super().__init__()
def __call__(self, z):
"""Apply the activation function to an input"""
if z.ndim == 1:
z = z.reshape(1, -1)
return self.fn(z)
@abstractmethod
def fn(self, z):
"""Apply the activation function to an input"""
raise NotImplementedError
@abstractmethod
def grad(self, x, **kwargs):
"""Compute the gradient of the activation function wrt the input"""
raise NotImplementedError
class Sigmoid(ActivationBase):
def __init__(self):
"""A logistic sigmoid activation function."""
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Sigmoid"
def fn(self, z):
r"""
Evaluate the logistic sigmoid, :math:`\sigma`, on the elements of input `z`.
.. math::
\sigma(x_i) = \frac{1}{1 + e^{-x_i}}
"""
return 1 / (1 + np.exp(-z))
def grad(self, x):
r"""
Evaluate the first derivative of the logistic sigmoid on the elements of `x`.
.. math::
\frac{\partial \sigma}{\partial x_i} = \sigma(x_i) (1 - \sigma(x_i))
"""
fn_x = self.fn(x)
return fn_x * (1 - fn_x)
def grad2(self, x):
r"""
Evaluate the second derivative of the logistic sigmoid on the elements of `x`.
.. math::
\frac{\partial^2 \sigma}{\partial x_i^2} =
\frac{\partial \sigma}{\partial x_i} (1 - 2 \sigma(x_i))
"""
fn_x = self.fn(x)
return fn_x * (1 - fn_x) * (1 - 2 * fn_x)
class ReLU(ActivationBase):
"""
A rectified linear activation function.
Notes
-----
"ReLU units can be fragile during training and can "die". For example, a
large gradient flowing through a ReLU neuron could cause the weights to
update in such a way that the neuron will never activate on any datapoint
again. If this happens, then the gradient flowing through the unit will
forever be zero from that point on. That is, the ReLU units can
irreversibly die during training since they can get knocked off the data
manifold.
For example, you may find that as much as 40% of your network can be "dead"
(i.e. neurons that never activate across the entire training dataset) if
the learning rate is set too high. With a proper setting of the learning
rate this is less frequently an issue." [*]_
References
----------
.. [*] Karpathy, A. "CS231n: Convolutional neural networks for visual recognition."
"""
def __init__(self):
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "ReLU"
def fn(self, z):
r"""
Evaulate the ReLU function on the elements of input `z`.
.. math::
\text{ReLU}(z_i)
&= z_i \ \ \ \ &&\text{if }z_i > 0 \\
&= 0 \ \ \ \ &&\text{otherwise}
"""
return np.clip(z, 0, np.inf)
def grad(self, x):
r"""
Evaulate the first derivative of the ReLU function on the elements of input `x`.
.. math::
\frac{\partial \text{ReLU}}{\partial x_i}
&= 1 \ \ \ \ &&\text{if }x_i > 0 \\
&= 0 \ \ \ \ &&\text{otherwise}
"""
return (x > 0).astype(int)
def grad2(self, x):
r"""
Evaulate the second derivative of the ReLU function on the elements of
input `x`.
.. math::
\frac{\partial^2 \text{ReLU}}{\partial x_i^2} = 0
"""
return np.zeros_like(x)
class LeakyReLU(ActivationBase):
"""
'Leaky' version of a rectified linear unit (ReLU).
Notes
-----
Leaky ReLUs [*]_ are designed to address the vanishing gradient problem in
ReLUs by allowing a small non-zero gradient when `x` is negative.
Parameters
----------
alpha: float
Activation slope when x < 0. Default is 0.3.
References
----------
.. [*] Mass, L. M., Hannun, A. Y, & Ng, A. Y. (2013). "Rectifier
nonlinearities improve neural network acoustic models." *Proceedings of
the 30th International Conference of Machine Learning, 30*.
"""
def __init__(self, alpha=0.3):
self.alpha = alpha
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Leaky ReLU(alpha={})".format(self.alpha)
def fn(self, z):
r"""
Evaluate the leaky ReLU function on the elements of input `z`.
.. math::
\text{LeakyReLU}(z_i)
&= z_i \ \ \ \ &&\text{if } z_i > 0 \\
&= \alpha z_i \ \ \ \ &&\text{otherwise}
"""
_z = z.copy()
_z[z < 0] = _z[z < 0] * self.alpha
return _z
def grad(self, x):
r"""
Evaluate the first derivative of the leaky ReLU function on the elements
of input `x`.
.. math::
\frac{\partial \text{LeakyReLU}}{\partial x_i}
&= 1 \ \ \ \ &&\text{if }x_i > 0 \\
&= \alpha \ \ \ \ &&\text{otherwise}
"""
out = np.ones_like(x)
out[x < 0] *= self.alpha
return out
def grad2(self, x):
r"""
Evaluate the second derivative of the leaky ReLU function on the
elements of input `x`.
.. math::
\frac{\partial^2 \text{LeakyReLU}}{\partial x_i^2} = 0
"""
return np.zeros_like(x)
class GELU(ActivationBase):
def __init__(self, approximate=True):
r"""
A Gaussian error linear unit (GELU). [*]_
Notes
-----
A ReLU alternative. GELU weights inputs by their value, rather than
gates inputs by their sign, as in vanilla ReLUs.
References
----------
.. [*] Hendrycks, D., & Gimpel, K. (2016). "Bridging nonlinearities and
stochastic regularizers with Gaussian error linear units." *CoRR*.
Parameters
----------
approximate : bool
Whether to use a faster but less precise approximation to the Gauss
error function when calculating the unit activation and gradient.
Default is True.
"""
self.approximate = True
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return f"GELU(approximate={self.approximate})"
def fn(self, z):
r"""
Compute the GELU function on the elements of input `z`.
.. math::
\text{GELU}(z_i) = z_i P(Z \leq z_i) = z_i \Phi(z_i)
= z_i \cdot \frac{1}{2}(1 + \text{erf}(x/\sqrt{2}))
"""
pi, sqrt, tanh = np.pi, np.sqrt, np.tanh
if self.approximate:
return 0.5 * z * (1 + tanh(sqrt(2 / pi) * (z + 0.044715 * z ** 3)))
return 0.5 * z * (1 + erf(z / sqrt(2)))
def grad(self, x):
r"""
Evaluate the first derivative of the GELU function on the elements
of input `x`.
.. math::
\frac{\partial \text{GELU}}{\partial x_i} =
\frac{1}{2} + \frac{1}{2}\left(\text{erf}(\frac{x}{\sqrt{2}}) +
\frac{x + \text{erf}'(\frac{x}{\sqrt{2}})}{\sqrt{2}}\right)
where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
"""
pi, exp, sqrt, tanh = np.pi, np.exp, np.sqrt, np.tanh
s = x / sqrt(2)
erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2)) # noqa: E731
if self.approximate:
approx = tanh(sqrt(2 / pi) * (x + 0.044715 * x ** 3))
dx = 0.5 + 0.5 * approx + ((0.5 * x * erf_prime(s)) / sqrt(2))
else:
dx = 0.5 + 0.5 * erf(s) + ((0.5 * x * erf_prime(s)) / sqrt(2))
return dx
def grad2(self, x):
r"""
Evaluate the second derivative of the GELU function on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{GELU}}{\partial x_i^2} =
\frac{1}{2\sqrt{2}} \left\[
\text{erf}'(\frac{x}{\sqrt{2}}) +
\frac{1}{\sqrt{2}} \text{erf}''(\frac{x}{\sqrt{2}})
\right]
where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}` and
:math:`\text{erf}''(x) = \frac{-4x}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
"""
pi, exp, sqrt = np.pi, np.exp, np.sqrt
s = x / sqrt(2)
erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2)) # noqa: E731
erf_prime2 = lambda x: -4 * x * exp(-(x ** 2)) / sqrt(pi) # noqa: E731
ddx = (1 / 2 * sqrt(2)) * (1 + erf_prime(s) + (erf_prime2(s) / sqrt(2)))
return ddx
class Tanh(ActivationBase):
def __init__(self):
"""A hyperbolic tangent activation function."""
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Tanh"
def fn(self, z):
"""Compute the tanh function on the elements of input `z`."""
return np.tanh(z)
def grad(self, x):
r"""
Evaluate the first derivative of the tanh function on the elements
of input `x`.
.. math::
\frac{\partial \tanh}{\partial x_i} = 1 - \tanh(x)^2
"""
return 1 - np.tanh(x) ** 2
def grad2(self, x):
r"""
Evaluate the second derivative of the tanh function on the elements
of input `x`.
.. math::
\frac{\partial^2 \tanh}{\partial x_i^2} =
-2 \tanh(x) \left(\frac{\partial \tanh}{\partial x_i}\right)
"""
tanh_x = np.tanh(x)
return -2 * tanh_x * (1 - tanh_x ** 2)
class Affine(ActivationBase):
def __init__(self, slope=1, intercept=0):
"""
An affine activation function.
Parameters
----------
slope: float
Activation slope. Default is 1.
intercept: float
Intercept/offset term. Default is 0.
"""
self.slope = slope
self.intercept = intercept
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Affine(slope={}, intercept={})".format(self.slope, self.intercept)
def fn(self, z):
r"""
Evaluate the Affine activation on the elements of input `z`.
.. math::
\text{Affine}(z_i) = \text{slope} \times z_i + \text{intercept}
"""
return self.slope * z + self.intercept
def grad(self, x):
r"""
Evaluate the first derivative of the Affine activation on the elements
of input `x`.
.. math::
\frac{\partial \text{Affine}}{\partial x_i} = \text{slope}
"""
return self.slope * np.ones_like(x)
def grad2(self, x):
r"""
Evaluate the second derivative of the Affine activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{Affine}}{\partial x_i^2} = 0
"""
return np.zeros_like(x)
class Identity(Affine):
def __init__(self):
"""
Identity activation function.
Notes
-----
:class:`Identity` is syntactic sugar for :class:`Affine` with
slope = 1 and intercept = 0.
"""
super().__init__(slope=1, intercept=0)
def __str__(self):
"""Return a string representation of the activation function"""
return "Identity"
class ELU(ActivationBase):
def __init__(self, alpha=1.0):
r"""
An exponential linear unit (ELU).
Notes
-----
ELUs are intended to address the fact that ReLUs are strictly nonnegative
and thus have an average activation > 0, increasing the chances of internal
covariate shift and slowing down learning. ELU units address this by (1)
allowing negative values when :math:`x < 0`, which (2) are bounded by a value
:math:`-\alpha`. Similar to :class:`LeakyReLU`, the negative activation
values help to push the average unit activation towards 0. Unlike
:class:`LeakyReLU`, however, the boundedness of the negative activation
allows for greater robustness in the face of large negative values,
allowing the function to avoid conveying the *degree* of "absence"
(negative activation) in the input. [*]_
Parameters
----------
alpha : float
Slope of negative segment. Default is 1.
References
----------
.. [*] Clevert, D. A., Unterthiner, T., Hochreiter, S. (2016). "Fast
and accurate deep network learning by exponential linear units
(ELUs)". *4th International Conference on Learning
Representations*.
"""
self.alpha = alpha
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "ELU(alpha={})".format(self.alpha)
def fn(self, z):
r"""
Evaluate the ELU activation on the elements of input `z`.
.. math::
\text{ELU}(z_i)
&= z_i \ \ \ \ &&\text{if }z_i > 0 \\
&= \alpha (e^{z_i} - 1) \ \ \ \ &&\text{otherwise}
"""
# z if z > 0 else alpha * (e^z - 1)
return np.where(z > 0, z, self.alpha * (np.exp(z) - 1))
def grad(self, x):
r"""
Evaluate the first derivative of the ELU activation on the elements
of input `x`.
.. math::
\frac{\partial \text{ELU}}{\partial x_i}
&= 1 \ \ \ \ &&\text{if } x_i > 0 \\
&= \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
"""
# 1 if x > 0 else alpha * e^(z)
return np.where(x > 0, np.ones_like(x), self.alpha * np.exp(x))
def grad2(self, x):
r"""
Evaluate the second derivative of the ELU activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{ELU}}{\partial x_i^2}
&= 0 \ \ \ \ &&\text{if } x_i > 0 \\
&= \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
"""
# 0 if x > 0 else alpha * e^(z)
return np.where(x >= 0, np.zeros_like(x), self.alpha * np.exp(x))
class Exponential(ActivationBase):
def __init__(self):
"""An exponential (base e) activation function"""
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Exponential"
def fn(self, z):
r"""
Evaluate the activation function
.. math::
\text{Exponential}(z_i) = e^{z_i}
"""
return np.exp(z)
def grad(self, x):
r"""
Evaluate the first derivative of the exponential activation on the elements
of input `x`.
.. math::
\frac{\partial \text{Exponential}}{\partial x_i} = e^{x_i}
"""
return np.exp(x)
def grad2(self, x):
r"""
Evaluate the second derivative of the exponential activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{Exponential}}{\partial x_i^2} = e^{x_i}
"""
return np.exp(x)
class SELU(ActivationBase):
r"""
A scaled exponential linear unit (SELU).
Notes
-----
SELU units, when used in conjunction with proper weight initialization and
regularization techniques, encourage neuron activations to converge to
zero-mean and unit variance without explicit use of e.g., batchnorm.
For SELU units, the :math:`\alpha` and :math:`\text{scale}` values are
constants chosen so that the mean and variance of the inputs are preserved
between consecutive layers. As such the authors propose weights be
initialized using Lecun-Normal initialization: :math:`w_{ij} \sim
\mathcal{N}(0, 1 / \text{fan_in})`, and to use the dropout variant
:math:`\alpha`-dropout during regularization. [*]_
See the reference for more information (especially the appendix ;-) ).
References
----------
.. [*] Klambauer, G., Unterthiner, T., & Hochreiter, S. (2017).
"Self-normalizing neural networks." *Advances in Neural Information
Processing Systems, 30.*
"""
def __init__(self):
self.alpha = 1.6732632423543772848170429916717
self.scale = 1.0507009873554804934193349852946
self.elu = ELU(alpha=self.alpha)
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "SELU"
def fn(self, z):
r"""
Evaluate the SELU activation on the elements of input `z`.
.. math::
\text{SELU}(z_i) = \text{scale} \times \text{ELU}(z_i, \alpha)
which is simply
.. math::
\text{SELU}(z_i)
&= \text{scale} \times z_i \ \ \ \ &&\text{if }z_i > 0 \\
&= \text{scale} \times \alpha (e^{z_i} - 1) \ \ \ \ &&\text{otherwise}
"""
return self.scale * self.elu.fn(z)
def grad(self, x):
r"""
Evaluate the first derivative of the SELU activation on the elements
of input `x`.
.. math::
\frac{\partial \text{SELU}}{\partial x_i}
&= \text{scale} \ \ \ \ &&\text{if } x_i > 0 \\
&= \text{scale} \times \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
"""
return np.where(
x >= 0, np.ones_like(x) * self.scale, np.exp(x) * self.alpha * self.scale,
)
def grad2(self, x):
r"""
Evaluate the second derivative of the SELU activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{SELU}}{\partial x_i^2}
&= 0 \ \ \ \ &&\text{if } x_i > 0 \\
&= \text{scale} \times \alpha e^{x_i} \ \ \ \ &&\text{otherwise}
"""
return np.where(x > 0, np.zeros_like(x), np.exp(x) * self.alpha * self.scale)
class HardSigmoid(ActivationBase):
def __init__(self):
"""
A "hard" sigmoid activation function.
Notes
-----
The hard sigmoid is a piecewise linear approximation of the logistic
sigmoid that is computationally more efficient to compute.
"""
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "Hard Sigmoid"
def fn(self, z):
r"""
Evaluate the hard sigmoid activation on the elements of input `z`.
.. math::
\text{HardSigmoid}(z_i)
&= 0 \ \ \ \ &&\text{if }z_i < -2.5 \\
&= 0.2 z_i + 0.5 \ \ \ \ &&\text{if }-2.5 \leq z_i \leq 2.5 \\
&= 1 \ \ \ \ &&\text{if }z_i > 2.5
"""
return np.clip((0.2 * z) + 0.5, 0.0, 1.0)
def grad(self, x):
r"""
Evaluate the first derivative of the hard sigmoid activation on the elements
of input `x`.
.. math::
\frac{\partial \text{HardSigmoid}}{\partial x_i}
&= 0.2 \ \ \ \ &&\text{if } -2.5 \leq x_i \leq 2.5\\
&= 0 \ \ \ \ &&\text{otherwise}
"""
return np.where((x >= -2.5) & (x <= 2.5), 0.2, 0)
def grad2(self, x):
r"""
Evaluate the second derivative of the hard sigmoid activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{HardSigmoid}}{\partial x_i^2} = 0
"""
return np.zeros_like(x)
class SoftPlus(ActivationBase):
def __init__(self):
"""
A softplus activation function.
Notes
-----
In contrast to :class:`ReLU`, the softplus activation is differentiable
everywhere (including 0). It is, however, less computationally efficient to
compute.
The derivative of the softplus activation is the logistic sigmoid.
"""
super().__init__()
def __str__(self):
"""Return a string representation of the activation function"""
return "SoftPlus"
def fn(self, z):
r"""
Evaluate the softplus activation on the elements of input `z`.
.. math::
\text{SoftPlus}(z_i) = \log(1 + e^{z_i})
"""
return np.log(np.exp(z) + 1)
def grad(self, x):
r"""
Evaluate the first derivative of the softplus activation on the elements
of input `x`.
.. math::
\frac{\partial \text{SoftPlus}}{\partial x_i} = \frac{e^{x_i}}{1 + e^{x_i}}
"""
exp_x = np.exp(x)
return exp_x / (exp_x + 1)
def grad2(self, x):
r"""
Evaluate the second derivative of the softplus activation on the elements
of input `x`.
.. math::
\frac{\partial^2 \text{SoftPlus}}{\partial x_i^2} =
\frac{e^{x_i}}{(1 + e^{x_i})^2}
"""
exp_x = np.exp(x)
return exp_x / ((exp_x + 1) ** 2)
================================================
FILE: numpy_ml/neural_nets/initializers/README.md
================================================
# Initializers
The `initializers.py` module contains objects for initializing optimizers,
activation functions, weight initializers, and learning rate schedulers from
strings or parameter dictionaries.
================================================
FILE: numpy_ml/neural_nets/initializers/__init__.py
================================================
from .initializers import *
================================================
FILE: numpy_ml/neural_nets/initializers/initializers.py
================================================
"""A module containing objects to instantiate various neural network components."""
import re
from functools import partial
from ast import literal_eval as _eval
import numpy as np
from ..optimizers import OptimizerBase, SGD, AdaGrad, RMSProp, Adam
from ..activations import (
ELU,
GELU,
SELU,
ReLU,
Tanh,
Affine,
Sigmoid,
Identity,
SoftPlus,
LeakyReLU,
Exponential,
HardSigmoid,
ActivationBase,
)
from ..schedulers import (
SchedulerBase,
ConstantScheduler,
ExponentialScheduler,
NoamScheduler,
KingScheduler,
)
from ..utils import (
he_normal,
he_uniform,
glorot_normal,
glorot_uniform,
truncated_normal,
)
class ActivationInitializer(object):
def __init__(self, param=None):
"""
A class for initializing activation functions. Valid `param` values
are:
(a) ``__str__`` representations of an `ActivationBase` instance
(b) `ActivationBase` instance
If `param` is `None`, return the identity function: f(X) = X
"""
self.param = param
def __call__(self):
"""Initialize activation function"""
param = self.param
if param is None:
act = Identity()
elif isinstance(param, ActivationBase):
act = param
elif isinstance(param, str):
act = self.init_from_str(param)
else:
raise ValueError("Unknown activation: {}".format(param))
return act
def init_from_str(self, act_str):
"""Initialize activation function from the `param` string"""
act_str = act_str.lower()
if act_str == "relu":
act_fn = ReLU()
elif act_str == "tanh":
act_fn = Tanh()
elif act_str == "selu":
act_fn = SELU()
elif act_str == "sigmoid":
act_fn = Sigmoid()
elif act_str == "identity":
act_fn = Identity()
elif act_str == "hardsigmoid":
act_fn = HardSigmoid()
elif act_str == "softplus":
act_fn = SoftPlus()
elif act_str == "exponential":
act_fn = Exponential()
elif "affine" in act_str:
r = r"affine\(slope=(.*), intercept=(.*)\)"
slope, intercept = re.match(r, act_str).groups()
act_fn = Affine(float(slope), float(intercept))
elif "leaky relu" in act_str:
r = r"leaky relu\(alpha=(.*)\)"
alpha = re.match(r, act_str).groups()[0]
act_fn = LeakyReLU(float(alpha))
elif "gelu" in act_str:
r = r"gelu\(approximate=(.*)\)"
approx = re.match(r, act_str).groups()[0] == "true"
act_fn = GELU(approximation=approx)
elif "elu" in act_str:
r = r"elu\(alpha=(.*)\)"
approx = re.match(r, act_str).groups()[0]
act_fn = ELU(alpha=float(alpha))
else:
raise ValueError("Unknown activation: {}".format(act_str))
return act_fn
class SchedulerInitializer(object):
def __init__(self, param=None, lr=None):
"""
A class for initializing learning rate schedulers. Valid `param` values
are:
(a) __str__ representations of `SchedulerBase` instances
(b) `SchedulerBase` instances
(c) Parameter dicts (e.g., as produced via the `summary` method in
`LayerBase` instances)
If `param` is `None`, return the ConstantScheduler with learning rate
equal to `lr`.
"""
if all([lr is None, param is None]):
raise ValueError("lr and param cannot both be `None`")
self.lr = lr
self.param = param
def __call__(self):
"""Initialize scheduler"""
param = self.param
if param is None:
scheduler = ConstantScheduler(self.lr)
elif isinstance(param, SchedulerBase):
scheduler = param
elif isinstance(param, str):
scheduler = self.init_from_str()
elif isinstance(param, dict):
scheduler = self.init_from_dict()
return scheduler
def init_from_str(self):
"""Initialize scheduler from the param string"""
r = r"([a-zA-Z]*)=([^,)]*)"
sch_str = self.param.lower()
kwargs = {i: _eval(j) for i, j in re.findall(r, sch_str)}
if "constant" in sch_str:
scheduler = ConstantScheduler(**kwargs)
elif "exponential" in sch_str:
scheduler = ExponentialScheduler(**kwargs)
elif "noam" in sch_str:
scheduler = NoamScheduler(**kwargs)
elif "king" in sch_str:
scheduler = KingScheduler(**kwargs)
else:
raise NotImplementedError("{}".format(sch_str))
return scheduler
def init_from_dict(self):
"""Initialize scheduler from the param dictionary"""
S = self.param
sc = S["hyperparameters"] if "hyperparameters" in S else None
if sc is None:
raise ValueError("Must have `hyperparameters` key: {}".format(S))
if sc and sc["id"] == "ConstantScheduler":
scheduler = ConstantScheduler()
elif sc and sc["id"] == "ExponentialScheduler":
scheduler = ExponentialScheduler()
elif sc and sc["id"] == "NoamScheduler":
scheduler = NoamScheduler()
elif sc:
raise NotImplementedError("{}".format(sc["id"]))
scheduler.set_params(sc)
return scheduler
class OptimizerInitializer(object):
def __init__(self, param=None):
"""
A class for initializing optimizers. Valid `param` values are:
(a) __str__ representations of `OptimizerBase` instances
(b) `OptimizerBase` instances
(c) Parameter dicts (e.g., as produced via the `summary` method in
`LayerBase` instances)
If `param` is `None`, return the SGD optimizer with default parameters.
"""
self.param = param
def __call__(self):
"""Initialize the optimizer"""
param = self.param
if param is None:
opt = SGD()
elif isinstance(param, OptimizerBase):
opt = param
elif isinstance(param, str):
opt = self.init_from_str()
elif isinstance(param, dict):
opt = self.init_from_dict()
return opt
def init_from_str(self):
"""Initialize optimizer from the `param` string"""
r = r"([a-zA-Z]*)=([^,)]*)"
opt_str = self.param.lower()
kwargs = {i: _eval(j) for i, j in re.findall(r, opt_str)}
if "sgd" in opt_str:
optimizer = SGD(**kwargs)
elif "adagrad" in opt_str:
optimizer = AdaGrad(**kwargs)
elif "rmsprop" in opt_str:
optimizer = RMSProp(**kwargs)
elif "adam" in opt_str:
optimizer = Adam(**kwargs)
else:
raise NotImplementedError("{}".format(opt_str))
return optimizer
def init_from_dict(self):
"""Initialize optimizer from the `param` dictonary"""
D = self.param
cc = D["cache"] if "cache" in D else None
op = D["hyperparameters"] if "hyperparameters" in D else None
if op is None:
raise ValueError("`param` dictionary has no `hyperparemeters` key")
if op and op["id"] == "SGD":
optimizer = SGD()
elif op and op["id"] == "RMSProp":
optimizer = RMSProp()
elif op and op["id"] == "AdaGrad":
optimizer = AdaGrad()
elif op and op["id"] == "Adam":
optimizer = Adam()
elif op:
raise NotImplementedError("{}".format(op["id"]))
optimizer.set_params(op, cc)
return optimizer
class WeightInitializer(object):
def __init__(self, act_fn_str, mode="glorot_uniform"):
"""
A factory for weight initializers.
Parameters
----------
act_fn_str : str
The string representation for the layer activation function
mode : str (default: 'glorot_uniform')
The weight initialization strategy. Valid entries are {"he_normal",
"he_uniform", "glorot_normal", glorot_uniform", "std_normal",
"trunc_normal"}
"""
if mode not in [
"he_normal",
"he_uniform",
"glorot_normal",
"glorot_uniform",
"std_normal",
"trunc_normal",
]:
raise ValueError("Unrecognize initialization mode: {}".format(mode))
self.mode = mode
self.act_fn = act_fn_str
if mode == "glorot_uniform":
self._fn = glorot_uniform
elif mode == "glorot_normal":
self._fn = glorot_normal
elif mode == "he_uniform":
self._fn = he_uniform
elif mode == "he_normal":
self._fn = he_normal
elif mode == "std_normal":
self._fn = np.random.randn
elif mode == "trunc_normal":
self._fn = partial(truncated_normal, mean=0, std=1)
def __call__(self, weight_shape):
"""Initialize weights according to the specified strategy"""
if "glorot" in self.mode:
gain = self._calc_glorot_gain()
W = self._fn(weight_shape, gain)
elif self.mode == "std_normal":
W = self._fn(*weight_shape)
else:
W = self._fn(weight_shape)
return W
def _calc_glorot_gain(self):
"""
Values from:
https://pytorch.org/docs/stable/nn.html?#torch.nn.init.calculate_gain
"""
gain = 1.0
act_str = self.act_fn.lower()
if act_str == "tanh":
gain = 5.0 / 3.0
elif act_str == "relu":
gain = np.sqrt(2)
elif "leaky relu" in act_str:
r = r"leaky relu\(alpha=(.*)\)"
alpha = re.match(r, act_str).groups()[0]
gain = np.sqrt(2 / 1 + float(alpha) ** 2)
return gain
================================================
FILE: numpy_ml/neural_nets/layers/README.md
================================================
# Layers
The `layers.py` module implements common layers / layer-wise operations that can
be composed to create larger neural networks. It includes:
- Fully-connected layers
- Sparse evolutionary layers ([Mocanu et al., 2018](https://www.nature.com/articles/s41467-018-04316-3))
- Dot-product attention layers ([Luong, Pho, & Manning, 2015](https://arxiv.org/pdf/1508.04025.pdf); [Vaswani et al., 2017](https://arxiv.org/pdf/1706.03762.pdf))
- 1D and 2D convolution (with stride, padding, and dilation) layers ([van den Oord et al., 2016](https://arxiv.org/pdf/1609.03499.pdf); [Yu & Kolton, 2016](https://arxiv.org/pdf/1511.07122.pdf))
- 2D "deconvolution" (with stride and padding) layers ([Zeiler et al., 2010](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf))
- Restricted Boltzmann machines (with CD-_n_ training) ([Smolensky, 1996](http://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf); [Carreira-Perpiñán & Hinton, 2005](http://www.cs.toronto.edu/~fritz/absps/cdmiguel.pdf))
- Elementwise multiplication operation
- Summation operation
- Flattening operation
- Embedding layer
- Softmax layer
- Max & average pooling layer
- 1D and 2D batch normalization layers ([Ioffe & Szegedy, 2015](http://proceedings.mlr.press/v37/ioffe15.pdf))
- 1D and 2D layer normalization layers ([Ba, Kiros, & Hinton, 2016](https://arxiv.org/pdf/1607.06450.pdf))
- Recurrent layers ([Elman, 1990](https://crl.ucsd.edu/~elman/Papers/fsit.pdf))
- Long short-term memory (LSTM) layers ([Hochreiter & Schmidhuber, 1997](http://www.bioinf.jku.at/publications/older/2604.pdf))
================================================
FILE: numpy_ml/neural_nets/layers/__init__.py
================================================
from .layers import *
================================================
FILE: numpy_ml/neural_nets/layers/layers.py
================================================
"""A collection of composable layer objects for building neural networks"""
from abc import ABC, abstractmethod
import numpy as np
from ..wrappers import init_wrappers, Dropout
from ..initializers import (
WeightInitializer,
OptimizerInitializer,
ActivationInitializer,
)
from ..utils import (
pad1D,
pad2D,
conv1D,
conv2D,
im2col,
col2im,
dilate,
deconv2D_naive,
calc_pad_dims_2D,
)
class LayerBase(ABC):
def __init__(self, optimizer=None):
"""An abstract base class inherited by all neural network layers"""
self.X = []
self.act_fn = None
self.trainable = True
self.optimizer = OptimizerInitializer(optimizer)()
self.gradients = {}
self.parameters = {}
self.derived_variables = {}
super().__init__()
@abstractmethod
def _init_params(self, **kwargs):
raise NotImplementedError
@abstractmethod
def forward(self, z, **kwargs):
"""Perform a forward pass through the layer"""
raise NotImplementedError
@abstractmethod
def backward(self, out, **kwargs):
"""Perform a backward pass through the layer"""
raise NotImplementedError
def freeze(self):
"""
Freeze the layer parameters at their current values so they can no
longer be updated.
"""
self.trainable = False
def unfreeze(self):
"""Unfreeze the layer parameters so they can be updated."""
self.trainable = True
def flush_gradients(self):
"""Erase all the layer's derived variables and gradients."""
assert self.trainable, "Layer is frozen"
self.X = []
for k, v in self.derived_variables.items():
self.derived_variables[k] = []
for k, v in self.gradients.items():
self.gradients[k] = np.zeros_like(v)
def update(self, cur_loss=None):
"""
Update the layer parameters using the accrued gradients and layer
optimizer. Flush all gradients once the update is complete.
"""
assert self.trainable, "Layer is frozen"
self.optimizer.step()
for k, v in self.gradients.items():
if k in self.parameters:
self.parameters[k] = self.optimizer(self.parameters[k], v, k, cur_loss)
self.flush_gradients()
def set_params(self, summary_dict):
"""
Set the layer parameters from a dictionary of values.
Parameters
----------
summary_dict : dict
A dictionary of layer parameters and hyperparameters. If a required
parameter or hyperparameter is not included within `summary_dict`,
this method will use the value in the current layer's
:meth:`summary` method.
Returns
-------
layer : :doc:`Layer ` object
The newly-initialized layer.
"""
layer, sd = self, summary_dict
# collapse `parameters` and `hyperparameters` nested dicts into a single
# merged dictionary
flatten_keys = ["parameters", "hyperparameters"]
for k in flatten_keys:
if k in sd:
entry = sd[k]
sd.update(entry)
del sd[k]
for k, v in sd.items():
if k in self.parameters:
layer.parameters[k] = v
if k in self.hyperparameters:
if k == "act_fn":
layer.act_fn = ActivationInitializer(v)()
elif k == "optimizer":
layer.optimizer = OptimizerInitializer(sd[k])()
elif k == "wrappers":
layer = init_wrappers(layer, sd[k])
elif k not in ["wrappers", "optimizer"]:
setattr(layer, k, v)
return layer
def summary(self):
"""Return a dict of the layer parameters, hyperparameters, and ID."""
return {
"layer": self.hyperparameters["layer"],
"parameters": self.parameters,
"hyperparameters": self.hyperparameters,
}
class DotProductAttention(LayerBase):
def __init__(self, scale=True, dropout_p=0, init="glorot_uniform", optimizer=None):
r"""
A single "attention head" layer using a dot-product for the scoring function.
Notes
-----
The equations for a dot product attention layer are:
.. math::
\mathbf{Z} &= \mathbf{K Q}^\\top \ \ \ \ &&\text{if scale = False} \\
&= \mathbf{K Q}^\top / \sqrt{d_k} \ \ \ \ &&\text{if scale = True} \\
\mathbf{Y} &= \text{dropout}(\text{softmax}(\mathbf{Z})) \mathbf{V}
Parameters
----------
scale : bool
Whether to scale the the key-query dot product by the square root
of the key/query vector dimensionality before applying the Softmax.
This is useful, since the scale of dot product will otherwise
increase as query / key dimensions grow. Default is True.
dropout_p : float in [0, 1)
The dropout propbability during training, applied to the output of
the softmax. If 0, no dropout is applied. Default is 0.
init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
The weight initialization strategy. Default is `'glorot_uniform'`.
Unused.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None. Unused.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Unused
parameters : dict
Unused
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.init = init
self.scale = scale
self.dropout_p = dropout_p
self._init_params()
def _init_params(self):
self.softmax = Dropout(Softmax(), self.dropout_p)
smdv = self.softmax.derived_variables
self.gradients = {}
self.parameters = {}
self.derived_variables = {
"attention_weights": [],
"dropout_mask": smdv["wrappers"][0]["dropout_mask"],
}
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "DotProductAttention",
"init": self.init,
"scale": self.scale,
"dropout_p": self.dropout_p,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def freeze(self):
"""
Freeze the layer parameters at their current values so they can no
longer be updated.
"""
self.trainable = False
self.softmax.freeze()
def unfreeze(self):
"""Unfreeze the layer parameters so they can be updated."""
self.trainable = True
self.softmax.unfreeze()
def forward(self, Q, K, V, retain_derived=True):
r"""
Compute the attention-weighted output of a collection of keys, values,
and queries.
Notes
-----
In the most abstract (ie., hand-wave-y) sense:
- Query vectors ask questions
- Key vectors advertise their relevancy to questions
- Value vectors give possible answers to questions
- The dot product between Key and Query vectors provides scores for
each of the the `n_ex` different Value vectors
For a single query and `n` key-value pairs, dot-product attention (with
scaling) is::
w0 = dropout(softmax( (query @ key[0]) / sqrt(d_k) ))
w1 = dropout(softmax( (query @ key[1]) / sqrt(d_k) ))
...
wn = dropout(softmax( (query @ key[n]) / sqrt(d_k) ))
y = np.array([w0, ..., wn]) @ values
(1 × n_ex) (n_ex × d_v)
In words, keys and queries are combined via dot-product to produce a
score, which is then passed through a softmax to produce a weight on
each value vector in Values. We elementwise multiply each value vector
by its weight, and then take the elementwise sum of each weighted value
vector to get the :math:`1 \times d_v` output for the current example.
In vectorized form,
.. math::
\mathbf{Y} = \text{dropout}(
\text{softmax}(\mathbf{KQ}^\top / \sqrt{d_k})
) \mathbf{V}
Parameters
----------
Q : :py:class:`ndarray ` of shape `(n_ex, *, d_k)`
A set of `n_ex` query vectors packed into a single matrix.
Optional middle dimensions can be used to specify, e.g., the number
of parallel attention heads.
K : :py:class:`ndarray ` of shape `(n_ex, *, d_k)`
A set of `n_ex` key vectors packed into a single matrix. Optional
middle dimensions can be used to specify, e.g., the number of
parallel attention heads.
V : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`
A set of `n_ex` value vectors packed into a single matrix. Optional
middle dimensions can be used to specify, e.g., the number of
parallel attention heads.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`
The attention-weighted output values
"""
Y, weights = self._fwd(Q, K, V)
if retain_derived:
self.X.append((Q, K, V))
self.derived_variables["attention_weights"].append(weights)
return Y
def _fwd(self, Q, K, V):
"""Actual computation of forward pass"""
scale = 1 / np.sqrt(Q.shape[-1]) if self.scale else 1
scores = Q @ K.swapaxes(-2, -1) * scale # attention scores
weights = self.softmax.forward(scores) # attention weights
Y = weights @ V
return Y, weights
def backward(self, dLdy, retain_grads=True):
r"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`
The gradient of the loss wrt. the layer output `Y`
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dQ : :py:class:`ndarray ` of shape `(n_ex, *, d_k)` or list of arrays
The gradient of the loss wrt. the layer query matrix/matrices `Q`.
dK : :py:class:`ndarray ` of shape `(n_ex, *, d_k)` or list of arrays
The gradient of the loss wrt. the layer key matrix/matrices `K`.
dV : :py:class:`ndarray ` of shape `(n_ex, *, d_v)` or list of arrays
The gradient of the loss wrt. the layer value matrix/matrices `V`.
""" # noqa: E501
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dQ, dK, dV = [], [], []
weights = self.derived_variables["attention_weights"]
for dy, (q, k, v), w in zip(dLdy, self.X, weights):
dq, dk, dv = self._bwd(dy, q, k, v, w)
dQ.append(dq)
dK.append(dk)
dV.append(dv)
if len(self.X) == 1:
dQ, dK, dV = dQ[0], dK[0], dV[0]
return dQ, dK, dV
def _bwd(self, dy, q, k, v, weights):
"""Actual computation of the gradient of the loss wrt. q, k, and v"""
d_k = k.shape[-1]
scale = 1 / np.sqrt(d_k) if self.scale else 1
dV = weights.swapaxes(-2, -1) @ dy
dWeights = dy @ v.swapaxes(-2, -1)
dScores = self.softmax.backward(dWeights)
dQ = dScores @ k * scale
dK = dScores.swapaxes(-2, -1) @ q * scale
return dQ, dK, dV
class RBM(LayerBase):
def __init__(self, n_out, K=1, init="glorot_uniform", optimizer=None):
"""
A Restricted Boltzmann machine with Bernoulli visible and hidden units.
Parameters
----------
n_out : int
The number of output dimensions/units.
K : int
The number of contrastive divergence steps to run before computing
a single gradient update. Default is 1.
init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
The weight initialization strategy. Default is `'glorot_uniform'`.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Unused
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.K = K # CD-K
self.init = init
self.n_in = None
self.n_out = n_out
self.is_initialized = False
self.act_fn_V = ActivationInitializer("Sigmoid")()
self.act_fn_H = ActivationInitializer("Sigmoid")()
self.parameters = {"W": None, "b_in": None, "b_out": None}
self._init_params()
def _init_params(self):
init_weights = WeightInitializer(str(self.act_fn_V), mode=self.init)
b_in = np.zeros((1, self.n_in))
b_out = np.zeros((1, self.n_out))
W = init_weights((self.n_in, self.n_out))
self.parameters = {"W": W, "b_in": b_in, "b_out": b_out}
self.gradients = {
"W": np.zeros_like(W),
"b_in": np.zeros_like(b_in),
"b_out": np.zeros_like(b_out),
}
self.derived_variables = {
"V": None,
"p_H": None,
"p_V_prime": None,
"p_H_prime": None,
"positive_grad": None,
"negative_grad": None,
}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "RBM",
"K": self.K,
"n_in": self.n_in,
"n_out": self.n_out,
"init": self.init,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameterse,
},
}
def CD_update(self, X):
"""
Perform a single contrastive divergence-`k` training update using the
visible inputs `X` as a starting point for the Gibbs sampler.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples. Each feature in X should ideally be
binary-valued, although it is possible to also train on real-valued
features ranging between (0, 1) (e.g., grayscale images).
"""
self.forward(X)
self.backward()
def forward(self, V, K=None, retain_derived=True):
"""
Perform the CD-`k` "forward pass" of visible inputs into hidden units
and back.
Notes
-----
This implementation follows [1]_'s recommendations for the RBM forward
pass:
- Use real-valued probabilities for both the data and the visible
unit reconstructions.
- Only the final update of the hidden units should use the actual
probabilities -- all others should be sampled binary states.
- When collecting the pairwise statistics for learning weights or
the individual statistics for learning biases, use the
probabilities, not the binary states.
References
----------
.. [1] Hinton, G. (2010). "A practical guide to training restricted
Boltzmann machines". *UTML TR 2010-003*
Parameters
----------
V : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Visible input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples. Each feature in V should ideally be
binary-valued, although it is possible to also train on real-valued
features ranging between (0, 1) (e.g., grayscale images).
K : int
The number of steps of contrastive divergence steps to run before
computing the gradient update. If None, use ``self.K``. Default is
None.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
"""
if not self.is_initialized:
self.n_in = V.shape[1]
self._init_params()
# override self.K if necessary
K = self.K if K is None else K
W = self.parameters["W"]
b_in = self.parameters["b_in"]
b_out = self.parameters["b_out"]
# compute hidden unit probabilities
Z_H = V @ W + b_out
p_H = self.act_fn_H.fn(Z_H)
# sample hidden states (stochastic binary values)
H = np.random.rand(*p_H.shape) <= p_H
H = H.astype(float)
# always use probabilities when computing gradients
positive_grad = V.T @ p_H
# perform CD-k
# TODO: use persistent CD-k
# https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf
H_prime = H.copy()
for k in range(K):
# resample v' given h (H_prime is binary for all but final step)
Z_V_prime = H_prime @ W.T + b_in
p_V_prime = self.act_fn_V.fn(Z_V_prime)
# don't resample visual units - always use raw probabilities!
V_prime = p_V_prime
# compute p(h' | v')
Z_H_prime = V_prime @ W + b_out
p_H_prime = self.act_fn_H.fn(Z_H_prime)
# if this is the final iteration of CD, keep hidden state
# probabilities (don't sample)
H_prime = p_H_prime
if k != self.K - 1:
H_prime = np.random.rand(*p_H_prime.shape) <= p_H_prime
H_prime = H_prime.astype(float)
negative_grad = p_V_prime.T @ p_H_prime
if retain_derived:
self.derived_variables["V"] = V
self.derived_variables["p_H"] = p_H
self.derived_variables["p_V_prime"] = p_V_prime
self.derived_variables["p_H_prime"] = p_H_prime
self.derived_variables["positive_grad"] = positive_grad
self.derived_variables["negative_grad"] = negative_grad
def backward(self, retain_grads=True, *args):
"""
Perform a gradient update on the layer parameters via the contrastive
divergence equations.
Parameters
----------
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
"""
V = self.derived_variables["V"]
p_H = self.derived_variables["p_H"]
p_V_prime = self.derived_variables["p_V_prime"]
p_H_prime = self.derived_variables["p_H_prime"]
positive_grad = self.derived_variables["positive_grad"]
negative_grad = self.derived_variables["negative_grad"]
if retain_grads:
self.gradients["b_in"] = V - p_V_prime
self.gradients["b_out"] = p_H - p_H_prime
self.gradients["W"] = positive_grad - negative_grad
def reconstruct(self, X, n_steps=10, return_prob=False):
"""
Reconstruct an input `X` by running the trained Gibbs sampler for
`n_steps`-worth of CD-`k`.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples. Each feature in `X` should ideally be
binary-valued, although it is possible to also train on real-valued
features ranging between (0, 1) (e.g., grayscale images). If `X` has
missing values, it may be sufficient to mark them with random
entries and allow the reconstruction to impute them.
n_steps : int
The number of Gibbs sampling steps to perform when generating the
reconstruction. Default is 10.
return_prob : bool
Whether to return the real-valued feature probabilities for the
reconstruction or the binary samples. Default is False.
Returns
-------
V : :py:class:`ndarray ` of shape `(n_ex, in_ch)`
The reconstruction (or feature probabilities if `return_prob` is
true) of the visual input `X` after running the Gibbs sampler for
`n_steps`.
"""
self.forward(X, K=n_steps)
p_V_prime = self.derived_variables["p_V_prime"]
# ignore the gradients produced during this reconstruction
self.flush_gradients()
# sample V_prime reconstruction if return_prob is False
V = p_V_prime
if not return_prob:
V = (np.random.rand(*p_V_prime.shape) <= p_V_prime).astype(float)
return V
#######################################################################
# Layer Ops #
#######################################################################
class Add(LayerBase):
def __init__(self, act_fn=None, optimizer=None):
"""
An "addition" layer that returns the sum of its inputs, passed through
an optional nonlinearity.
Parameters
----------
act_fn : str, :doc:`Activation ` object, or None
The element-wise output nonlinearity used in computing the final
output. If None, use the identity function :math:`f(x) = x`.
Default is None.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Unused
parameters : dict
Unused
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.act_fn = ActivationInitializer(act_fn)()
self._init_params()
def _init_params(self):
self.gradients = {}
self.parameters = {}
self.derived_variables = {"sum": []}
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "Sum",
"act_fn": str(self.act_fn),
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
r"""
Compute the layer output on a single minibatch.
Parameters
----------
X : list of length `n_inputs`
A list of tensors, all of the same shape.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, *)`
The sum over the `n_ex` examples.
"""
out = X[0].copy()
for i in range(1, len(X)):
out += X[i]
if retain_derived:
self.X.append(X)
self.derived_variables["sum"].append(out)
return self.act_fn(out)
def backward(self, dLdY, retain_grads=True):
r"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, *)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : list of length `n_inputs`
The gradient of the loss wrt. each input in `X`.
"""
if not isinstance(dLdY, list):
dLdY = [dLdY]
X = self.X
_sum = self.derived_variables["sum"]
grads = [self._bwd(dy, x, ss) for dy, x, ss in zip(dLdY, X, _sum)]
return grads[0] if len(X) == 1 else grads
def _bwd(self, dLdY, X, _sum):
"""Actual computation of gradient of the loss wrt. each input"""
grads = [dLdY * self.act_fn.grad(_sum) for _ in X]
return grads
class Multiply(LayerBase):
def __init__(self, act_fn=None, optimizer=None):
"""
A multiplication layer that returns the *elementwise* product of its
inputs, passed through an optional nonlinearity.
Parameters
----------
act_fn : str, :doc:`Activation ` object, or None
The element-wise output nonlinearity used in computing the final
output. If None, use the identity function :math:`f(x) = x`.
Default is None.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Unused
parameters : dict
Unused
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.act_fn = ActivationInitializer(act_fn)()
self._init_params()
def _init_params(self):
self.gradients = {}
self.parameters = {}
self.derived_variables = {"product": []}
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "Multiply",
"act_fn": str(self.act_fn),
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
r"""
Compute the layer output on a single minibatch.
Parameters
----------
X : list of length `n_inputs`
A list of tensors, all of the same shape.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, *)`
The product over the `n_ex` examples.
""" # noqa: E501
out = X[0].copy()
for i in range(1, len(X)):
out *= X[i]
if retain_derived:
self.X.append(X)
self.derived_variables["product"].append(out)
return self.act_fn(out)
def backward(self, dLdY, retain_grads=True):
r"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, *)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : list of length `n_inputs`
The gradient of the loss wrt. each input in `X`.
"""
if not isinstance(dLdY, list):
dLdY = [dLdY]
X = self.X
_prod = self.derived_variables["product"]
grads = [self._bwd(dy, x, pr) for dy, x, pr in zip(dLdY, X, _prod)]
return grads[0] if len(X) == 1 else grads
def _bwd(self, dLdY, X, prod):
"""Actual computation of gradient of loss wrt. each input"""
grads = [dLdY * self.act_fn.grad(prod)] * len(X)
for i, x in enumerate(X):
grads = [g * x if j != i else g for j, g in enumerate(grads)]
return grads
class Flatten(LayerBase):
def __init__(self, keep_dim="first", optimizer=None):
"""
Flatten a multidimensional input into a 2D matrix.
Parameters
----------
keep_dim : {'first', 'last', -1}
The dimension of the original input to retain. Typically used for
retaining the minibatch dimension.. If -1, flatten all dimensions.
Default is 'first'.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Unused
gradients : dict
Unused
parameters : dict
Unused
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.keep_dim = keep_dim
self._init_params()
def _init_params(self):
self.gradients = {}
self.parameters = {}
self.derived_variables = {"in_dims": []}
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "Flatten",
"keep_dim": self.keep_dim,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
r"""
Compute the layer output on a single minibatch.
Parameters
----------
X : :py:class:`ndarray `
Input volume to flatten.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(*out_dims)`
Flattened output. If `keep_dim` is `'first'`, `X` is reshaped to
``(X.shape[0], -1)``, otherwise ``(-1, X.shape[0])``.
"""
if retain_derived:
self.derived_variables["in_dims"].append(X.shape)
if self.keep_dim == -1:
return X.flatten().reshape(1, -1)
rs = (X.shape[0], -1) if self.keep_dim == "first" else (-1, X.shape[-1])
return X.reshape(*rs)
def backward(self, dLdy, retain_grads=True):
r"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(*out_dims)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : :py:class:`ndarray ` of shape `(*in_dims)` or list of arrays
The gradient of the loss wrt. the layer input(s) `X`.
""" # noqa: E501
if not isinstance(dLdy, list):
dLdy = [dLdy]
in_dims = self.derived_variables["in_dims"]
out = [dy.reshape(*dims) for dy, dims in zip(dLdy, in_dims)]
return out[0] if len(dLdy) == 1 else out
#######################################################################
# Normalization Layers #
#######################################################################
class BatchNorm2D(LayerBase):
def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):
"""
A batch normalization layer for two-dimensional inputs with an
additional channel dimension.
Notes
-----
BatchNorm is an attempt address the problem of internal covariate
shift (ICS) during training by normalizing layer inputs.
ICS refers to the change in the distribution of layer inputs during
training as a result of the changing parameters of the previous
layer(s). ICS can make it difficult to train models with saturating
nonlinearities, and in general can slow training by requiring a lower
learning rate.
Equations [train]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Equations [test]::
Y = scaler * running_norm(X) + intercept
running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)
In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates
the mean and var across the *batch* rather than the output features.
This has two disadvantages:
1. It is highly affected by batch size: smaller mini-batch sizes
increase the variance of the estimates for the global mean and
variance.
2. It is difficult to apply in RNNs -- one must fit a separate
BatchNorm layer for *each* time-step.
Parameters
----------
momentum : float
The momentum term for the running mean/running std calculations.
The closer this is to 1, the less weight will be given to the
mean/std of the current batch (i.e., higher smoothing). Default is
0.9.
epsilon : float
A small smoothing constant to use during computation of ``norm(X)``
to avoid divide-by-zero errors. Default is 1e-5.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.in_ch = None
self.out_ch = None
self.epsilon = epsilon
self.momentum = momentum
self.parameters = {
"scaler": None,
"intercept": None,
"running_var": None,
"running_mean": None,
}
self.is_initialized = False
def _init_params(self):
scaler = np.random.rand(self.in_ch)
intercept = np.zeros(self.in_ch)
# init running mean and std at 0 and 1, respectively
running_mean = np.zeros(self.in_ch)
running_var = np.ones(self.in_ch)
self.parameters = {
"scaler": scaler,
"intercept": intercept,
"running_var": running_var,
"running_mean": running_mean,
}
self.gradients = {
"scaler": np.zeros_like(scaler),
"intercept": np.zeros_like(intercept),
}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "BatchNorm2D",
"act_fn": None,
"in_ch": self.in_ch,
"out_ch": self.out_ch,
"epsilon": self.epsilon,
"momentum": self.momentum,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def reset_running_stats(self):
"""Reset the running mean and variance estimates to 0 and 1."""
assert self.trainable, "Layer is frozen"
self.parameters["running_mean"] = np.zeros(self.in_ch)
self.parameters["running_var"] = np.ones(self.in_ch)
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Notes
-----
Equations [train]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Equations [test]::
Y = scaler * running_norm(X) + intercept
running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)
In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates the
mean and var across the *batch* rather than the output features.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
Input volume containing the `in_rows` x `in_cols`-dimensional
features for a minibatch of `n_ex` examples.
retain_derived : bool
Whether to use the current intput to adjust the running mean and
running_var computations. Setting this to False is the same as
freezing the layer for the current input. Default is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
Layer output for each of the `n_ex` examples.
""" # noqa: E501
if not self.is_initialized:
self.in_ch = self.out_ch = X.shape[3]
self._init_params()
ep = self.hyperparameters["epsilon"]
mm = self.hyperparameters["momentum"]
rm = self.parameters["running_mean"]
rv = self.parameters["running_var"]
scaler = self.parameters["scaler"]
intercept = self.parameters["intercept"]
# if the layer is frozen, use our running mean/std values rather
# than the mean/std values for the new batch
X_mean = self.parameters["running_mean"]
X_var = self.parameters["running_var"]
if self.trainable and retain_derived:
X_mean, X_var = X.mean(axis=(0, 1, 2)), X.var(axis=(0, 1, 2)) # , ddof=1)
self.parameters["running_mean"] = mm * rm + (1.0 - mm) * X_mean
self.parameters["running_var"] = mm * rv + (1.0 - mm) * X_var
if retain_derived:
self.X.append(X)
N = (X - X_mean) / np.sqrt(X_var + ep)
y = scaler * N + intercept
return y
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
The gradient of the loss wrt. the layer input `X`.
""" # noqa: E501
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dX = []
X = self.X
for dy, x in zip(dLdy, X):
dx, dScaler, dIntercept = self._bwd(dy, x)
dX.append(dx)
if retain_grads:
self.gradients["scaler"] += dScaler
self.gradients["intercept"] += dIntercept
return dX[0] if len(X) == 1 else dX
def _bwd(self, dLdy, X):
"""Computation of gradient of loss wrt. X, scaler, and intercept"""
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
# reshape to 2D, retaining channel dim
X_shape = X.shape
X = np.reshape(X, (-1, X.shape[3]))
dLdy = np.reshape(dLdy, (-1, dLdy.shape[3]))
# apply 1D batchnorm backward pass on reshaped array
n_ex, in_ch = X.shape
X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)
N = (X - X_mean) / np.sqrt(X_var + ep)
dIntercept = dLdy.sum(axis=0)
dScaler = np.sum(dLdy * N, axis=0)
dN = dLdy * scaler
dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (
n_ex * np.sqrt(X_var + ep)
)
return np.reshape(dX, X_shape), dScaler, dIntercept
class BatchNorm1D(LayerBase):
def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):
"""
A batch normalization layer for 1D inputs.
Notes
-----
BatchNorm is an attempt address the problem of internal covariate
shift (ICS) during training by normalizing layer inputs.
ICS refers to the change in the distribution of layer inputs during
training as a result of the changing parameters of the previous
layer(s). ICS can make it difficult to train models with saturating
nonlinearities, and in general can slow training by requiring a lower
learning rate.
Equations [train]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Equations [test]::
Y = scaler * running_norm(X) + intercept
running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)
In contrast to :class:`LayerNorm1D`, the BatchNorm layer calculates
the mean and var across the *batch* rather than the output features.
This has two disadvantages:
1. It is highly affected by batch size: smaller mini-batch sizes
increase the variance of the estimates for the global mean and
variance.
2. It is difficult to apply in RNNs -- one must fit a separate
BatchNorm layer for *each* time-step.
Parameters
----------
momentum : float
The momentum term for the running mean/running std calculations.
The closer this is to 1, the less weight will be given to the
mean/std of the current batch (i.e., higher smoothing). Default is
0.9.
epsilon : float
A small smoothing constant to use during computation of ``norm(X)``
to avoid divide-by-zero errors. Default is 1e-5.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.n_in = None
self.n_out = None
self.epsilon = epsilon
self.momentum = momentum
self.parameters = {
"scaler": None,
"intercept": None,
"running_var": None,
"running_mean": None,
}
self.is_initialized = False
def _init_params(self):
scaler = np.random.rand(self.n_in)
intercept = np.zeros(self.n_in)
# init running mean and std at 0 and 1, respectively
running_mean = np.zeros(self.n_in)
running_var = np.ones(self.n_in)
self.parameters = {
"scaler": scaler,
"intercept": intercept,
"running_mean": running_mean,
"running_var": running_var,
}
self.gradients = {
"scaler": np.zeros_like(scaler),
"intercept": np.zeros_like(intercept),
}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "BatchNorm1D",
"act_fn": None,
"n_in": self.n_in,
"n_out": self.n_out,
"epsilon": self.epsilon,
"momentum": self.momentum,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def reset_running_stats(self):
"""Reset the running mean and variance estimates to 0 and 1."""
assert self.trainable, "Layer is frozen"
self.parameters["running_mean"] = np.zeros(self.n_in)
self.parameters["running_var"] = np.ones(self.n_in)
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples.
retain_derived : bool
Whether to use the current intput to adjust the running mean and
running_var computations. Setting this to True is the same as
freezing the layer for the current input. Default is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer output for each of the `n_ex` examples
"""
if not self.is_initialized:
self.n_in = self.n_out = X.shape[1]
self._init_params()
ep = self.hyperparameters["epsilon"]
mm = self.hyperparameters["momentum"]
rm = self.parameters["running_mean"]
rv = self.parameters["running_var"]
scaler = self.parameters["scaler"]
intercept = self.parameters["intercept"]
# if the layer is frozen, use our running mean/std values rather
# than the mean/std values for the new batch
X_mean = self.parameters["running_mean"]
X_var = self.parameters["running_var"]
if self.trainable and retain_derived:
X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)
self.parameters["running_mean"] = mm * rm + (1.0 - mm) * X_mean
self.parameters["running_var"] = mm * rv + (1.0 - mm) * X_var
if retain_derived:
self.X.append(X)
N = (X - X_mean) / np.sqrt(X_var + ep)
y = scaler * N + intercept
return y
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, n_in)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : :py:class:`ndarray ` of shape `(n_ex, n_in)`
The gradient of the loss wrt. the layer input `X`.
"""
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dX = []
X = self.X
for dy, x in zip(dLdy, X):
dx, dScaler, dIntercept = self._bwd(dy, x)
dX.append(dx)
if retain_grads:
self.gradients["scaler"] += dScaler
self.gradients["intercept"] += dIntercept
return dX[0] if len(X) == 1 else dX
def _bwd(self, dLdy, X):
"""Computation of gradient of loss wrt X, scaler, and intercept"""
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
n_ex, n_in = X.shape
X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)
N = (X - X_mean) / np.sqrt(X_var + ep)
dIntercept = dLdy.sum(axis=0)
dScaler = np.sum(dLdy * N, axis=0)
dN = dLdy * scaler
dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (
n_ex * np.sqrt(X_var + ep)
)
return dX, dScaler, dIntercept
class LayerNorm2D(LayerBase):
def __init__(self, epsilon=1e-5, optimizer=None):
"""
A layer normalization layer for 2D inputs with an additional channel
dimension.
Notes
-----
In contrast to :class:`BatchNorm2D`, the LayerNorm layer calculates the
mean and variance across *features* rather than examples in the batch
ensuring that the mean and variance estimates are independent of batch
size and permitting straightforward application in RNNs.
Equations [train & test]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Also in contrast to :class:`BatchNorm2D`, `scaler` and `intercept` are applied
*elementwise* to ``norm(X)``.
Parameters
----------
epsilon : float
A small smoothing constant to use during computation of ``norm(X)``
to avoid divide-by-zero errors. Default is 1e-5.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.in_ch = None
self.out_ch = None
self.epsilon = epsilon
self.parameters = {"scaler": None, "intercept": None}
self.is_initialized = False
def _init_params(self, X_shape):
n_ex, in_rows, in_cols, in_ch = X_shape
scaler = np.random.rand(in_rows, in_cols, in_ch)
intercept = np.zeros((in_rows, in_cols, in_ch))
self.parameters = {"scaler": scaler, "intercept": intercept}
self.gradients = {
"scaler": np.zeros_like(scaler),
"intercept": np.zeros_like(intercept),
}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "LayerNorm2D",
"act_fn": None,
"in_ch": self.in_ch,
"out_ch": self.out_ch,
"epsilon": self.epsilon,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Notes
-----
Equations [train & test]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
Input volume containing the `in_rows` by `in_cols`-dimensional
features for a minibatch of `n_ex` examples.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
Layer output for each of the `n_ex` examples.
""" # noqa: E501
if not self.is_initialized:
self.in_ch = self.out_ch = X.shape[3]
self._init_params(X.shape)
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
intercept = self.parameters["intercept"]
if retain_derived:
self.X.append(X)
X_var = X.var(axis=(1, 2, 3), keepdims=True)
X_mean = X.mean(axis=(1, 2, 3), keepdims=True)
lnorm = (X - X_mean) / np.sqrt(X_var + ep)
y = scaler * lnorm + intercept
return y
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`
The gradient of the loss wrt. the layer input `X`.
""" # noqa: E501
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dX = []
X = self.X
for dy, x in zip(dLdy, X):
dx, dScaler, dIntercept = self._bwd(dy, x)
dX.append(dx)
if retain_grads:
self.gradients["scaler"] += dScaler
self.gradients["intercept"] += dIntercept
return dX[0] if len(X) == 1 else dX
def _bwd(self, dy, X):
"""Computation of gradient of the loss wrt X, scaler, intercept"""
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
X_mean = X.mean(axis=(1, 2, 3), keepdims=True)
X_var = X.var(axis=(1, 2, 3), keepdims=True)
lnorm = (X - X_mean) / np.sqrt(X_var + ep)
dLnorm = dy * scaler
dIntercept = dy.sum(axis=0)
dScaler = np.sum(dy * lnorm, axis=0)
n_in = np.prod(X.shape[1:])
lnorm = lnorm.reshape(-1, n_in)
dLnorm = dLnorm.reshape(lnorm.shape)
X_var = X_var.reshape(X_var.shape[:2])
dX = (
n_in * dLnorm
- dLnorm.sum(axis=1, keepdims=True)
- lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)
) / (n_in * np.sqrt(X_var + ep))
# reshape X gradients back to proper dimensions
return np.reshape(dX, X.shape), dScaler, dIntercept
class LayerNorm1D(LayerBase):
def __init__(self, epsilon=1e-5, optimizer=None):
"""
A layer normalization layer for 1D inputs.
Notes
-----
In contrast to :class:`BatchNorm1D`, the LayerNorm layer calculates the
mean and variance across *features* rather than examples in the batch
ensuring that the mean and variance estimates are independent of batch
size and permitting straightforward application in RNNs.
Equations [train & test]::
Y = scaler * norm(X) + intercept
norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
Also in contrast to :class:`BatchNorm1D`, `scaler` and `intercept` are applied
*elementwise* to ``norm(X)``.
Parameters
----------
epsilon : float
A small smoothing constant to use during computation of ``norm(X)``
to avoid divide-by-zero errors. Default is 1e-5.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.n_in = None
self.n_out = None
self.epsilon = epsilon
self.parameters = {"scaler": None, "intercept": None}
self.is_initialized = False
def _init_params(self):
scaler = np.random.rand(self.n_in)
intercept = np.zeros(self.n_in)
self.parameters = {"scaler": scaler, "intercept": intercept}
self.gradients = {
"scaler": np.zeros_like(scaler),
"intercept": np.zeros_like(intercept),
}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "LayerNorm1D",
"act_fn": None,
"n_in": self.n_in,
"n_out": self.n_out,
"epsilon": self.epsilon,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer output for each of the `n_ex` examples.
"""
if not self.is_initialized:
self.n_in = self.n_out = X.shape[1]
self._init_params()
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
intercept = self.parameters["intercept"]
if retain_derived:
self.X.append(X)
X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)
lnorm = (X - X_mean) / np.sqrt(X_var + ep)
y = scaler * lnorm + intercept
return y
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdY : :py:class:`ndarray ` of shape `(n_ex, n_in)`
The gradient of the loss wrt. the layer output `Y`.
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dX : :py:class:`ndarray ` of shape `(n_ex, n_in)`
The gradient of the loss wrt. the layer input `X`.
"""
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dX = []
X = self.X
for dy, x in zip(dLdy, X):
dx, dScaler, dIntercept = self._bwd(dy, x)
dX.append(dx)
if retain_grads:
self.gradients["scaler"] += dScaler
self.gradients["intercept"] += dIntercept
return dX[0] if len(X) == 1 else dX
def _bwd(self, dLdy, X):
"""Computation of gradient of the loss wrt X, scaler, intercept"""
scaler = self.parameters["scaler"]
ep = self.hyperparameters["epsilon"]
n_ex, n_in = X.shape
X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)
lnorm = (X - X_mean) / np.sqrt(X_var + ep)
dIntercept = dLdy.sum(axis=0)
dScaler = np.sum(dLdy * lnorm, axis=0)
dLnorm = dLdy * scaler
dX = (
n_in * dLnorm
- dLnorm.sum(axis=1, keepdims=True)
- lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)
) / (n_in * np.sqrt(X_var + ep))
return dX, dScaler, dIntercept
#######################################################################
# MLP Layers #
#######################################################################
class Embedding(LayerBase):
def __init__(
self, n_out, vocab_size, pool=None, init="glorot_uniform", optimizer=None,
):
"""
An embedding layer.
Notes
-----
Equations::
Y = W[x]
NB. This layer must be the first in a neural network as the gradients
do not get passed back through to the inputs.
Parameters
----------
n_out : int
The dimensionality of the embeddings
vocab_size : int
The total number of items in the vocabulary. All integer indices
are expected to range between 0 and `vocab_size - 1`.
pool : {'sum', 'mean', None}
If not None, apply this function to the collection of `n_in`
encodings in each example to produce a single, pooled embedding.
Default is None.
init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
The weight initialization strategy. Default is `'glorot_uniform'`.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
fstr = "'pool' must be either 'sum', 'mean', or None but got '{}'"
assert pool in ["sum", "mean", None], fstr.format(pool)
self.init = init
self.pool = pool
self.n_out = n_out
self.vocab_size = vocab_size
self.parameters = {"W": None}
self.is_initialized = False
self._init_params()
def _init_params(self):
init_weights = WeightInitializer("Affine(slope=1, intercept=0)", mode=self.init)
W = init_weights((self.vocab_size, self.n_out))
self.parameters = {"W": W}
self.derived_variables = {}
self.gradients = {"W": np.zeros_like(W)}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "Embedding",
"init": self.init,
"pool": self.pool,
"n_out": self.n_out,
"vocab_size": self.vocab_size,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def lookup(self, ids):
"""
Return the embeddings associated with the IDs in `ids`.
Parameters
----------
word_ids : :py:class:`ndarray ` of shape (`M`,)
An array of `M` IDs to retrieve embeddings for.
Returns
-------
embeddings : :py:class:`ndarray ` of shape (`M`, `n_out`)
The embedding vectors for each of the `M` IDs.
"""
return self.parameters["W"][ids]
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Notes
-----
Equations:
Y = W[x]
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)` or list of length `n_ex`
Layer input, representing a minibatch of `n_ex` examples. If
``self.pool`` is None, each example must consist of exactly `n_in`
integer token IDs. Otherwise, `X` can be a ragged array, with each
example consisting of a variable number of token IDs.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through with regard to this input.
Default is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, n_in, n_out)`
Embeddings for each coordinate of each of the `n_ex` examples
""" # noqa: E501
# if X is a ragged array
if isinstance(X, list) and not issubclass(X[0].dtype.type, np.integer):
fstr = "Input to Embedding layer must be an array of integers, got '{}'"
raise TypeError(fstr.format(X[0].dtype.type))
# otherwise
if isinstance(X, np.ndarray) and not issubclass(X.dtype.type, np.integer):
fstr = "Input to Embedding layer must be an array of integers, got '{}'"
raise TypeError(fstr.format(X.dtype.type))
Y = self._fwd(X)
if retain_derived:
self.X.append(X)
return Y
def _fwd(self, X):
"""Actual computation of forward pass"""
W = self.parameters["W"]
if self.pool is None:
emb = W[X]
elif self.pool == "sum":
emb = np.array([W[x].sum(axis=0) for x in X])[:, None, :]
elif self.pool == "mean":
emb = np.array([W[x].mean(axis=0) for x in X])[:, None, :]
return emb
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to embedding weights.
Notes
-----
Because the items in `X` are interpreted as indices, we cannot compute
the gradient of the layer output wrt. `X`.
Parameters
----------
dLdy : :py:class:`ndarray ` of shape `(n_ex, n_in, n_out)` or list of arrays
The gradient(s) of the loss wrt. the layer output(s)
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
""" # noqa: E501
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
for dy, x in zip(dLdy, self.X):
dw = self._bwd(dy, x)
if retain_grads:
self.gradients["W"] += dw
def _bwd(self, dLdy, X):
"""Actual computation of gradient of the loss wrt. W"""
dW = np.zeros_like(self.parameters["W"])
dLdy = dLdy.reshape(-1, self.n_out)
if self.pool is None:
for ix, v_id in enumerate(X.flatten()):
dW[v_id] += dLdy[ix]
elif self.pool == "sum":
for ix, v_ids in enumerate(X):
dW[v_ids] += dLdy[ix]
elif self.pool == "mean":
for ix, v_ids in enumerate(X):
dW[v_ids] += dLdy[ix] / len(v_ids)
return dW
class FullyConnected(LayerBase):
def __init__(self, n_out, act_fn=None, init="glorot_uniform", optimizer=None):
r"""
A fully-connected (dense) layer.
Notes
-----
A fully connected layer computes the function
.. math::
\mathbf{Y} = f( \mathbf{WX} + \mathbf{b} )
where `f` is the activation nonlinearity, **W** and **b** are
parameters of the layer, and **X** is the minibatch of input examples.
Parameters
----------
n_out : int
The dimensionality of the layer output
act_fn : str, :doc:`Activation ` object, or None
The element-wise output nonlinearity used in computing `Y`. If None,
use the identity function :math:`f(X) = X`. Default is None.
init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}
The weight initialization strategy. Default is `'glorot_uniform'`.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.
gradients : dict
Dictionary of loss gradients with regard to the layer parameters
parameters : dict
Dictionary of layer parameters
hyperparameters : dict
Dictionary of layer hyperparameters
derived_variables : dict
Dictionary of any intermediate values computed during
forward/backward propagation.
""" # noqa: E501
super().__init__(optimizer)
self.init = init
self.n_in = None
self.n_out = n_out
self.act_fn = ActivationInitializer(act_fn)()
self.parameters = {"W": None, "b": None}
self.is_initialized = False
def _init_params(self):
init_weights = WeightInitializer(str(self.act_fn), mode=self.init)
b = np.zeros((1, self.n_out))
W = init_weights((self.n_in, self.n_out))
self.parameters = {"W": W, "b": b}
self.derived_variables = {"Z": []}
self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)}
self.is_initialized = True
@property
def hyperparameters(self):
"""Return a dictionary containing the layer hyperparameters."""
return {
"layer": "FullyConnected",
"init": self.init,
"n_in": self.n_in,
"n_out": self.n_out,
"act_fn": str(self.act_fn),
"optimizer": {
"cache": self.optimizer.cache,
"hyperparameters": self.optimizer.hyperparameters,
},
}
def forward(self, X, retain_derived=True):
"""
Compute the layer output on a single minibatch.
Parameters
----------
X : :py:class:`ndarray ` of shape `(n_ex, n_in)`
Layer input, representing the `n_in`-dimensional features for a
minibatch of `n_ex` examples.
retain_derived : bool
Whether to retain the variables calculated during the forward pass
for use later during backprop. If False, this suggests the layer
will not be expected to backprop through wrt. this input. Default
is True.
Returns
-------
Y : :py:class:`ndarray ` of shape `(n_ex, n_out)`
Layer output for each of the `n_ex` examples.
"""
if not self.is_initialized:
self.n_in = X.shape[1]
self._init_params()
Y, Z = self._fwd(X)
if retain_derived:
self.X.append(X)
self.derived_variables["Z"].append(Z)
return Y
def _fwd(self, X):
"""Actual computation of forward pass"""
W = self.parameters["W"]
b = self.parameters["b"]
Z = X @ W + b
Y = self.act_fn(Z)
return Y, Z
def backward(self, dLdy, retain_grads=True):
"""
Backprop from layer outputs to inputs.
Parameters
----------
dLdy : :py:class:`ndarray ` of shape `(n_ex, n_out)` or list of arrays
The gradient(s) of the loss wrt. the layer output(s).
retain_grads : bool
Whether to include the intermediate parameter gradients computed
during the backward pass in the final parameter update. Default is
True.
Returns
-------
dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in)` or list of arrays
The gradient of the loss wrt. the layer input(s) `X`.
""" # noqa: E501
assert self.trainable, "Layer is frozen"
if not isinstance(dLdy, list):
dLdy = [dLdy]
dX = []
X = self.X
for dy, x in zip(dLdy, X):
dx, dw, db = self._bwd(dy, x)
dX.append(dx)
if retain_grads:
self.gradients["W"] += dw
self.gradients["b"] += db
return dX[0] if len(X) == 1 else dX
def _bwd(self, dLdy, X):
"""Actual computation of gradient of the loss wrt. X, W, and b"""
W = self.parameters["W"]
b = self.parameters["b"]
Z = X @ W + b
dZ = dLdy * self.act_fn.grad(Z)
dX = dZ @ W.T
dW = X.T @ dZ
dB = dZ.sum(axis=0, keepdims=True)
return dX, dW, dB
def _bwd2(self, dLdy, X, dLdy_bwd):
"""Compute second derivatives / deriv. of loss wrt. dX, dW, and db"""
W = self.parameters["W"]
b = self.parameters["b"]
dZ = self.act_fn.grad(X @ W + b)
ddZ = self.act_fn.grad2(X @ W + b)
ddX = dLdy @ W * dZ
ddW = dLdy.T @ (dLdy_bwd * dZ)
ddB = np.sum(dLdy @ W * dLdy_bwd * ddZ, axis=0, keepdims=True)
return ddX, ddW, ddB
class Softmax(LayerBase):
def __init__(self, dim=-1, optimizer=None):
r"""
A softmax nonlinearity layer.
Notes
-----
This is implemented as a layer rather than an activation primarily
because it requires retaining the layer input in order to compute the
softmax gradients properly. In other words, in contrast to other
simple activations, the softmax function and its gradient are not
computed elementwise, and thus are more easily expressed as a layer.
The softmax function computes:
.. math::
y_i = \frac{e^{x_i}}{\sum_j e^{x_j}}
where :math:`x_i` is the `i` th element of input example **x**.
Parameters
----------
dim: int
The dimension in `X` along which the softmax will be computed.
Default is -1.
optimizer : str, :doc:`Optimizer ` object, or None
The optimization strategy to use when performing gradient updates
within the :meth:`update` method. If None, use the :class:`SGD
` optimizer with
default parameters. Default is None. Unused for this layer.
Attributes
----------
X : list
Running list of inputs to the :meth:`forward