Full Code of openai/jukebox for AI

master 08efbbc1d4ed cached

319 files

1.7 MB

499.5k tokens

1379 symbols

1 requests

Download .txt

Showing preview only (1,841K chars total). Download the full file or copy to clipboard to get everything.

Repository: openai/jukebox
Branch: master
Commit: 08efbbc1d4ed
Files: 319
Total size: 1.7 MB

Directory structure:
gitextract_kyecer1w/

├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── apex/
│   ├── .gitignore
│   ├── .nojekyll
│   ├── LICENSE
│   ├── README.md
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   └── fused_adam.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── apex.patch
│   ├── csrc/
│   │   ├── amp_C_frontend.cpp
│   │   ├── flatten_unflatten.cpp
│   │   ├── fused_adam_cuda.cpp
│   │   ├── fused_adam_cuda_kernel.cu
│   │   ├── layer_norm_cuda.cpp
│   │   ├── layer_norm_cuda_kernel.cu
│   │   ├── multi_tensor_apply.cuh
│   │   ├── multi_tensor_axpby_kernel.cu
│   │   ├── multi_tensor_l2norm_kernel.cu
│   │   ├── multi_tensor_lamb_stage_1.cu
│   │   ├── multi_tensor_lamb_stage_2.cu
│   │   ├── multi_tensor_scale_kernel.cu
│   │   ├── syncbn.cpp
│   │   ├── type_shim.h
│   │   └── welford.cu
│   ├── docs/
│   │   ├── Makefile
│   │   └── source/
│   │       ├── _static/
│   │       │   └── css/
│   │       │       └── pytorch_theme.css
│   │       ├── _templates/
│   │       │   └── layout.html
│   │       ├── advanced.rst
│   │       ├── amp.rst
│   │       ├── conf.py
│   │       ├── fp16_utils.rst
│   │       ├── index.rst
│   │       ├── layernorm.rst
│   │       ├── optimizers.rst
│   │       └── parallel.rst
│   ├── examples/
│   │   ├── README.md
│   │   ├── dcgan/
│   │   │   └── README.md
│   │   ├── docker/
│   │   │   ├── Dockerfile
│   │   │   └── README.md
│   │   ├── imagenet/
│   │   │   ├── README.md
│   │   │   └── main_amp.py
│   │   └── simple/
│   │       └── distributed/
│   │           ├── README.md
│   │           ├── distributed_data_parallel.py
│   │           └── run.sh
│   ├── setup.py
│   └── tests/
│       ├── L0/
│       │   ├── run_amp/
│       │   │   ├── __init__.py
│       │   │   ├── test_add_param_group.py
│       │   │   ├── test_basic_casts.py
│       │   │   ├── test_cache.py
│       │   │   ├── test_multi_tensor_axpby.py
│       │   │   ├── test_multi_tensor_l2norm.py
│       │   │   ├── test_multi_tensor_scale.py
│       │   │   ├── test_multiple_models_optimizers_losses.py
│       │   │   ├── test_promotion.py
│       │   │   ├── test_rnn.py
│       │   │   └── utils.py
│       │   ├── run_fp16util/
│       │   │   ├── __init__.py
│       │   │   └── test_fp16util.py
│       │   ├── run_fused_layer_norm/
│       │   │   └── test_fused_layer_norm.py
│       │   ├── run_mixed_adam/
│       │   │   ├── __init__.py
│       │   │   ├── test_fp16_optimizer.py
│       │   │   └── test_mixed_adam.py
│       │   └── run_test.py
│       ├── L1/
│       │   ├── common/
│       │   │   ├── compare.py
│       │   │   ├── main_amp.py
│       │   │   └── run_test.sh
│       │   ├── cross_product/
│       │   │   └── run.sh
│       │   └── cross_product_distributed/
│       │       └── run.sh
│       ├── distributed/
│       │   ├── DDP/
│       │   │   ├── ddp_race_condition_test.py
│       │   │   └── run_race_test.sh
│       │   ├── amp_master_params/
│       │   │   ├── amp_master_params.py
│       │   │   ├── compare.py
│       │   │   └── run.sh
│       │   └── synced_batchnorm/
│       │       ├── single_gpu_unit_test.py
│       │       ├── test_groups.py
│       │       ├── two_gpu_unit_test.py
│       │       └── unit_test.sh
│       └── docker_extension_builds/
│           └── run.sh
├── jukebox/
│   ├── Interacting_with_Jukebox.ipynb
│   ├── __init__.py
│   ├── align.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── artist_genre_processor.py
│   │   ├── data_processor.py
│   │   ├── files_dataset.py
│   │   ├── ids/
│   │   │   ├── v2_artist_ids.txt
│   │   │   ├── v2_genre_ids.txt
│   │   │   ├── v3_artist_ids.txt
│   │   │   └── v3_genre_ids.txt
│   │   ├── labels.py
│   │   └── text_processor.py
│   ├── hparams.py
│   ├── lyricdict.py
│   ├── make_models.py
│   ├── prior/
│   │   ├── __init__.py
│   │   ├── autoregressive.py
│   │   ├── conditioners.py
│   │   └── prior.py
│   ├── sample.py
│   ├── save_html.py
│   ├── tests/
│   │   └── test_sample.py
│   ├── train.py
│   ├── transformer/
│   │   ├── __init__.py
│   │   ├── factored_attention.py
│   │   ├── ops.py
│   │   └── transformer.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── audio_utils.py
│   │   ├── checkpoint.py
│   │   ├── dist_adapter.py
│   │   ├── dist_utils.py
│   │   ├── ema.py
│   │   ├── fp16.py
│   │   ├── io.py
│   │   ├── logger.py
│   │   ├── remote_utils.py
│   │   ├── sample_utils.py
│   │   └── torch_utils.py
│   └── vqvae/
│       ├── __init__.py
│       ├── bottleneck.py
│       ├── encdec.py
│       ├── resnet.py
│       └── vqvae.py
├── requirements.txt
├── setup.py
└── tensorboardX/
    ├── .codecov.yml
    ├── .flake8
    ├── .github/
    │   └── ISSUE_TEMPLATE/
    │       ├── bug_report.md
    │       └── feature-requests-or-general-questions.md
    ├── .gitignore
    ├── .travis.yml
    ├── HISTORY.rst
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── compile.sh
    ├── docs/
    │   ├── Makefile
    │   ├── conf.py
    │   ├── index.rst
    │   ├── tensorboard.rst
    │   ├── tutorial.rst
    │   ├── tutorial_zh.rst
    │   └── utils.rst
    ├── examples/
    │   ├── RUN_AFTER_PIP_INSTALL
    │   ├── __init__.py
    │   ├── chainer/
    │   │   ├── extension_logger/
    │   │   │   ├── net.py
    │   │   │   ├── train_dcgan.py
    │   │   │   ├── updater.py
    │   │   │   ├── visualize.py
    │   │   │   └── writetensorboard.py
    │   │   └── plain_logger/
    │   │       ├── data.py
    │   │       ├── net.py
    │   │       └── train_vae.py
    │   ├── demo.py
    │   ├── demo_beholder.py
    │   ├── demo_caffe2.py
    │   ├── demo_custom_scalars.py
    │   ├── demo_embedding.py
    │   ├── demo_graph.py
    │   ├── demo_hparams.py
    │   ├── demo_matplotlib.py
    │   ├── demo_multiple_embedding.py
    │   ├── demo_nvidia_smi.py
    │   ├── demo_onnx.py
    │   └── demo_purge.py
    ├── setup.cfg
    ├── setup.py
    ├── tensorboardX/
    │   ├── __init__.py
    │   ├── beholder/
    │   │   ├── __init__.py
    │   │   ├── beholder.py
    │   │   ├── file_system_tools.py
    │   │   ├── shared_config.py
    │   │   └── video_writing.py
    │   ├── caffe2_graph.py
    │   ├── crc32c.py
    │   ├── embedding.py
    │   ├── event_file_writer.py
    │   ├── onnx_graph.py
    │   ├── proto/
    │   │   ├── __init__.py
    │   │   ├── api.proto
    │   │   ├── api_pb2.py
    │   │   ├── attr_value.proto
    │   │   ├── attr_value_pb2.py
    │   │   ├── event.proto
    │   │   ├── event_pb2.py
    │   │   ├── graph.proto
    │   │   ├── graph_pb2.py
    │   │   ├── layout.proto
    │   │   ├── layout_pb2.py
    │   │   ├── node_def.proto
    │   │   ├── node_def_pb2.py
    │   │   ├── plugin_hparams.proto
    │   │   ├── plugin_hparams_pb2.py
    │   │   ├── plugin_mesh.proto
    │   │   ├── plugin_mesh_pb2.py
    │   │   ├── plugin_pr_curve.proto
    │   │   ├── plugin_pr_curve_pb2.py
    │   │   ├── plugin_text.proto
    │   │   ├── plugin_text_pb2.py
    │   │   ├── resource_handle.proto
    │   │   ├── resource_handle_pb2.py
    │   │   ├── step_stats.proto
    │   │   ├── step_stats_pb2.py
    │   │   ├── summary.proto
    │   │   ├── summary_pb2.py
    │   │   ├── tensor.proto
    │   │   ├── tensor_pb2.py
    │   │   ├── tensor_shape.proto
    │   │   ├── tensor_shape_pb2.py
    │   │   ├── types.proto
    │   │   ├── types_pb2.py
    │   │   ├── versions.proto
    │   │   └── versions_pb2.py
    │   ├── proto_graph.py
    │   ├── pytorch_graph.py
    │   ├── record_writer.py
    │   ├── summary.py
    │   ├── torchvis.py
    │   ├── utils.py
    │   ├── visdom_writer.py
    │   ├── writer.py
    │   └── x2num.py
    ├── tensorboardX.patch
    └── tests/
        ├── __init__.py
        ├── event_file_writer_test.py
        ├── expect/
        │   ├── caffe_mnist.expect
        │   ├── caffe_overfeat.expect
        │   ├── test_caffe2.test_simple_cnnmodel.expect
        │   ├── test_caffe2.test_simple_model.expect
        │   ├── test_pr_curve.test_pr_purve.expect
        │   ├── test_pr_curve.test_pr_purve_raw.expect
        │   ├── test_summary.test_audio.expect
        │   ├── test_summary.test_custom_scalars.expect
        │   ├── test_summary.test_float32_image.expect
        │   ├── test_summary.test_histogram_auto.expect
        │   ├── test_summary.test_histogram_doane.expect
        │   ├── test_summary.test_histogram_fd.expect
        │   ├── test_summary.test_hparams.expect
        │   ├── test_summary.test_image_with_3_channel_batched.expect
        │   ├── test_summary.test_image_with_boxes.expect
        │   ├── test_summary.test_image_with_four_channel.expect
        │   ├── test_summary.test_image_with_four_channel_batched.expect
        │   ├── test_summary.test_image_with_one_channel.expect
        │   ├── test_summary.test_image_with_one_channel_batched.expect
        │   ├── test_summary.test_image_without_channel.expect
        │   ├── test_summary.test_mesh.expect
        │   ├── test_summary.test_text.expect
        │   ├── test_summary.test_uint8_image.expect
        │   └── test_summary.test_video.expect
        ├── expect_reader.py
        ├── record_writer_test.py
        ├── test_beholder.py
        ├── test_caffe2.py
        ├── test_chainer_np.py
        ├── test_crc32c.py
        ├── test_embedding.py
        ├── test_figure.py
        ├── test_numpy.py
        ├── test_onnx_graph.py
        ├── test_pr_curve.py
        ├── test_pytorch_graph.py
        ├── test_pytorch_np.py
        ├── test_record_writer.py
        ├── test_summary.py
        ├── test_summary_writer.py
        ├── test_test.py
        ├── test_utils.py
        ├── test_visdom.py
        └── test_writer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Global
.DS_Store
.idea

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

================================================
FILE: LICENSE
================================================
Noncommercial Use License

Software Copyright (c) 2020 OpenAI

We don’t claim ownership of the content you create with Jukebox.
We only ask that you use Jukebox responsibly and clearly indicate your content was created using OpenAI’s Jukebox.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software, including without limitation the rights to use, copy,
modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

No portion of the Software, nor any content created with the Software, may be used for commercial purposes.

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

The above copyright notice and this permission notice need not be included with content created by the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

================================================
FILE: MANIFEST.in
================================================
recursive-include jukebox *.py
recursive-include jukebox *.txt


================================================
FILE: README.md
================================================
**Status:** Archive (code is provided as-is, no updates expected)

# Jukebox
Code for "Jukebox: A Generative Model for Music"

[Paper](https://arxiv.org/abs/2005.00341) 
[Blog](https://openai.com/blog/jukebox) 
[Explorer](http://jukebox.openai.com/) 
[Colab](https://colab.research.google.com/github/openai/jukebox/blob/master/jukebox/Interacting_with_Jukebox.ipynb) 

# Install
Install the conda package manager from https://docs.conda.io/en/latest/miniconda.html    
    
``` 
# Required: Sampling
conda create --name jukebox python=3.7.5
conda activate jukebox
conda install mpi4py=3.0.3 # if this fails, try: pip install mpi4py==3.0.3
conda install pytorch=1.4 torchvision=0.5 cudatoolkit=10.0 -c pytorch
git clone https://github.com/openai/jukebox.git
cd jukebox
pip install -r requirements.txt
pip install -e .

# Required: Training
conda install av=7.0.01 -c conda-forge 
pip install ./tensorboardX
 
# Optional: Apex for faster training with fused_adam
conda install pytorch=1.1 torchvision=0.3 cudatoolkit=10.0 -c pytorch
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
```

# Sampling
## Sampling from scratch
To sample normally, run the following command. Model can be `5b`, `5b_lyrics`, `1b_lyrics`
``` 
python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --sample_length_in_seconds=20 \
--total_sample_length_in_seconds=180 --sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
```
``` 
python jukebox/sample.py --model=1b_lyrics --name=sample_1b --levels=3 --sample_length_in_seconds=20 \
--total_sample_length_in_seconds=180 --sr=44100 --n_samples=16 --hop_fraction=0.5,0.5,0.125
```
The above generates the first `sample_length_in_seconds` seconds of audio from a song of total length `total_sample_length_in_seconds`.
To use multiple GPU's, launch the above scripts as `mpiexec -n {ngpus} python jukebox/sample.py ...` so they use `{ngpus}`

The samples decoded from each level are stored in `{name}/level_{level}`. 
You can also view the samples as an html with the aligned lyrics under `{name}/level_{level}/index.html`. 
Run `python -m http.server` and open the html through the server to see the lyrics animate as the song plays.  
A summary of all sampling data including zs, x, labels and sampling_kwargs is stored in `{name}/level_{level}/data.pth.tar`.

The hps are for a V100 GPU with 16 GB GPU memory. The `1b_lyrics`, `5b`, and `5b_lyrics` top-level priors take up 
3.8 GB, 10.3 GB, and 11.5 GB, respectively. The peak memory usage to store transformer key, value cache is about 400 MB 
for `1b_lyrics` and 1 GB for `5b_lyrics` per sample. If you are having trouble with CUDA OOM issues, try `1b_lyrics` or 
decrease `max_batch_size` in sample.py, and `--n_samples` in the script call.

On a V100, it takes about 3 hrs to fully sample 20 seconds of music. Since this is a long time, it is recommended to use `n_samples > 1` so you can generate as many samples as possible in parallel. The 1B lyrics and upsamplers can process 16 samples at a time, while 5B can fit only up to 3. Since the vast majority of time is spent on upsampling, we recommend using a multiple of 3 less than 16 like `--n_samples 15` for `5b_lyrics`. This will make the top-level generate samples in groups of three while upsampling is done in one pass.

To continue sampling from already generated codes for a longer duration, you can run
```
python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --mode=continue \
--codes_file=sample_5b/level_0/data.pth.tar --sample_length_in_seconds=40 --total_sample_length_in_seconds=180 \
--sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
```
Here, we take the 20 seconds samples saved from the first sampling run at `sample_5b/level_0/data.pth.tar` and continue by adding 20 more seconds. 

You could also continue directly from the level 2 saved outputs, just pass `--codes_file=sample_5b/level_2/data.pth.tar`.
 Note this will upsample the full 40 seconds song at the end.

If you stopped sampling at only the first level and want to upsample the saved codes, you can run
```
python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --mode=upsample \
--codes_file=sample_5b/level_2/data.pth.tar --sample_length_in_seconds=20 --total_sample_length_in_seconds=180 \
--sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
```
Here, we take the 20 seconds samples saved from the first sampling run at `sample_5b/level_2/data.pth.tar` and upsample the lower two levels.

## Prompt with your own music
If you want to prompt the model with your own creative piece or any other music, first save them as wave files and run
```
python jukebox/sample.py --model=5b_lyrics --name=sample_5b_prompted --levels=3 --mode=primed \
--audio_file=path/to/recording.wav,awesome-mix.wav,fav-song.wav,etc.wav --prompt_length_in_seconds=12 \
--sample_length_in_seconds=20 --total_sample_length_in_seconds=180 --sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
```
This will load the four files, tile them to fill up to `n_samples` batch size, and prime the model with the first `prompt_length_in_seconds` seconds.

# Training
## VQVAE
To train a small vqvae, run
```
mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae --name=small_vqvae --sample_length=262144 --bs=4 \
--audio_files_dir={audio_files_dir} --labels=False --train --aug_shift --aug_blend
```
Here, `{audio_files_dir}` is the directory in which you can put the audio files for your dataset, and `{ngpus}` is number of GPU's you want to use to train. 
The above trains a two-level VQ-VAE with `downs_t = (5,3)`, and `strides_t = (2, 2)` meaning we downsample the audio by `2**5 = 32` to get the first level of codes, and `2**8 = 256` to get the second level codes.  
Checkpoints are stored in the `logs` folder. You can monitor the training by running Tensorboard
```
tensorboard --logdir logs
```
    
## Prior
### Train prior or upsamplers
Once the VQ-VAE is trained, we can restore it from its saved checkpoint and train priors on the learnt codes. 
To train the top-level prior, we can run

```
mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae,small_prior,all_fp16,cpu_ema --name=small_prior \
--sample_length=2097152 --bs=4 --audio_files_dir={audio_files_dir} --labels=False --train --test --aug_shift --aug_blend \
--restore_vqvae=logs/small_vqvae/checkpoint_latest.pth.tar --prior --levels=2 --level=1 --weight_decay=0.01 --save_iters=1000
```

To train the upsampler, we can run
```
mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae,small_upsampler,all_fp16,cpu_ema --name=small_upsampler \
--sample_length=262144 --bs=4 --audio_files_dir={audio_files_dir} --labels=False --train --test --aug_shift --aug_blend \
--restore_vqvae=logs/small_vqvae/checkpoint_latest.pth.tar --prior --levels=2 --level=0 --weight_decay=0.01 --save_iters=1000
```
We pass `sample_length = n_ctx * downsample_of_level` so that after downsampling the tokens match the n_ctx of the prior hps. 
Here, `n_ctx = 8192` and `downsamples = (32, 256)`, giving `sample_lengths = (8192 * 32, 8192 * 256) = (65536, 2097152)` respectively for the bottom and top level. 

### Learning rate annealing
To get the best sample quality anneal the learning rate to 0 near the end of training. To do so, continue training from the latest 
checkpoint and run with
```
--restore_prior="path/to/checkpoint" --lr_use_linear_decay --lr_start_linear_decay={already_trained_steps} --lr_decay={decay_steps_as_needed}
```

### Reuse pre-trained VQ-VAE and train top-level prior on new dataset from scratch.
#### Train without labels
Our pre-trained VQ-VAE can produce compressed codes for a wide variety of genres of music, and the pre-trained upsamplers 
can upsample them back to audio that sound very similar to the original audio.
To re-use these for a new dataset of your choice, you can retrain just the top-level  

To train top-level on a new dataset, run
```
mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_prior \
--sample_length=1048576 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
--labels=False --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
```
Training the `small_prior` with a batch size of 2, 4, and 8 requires 6.7 GB, 9.3 GB, and 15.8 GB of GPU memory, respectively. A few days to a week of training typically yields reasonable samples when the dataset is homogeneous (e.g. all piano pieces, songs of the same style, etc).

Near the end of training, follow [this](#learning-rate-annealing) to anneal the learning rate to 0

#### Sample from new model
You can then run sample.py with the top-level of our models replaced by your new model. To do so,
- Add an entry `my_model=("vqvae", "upsampler_level_0", "upsampler_level_1", "small_prior")` in `MODELS` in `make_models.py`. 
- Update the `small_prior` dictionary in `hparams.py` to include `restore_prior='path/to/checkpoint'`. If you
you changed any hps directly in the command line script (eg:`heads`), make sure to update them in the dictionary too so 
that `make_models` restores our checkpoint correctly.
- Run sample.py as outlined in the sampling section, but now with `--model=my_model` 

For example, let's say we trained `small_vqvae`, `small_prior`, and `small_upsampler` under `/path/to/jukebox/logs`. In `make_models.py`, we are going to declare a tuple of the new models as `my_model`.
```
MODELS = {
    '5b': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b"),
    '5b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b_lyrics"),
    '1b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_1b_lyrics"),
    'my_model': ("my_small_vqvae", "my_small_upsampler", "my_small_prior"),
}
```

Next, in `hparams.py`, we add them to the registry with the corresponding `restore_`paths and any other command line options used during training. Another important note is that for top-level priors with lyric conditioning, we have to locate a self-attention layer that shows alignment between the lyric and music tokens. Look for layers where `prior.prior.transformer._attn_mods[layer].attn_func` is either 6 or 7. If your model is starting to sing along lyrics, it means some layer, head pair has learned alignment. Congrats!
```
my_small_vqvae = Hyperparams(
    restore_vqvae='/path/to/jukebox/logs/small_vqvae/checkpoint_some_step.pth.tar',
)
my_small_vqvae.update(small_vqvae)
HPARAMS_REGISTRY["my_small_vqvae"] = my_small_vqvae

my_small_prior = Hyperparams(
    restore_prior='/path/to/jukebox/logs/small_prior/checkpoint_latest.pth.tar',
    level=1,
    labels=False,
    # TODO For the two lines below, if `--labels` was used and the model is
    # trained with lyrics, find and enter the layer, head pair that has learned
    # alignment.
    alignment_layer=47,
    alignment_head=0,
)
my_small_prior.update(small_prior)
HPARAMS_REGISTRY["my_small_prior"] = my_small_prior

my_small_upsampler = Hyperparams(
    restore_prior='/path/to/jukebox/logs/small_upsampler/checkpoint_latest.pth.tar',
    level=0,
    labels=False,
)
my_small_upsampler.update(small_upsampler)
HPARAMS_REGISTRY["my_small_upsampler"] = my_small_upsampler
```

#### Train with labels 
To train with you own metadata for your audio files, implement `get_metadata` in `data/files_dataset.py` to return the 
`artist`, `genre` and `lyrics` for a given audio file. For now, you can pass `''` for lyrics to not use any lyrics.

For training with labels, we'll use `small_labelled_prior` in `hparams.py`, and we set `labels=True,labels_v3=True`. 
We use 2 kinds of labels information:
- Artist/Genre: 
  - For each file, we return an artist_id and a list of genre_ids. The reason we have a list and not a single genre_id 
  is that in v2, we split genres like `blues_rock` into a bag of words `[blues, rock]`, and we pass atmost 
  `max_bow_genre_size` of those, in `v3` we consider it as a single word and just set `max_bow_genre_size=1`.
  - Update the `v3_artist_ids` and `v3_genre_ids` to use ids from your new dataset. 
  - In `small_labelled_prior`, set the hps `y_bins = (number_of_genres, number_of_artists)` and `max_bow_genre_size=1`. 
- Timing: 
  - For each chunk of audio, we return the `total_length` of the song, the `offset` the current audio chunk is at and 
  the `sample_length` of the audio chunk. We have three timing embeddings: total_length, our current position, and our 
  current position as a fraction of the total length, and we divide the range of these values into `t_bins` discrete bins. 
  - In `small_labelled_prior`, set the hps `min_duration` and `max_duration` to be the shortest/longest duration of audio 
  files you want for your dataset, and `t_bins` for how many bins you want to discretize timing information into. Note 
  `min_duration * sr` needs to be at least `sample_length` to have an audio chunk in it.

After these modifications, to train a top-level with labels, run
```
mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_labelled_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_prior_labels \
--sample_length=1048576 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
```

For sampling, follow same instructions as [above](#sample-from-new-model) but use `small_labelled_prior` instead of `small_prior`.  

#### Train with lyrics
To train in addition with lyrics, update `get_metadata` in `data/files_dataset.py` to return `lyrics` too.
For training with lyrics, we'll use `small_single_enc_dec_prior` in `hparams.py`. 
- Lyrics: 
  - For each file, we linearly align the lyric characters to the audio, find the position in lyric that corresponds to 
  the midpoint of our audio chunk, and pass a window of `n_tokens` lyric characters centred around that. 
  - In `small_single_enc_dec_prior`, set the hps `use_tokens=True` and `n_tokens` to be the number of lyric characters 
  to use for an audio chunk. Set it according to the `sample_length` you're training on so that its large enough that 
  the lyrics for an audio chunk are almost always found inside a window of that size.
  - If you use a non-English vocabulary, update `text_processor.py` with your new vocab and set
  `n_vocab = number of characters in vocabulary` accordingly in `small_single_enc_dec_prior`. In v2, we had a `n_vocab=80` 
  and in v3 we missed `+` and so `n_vocab=79` of characters. 

After these modifications, to train a top-level with labels and lyrics, run
```
mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_single_enc_dec_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_single_enc_dec_prior_labels \
--sample_length=786432 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
```
To simplify hps choices, here we used a `single_enc_dec` model like the `1b_lyrics` model that combines both encoder and 
decoder of the transformer into a single model. We do so by merging the lyric vocab and vq-vae vocab into a single 
larger vocab, and flattening the lyric tokens and the vq-vae codes into a single sequence of length `n_ctx + n_tokens`. 
This uses `attn_order=12` which includes `prime_attention` layers with keys/values from lyrics and queries from audio. 
If you instead want to use a model with the usual encoder-decoder style transformer, use `small_sep_enc_dec_prior`.

For sampling, follow same instructions as [above](#sample-from-new-model) but use `small_single_enc_dec_prior` instead of 
`small_prior`. To also get the alignment between lyrics and samples in the saved html, you'll need to set `alignment_layer` 
and `alignment_head` in `small_single_enc_dec_prior`. To find which layer/head is best to use, run a forward pass on a training example,
save the attention weight tensors for all prime_attention layers, and pick the (layer, head) which has the best linear alignment 
pattern between the lyrics keys and music queries. 

### Fine-tune pre-trained top-level prior to new style(s)
Previously, we showed how to train a small top-level prior from scratch. Assuming you have a GPU with at least 15 GB of memory and support for fp16, you could fine-tune from our pre-trained 1B top-level prior. Here are the steps:

- Support `--labels=True` by implementing `get_metadata` in `jukebox/data/files_dataset.py` for your dataset.
- Add new entries in `jukebox/data/ids`. We recommend replacing existing mappings (e.g. rename `"unknown"`, etc with styles of your choice). This uses the pre-trained style vectors as initialization and could potentially save some compute.

After these modifications, run 
```
mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,prior_1b_lyrics,all_fp16,cpu_ema --name=finetuned \
--sample_length=1048576 --bs=1 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
```
To get the best sample quality, it is recommended to anneal the learning rate in the end. Training the 5B top-level requires GPipe which is not supported in this release.

# Citation

Please cite using the following bibtex entry:

```
@article{dhariwal2020jukebox,
  title={Jukebox: A Generative Model for Music},
  author={Dhariwal, Prafulla and Jun, Heewoo and Payne, Christine and Kim, Jong Wook and Radford, Alec and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2005.00341},
  year={2020}
}
```

# License 
[Noncommercial Use License](./LICENSE) 

It covers both released code and weights. 



================================================
FILE: apex/.gitignore
================================================
apex.egg-info
dist
build
docs/build
*~

================================================
FILE: apex/.nojekyll
================================================


================================================
FILE: apex/LICENSE
================================================
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

================================================
FILE: apex/README.md
================================================
# Introduction

This repository holds NVIDIA-maintained utilities to streamline 
mixed precision and distributed training in Pytorch. 
Some of the code here will be included in upstream Pytorch eventually.
The intention of Apex is to make up-to-date utilities available to 
users as quickly as possible.

## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)

# Contents

## 1. Amp:  Automatic Mixed Precision

`apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
Users can easily experiment with different pure and mixed precision training modes by supplying
different flags to `amp.initialize`.

[Webinar introducing Amp](https://info.nvidia.com/webinar-mixed-precision-with-pytorch-reg-page.html)
(The flag `cast_batchnorm` has been renamed to `keep_batchnorm_fp32`).

[API Documentation](https://nvidia.github.io/apex/amp.html)

[Comprehensive Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)

[DCGAN example coming soon...](https://github.com/NVIDIA/apex/tree/master/examples/dcgan)

[Moving to the new Amp API](https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users) (for users of the deprecated "Amp" and "FP16_Optimizer" APIs)

## 2. Distributed Training

`apex.parallel.DistributedDataParallel` is a module wrapper, similar to 
`torch.nn.parallel.DistributedDataParallel`.  It enables convenient multiprocess distributed training,
optimized for NVIDIA's NCCL communication library.

[API Documentation](https://nvidia.github.io/apex/parallel.html)

[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)

[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)

The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.

### Synchronized Batch Normalization

`apex.parallel.SyncBatchNorm` extends `torch.nn.modules.batchnorm._BatchNorm` to
support synchronized BN.
It allreduces stats across processes during multiprocess (DistributedDataParallel) training.
Synchronous BN has been used in cases where only a small
local minibatch can fit on each GPU.
Allreduced stats increase the effective batch size for the BN layer to the
global batch size across all processes (which, technically, is the correct
formulation).
Synchronous BN has been observed to improve converged accuracy in some of our research models.

# Requirements

Python 3

CUDA 9 or newer

PyTorch 0.4 or newer.  The CUDA and C++ extensions require pytorch 1.0 or newer.

We recommend the latest stable release, obtainable from
[https://pytorch.org/](https://pytorch.org/).  We also test against the latest master branch, obtainable from [https://github.com/pytorch/pytorch](https://github.com/pytorch/pytorch).

It's often convenient to use Apex in Docker containers.  Compatible options include:
* [NVIDIA Pytorch containers from NGC](https://ngc.nvidia.com/catalog/containers/nvidia%2Fpytorch), which come with Apex preinstalled.  To use the latest Amp API, you may need to `pip uninstall apex` then reinstall Apex using the **Quick Start** commands below.
* [official Pytorch -devel Dockerfiles](https://hub.docker.com/r/pytorch/pytorch/tags), e.g. `docker pull pytorch/pytorch:nightly-devel-cuda10.0-cudnn7`, in which you can install Apex using the **Quick Start** commands.

See the [Docker example folder](https://github.com/NVIDIA/apex/tree/master/examples/docker) for details.

# Quick Start

### Linux

For performance and full functionality, we recommend installing Apex with
CUDA and C++ extensions via
```
$ git clone https://github.com/NVIDIA/apex
$ cd apex
$ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
```

Apex also supports a Python-only build (required with Pytorch 0.4) via
```
$ pip install -v --no-cache-dir .
```
A Python-only build omits:
- Fused kernels required to use `apex.optimizers.FusedAdam`.
- Fused kernels required to use `apex.normalization.FusedLayerNorm`.
- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.

### Windows support
Windows support is experimental, and Linux is recommended.  `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
on your system.  `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.


================================================
FILE: apex/apex/RNN/README.md
================================================
Under construction...


================================================
FILE: apex/apex/RNN/RNNBackend.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable

import torch.nn.functional as F

import math


def is_iterable(maybe_iterable):
    return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)


def flatten_list(tens_list):
    """
    flatten_list
    """
    if not is_iterable(tens_list):
        return tens_list
    
    return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )

    
#These modules always assumes batch_first
class bidirectionalRNN(nn.Module):
    """
    bidirectionalRNN
    """
    def __init__(self, inputRNN, num_layers=1, dropout = 0):
        super(bidirectionalRNN, self).__init__()
        self.dropout = dropout
        self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
        self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
        self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
        
    #collect hidden option will return all hidden/cell states from entire RNN
    def forward(self, input, collect_hidden=False):
        """
        forward()
        """
        seq_len = input.size(0)
        bsz = input.size(1)

        fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
        bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
        
        output = torch.cat( [fwd_out, bckwrd_out], -1 )
        hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )

        return output, hiddens

    def reset_parameters(self):
        """
        reset_parameters()
        """
        for rnn in self.rnns:
            rnn.reset_parameters()
        
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for rnn in self.rnns:
            rnn.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for rnn in self.rnns:
            rnn.detachHidden()
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for rnn in self.rnns:
            rnn.reset_hidden(bsz)

    def init_inference(self, bsz):    
        """
        init_inference()
        """
        for rnn in self.rnns:
            rnn.init_inference(bsz)

   
#assumes hidden_state[0] of inputRNN is output hidden state
#constructor either takes an RNNCell or list of RNN layers
class stackedRNN(nn.Module):        
    """
    stackedRNN
    """
    def __init__(self, inputRNN, num_layers=1, dropout=0):
        super(stackedRNN, self).__init__()
        
        self.dropout = dropout
        
        if isinstance(inputRNN, RNNCell):
            self.rnns = [inputRNN]
            for i in range(num_layers-1):
                self.rnns.append(inputRNN.new_like(inputRNN.output_size))
        elif isinstance(inputRNN, list):
            assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
            self.rnns=inputRNN
        else:
            raise RuntimeError()
        
        self.nLayers = len(self.rnns)
        
        self.rnns = nn.ModuleList(self.rnns)


    '''
    Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
    If collect hidden will also return Tuple(
        [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
    )
    If not collect hidden will also return Tuple(
        [n_hidden_states] Tensor([layer][batch size][features])
    '''
    def forward(self, input, collect_hidden=False, reverse=False):
        """
        forward()
        """
        seq_len = input.size(0)
        bsz = input.size(1)
        inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)

        hidden_states = [[] for i in range(self.nLayers)]
        outputs = []

        for seq in inp_iter:
            for layer in range(self.nLayers):

                if layer == 0:
                    prev_out = input[seq]
                    
                outs = self.rnns[layer](prev_out)

                if collect_hidden:
                    hidden_states[layer].append(outs)
                elif seq == seq_len-1:
                    hidden_states[layer].append(outs)
                    
                prev_out = outs[0]

            outputs.append(prev_out)

        if reverse:
            outputs = list(reversed(outputs))
        '''
        At this point outputs is in format:
        list( [seq_length] x Tensor([bsz][features]) )
        need to convert it to:
        list( Tensor([seq_length][bsz][features]) )
        '''
        output = flatten_list(outputs)

        '''
        hidden_states at this point is in format:
        list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
        need to convert it to:
          For not collect hidden:
            list( [hidden_states] x Tensor([layer][bsz][features]) )
          For collect hidden:
            list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
        '''
        if not collect_hidden:
            seq_len = 1
        n_hid = self.rnns[0].n_hidden_states
        new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]


        for i in range(n_hid):
            for j in range(seq_len):
                for k in range(self.nLayers):
                    new_hidden[i][j][k] = hidden_states[k][j][i]

        hidden_states = new_hidden
        #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
        #Reverse seq_length if reverse
        if reverse:
            hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)

        #flatten layer dimension into tensor
        hiddens = list( list(
            flatten_list(seq) for seq in hidden )
                        for hidden in hidden_states )
        
        #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
        #Remove seq_length dimension if not collect_hidden
        if not collect_hidden:
            hidden_states = list( entry[0] for entry in hidden_states)
        return output, hidden_states
    
    def reset_parameters(self):
        """
        reset_parameters()
        """
        for rnn in self.rnns:
            rnn.reset_parameters()
        
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for rnn in self.rnns:
            rnn.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for rnn in self.rnns:
            rnn.detach_hidden()
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for rnn in self.rnns:
            rnn.reset_hidden(bsz)

    def init_inference(self, bsz):    
        """ 
        init_inference()
        """
        for rnn in self.rnns:
            rnn.init_inference(bsz)

class RNNCell(nn.Module):
    """ 
    RNNCell 
    gate_multiplier is related to the architecture you're working with
    For LSTM-like it will be 4 and GRU-like will be 3.
    Always assumes input is NOT batch_first.
    Output size that's not hidden size will use output projection
    Hidden_states is number of hidden states that are needed for cell
    if one will go directly to cell as tensor, if more will go as list
    """
    def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
        super(RNNCell, self).__init__()

        self.gate_multiplier = gate_multiplier
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.cell = cell
        self.bias = bias
        self.output_size = output_size
        if output_size is None:
            self.output_size = hidden_size

        self.gate_size = gate_multiplier * self.hidden_size
        self.n_hidden_states = n_hidden_states

        self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
        self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))

        #Check if there's recurrent projection
        if(self.output_size != self.hidden_size):
            self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))

        self.b_ih = self.b_hh = None
        if self.bias:
            self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
            self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
            
        #hidden states for forward
        self.hidden = [ None for states in range(self.n_hidden_states)]

        self.reset_parameters()

    def new_like(self, new_input_size=None):
        """
        new_like()
        """
        if new_input_size is None:
            new_input_size = self.input_size
            
        return type(self)(self.gate_multiplier,
                       new_input_size,
                       self.hidden_size,
                       self.cell,
                       self.n_hidden_states,
                       self.bias,
                       self.output_size)

    
    #Use xavier where we can (weights), otherwise use uniform (bias)
    def reset_parameters(self, gain=1):
        """
        reset_parameters()
        """
        stdev = 1.0 / math.sqrt(self.hidden_size)
        for param in self.parameters():
            param.data.uniform_(-stdev, stdev)
    '''
    Xavier reset:
    def reset_parameters(self, gain=1):
        stdv = 1.0 / math.sqrt(self.gate_size)

        for param in self.parameters():
            if (param.dim() > 1):
                torch.nn.init.xavier_normal(param, gain)
            else:
                param.data.uniform_(-stdv, stdv)
    '''
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for param in self.parameters():
            if param is not None:
                a_param = param
                break

        for i, _ in enumerate(self.hidden):
            if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):

                if i==0:
                    hidden_size = self.output_size
                else:
                    hidden_size = self.hidden_size

                tens = a_param.data.new(bsz, hidden_size).zero_()
                self.hidden[i] = Variable(tens, requires_grad=False)
            
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for i, _ in enumerate(self.hidden):
            self.hidden[i] = None
        self.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for i, _ in enumerate(self.hidden):
            if self.hidden[i] is None:
                raise RuntimeError("Must initialize hidden state before you can detach it")
        for i, _ in enumerate(self.hidden):
            self.hidden[i] = self.hidden[i].detach()
        
    def forward(self, input):
        """
        forward()
        if not inited or bsz has changed this will create hidden states
        """
        self.init_hidden(input.size()[0])

        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
        self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
        if(self.n_hidden_states > 1):
            self.hidden = list(self.hidden)
        else:
            self.hidden=[self.hidden]

        if self.output_size != self.hidden_size:
            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)

        return tuple(self.hidden)


================================================
FILE: apex/apex/RNN/__init__.py
================================================
from .models import LSTM, GRU, ReLU, Tanh, mLSTM

__all__ = ['models']


================================================
FILE: apex/apex/RNN/cells.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from .RNNBackend import RNNCell

from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

import math 


class mLSTMRNNCell(RNNCell):
    """
    mLSTMRNNCell
    """

    def __init__(self, input_size, hidden_size, bias = False, output_size = None):
        gate_multiplier = 4
        super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)

        self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
        self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))

        self.reset_parameters()

    def forward(self, input):
        """
        mLSTMRNNCell.forward()
        """
        #if not inited or bsz has changed this will create hidden states
        self.init_hidden(input.size()[0])

        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden

        self.hidden = list(
                           self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
                           b_ih=self.b_ih, b_hh=self.b_hh)
        )
        
        if self.output_size != self.hidden_size:
            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
        return tuple(self.hidden)


    def new_like(self, new_input_size=None):
        if new_input_size is None:
            new_input_size = self.input_size
        
        return type(self)(
            new_input_size,
            self.hidden_size,
            self.bias,
            self.output_size)

def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
    """
    mLSTMCell
    """

    if input.is_cuda:
        igates = F.linear(input, w_ih)
        m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
        hgates = F.linear(m, w_hh)

        state = fusedBackend.LSTMFused.apply
        return state(igates, hgates, hidden[1], b_ih, b_hh)

    hx, cx = hidden
    
    m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
    gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

    ingate = F.sigmoid(ingate)
    forgetgate = F.sigmoid(forgetgate)
    cellgate = F.tanh(cellgate)
    outgate = F.sigmoid(outgate)
    
    cy = (forgetgate * cx) + (ingate * cellgate)
    hy = outgate * F.tanh(cy)
    
    return hy, cy
                                                                            


================================================
FILE: apex/apex/RNN/models.py
================================================
import torch

from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell

from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
from .cells import mLSTMRNNCell, mLSTMCell

def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
    """
    :class:`toRNNBackend`
    """

    if bidirectional:
        return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
    else:
        return stackedRNN(inputRNN, num_layers, dropout = dropout)


def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`LSTM`
    """
    inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`GRU`
    """
    inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`ReLU`
    """
    inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`Tanh`
    """
    inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
        
def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`mLSTM`
    """
    inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)




================================================
FILE: apex/apex/__init__.py
================================================
from . import parallel
from . import amp
from . import fp16_utils

# For optimizers and normalization there is no Python fallback.
# Absence of cuda backend is a hard error.
# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
# so they expect those backends to be available, but for some reason they actually aren't
# available (for example because they built improperly in a way that isn't revealed until
# load time) the error message is timely and visible.
from . import optimizers
from . import normalization


================================================
FILE: apex/apex/amp/README.md
================================================
# amp: Automatic Mixed Precision

## Annotating User Functions

Nearly all PyTorch user code needs nothing more than the two steps
above to use amp. After all, custom layers are built out of simpler
PyTorch components, and amp already can see those.

However, any custom C++ or CUDA code is outside of amp's (default)
view of things. For example, suppose I implemented a new recurrent
cell called a "forgetful recurrent unit" that calls directly into a
CUDA backend:

```python
from backend import FRUBackend

def fru(input, hidden, weight, bias):
    # call to CUDA code
    FRUBackend(input, hidden, weight, bias)
```

In this case, it is possible to get a runtime type mismatch. For
example, you might have `input` in fp16, and `weight` in fp32, and amp
doesn't have the visibility to insert an appropriate cast.

amp exposes two ways to handle "invisible" backend code: function
annotations and explicit registration.

#### Function annotation

The first way to handle backend code is a set of function annotations:

- `@amp.half_function`
- `@amp.float_function`
- `@amp.promote_function`

These correspond to:

- Cast all arguments to fp16
- Cast all argumnets fo fp32
- If there are any type mismatches, cast everything to the widest type

In our example, we believe that the FRU unit is fp16-safe and will get
performance gains from casting its arguments to fp16, so we write:

```python
@amp.half_function
def fru(input, hidden, weight, bias):
    #...
```

#### Explicit registration

The other way to handle backend code is with explicit function
registration:

- `amp.register_half_function(module, function_name)`
- `amp.register_float_function(module, function_name)`
- `amp.register_promote_function(module, function_name)`

When using this API, `module` is the containing class or module for
the function, and `function_name` is the _string_ name of the
function. Note that the function must be registered before the call to
`amp.initalize()`.

For our FRU unit, we can register the backend function directly:

```python
import backend

amp.register_half_function(backend, 'FRUBackend')
```


================================================
FILE: apex/apex/amp/__init__.py
================================================
from .amp import init, half_function, float_function, promote_function,\
    register_half_function, register_float_function, register_promote_function
from .handle import scale_loss, disable_casts
from .frontend import initialize
from ._amp_state import master_params, _amp_state


================================================
FILE: apex/apex/amp/__version__.py
================================================
VERSION = (0, 1, 0)
__version__ = '.'.join(map(str, VERSION))


================================================
FILE: apex/apex/amp/_amp_state.py
================================================
# This is a "header object" that allows different amp modules to communicate.
# I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.  
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch

TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])

if TORCH_MAJOR == 0:
    import collections.abc as container_abcs
else:
    from torch._six import container_abcs


class AmpState(object):
    def __init__(self):
        self.hard_override=False
        self.allow_incoming_model_not_fp32 = False
        self.verbosity=1


# Attribute stash.  Could also just stash things as global module attributes.
_amp_state = AmpState()


def warn_or_err(msg):
    if _amp_state.hard_override:
        print("Warning:  " + msg)
    else:
        raise RuntimeError(msg)
        # I'm not sure if allowing hard_override is a good idea.
        # + "  If you're sure you know what you're doing, supply " +
        #                    "hard_override=True to amp.initialize.")


distributed = False
if 'WORLD_SIZE' in os.environ:
    distributed = int(os.environ['WORLD_SIZE']) > 1


def maybe_print(msg, rank0=False):
    if _amp_state.verbosity > 0:
        if rank0:
            if distributed:
                if torch.distributed.get_rank() == 0:
                    print(msg)
            else:
                print(msg)
        else:
            print(msg)


# def iter_params(param_groups):
#     for group in param_groups:
#         for p in group['params']:
#             yield p


def master_params(optimizer):
    """
    Generator expression that iterates over the params owned by ``optimizer``.

    Args:
        optimizer: An optimizer previously returned from ``amp.initialize``.
    """
    for group in optimizer.param_groups:
        for p in group['params']:
            yield p


================================================
FILE: apex/apex/amp/_initialize.py
================================================
import torch
from torch._six import string_classes
import functools
import numpy as np
import warnings
from ._amp_state import _amp_state, warn_or_err, container_abcs
from .handle import disable_casts
from .scaler import LossScaler
from ._process_optimizer import _process_optimizer
from apex.fp16_utils import convert_network
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..optimizers import FusedAdam
from ..parallel import DistributedDataParallel as apex_DDP
from ..parallel.LARC import LARC


def to_type(dtype, t):
    if isinstance(t, torch.Tensor):
        if not t.is_cuda:
            # This should not be a hard error, since it may be legitimate.
            warnings.warn("An input tensor was not cuda.")
        # GANs require this.
        # if t.requires_grad:
        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
        #         "its gradients will not be properly allreduced by DDP.")
        if t.is_floating_point():
            return t.to(dtype)
        return t
    else:
        # Trust the user's custom batch type, that's all I can do here.
        return t.to(dtype)


# Modified from torch.optim.optimizer.py.  This is a bit more general than casted_args in utils.py.
def applier(value, fn):
    if isinstance(value, torch.Tensor):
        return fn(value)
    elif isinstance(value, string_classes):
        return value
    elif isinstance(value, np.ndarray):
        return value
    elif hasattr(value, "to"): # Allow handling of custom batch classes
        return fn(value)
    elif isinstance(value, container_abcs.Mapping):
        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
    elif isinstance(value, container_abcs.Iterable):
        return type(value)(applier(v, fn) for v in value)
    else:
        # Do I want this to fire off even if someone chooses to pass something ordinary like
        # an int or float?  May be more annoying than it's worth.
        # print("Warning:  unrecognized type in applier.  If your input data is a custom class, "
        #     "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
        #     "Amp will check for your custom to() and invoke it to cast the batch's "
        #     "floating-point Tensors to the appropriate type. "
        #     "Also, if your data is a custom class, it is your responsibility to ensure that "
        #     "any Tensors you want to be cuda are already cuda."
        return value


def check_models(models):
    for model in models:
        parallel_type = None
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            parallel_type = "torch.nn.parallel.DistributedDataParallel"
        if isinstance(model, apex_DDP):
            parallel_type = "apex.parallel.DistributedDataParallel"
        if isinstance(model, torch.nn.parallel.DataParallel):
            parallel_type = "torch.nn.parallel.DataParallel"
        if parallel_type is not None:
            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
                "Parallel wrappers should only be applied to the model(s) AFTER \n"
                "the model(s) have been returned from amp.initialize.")


def check_params_fp32(models):
    for model in models:
        for name, param in model.named_parameters():
            if param.is_floating_point():
                if 'Half' in param.type():
                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you do not need to call .half() on your model\n"
                        "before passing it, no matter what optimization level you choose.".format(
                        name, param.type()))
                elif not param.is_cuda:
                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you need to provide a model with parameters\n"
                        "located on a CUDA device before passing it no matter what optimization level\n"
                        "you chose. Use model.to('cuda') to use the default device.".format(
                        name, param.type()))

        # Backward compatibility for PyTorch 0.4
        if hasattr(model, 'named_buffers'):
            buf_iter = model.named_buffers()
        else:
            buf_iter = model._buffers
        for obj in buf_iter:
            if type(obj)==tuple:
                name, buf = obj
            else:
                name, buf = obj, buf_iter[obj]
            if buf.is_floating_point():
                if 'Half' in buf.type():
                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you do not need to call .half() on your model\n"
                        "before passing it, no matter what optimization level you choose.".format(
                        name, buf.type()))
                elif not buf.is_cuda:
                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you need to provide a model with buffers\n"
                        "located on a CUDA device before passing it no matter what optimization level\n"
                        "you chose. Use model.to('cuda') to use the default device.".format(
                        name, buf.type()))


def check_optimizers(optimizers):
    for optim in optimizers:
        bad_optim_type = None
        if isinstance(optim, FP16_Optimizer_general):
            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
        if isinstance(optim, FP16_Optimizer_for_fused):
            bad_optim_type = "apex.optimizers.FP16_Optimizer"
        if bad_optim_type is not None:
            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
                               "The optimizer(s) passed to amp.initialize() must be bare \n"
                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
                               "optimizers (currently just FusedAdam, but FusedSGD will be added \n"
                               "soon).  You should not manually wrap your optimizer in either \n"
                               "apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer. \n"
                               "amp.initialize will take care of that for you (if necessary) based \n"
                               "on the specified opt_level (and optional overridden properties).")


def wrap_fused_adam(optimizer, properties):
    msg = 'Currently, the usage of FusedAdam is restricted to '\
          'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '\
          'loss_scale=float or "dynamic").  We are working on enabling more general usage.'

    assert properties.master_weights is True, msg
    assert properties.cast_model_type is torch.float16, msg
    assert (properties.keep_batchnorm_fp32 is False or
            properties.keep_batchnorm_fp32 is None), msg

    if properties.loss_scale == "dynamic":
        return FP16_Optimizer_for_fused(optimizer, dynamic_loss_scale=True)
    else:
        return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)


def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init

    optimizers_was_list = False
    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
        optimizers = [optimizers]
    elif optimizers is None:
        optimizers = []
    elif isinstance(optimizers, list):
        optimizers_was_list = True
        check_optimizers(optimizers)
    else:
        check_optimizers([optimizers])
        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")

    if isinstance(models, torch.nn.Module):
        models_was_list = False
        models = [models]
    elif isinstance(models, list):
        models_was_list = True
    else:
        raise TypeError("models must be either a single model or a list of models.")

    check_models(models)

    if not _amp_state.allow_incoming_model_not_fp32:
        check_params_fp32(models)


    # In the future, when FP16_Optimizer can be deprecated and master weights can
    # become an attribute, remember to stash master weights before casting the model.

    if properties.cast_model_type:
        if properties.keep_batchnorm_fp32:
            for model in models:
                convert_network(model, properties.cast_model_type)
        else:
            for model in models:
                model.to(properties.cast_model_type)

        input_caster = functools.partial(to_type, properties.cast_model_type)
        if cast_model_outputs is not None:
            output_caster = functools.partial(to_type, cast_model_outputs)
        else:
            output_caster = functools.partial(to_type, torch.float32)

        for model in models:
            # Patch the forward method to cast incoming data to the correct type, and
            # outgoing data to float32, so "the user never needs to call .half()."
            # I like writing things explicitly more than decorators.
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*applier(args, input_caster),
                                     **applier(kwargs, input_caster))
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

        # State dict trick to recast any preexisting per-param state tensors 
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())
    elif cast_model_outputs is not None:
        output_caster = functools.partial(to_type, cast_model_outputs)

        for model in models:
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*args, **kwargs)
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

    for i, optimizer in enumerate(optimizers):
        # Still need to special case this for the first pass
        if isinstance(optimizer, FusedAdam):
            optimizers[i] = wrap_fused_adam(optimizer, properties)
        else:
            optimizers[i] = _process_optimizer(optimizer, properties)

    _amp_state.loss_scalers = []
    for _ in range(num_losses):
        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
                                                  min_loss_scale=_amp_state.min_loss_scale,
                                                  max_loss_scale=_amp_state.max_loss_scale))

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.
        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
        for optimizer in optimizers:
            # Disable Amp casting for the optimizer step, because it should only be
            # applied to FP32 master params anyway.
            def patch_step(old_step):
                def new_step(*args, **kwargs):
                    with disable_casts():
                        output = old_step(*args, **kwargs)
                    return output
                return new_step

            optimizer.step = patch_step(optimizer.step)

    if optimizers_was_list:
        if models_was_list:
            return models, optimizers
        else:
            return models[0], optimizers
    else:
        if models_was_list:
            if len(optimizers) == 0:
                return models
            else:
                return models, optimizers[0]
        else:
            if len(optimizers) == 0:
                return models[0]
            else:
                return models[0], optimizers[0]


================================================
FILE: apex/apex/amp/_process_optimizer.py
================================================
import types
from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print
import torch


class AmpOptimizerState(object):
    def __init__(self):
        pass


def lazy_init_with_master_weights(self):
        stash = self._amp_stash
        stash.fp16_groups = []
        stash.fp32_from_fp16_groups = []
        stash.fp32_from_fp32_groups = []
        for i, param_group in enumerate(self.param_groups):
            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(param_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
                        #             .format(param.size()))
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        param_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                        # Reset existing state dict key to the new master param.
                        # We still need to recast per-param state tensors, if any, to FP32.
                        if param in self.state:
                           self.state[master_param] = self.state.pop(param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                        #             .format(param.size()))
                        fp32_params_this_group.append(param)
                        param_group['params'][i] = param
                    else:
                        raise TypeError("Optimizer's parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                        "Received {}".format(param.type()))

            stash.fp16_groups.append(fp16_params_this_group)
            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            stash.fp32_from_fp32_groups.append(fp32_params_this_group)

        stash.all_fp16_params = []
        for group in stash.fp16_groups:
            stash.all_fp16_params += group

        stash.all_fp32_from_fp16_params = []
        for group in stash.fp32_from_fp16_groups:
            stash.all_fp32_from_fp16_params += group

        stash.all_fp32_from_fp32_params = []
        for group in stash.fp32_from_fp32_groups:
            stash.all_fp32_from_fp32_params += group

        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]

        for param in stash.all_fp32_from_fp16_params:
            param.grad = None

        for param in stash.all_fp32_from_fp32_params:
            param.grad = None

        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
        self.load_state_dict(self.state_dict())


def prepare_backward_with_master_weights(self):
    stash = self._amp_stash

    if not stash.lazy_init_called:
        self._lazy_init_maybe_master_weights()
        stash.lazy_init_called = True

    for i, param in enumerate(stash.all_fp16_params):
        # Set up to leverage grad copy elision:
        param.grad = None

    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad

    for i, param in enumerate(stash.all_fp32_from_fp32_params):
        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None


def post_backward_with_master_weights(self, scaler):
    stash = self._amp_stash

    # This is a lot of python overhead...
    fp16_grads_needing_unscale = []
    new_fp32_grads = []
    fp16_grads_needing_unscale_with_stash = []
    preexisting_fp32_grads = []
    for fp16_param, fp32_param in zip(stash.all_fp16_params,
                                      stash.all_fp32_from_fp16_params):
        if fp16_param.grad is None and fp32_param.grad is not None:
            continue
        elif fp16_param.grad is not None and fp32_param.grad is None:
            fp32_param.grad = torch.empty_like(fp32_param)
            fp16_grads_needing_unscale.append(fp16_param.grad)
            new_fp32_grads.append(fp32_param.grad)
        elif fp16_param.grad is not None and fp32_param.grad is not None:
            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
            preexisting_fp32_grads.append(fp32_param.grad)
        else: # fp16_param.grad is None and fp32_param.grad is None:
            continue

    if len(fp16_grads_needing_unscale) > 0:
        scaler.unscale(
            fp16_grads_needing_unscale,
            new_fp32_grads,
            scaler.loss_scale(),
            models_are_masters=False)

    if len(fp16_grads_needing_unscale_with_stash) > 0:
        scaler.unscale_with_stashed(
            fp16_grads_needing_unscale_with_stash,
            preexisting_fp32_grads,
            preexisting_fp32_grads)

    # fp32 params can be treated as they would be in the "no_master_weights" case.
    grads_needing_unscale = []
    grads_needing_unscale_with_stash = []
    stashed = []
    for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
                                   stash.all_fp32_from_fp32_grad_stash):
        if param.grad is None and stashed_grad is not None:
            param.grad = stashed_grad
        elif param.grad is not None and stashed_grad is None:
            grads_needing_unscale.append(param.grad)
        elif param.grad is not None and stashed_grad is not None:
            grads_needing_unscale_with_stash.append(param.grad)
            stashed.append(stashed_grad)
        else: # param.grad is None and stashed_grad is None:
            continue

    if len(grads_needing_unscale) > 0:
        scaler.unscale(
            grads_needing_unscale,
            grads_needing_unscale,
            scaler.loss_scale(),
            models_are_masters=True)

    if len(grads_needing_unscale_with_stash) > 0:
        scaler.unscale_with_stashed(
            grads_needing_unscale_with_stash,
            stashed,
            grads_needing_unscale_with_stash)

    # Clear the stash.
    for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
        stash.all_fp32_from_fp32_grad_stash[i] = None


def lazy_init_no_master_weights(self):
    stash = self._amp_stash
    stash.all_fp16_params = []
    stash.all_fp32_params = []
    for i, param_group in enumerate(self.param_groups):
        for i, param in enumerate(param_group['params']):
            if param.type() == 'torch.cuda.HalfTensor':
                stash.all_fp16_params.append(param)
            elif param.type() == 'torch.cuda.FloatTensor':
                stash.all_fp32_params.append(param)
            else:
                raise TypeError("Optimizer's parameters must be either "
                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                "Received {}".format(param.type()))

    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]


def prepare_backward_no_master_weights(self):
    stash = self._amp_stash

    if not stash.lazy_init_called:
        self._lazy_init_maybe_master_weights()
        stash.lazy_init_called = True

    for i, param in enumerate(stash.all_fp16_params):
        stash.all_fp16_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None

    for i, param in enumerate(stash.all_fp32_params):
        stash.all_fp32_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None


def post_backward_no_master_weights(self, scaler):
    stash = self._amp_stash

    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
             (stash.all_fp32_params, stash.all_fp32_grad_stash))

    for params, stashed_grads in split_types:
        # This is a lot of python overhead...
        grads_needing_unscale = []
        grads_needing_unscale_with_stash = []
        stashed = []
        for param, stashed_grad in zip(params, stashed_grads):
            if param.grad is None and stashed_grad is not None:
                param.grad = stashed_grad
            elif param.grad is not None and stashed_grad is None:
                grads_needing_unscale.append(param.grad)
            elif param.grad is not None and stashed_grad is not None:
                grads_needing_unscale_with_stash.append(param.grad)
                stashed.append(stashed_grad)
            else: # param.grad is None and stashed_grad is None
                continue

        if len(grads_needing_unscale) > 0:
            scaler.unscale(
                grads_needing_unscale,
                grads_needing_unscale,
                scaler.loss_scale(),
                models_are_masters=True)

        if len(grads_needing_unscale_with_stash) > 0:
            scaler.unscale_with_stashed(
                grads_needing_unscale_with_stash,
                stashed,
                grads_needing_unscale_with_stash)

        # Clear the stash.
        for i in range(len(stashed_grads)):
            stashed_grads[i] = None


def _master_params_to_model_params(self):
    stash = self._amp_stash
    if multi_tensor_applier.available:
        if len(stash.all_fp16_params) > 0:
            multi_tensor_applier(
                stash.multi_tensor_scale,
                stash.dummy_overflow_buf,
                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
                1.0)
    else:
        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
            master_params_to_model_params(fp16_group, fp32_from_fp16_group)


def _process_optimizer(optimizer, properties):
    if hasattr(optimizer, "_amp_stash"):
        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
    else:
        optimizer._amp_stash = AmpOptimizerState()

    optimizer._amp_stash.lazy_init_called = False
    optimizer._amp_stash.already_patched = False
    optimizer._amp_stash.params_have_scaled_gradients = False

    for name in ("_lazy_init_maybe_master_weights",
                 "_master_params_to_model_params",
                 "_prepare_amp_backward",
                 "_post_amp_backward"):
        if hasattr(optimizer, name):
            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))

    # TODO:  Centralize exposure and import error checking for the C backend.
    if multi_tensor_applier.available:
        import amp_C
        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);

    if properties.master_weights:
        optimizer._lazy_init_maybe_master_weights = types.MethodType(
            lazy_init_with_master_weights, optimizer)

        optimizer._master_params_to_model_params = types.MethodType(
            _master_params_to_model_params, optimizer)

        old_step = optimizer.step
        def new_step(self, closure=None):
            if closure is not None:
                raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
            retval = old_step()
            self._master_params_to_model_params()
            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
            for param in self._amp_stash.all_fp32_from_fp16_params:
                param.grad = None
            return retval
        optimizer.step = types.MethodType(new_step, optimizer)

        old_zero_grad = optimizer.zero_grad
        def new_zero_grad(self):
            stash = self._amp_stash
            if not stash.lazy_init_called:
                self._lazy_init_maybe_master_weights()
                stash.lazy_init_called = True
            # Zero the model grads.
            for param in stash.all_fp16_params:
                if param.grad is not None:
                    param.grad.detach_()
                    param.grad.zero_()
            for param in stash.all_fp32_from_fp32_params:
                if param.grad is not None:
                    param.grad.detach_()
                    param.grad.zero_()
            # Clear the master grads that are independent of model grads
            for param in self._amp_stash.all_fp32_from_fp16_params:
                param.grad = None
        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)

        optimizer._prepare_amp_backward = types.MethodType(
            prepare_backward_with_master_weights, optimizer)

        optimizer._post_amp_backward = types.MethodType(
            post_backward_with_master_weights, optimizer)
    else:
        optimizer._lazy_init_maybe_master_weights = types.MethodType(
            lazy_init_no_master_weights, optimizer)

        optimizer._prepare_amp_backward = types.MethodType(
            prepare_backward_no_master_weights, optimizer)

        optimizer._post_amp_backward = types.MethodType(
            post_backward_no_master_weights, optimizer)

    old_add_param_group = optimizer.add_param_group

    def new_add_param_group(self, new_group):
        stash = self._amp_stash

        if not stash.lazy_init_called:
            self._lazy_init_maybe_master_weights()
            stash.lazy_init_called = True

        assert isinstance(new_group, dict), "param group must be a dict"

        new_params = new_group['params']
        if isinstance(new_params, torch.Tensor):
            new_group['params'] = [new_params]
        elif isinstance(new_params, set):
            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
        else:
            new_group['params'] = list(new_params)

        if properties.master_weights:
            # Mutate new_group in-place to use FP32 master params
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(new_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        new_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        fp32_params_this_group.append(param)
                        new_group['params'][i] = param
                    else:
                        raise TypeError("Optimizer's parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                        "Received {}".format(param.type()))

            stash.fp16_groups.append(fp16_params_this_group)
            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            stash.fp32_from_fp32_groups.append(fp32_params_this_group)

            stash.all_fp16_params += fp16_params_this_group
            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
            stash.all_fp32_from_fp32_params += fp32_params_this_group

            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]

            # It should be ok to let params be added with existing .grad attributes.
            # for param in fp16_params_this_group:
            #     param.grad = None

            # for param in fp32_from_fp16_params_this_group:
            #     param.grad = None

            # for param in stash.fp32_params_this_group:
            #     param.grad = None
        else:
            for param in new_group['params']:
                if param.type() == 'torch.cuda.HalfTensor':
                    stash.all_fp16_params.append(param)
                    stash.all_fp16_grad_stash.append(None)
                elif param.type() == 'torch.cuda.FloatTensor':
                    stash.all_fp32_params.append(param)
                    stash.all_fp32_grad_stash.append(None)
                else:
                    raise TypeError("Optimizer's parameters must be either "
                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                    "Received {}".format(param.type()))

        old_add_param_group(new_group)

    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)

    return optimizer


================================================
FILE: apex/apex/amp/amp.py
================================================
from . import compat, rnn_compat, utils, wrap
from .handle import AmpHandle, NoOpHandle
from .lists import functional_overrides, torch_overrides, tensor_overrides
from ._amp_state import _amp_state
from .frontend import *

import functools
import itertools

import torch


_DECORATOR_HANDLE = None
_USER_CAST_REGISTRY = set()
_USER_PROMOTE_REGISTRY = set()


def _decorator_helper(orig_fn, cast_fn, wrap_fn):
    def wrapper(*args, **kwargs):
        handle = _DECORATOR_HANDLE
        if handle is None or not handle.is_active():
            return orig_fn(*args, **kwargs)
        inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
                                  handle.verbose)
        return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
    return wrapper


# Decorator form
def half_function(fn):
    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
    return _decorator_helper(fn, utils.maybe_half, wrap_fn)


def float_function(fn):
    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
    return _decorator_helper(fn, utils.maybe_float, wrap_fn)


def promote_function(fn):
    wrap_fn = functools.partial(wrap.make_promote_wrapper)
    return _decorator_helper(fn, utils.maybe_float, wrap_fn)


# Registry form
def register_half_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))


def register_float_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))


def register_promote_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_PROMOTE_REGISTRY.add((module, name))


# Top-level function to insert _all_ the hooks.
def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
    global _DECORATOR_HANDLE

    if not enabled:
        handle = NoOpHandle()
        _DECORATOR_HANDLE = handle
        return handle

    handle = AmpHandle(loss_scale, enable_caching, verbose)

    # 0) Force-{fp16, fp32} for user-annotated functions
    for mod, fn, cast_fn in _USER_CAST_REGISTRY:
        try_caching = (cast_fn == utils.maybe_half)
        wrap.cached_cast(mod, fn, cast_fn, handle,
                         try_caching, verbose)
    _USER_CAST_REGISTRY.clear()

    # 0.5) Force-promote for user-annotated functions
    for mod, fn in _USER_PROMOTE_REGISTRY:
        wrap.promote(mod, fn, handle, verbose)
    _USER_PROMOTE_REGISTRY.clear()

    # 1) Force-{fp16, fp32} on white- / black-list functions
    override_modules = [functional_overrides,
                        torch_overrides,
                        tensor_overrides]
    cast_table = [('FP16_FUNCS', utils.maybe_half),
                  ('FP32_FUNCS', utils.maybe_float)]
    for module, (list_name, cast_fn) in itertools.product(override_modules,
                                                          cast_table):
        for fn in getattr(module, list_name):
            try_caching = (cast_fn == utils.maybe_half)
            wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
                             try_caching, verbose)

    # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
    #      methods on FloatTensor, since they're distinct types.
    if compat.tensor_is_float_tensor():
        for fn in tensor_overrides.FP16_FUNCS:
            wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
                             handle, try_caching=True, verbose=verbose)
        for fn in tensor_overrides.FP32_FUNCS:
            wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
                             handle, try_caching=False, verbose=verbose)

    # 2) Enable type-promotion on multi-arg functions and methods.
    #    NB: special handling for sequence fns (e.g. `torch.cat`).
    promote_modules = [torch_overrides, tensor_overrides]
    promote_table = [('CASTS', wrap.promote),
                     ('SEQUENCE_CASTS', wrap.sequence_promote)]
    for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
                                                                  promote_table):
        for fn in getattr(promote_mod, list_name):
            promote_fn(promote_mod.MODULE, fn, handle, verbose)

    # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
    if compat.tensor_is_float_tensor():
        for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
                                                               torch.cuda.HalfTensor],
                                                              promote_table):
            for fn in getattr(tensor_overrides, list_name):
                promote_fn(cls, fn, handle, verbose)

    # 3) For any in-place version of a blacklist function, error if any input is fp16.
    #    NB: this is overly conservative.
    for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
        wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)

    # 3.5) For any in-place blacklist method, error if called on fp16 tensor
    for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
        wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
        if compat.tensor_is_float_tensor():
            wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)

    # 4) For other in-place methods, match the type of self tensor
    for fn in utils.as_inplace(itertools.chain(
            tensor_overrides.FP16_FUNCS,
            tensor_overrides.CASTS)):
        wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
        if compat.tensor_is_float_tensor():
            wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
            wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)

    # 5) RNNs + RNN cells are whitelisted specially
    if rnn_compat.has_old_rnns():
        wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
    if not rnn_compat.has_old_rnns():
        # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
        torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
        # Wrap all the rnns
        for x in rnn_compat.RNN_NAMES:
            wrap.new_rnn_cast(x.upper(), handle, verbose)

    # Wrap all the RNN cells
    rnn_compat.whitelist_rnn_cells(handle, verbose)

    # 6) Place error+print message on banned functions.
    #    Or, if allow_banned, then cast to FP32.
    for fn, err_msg in functional_overrides.BANNED_FUNCS:
        if allow_banned:
            wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
                             handle, try_caching=True, verbose=verbose)
        else:
            wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)

    _DECORATOR_HANDLE = handle

    _amp_state.handle = handle

    return handle


================================================
FILE: apex/apex/amp/compat.py
================================================
import torch

# True for post-0.4, when Variables/Tensors merged.
def variable_is_tensor():
    v = torch.autograd.Variable()
    return isinstance(v, torch.Tensor)

def tensor_is_variable():
    x = torch.Tensor()
    return type(x) == torch.autograd.Variable

# False for post-0.4
def tensor_is_float_tensor():
    x = torch.Tensor()
    return type(x) == torch.FloatTensor

# Akin to `torch.is_tensor`, but returns True for Variable
# objects in pre-0.4.
def is_tensor_like(x):
    return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)

# Wraps `torch.is_floating_point` if present, otherwise checks
# the suffix of `x.type()`.
def is_floating_point(x):
    if hasattr(torch, 'is_floating_point'):
        return torch.is_floating_point(x)
    try:
        torch_type = x.type()
        return torch_type.endswith('FloatTensor') or \
            torch_type.endswith('HalfTensor') or \
            torch_type.endswith('DoubleTensor')
    except AttributeError:
        return False

def scalar_python_val(x):
    if hasattr(x, 'item'):
        return x.item()
    else:
        if isinstance(x, torch.autograd.Variable):
            return x.data[0]
        else:
            return x[0]


================================================
FILE: apex/apex/amp/frontend.py
================================================
import torch
from ._initialize import _initialize
from ._amp_state import _amp_state, warn_or_err, maybe_print


class Properties(object):
    """
    This class has two purposes: to establish a set of default properties,
    and to route setting of these attributes through __setattr__ so that (in theory)
    they can be checked for consistency with other existing args.
    """
    def __init__(self):
        self.options = {
            "enabled" : False,
            "opt_level" : None,
            "cast_model_type" : None,
            "patch_torch_functions" : False,
            "keep_batchnorm_fp32" : None,
            "master_weights" : None,
            "loss_scale" : 1.0,
            # Reserved for future functionality
            # "fused_optimizer" : False,
            # "enable_ddp_interop" : False,
            }

    """
    This function allows updating several options at a time without routing through
    __setattr__ checks, to avoid "you can't get there from here" scenarios.
    Currently not intended to be exposed; users are expected to select an opt_level
    and apply consistent modifications.
    """
    def _update_options_dict(new_options):
        for k, v in new_options:
            if k in self.options:
                self.options[k] = v
            else:
                raise ValueError("Tried to set unexpected option {}".format(k))
    """
    The members of "options" are not direct attributes of self, so access attempts
    will roll down to __getattr__.  This borrows from the logic in torch.nn.Module.
    """
    def __getattr__(self, name):
        if "options" in self.__dict__:
            options =  self.__dict__["options"]
            if name in options:
                return options[name]
        raise AttributeError("'{}' object has no attribute '{}'".format(
            type(self).__name__, name))

    def __setattr__(self, name, value):
        if "options" in self.__dict__:
            if name in self.options:
                # print("setting {} {}".format(name, value))
                if name == "cast_model_type":
                    if self.opt_level == "O1" and value is not None:
                        if value is not False:
                            if value is not torch.float32:
                                warn_or_err("O1 inserts casts around Torch functions rather than "
                                            "model weights, so with O1, the model weights themselves "
                                            "should remain FP32. If you wish to cast the model to a "
                                            "different type, use opt_level='O2' or 'O3'. " +
                                            "cast_model_type was {}".format(value))
                    self.options[name] = value
                elif name == "patch_torch_functions":
                    if self.opt_level != "O1" and value:
                        warn_or_err("Currently, patch_torch_functions=True should only be set by "
                                    "selecting opt_level='O1'.")
                    self.options[name] = value
                elif name == "keep_batchnorm_fp32":
                    if self.opt_level == "O1" and value is not None:
                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
                                    " keep_batchnorm_fp32 was {}".format(value))
                    if value == "False":
                        self.options[name] = False
                    elif value == "True":
                        self.options[name] = True
                    else:
                        assert (value is True or value is False or value is None),\
                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
                            "or None, found keep_batchnorm_fp32={}".format(value)
                        self.options[name] = value
                elif name == "master_weights":
                    if self.opt_level == "O1" and value is not None:
                        warn_or_err("It doesn't make sense to use master_weights with O1. "
                                    "With O1, your model weights themselves should be FP32.")
                    self.options[name] = value
                elif name == "loss_scale":
                    if value == "dynamic":
                        self.options[name] = value
                    else:
                        self.options[name] = float(value)
                else:
                    self.options[name] = value
        else:
            super(Properties, self).__setattr__(name, value)


""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """

class O3:
    brief = "O3:  Pure FP16 training."
    more = "Calls .half() on your model, converting the entire model to FP16.\n"\
        "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
        "so you don't need to change your data pipeline.\n"\
        "This mode is useful for establishing a performance ceiling.\n"\
        "It's also possible training may 'just work' in this mode.\n"\
        "If not, try other optimization levels."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O3"
        properties.cast_model_type = torch.float16
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = False
        properties.master_weights = False
        properties.loss_scale = 1.0
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O2:
    brief = "O2:  FP16 training with FP32 batchnorm and FP32 master weights.\n"
    more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
        "to FP16.  Batchnorms are retained in FP32 for additional stability.\n"\
        "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
        "your data pipeline.\n"\
        "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
        "these master weights, then copy the master weights into the FP16 model weights.\n"\
        "Master weights can also improve convergence and stability."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O2"
        properties.cast_model_type = torch.float16
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = True
        properties.master_weights = True
        properties.loss_scale = "dynamic"
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O1:
    brief = "O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n"
    more = "The type of your model's weights is not altered.  However, internally,\n"\
        "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
        "while operations that might benefit from the additional stability of FP32 are patched\n"\
        "to cast their inputs to fp32.\n"\
        "O1 is the safest way to try mixed precision training, and is recommended when\n"\
        "trying mixed precision training for the first time."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O1"
        properties.cast_model_type = None
        properties.patch_torch_functions = True
        properties.keep_batchnorm_fp32 = None
        properties.master_weights = None
        properties.loss_scale = "dynamic"
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O0:
    brief = "O0:  Pure FP32 training.\n"
    more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
        "types of weights and internal Pytorch operations are not altered.  This mode disables any\n"\
        "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O0"
        properties.cast_model_type = torch.float32
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = None
        properties.master_weights = False
        properties.loss_scale = 1.0
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


opt_levels = {"O3": O3(),
              "O2": O2(),
              "O1": O1(),
              "O0": O0()}


# allow user to directly pass Properties struct as well?
def initialize(
    models,
    optimizers=None,
    enabled=True,
    opt_level="O1",
    cast_model_type=None,
    patch_torch_functions=None,
    keep_batchnorm_fp32=None,
    master_weights=None,
    loss_scale=None,
    cast_model_outputs=None,
    num_losses=1,
    verbosity=1,
    min_loss_scale=None,
    max_loss_scale=2.**24
    ):
    """
    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
    chosen ``opt_level`` and overridden properties, if any.

    ``amp.initialize`` should be called **after** you have finished
    constructing your model(s) and
    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
    See `Distributed training`_ in the Imagenet example.

    Currently, ``amp.initialize`` should only be called **once**,
    although it can process an arbitrary number of
    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
    If you think your use case requires ``amp.initialize`` to be called more than once,
    `let us know`_.

    Any property keyword argument that is not ``None`` will be interpreted as a manual override.

    To prevent having to rewrite anything else in your script, name the returned models/optimizers
    to replace the passed models/optimizers, as in the code sample below.

    Args:
        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
            REQUIRED for training, optional for inference.
        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
            should run as if Amp were not present.
        opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.  Accepted values are
            "O0", "O1", "O2", and "O3", explained in detail above.
        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
            above.
        patch_torch_functions (bool, optional, default=None):  Optional property override.
        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
            passed as a string, must be the string "True" or "False".
        master_weights (bool, optional, default=None):  Optional property override.
        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
            must be a string representing a number, e.g., "128.0", or the string "dynamic".
        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
            of your model(s) are always cast to a particular type regardless of ``opt_level``.
        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
            which can improve stability.  See "Multiple models/optimizers/losses"
            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
            support multiple losses/backward passes, but use a single global loss scale
            for all of them.
        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
        min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
            loss scaling.  The default value of None means that no floor is imposed.
            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
        max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
            dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.

    Returns:
        Model(s) and optimizer(s) modified according to the ``opt_level``.
        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
        also be a list.

    Permissible invocations::

        model, optim = amp.initialize(model, optim,...)
        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)

        # This is not an exhaustive list of the cross product of options that are possible,
        # just a set of examples.
        model, optim = amp.initialize(model, optim, opt_level="O0")
        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")

        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")

        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")

        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")

    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.

    .. _`Distributed training`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training

    .. _`Imagenet example`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet

    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html

    .. _`Advanced Amp Usage topic`:
        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses

    .. _`let us know`:
        https://github.com/NVIDIA/apex/issues
    """
    _amp_state.opt_properties = Properties()
    _amp_state.verbosity = verbosity

    if not enabled:
        if optimizers is None:
            return models
        else:
            return models, optimizers

    if not torch.backends.cudnn.enabled:
        raise RuntimeError(
            "Amp requires torch.backends.cudnn.enabled = True")

    if opt_level not in opt_levels:
        raise RuntimeError(
            "Unexpected optimization level {}. ".format(opt_level) +
            "Options are 'O0', 'O1', 'O2', 'O3'.  Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
            "not the number zero.")
    else:
        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
        maybe_print("Defaults for this optimization level are:", True)
        for k, v in _amp_state.opt_properties.options.items():
            maybe_print("{:22} : {}".format(k, v), True)

    _amp_state.min_loss_scale = min_loss_scale
    _amp_state.max_loss_scale = max_loss_scale

    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
    # I chose to have the keyword arguments listed directly in the argument list,
    # instead of **kwargs, so I can't use kwargs.items() here.
    if enabled is not None:
        _amp_state.opt_properties.enabled = enabled
    if opt_level is not None:
        _amp_state.opt_properties.opt_level = opt_level
    if cast_model_type is not None:
        _amp_state.opt_properties.cast_model_type = cast_model_type
    if patch_torch_functions is not None:
        _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
    if keep_batchnorm_fp32 is not None:
        _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
    if master_weights is not None:
        _amp_state.opt_properties.master_weights = master_weights
    if loss_scale is not None:
        _amp_state.opt_properties.loss_scale = loss_scale

    maybe_print("After processing overrides, optimization options are:", True)
    for k, v in _amp_state.opt_properties.options.items():
        maybe_print("{:22} : {}".format(k, v), True)

    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)


# TODO:  is this necessary/useful?
# def check_option_consistency(enabled=True,
#                              opt_level=None,
#                              cast_model_type=None,
#                              patch_torch_functions=None,
#                              keep_batchnorm_fp32=None,
#                              master_weights=None,
#                              loss_scale=None,
#                              enable_ddp_interop=None,
#                              hard_override=False):
#     """
#     Utility function that enables users to quickly check if the option combination they intend
#     to use is permitted.  ``check_option_consistency`` does not require models or optimizers
#     to be constructed, and can be called at any point in the script.  ``check_option_consistency``
#     is totally self-contained; it does not set any amp global state or affect anything outside
#     of itself.
#     """
#
#     if not enabled:
#         return
#
#     if opt_level not in opt_levels:
#         raise RuntimeError("Unexpected optimization level.  Options are 'O0', 'O1', 'O2', 'O3'.")
#     else:
#         opt_properties = opt_levels[opt_level](Properties())
#         print("Selected optimization level {}", opt_levels[opt_level].brief)
#         print("Defaults for this optimization level are:")
#         for k, v in opt_properties.options:
#             print("{:22} : {}".format(k, v))
#
#     print("Processing user overrides (additional kwargs that are not None)...")
#     for k, v in kwargs:
#         if k not in _amp_state.opt_properties.options:
#             raise RuntimeError("Unexpected kwarg {}".format(k))
#         if v is not None:
#             setattr(opt_properties, k, v)
#
#     print("After processing overrides, optimization options are:")
#     for k, v in opt_properties.options:
#         print("{:22} : {}".format(k, v))


================================================
FILE: apex/apex/amp/handle.py
================================================
import contextlib
import warnings
import torch

from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params, maybe_print
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..parallel.LARC import LARC


# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@contextlib.contextmanager
def scale_loss(loss,
               optimizers,
               loss_id=0,
               model=None,
               delay_unscale=False,
               delay_overflow_check=False):
    """
    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
    and unscaled, so that ``optimizer.step()`` can be called.

    .. note::
        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
        any FP16 gradients are copied to FP32 master gradients before being unscaled.
        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.

    .. warning::
        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
        unscaled.  The direct ``.grad`` attributes of any FP16
        model params will remain scaled after context manager exit.
        This subtlety affects gradient clipping.  See "Gradient clipping" under
        `Advanced Amp Usage`_ for best practices.

    Args:
        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
            manager yields is simply ``loss.float()*loss_scale``, so in principle
            ``loss`` could have more than one element, as long as you call
            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
            Must be an optimizer or list of optimizers returned from an earlier call
            to ``amp.initialize``.  For example use with multiple optimizers, see
            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
            being used for the current backward pass.  See "Multiple models/optimizers/losses"
            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
            will use the default global loss scaler for this backward pass.
        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
            optimizations.
        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
            the default value of ``False`` is strongly recommended.
            If ``True``, Amp will not unscale the gradients or perform model->master
            gradient copies on context manager exit.
            ``delay_unscale=True`` is a minor ninja performance optimization and can result
            in weird gotchas (especially with multiple models/optimizers/losses),
            so only use it if you know what you're doing.
            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
            illustrates a situation where this CAN (but does not need to) be used.

    .. warning::
        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
        called yet after context manager exit, and must wait for another, later backward context
        manager invocation with ``delay_unscale`` left to False.

    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html
    """
    if not hasattr(_amp_state, "opt_properties"):
        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
                           "before `with amp.scale_loss`.")

    if not _amp_state.opt_properties.enabled:
        yield loss
        return

    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
        optimizers = [optimizers]

    # this is what happens when i have to support tools from different sources under the same API...
    # TODO:  Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
    if isinstance(optimizers, FP16_Optimizer_for_fused):
        loss_scale = optimizers.cur_scale
    else:
        loss_scaler = _amp_state.loss_scalers[loss_id]
        loss_scale = loss_scaler.loss_scale()

    if ((not _amp_state.opt_properties.master_weights)
        and (not loss_scaler.dynamic)
        and loss_scale == 1.0):
        yield loss.float()
        # Needing to drop the cache here as well is an ugly gotcha.
        # But for now I think it's necessary to short-circuit.
        # Probably ok to skip this if not delay_unscale
        if _amp_state.opt_properties.patch_torch_functions:
            _amp_state.handle._clear_cache()
        return

    if not delay_unscale:
        if isinstance(optimizers, list):
            for optimizer in optimizers:
                if not optimizer._amp_stash.params_have_scaled_gradients:
                    optimizer._prepare_amp_backward()

    yield (loss.float())*loss_scale

    if delay_unscale:
        for optimizer in optimizers:
            optimizer._amp_stash.params_have_scaled_gradients = True
    else:
        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
        if not isinstance(optimizers, FP16_Optimizer_for_fused):
            loss_scaler.clear_overflow_state()
            for optimizer in optimizers:
                optimizer._post_amp_backward(loss_scaler)
                optimizer._amp_stash.params_have_scaled_gradients = False
            # For future fused optimizers that enable sync-free dynamic loss scaling,
            # should_skip will always be False.
            should_skip = False if delay_overflow_check else loss_scaler.update_scale()
            if should_skip:
                for optimizer in optimizers:
                    if not optimizer._amp_stash.already_patched:
                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
                        # necessary because amp.scale_loss is already creating a temporary scope.
                        def patch_step(opt, loss_scaler, loss_id):
                            opt_step = opt.step
                            def skip_step(closure=None):
                                if closure is not None:
                                    raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
                                             "{} reducing loss scale to {}").format(loss_id,
                                             loss_scaler.loss_scale()))
                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
                                        param.grad = None
                                opt.step = opt_step
                                opt._amp_stash.already_patched = False
                            return skip_step
                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
                        optimizer._amp_stash.already_patched = True

    # Probably ok to skip this if not delay_unscale
    if _amp_state.opt_properties.patch_torch_functions:
        _amp_state.handle._clear_cache()


# Free function version of AmpHandle.disable_casts, another step on the
# path to removing the concept of "AmpHandle"
@contextlib.contextmanager
def disable_casts():
    _amp_state.handle._is_active = False
    yield
    _amp_state.handle._is_active = True


class AmpHandle(object):
    def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
        self._enable_caching = enable_caching
        self._verbose = verbose
        self._cache = dict()
        self._default_scaler = LossScaler(loss_scale)
        self._is_active = True
        self._all_wrappers = []

    def is_active(self):
        return self._is_active

    @contextlib.contextmanager
    def _disable_casts(self):
        self._is_active = False
        yield
        self._is_active = True

    def wrap_optimizer(self, optimizer, num_loss=1):
        self._default_scaler = None
        return OptimWrapper(optimizer, self, num_loss)

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")

        if not self.is_active():
            yield loss
            return

        if self._default_scaler is None:
            raise RuntimeError(
                'After calling `handle.wrap_optimizer()`, you must explicitly ' +
                'use `optimizer.scale_loss(loss)`.')

        # TODO: this code block is duplicated here and `opt.py`. Unify.
        loss_scale = self._default_scaler.loss_scale()
        yield loss * loss_scale

        self._default_scaler.clear_overflow_state()
        self._default_scaler.unscale(
            master_params(optimizer),
            master_params(optimizer),
            loss_scale)
        should_skip = self._default_scaler.update_scale()
        if should_skip:
            optimizer_step = optimizer.step
            def skip_step():
                maybe_print('Gradient overflow, skipping update')
                optimizer.step = optimizer_step
            optimizer.step = skip_step

        self._clear_cache()

    def _clear_cache(self):
        self._cache.clear()

    # Experimental support for saving / restoring uncasted versions of functions
    def _save_func(self, mod, fn, func):
        self._all_wrappers.append((mod, fn, func))

    def _deactivate(self):
        for mod, fn, func in self._all_wrappers:
            utils.set_func(mod, fn, func)
        self._all_wrappers = []

    @property
    def has_cache(self):
        return self._enable_caching

    @property
    def cache(self):
        return self._cache

    def remove_cache(self, param):
        if self.has_cache and param in self.cache:
            del self.cache[param]

    @property
    def verbose(self):
        return self._verbose

class NoOpHandle(object):
    def is_active(self):
        return False

    @contextlib.contextmanager
    def _disable_casts(self):
        yield

    def wrap_optimizer(self, optimizer, num_loss=1):
        return OptimWrapper(optimizer, self, num_loss)

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
        yield loss

    @property
    def has_cache(self):
        return False

    @property
    def verbose(self):
        return False

    def _clear_cache(self):
        pass

    def _deactivate(self):
        pass


================================================
FILE: apex/apex/amp/lists/__init__.py
================================================


================================================
FILE: apex/apex/amp/lists/functional_overrides.py
================================================

# TODO: think about the following two. They do weird things.
# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
# - torch.nn.utils.weight_norm

# Notes:
# F.instance_norm uses batch_norm internally. Which correctly handles
#   fp16 in/out with fp32 weights. So we shouldn't do anything for
#   either of these.
# F.normalize calls `input.norm()` internally, so it's redundant, but
#   kept here in case impl. changes.
# F.cosine_similarity is same: calls `x.norm()` internally.

import torch.nn.functional

MODULE = torch.nn.functional

FP16_FUNCS = [
    'conv1d',
    'conv2d',
    'conv3d',
    'conv_transpose1d',
    'conv_transpose2d',
    'conv_transpose3d',
    'conv_tbc', # Undocumented / maybe new?
    'linear',
]

FP32_FUNCS = [

    # Interpolation/Upsampling
    'interpolate',

    # Pointwise
    'softplus',
    'softmin',
    'log_softmax',
    'softmax',

    # Normalization
    'layer_norm',
    'group_norm',
    'local_response_norm',
    'normalize',
    'cosine_similarity',

    # Loss functions
    # TODO: which of these can be fp16?
    'poisson_nll_loss',
    'cosine_embedding_loss',
    'cross_entropy',
    'hinge_embedding_loss',
    'kl_div',
    'l1_loss',
    'mse_loss',
    'margin_ranking_loss',
    'multilabel_margin_loss',
    'multilabel_soft_margin_loss',
    'multi_margin_loss',
    'nll_loss',
    'binary_cross_entropy_with_logits',
    'smooth_l1_loss',
    'soft_margin_loss',
    'triplet_margin_loss'
]

BANNED_FUNCS = [
    ('binary_cross_entropy',
     ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
      "It requires that the output of the previous function be already a FloatTensor. \n\n"
      "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
      "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
      "that is compatible with amp.\nAnother option is to add\n"
      "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
      "If you _really_ know what you are doing, you can disable this warning by passing "
      "allow_banned=True to `amp.init()`."))
]


================================================
FILE: apex/apex/amp/lists/tensor_overrides.py
================================================
from .. import compat
from . import torch_overrides

import importlib

import torch

if compat.variable_is_tensor() and not compat.tensor_is_variable():
    MODULE = torch.Tensor
else:
    MODULE = torch.autograd.Variable


FP16_FUNCS = [
    '__matmul__',
]

FP32_FUNCS = [
    '__ipow__',
    '__pow__',
    '__rpow__',

    # Cast to fp32 before transfer to CPU
    'cpu',
]

CASTS = [
    '__add__',
    '__div__',
    '__eq__',
    '__ge__',
    '__gt__',
    '__iadd__',
    '__idiv__',
    '__imul__',
    '__isub__',
    '__itruediv__',
    '__le__',
    '__lt__',
    '__mul__',
    '__ne__',
    '__radd__',
    '__rdiv__',
    '__rmul__',
    '__rsub__',
    '__rtruediv__',
    '__sub__',
    '__truediv__',
]

# None of these, but here to make code cleaner.
SEQUENCE_CASTS = []

# We need to grab all the methods from torch_overrides and add them to
# the Tensor lists as well, as almost all methods are duplicated
# between `torch` and `torch.Tensor` (and check with `hasattr`,
# because a few random ones aren't defined on Tensor)
_self_mod = importlib.import_module(__name__)
for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
    lst = getattr(_self_mod, attrname)
    for fn in getattr(torch_overrides, attrname):
        if hasattr(MODULE, fn):
            lst.append(fn)


================================================
FILE: apex/apex/amp/lists/torch_overrides.py
================================================
import torch

from .. import utils

MODULE = torch

FP16_FUNCS = [
    # Low level functions wrapped by torch.nn layers.
    # The wrapper layers contain the weights which are then passed in as a parameter
    # to these functions.
    'conv1d',
    'conv2d',
    'conv3d',
    'conv_transpose1d',
    'conv_transpose2d',
    'conv_transpose3d',
    'conv_tbc',
    'prelu',

    # BLAS
    'addmm',
    'addmv',
    'addr',
    'matmul',
    'mm',
    'mv',
]

FP32_FUNCS = [
    # Pointwise
    'acos',
    'asin',
    'cosh',
    'erfinv',
    'exp',
    'expm1',
    'log',
    'log10',
    'log2',
    'reciprocal',
    'rsqrt',
    'sinh',
    'tan',

    # Other math
    'pow',

    # Reduction
    'cumprod',
    'cumsum',
    'dist',
    'mean',
    'norm',
    'prod',
    'std',
    'sum',
    'var',

    # Misc
    'renorm'
]

# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
# check the CUDA version -- if at least 9.1, then put the bmm
# functions on the fp16 list. Otherwise, put them on the fp32 list.
_bmms = ['addbmm',
         'baddbmm',
         'bmm']
if utils.get_cuda_version() >= (9, 1, 0):
    FP16_FUNCS.extend(_bmms)
else:
    FP32_FUNCS.extend(_bmms)

# Multi-tensor fns that may need type promotion
CASTS = [
    # Multi-tensor math
    'addcdiv',
    'addcmul',
    'atan2',
    'cross',
    'bilinear',

    # Element-wise _or_ tensor-wise math
    'add',
    'div',
    'mul',

    # Comparison
    'eq',
    'equal',
    'ge',
    'gt',
    'le',
    'lt',
    'ne'
]

# Functions that take sequence arguments. We need to inspect the whole
# sequence and cast to the widest type.
SEQUENCE_CASTS = [
    'cat',
    'stack'
]


================================================
FILE: apex/apex/amp/opt.py
================================================
import contextlib
import warnings

from .scaler import LossScaler, master_params
from ._amp_state import maybe_print

import numpy as np

class OptimWrapper(object):
    def __init__(self, optimizer, amp_handle, num_loss):
        self._optimizer = optimizer
        self._amp_handle = amp_handle
        self._num_loss = num_loss
        self._loss_idx = 0
        self._skip_next = [False] * num_loss
        self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]

    @contextlib.contextmanager
    def scale_loss(self, loss):
        if not self._amp_handle.is_active():
            yield loss
            return

        # When there are multiple losses per-optimizer, we need
        # to save out current grad accumulation, since we won't be
        # able to unscale this particulare loss once the grads are
        # all mixed together.
        cached_grads = []
        if self._loss_idx > 0:
            for p in master_params(self._optimizer):
                if p.grad is not None:
                    cached_grads.append(p.grad.data.detach().clone())
                else:
                    cached_grads.append(None)
            self._optimizer.zero_grad()

        loss_scale = self._cur_loss_scaler().loss_scale()
        yield loss * loss_scale

        self._cur_loss_scaler().clear_overflow_state()
        self._cur_loss_scaler().unscale(
            master_params(self._optimizer),
            master_params(self._optimizer),
            loss_scale)
        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
        self._loss_idx += 1

        if len(cached_grads) > 0:
            for p, cached_grad in zip(master_params(self._optimizer),
                                      cached_grads):
                if cached_grad is not None:
                    p.grad.data.add_(cached_grad)
            cached_grads = []

    def _cur_loss_scaler(self):
        assert 0 <= self._loss_idx < self._num_loss
        return self._loss_scaler[self._loss_idx]

    def step(self, closure=None):
        if not self._amp_handle.is_active():
            return self._optimizer.step(closure=closure)

        self._loss_idx = 0

        for group in self._optimizer.param_groups:
            for p in group['params']:
                self._amp_handle.remove_cache(p)

        if closure is not None:
            raise NotImplementedError(
                'The `closure` argument is unsupported by the amp ' +
                'optimizer wrapper.')
        if any(self._skip_next):
            maybe_print('Gradient overflow, skipping update')
            self._skip_next = [False] * self._num_loss
        else:
            return self._optimizer.step(closure=closure)

    # Forward any attribute lookups
    def __getattr__(self, attr):
        return getattr(self._optimizer, attr)

    # Forward all torch.optim.Optimizer methods
    def __getstate__(self):
        return self._optimizer.__getstate__()

    def __setstate__(self):
        return self._optimizer.__setstate__()

    def __repr__(self):
        return self._optimizer.__repr__()

    def state_dict(self):
        return self._optimizer.state_dict()

    def load_state_dict(self, state_dict):
        return self._optimizer.load_state_dict(state_dict)

    def zero_grad(self):
        return self._optimizer.zero_grad()

    def add_param_group(self, param_group):
        return self._optimizer.add_param_group(param_group)


================================================
FILE: apex/apex/amp/rnn_compat.py
================================================
from . import utils, wrap

import torch
_VF = torch._C._VariableFunctions
RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']

def _gen_VF_wrapper(name):
    def wrapper(*args, **kwargs):
        return getattr(_VF, name)(*args, **kwargs)
    return wrapper

# Some python magic to generate an object that has the rnn cell functions
# defined on it, all of which call into corresponding _VF version.
# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
# imported at module scope within torch.nn.modules.rnn).  This should
# not affect third-party importers of _VF.py.
class VariableFunctionsShim(object):
    def __init__(self):
        for name in RNN_NAMES:
            for suffix in ['', '_cell']:
               fn_name = name + suffix
               setattr(self, fn_name, _gen_VF_wrapper(fn_name))

def has_old_rnns():
    try:
        torch.nn.backends.thnn.backend.LSTMCell
        return True
    except:
        return False

def whitelist_rnn_cells(handle, verbose):
    # Different module + function names in old/new RNN cases
    if has_old_rnns():
        fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
        mod = torch.nn.backends.thnn.backend
    else:
        fn_names = [x + '_cell' for x in RNN_NAMES]
        mod = torch.nn.modules.rnn._VF
        assert isinstance(mod, VariableFunctionsShim)

    # Insert casts on cell functions
    for fn in fn_names:
        wrap.cached_cast(mod, fn, utils.maybe_half, handle,
                         try_caching=True, verbose=verbose)

    if has_old_rnns():
        # Special handling of `backward` for fused gru / lstm:
        # The `backward` method calls Tensor.sum() (blacklist) internally,
        # and then the resulting grad_input has the wrong type.
        # TODO: where else is this a problem?
        for rnn_type in ['GRUFused', 'LSTMFused']:
            mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
            wrap.disable_casts(mod, 'backward', handle)


================================================
FILE: apex/apex/amp/scaler.py
================================================
import torch
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state, master_params, maybe_print
from itertools import product

def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
            return True

    if master_grad is not model_grad: # copy_ probably internally short-circuits this
        master_grad.copy_(model_grad)
    if scale != 1.0:
        master_grad.mul_(scale)
    return False

def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
            return True

    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
    #     master_grad.copy_(model_grad)
    assert stashed_grad.dtype == master_grad.dtype
    converted_model_grad = model_grad.to(master_grad.dtype)
    stashed_grad.add_(scale, converted_model_grad)
    master_grad.data = stashed_grad.data
    return False

class LossScaler(object):
    warned_no_fused_kernel = False
    warned_unscaling_non_fp32_grad = False
    has_fused_kernel = False

    def __init__(self,
                 loss_scale,
                 init_scale=2.**16,
                 scale_factor=2.,
                 scale_window=2000,
                 min_loss_scale=None,
                 max_loss_scale=2.**24):
        if loss_scale == "dynamic":
            self.dynamic = True
            self._loss_scale = init_scale
        else:
            self.dynamic = False
            self._loss_scale = loss_scale
        self._max_loss_scale = max_loss_scale
        self._min_loss_scale = min_loss_scale
        self._scale_seq_len = scale_window
        self._unskipped = 0
        self._has_overflow = False
        self._overflow_buf = torch.cuda.IntTensor([0])
        if multi_tensor_applier.available:
            import amp_C
            LossScaler.has_fused_kernel = multi_tensor_applier.available
            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
        else:
            if not LossScaler.warned_no_fused_kernel:
                maybe_print(
                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
                    "Using Python fallback.  Original ImportError was: " +
                    repr(multi_tensor_applier.import_err),
                    True)
            LossScaler.has_fused_kernel = False
            LossScaler.warned_no_fused_kernel = True

    def loss_scale(self):
        return self._loss_scale

    def unscale_python(self, model_grads, master_grads, scale):
        for model, master in zip(model_grads, master_grads):
            if model is not None:
                if not LossScaler.warned_unscaling_non_fp32_grad:
                    if master.dtype != torch.float32:
                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
                self._has_overflow = scale_check_overflow_python(model,
                                                                 master,
                                                                 1./scale,
                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

    # unused_scale keeps some of the old API alive for hopefully a short time.
    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False):
        if self._has_overflow:
            return

        scale = self._loss_scale

        if scale == 1.0 and models_are_masters and not self.dynamic:
            return

        if LossScaler.has_fused_kernel:
            # if (not LossScaler.warned_unscaling_non_fp32_grad
            #     and master_grads[0].dtype == torch.float16):
            #     print("Warning:  unscaling grads that are not FP32. "
            #           "Unscaling non-fp32 grads may indicate an error. "
            #           "When using Amp, you don't need to call .half() on your model.")
            #     # Setting this to True unconditionally allows the possibility of an escape
            #     # if never-before-seen non-fp32 grads are created in some later iteration.
            #     LossScaler.warned_unscaling_non_fp32_grad = True
            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
                                 self._overflow_buf,
                                 [model_grads, master_grads],
                                 1./scale)
        else:
            self.unscale_python(model_grads, master_grads, scale)

        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
        #     self._has_overflow = self._overflow_buf.item()

    def unscale_with_stashed_python(self,
                                    model_grads,
                                    stashed_master_grads,
                                    master_grads,
                                    scale):
        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
            if model is None and stashed is None:
                continue
            else:
                if not LossScaler.warned_unscaling_non_fp32_grad:
                    if master.dtype != torch.float32:
                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
                self._has_overflow = axpby_check_overflow_python(model,
                                                                 stashed,
                                                                 master,
                                                                 1./scale,
                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

    def unscale_with_stashed(self,
                             model_grads,
                             stashed_master_grads,
                             master_grads):
        if self._has_overflow:
            return

        scale = self._loss_scale

        if LossScaler.has_fused_kernel:
            if (not LossScaler.warned_unscaling_non_fp32_grad
                and master_grads[0].dtype == torch.float16):
                print("Warning:  unscaling grads that are not FP32. "
                      "Unscaling non-fp32 grads may indicate an error. "
                      "When using Amp, you don't need to call .half() on your model.")
                # Setting this to True unconditionally allows the possibility of an escape
                # if never-before-seen non-fp32 grads are created in some later iteration.
                LossScaler.warned_unscaling_non_fp32_grad = True
            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
                                 self._overflow_buf,
                                 [model_grads, stashed_master_grads, master_grads],
                                 1./scale,
                                 1.0,
                                 0) # check only arg 0, aka the incoming model grads, for infs
        else:
            self.unscale_with_stashed_python(model_grads,
                                             stashed_master_grads,
                                             master_grads,
                                             scale)

        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
        #     self._has_overflow = self._overflow_buf.item()

    def clear_overflow_state(self):
        self._has_overflow = False
        if self.has_fused_kernel:
            self._overflow_buf.zero_()

    # Separate so unscale() can be called more that once before updating.
    def update_scale(self):
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
            self._has_overflow = self._overflow_buf.item()

        if self._has_overflow and self.dynamic:
            should_skip = True
            if(self._min_loss_scale):
                self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
            else:
                self._loss_scale = self._loss_scale/2.
            self._unskipped = 0
        else:
            should_skip = False
            self._unskipped += 1

        if self._unskipped == self._scale_seq_len and self.dynamic:
            self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
            self._unskipped = 0

        return should_skip


================================================
FILE: apex/apex/amp/utils.py
================================================
from . import compat

import functools
import itertools

import torch

def get_cuda_version():
    return tuple(int(x) for x in torch.version.cuda.split('.'))

def is_fp_tensor(x):
    if is_nested(x):
        # Fast-fail version of all(is_fp_tensor)
        for y in x:
            if not is_fp_tensor(y):
                return False
        return True
    return compat.is_tensor_like(x) and compat.is_floating_point(x)

def is_nested(x):
    return isinstance(x, tuple) or isinstance(x, list)

def should_cache(x):
    if is_nested(x):
        # Fast-fail version of all(should_cache)
        for y in x:
            if not should_cache(y):
                return False
        return True
    return isinstance(x, torch.nn.parameter.Parameter) and \
        type_string(x) == 'FloatTensor'

def collect_fp_tensor_types(args, kwargs):
    def collect_types(x, types):
        if is_nested(x):
            for y in x:
                collect_types(y, types)
        else:
            types.add(type_string(x))

    all_args = itertools.chain(args, kwargs.values())
    types = set()
    for x in all_args:
        if is_fp_tensor(x):
            collect_types(x, types)
    return types

def type_string(x):
    return x.type().split('.')[-1]

def maybe_half(x, name='', verbose=False):
    if is_nested(x):
        return type(x)([maybe_half(y) for y in x])

    if not x.is_cuda or type_string(x) == 'HalfTensor':
        return x
    else:
        if verbose:
            print('Float->Half ({})'.format(name))
        return x.half()

def maybe_float(x, name='', verbose=False):
    if is_nested(x):
        return type(x)([maybe_float(y) for y in x])

    if not x.is_cuda or type_string(x) == 'FloatTensor':
        return x
    else:
        if verbose:
            print('Half->Float ({})'.format(name))
        return x.float()

# NB: returneds casted `args`, mutates `kwargs` in-place
def casted_args(cast_fn, args, kwargs):
    new_args = []
    for x in args:
        if is_fp_tensor(x):
            new_args.append(cast_fn(x))
        else:
            new_args.append(x)
    for k in kwargs:
        val = kwargs[k]
        if is_fp_tensor(val):
            kwargs[k] = cast_fn(val)
    return new_args

def cached_cast(cast_fn, x, cache):
    if is_nested(x):
        return type(x)([cached_cast(y) for y in x])
    if x in cache:
        cached_x = cache[x]
        if x.requires_grad and cached_x.requires_grad:
            # Make sure x is actually cached_x's autograd parent.
            if cached_x.grad_fn.next_functions[1][0].variable is not x:
                raise RuntimeError("x and cache[x] both require grad, but x is not "
                                   "cache[x]'s parent.  This is likely an error.")
        # During eval, it's possible to end up caching casted weights with
        # requires_grad=False.  On the next training iter, if cached_x is found
        # and reused from the cache, it will not actually have x as its parent.
        # Therefore, we choose to invalidate the cache (and force refreshing the cast)
        # if x.requires_grad and cached_x.requires_grad do not match.
        #
        # During eval (i.e. running under with torch.no_grad()) the invalidation
        # check would cause the cached value to be dropped every time, because
        # cached_x would always be created with requires_grad=False, while x would
        # still have requires_grad=True.  This would render the cache effectively
        # useless during eval.  Therefore, if we are running under the no_grad()
        # context manager (torch.is_grad_enabled=False) we elide the invalidation
        # check, and use the cached value even though its requires_grad flag doesn't
        # match.  During eval, we don't care that there's no autograd-graph
        # connection between x and cached_x.
        if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
            del cache[x]
        else:
            return cached_x

    casted_x = cast_fn(x)
    cache[x] = casted_x
    return casted_x

def verbosify(cast_fn, fn_name, verbose):
    if verbose:
        return functools.partial(cast_fn, name=fn_name, verbose=verbose)
    else:
        return cast_fn

def as_inplace(fns):
    for x in fns:
        yield x + '_'

def has_func(mod, fn):
    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
        return fn in mod.function_classes
    elif isinstance(mod, dict):
        return fn in mod
    else:
        return hasattr(mod, fn)

def get_func(mod, fn):
    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
        return mod.function_classes[fn]
    elif isinstance(mod, dict):
        return mod[fn]
    else:
        return getattr(mod, fn)

def set_func(mod, fn, new_fn):
    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
        mod.function_classes[fn] = new_fn
    elif isinstance(mod, dict):
        mod[fn] = new_fn
    else:
        setattr(mod, fn, new_fn)

def set_func_save(handle, mod, fn, new_fn):
    cur_fn = get_func(mod, fn)
    handle._save_func(mod, fn, cur_fn)
    set_func(mod, fn, new_fn)

# A couple problems get solved here:
# - The flat_weight buffer is disconnected from autograd graph,
#   so the fp16 weights need to be derived from the input weights
#   to this forward call, not the flat buffer.
# - The ordering of weights in the flat buffer is...idiosyncratic.
# First problem is solved with combination of set_ (to set up
# correct storage) and copy_ (so the fp16 weight derives from the
# fp32 one in autograd.
# Second is solved by doing ptr arithmetic on the fp32 weights
# to derive the correct offset.
#
# TODO: maybe this should actually use
# `torch._cudnn_rnn_flatten_weight`? But then I need to call
# on first iter and cache the right offsets. Ugh.
def synthesize_flattened_rnn_weights(fp32_weights,
                                     fp16_flat_tensor,
                                     rnn_fn='',
                                     verbose=False):
    fp16_weights = []
    fp32_base_ptr = fp32_weights[0][0].data_ptr()
    for layer_weights in fp32_weights:
        fp16_layer_weights = []
        for w_fp32 in layer_weights:
            w_fp16 = w_fp32.new().half()
            offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
            w_fp16.set_(fp16_flat_tensor.storage(),
                        offset,
                        w_fp32.shape)
            w_fp16.copy_(w_fp32)
            if verbose:
                print('Float->Half ({})'.format(rnn_fn))
            fp16_layer_weights.append(w_fp16)
        fp16_weights.append(fp16_layer_weights)
    return fp16_weights

# Roughly same as above, just the `fp32_weights` aren't nested.
# Code kept separate for readability.
def new_synthesize_flattened_rnn_weights(fp32_weights,
                                         fp16_flat_tensor,
                                         rnn_fn='',
                                         verbose=False):
    fp16_weights = []
    fp32_base_ptr = fp32_weights[0].data_ptr()
    for w_fp32 in fp32_weights:
        w_fp16 = w_fp32.new().half()
        offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
        w_fp16.set_(fp16_flat_tensor.storage(),
                    offset,
                    w_fp32.shape)
        w_fp16.copy_(w_fp32)
        if verbose:
            print('Float->Half ({})'.format(rnn_fn))
        fp16_weights.append(w_fp16)
    return fp16_weights


================================================
FILE: apex/apex/amp/wrap.py
================================================
from . import compat
from . import utils
from ._amp_state import _amp_state
from . import rnn_compat

import functools

import torch

def make_cast_wrapper(orig_fn, cast_fn, handle,
                      try_caching=False):
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        if not handle.is_active():
            return orig_fn(*args, **kwargs)

        if try_caching and handle.has_cache:
            args = list(args)
            for i in range(len(args)):
                if utils.should_cache(args[i]):
                    args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
            for k in kwargs:
                if utils.should_cache(kwargs[k]):
                    kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
        new_args = utils.casted_args(cast_fn,
                                     args,
                                     kwargs)
        return orig_fn(*new_args, **kwargs)
    return wrapper

def cached_cast(mod, fn, cast_fn, handle,
                try_caching=False, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    cast_fn = utils.verbosify(cast_fn, fn, verbose)
    wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
    utils.set_func_save(handle, mod, fn, wrapper)

# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
# Annoyingly, make_promote_wrapper still uses the global handle.  Once everyone
# is on the new API and I am free to get rid of handle, I can clean this up.
def make_promote_wrapper(orig_fn, cast_fn, handle=None):
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        if not _amp_state.handle.is_active():
            return orig_fn(*args, **kwargs)

        types = utils.collect_fp_tensor_types(args, kwargs)

        if len(types) <= 1:
            return orig_fn(*args, **kwargs)
        elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
            new_args = utils.casted_args(cast_fn,
                                         args,
                                         kwargs)
            return orig_fn(*new_args, **kwargs)
        else:
            raise NotImplementedError('Do not know how to handle ' +
                                      'these types to promote: {}'
                                      .format(types))
    return wrapper

def promote(mod, fn, handle, verbose=False):
    orig_fn = utils.get_func(mod, fn)
    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
    wrapper = make_promote_wrapper(orig_fn, maybe_float)
    utils.set_func_save(handle, mod, fn, wrapper)

def sequence_promote(mod, fn, handle, verbose=False):
    orig_fn = utils.get_func(mod, fn)
    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
    @functools.wraps(orig_fn)
    def wrapper(seq, *args, **kwargs):
        if not _amp_state.handle.is_active():
            return orig_fn(seq, *args, **kwargs)

        types = set([utils.type_string(x) for x in seq])
        if len(types) <= 1:
            return orig_fn(seq, *args, **kwargs)
        elif types == set(['HalfTensor', 'FloatTensor']):
            cast_seq = utils.casted_args(maybe_float,
                                         seq, {})
            return orig_fn(cast_seq, *args, **kwargs)
        else:
            # TODO: other mixed-type cases aren't due to amp.
            #       Just pass through?
            return orig_fn(seq, *args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def promote_match_arg0(mod, fn, handle, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(arg0, *args, **kwargs):
        assert compat.is_tensor_like(arg0)
        if not _amp_state.handle.is_active():
            return orig_fn(arg0, *args, **kwargs)

        if utils.type_string(arg0) == 'HalfTensor':
            cast_fn = utils.maybe_half
        elif utils.type_string(arg0) == 'FloatTensor':
            cast_fn = utils.maybe_float
        else:
            return orig_fn(arg0, *args, **kwargs)
        cast_fn = utils.verbosify(cast_fn, fn, verbose)
        new_args = utils.casted_args(cast_fn, args, kwargs)
        return orig_fn(arg0, *new_args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def err_if_any_half(mod, fn, handle, custom_err_msg=None):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        types = utils.collect_fp_tensor_types(args, kwargs)
        if 'HalfTensor' in types:
            if custom_err_msg:
                raise NotImplementedError(custom_err_msg)
            else:
                raise NotImplementedError('Cannot call in-place function ' +
                                          '{} with fp16 arguments.'.format(fn))
        else:
            return orig_fn(*args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def err_if_arg0_half(mod, fn, handle, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(arg0, *args, **kwargs):
        assert compat.is_tensor_like(arg0)
        if utils.type_string(arg0) == 'HalfTensor':
            raise NotImplementedError('Cannot call in-place method ' +
                                      '{} on fp16 Tensors.'.format(fn))
        else:
            cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
            new_args = utils.casted_args(cast_fn, args, kwargs)
            return orig_fn(arg0, *new_args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

# Current RNN approach:
# - Wrap top-level `RNN` function in thnn backend
# - Will call into either CudnnRNN or AutogradRNN
#  - Each of these are factory functions that return a per-iter
#    `forward` function
# - We interpose on the factory function to:
#   1) Interpose on the actual forward function and put in casts
#   2) Insert an fp16 `flat_weight` if necessary
def rnn_cast(backend, fn, handle, verbose=False):
    orig_rnn = utils.get_func(backend, fn)
    @functools.wraps(orig_rnn)
    def rnn_wrapper(*args, **kwargs):
        flat_weight = kwargs.get('flat_weight')
        if flat_weight is not None:
            # We replace `flat_weight` with an uninitialized fp16
            # Tensor. The "actual" weight tensors (provided in `forward`),
            # will then be set up as ptrs into the buffer and have the
            # corresponding fp32 values copied in.
            # We need to call `copy` on the "actual" weights so that the
            # autograd graph correctly backprops from the wgrads computed
            # inside cuDNN (on fp16 weights) into the fp32 weights.
            assert utils.type_string(flat_weight) == 'FloatTensor'
            if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
                # Pre-0.4. A little slower, since it zeros out memory.
                flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
            else:
                flat_weight_fp16 = torch.empty_like(flat_weight,
                                                    dtype=torch.float16)
            kwargs['flat_weight'] = flat_weight_fp16
        else:
            flat_weight_fp16 = None

        forward = orig_rnn(*args, **kwargs)
        @functools.wraps(forward)
        def fwd_wrapper(*fargs, **fkwargs):
            assert len(fargs) == 3 or len(fargs) == 4
            inputs, weights, hiddens = fargs[:3]
            assert utils.is_fp_tensor(inputs)
            assert isinstance(weights, list)
            cast_fn = utils.verbosify(utils.maybe_half,
                                      fn,
                                      verbose)
            new_args = []

            # 0) Inputs
            new_args.append(cast_fn(inputs))

            # 1) Weights
            if flat_weight_fp16 is not None:
                fp16_weights = utils.synthesize_flattened_rnn_weights(
                    weights, flat_weight_fp16, fn, verbose)
            else:
                fp16_weights = [[cast_fn(w) for w in layer]
                                for layer in weights]
            new_args.append(fp16_weights)

            # 2) Inputs: either a tuple (for LSTM) or single tensor
            if isinstance(hiddens, tuple):
                new_args.append(tuple(cast_fn(x) for x in hiddens))
            elif utils.is_fp_tensor(hiddens):
                new_args.append(cast_fn(hiddens))
            else:
                # Hiddens can, in principle, be `None` -- pass through
                new_args.append(hiddens)

            # 3) Batch sizes (0.4 or later only)
            if len(fargs) == 4:
                new_args.append(fargs[3])

            return forward(*new_args, **fkwargs)
        return fwd_wrapper
    utils.set_func_save(handle, backend, fn, rnn_wrapper)

def new_rnn_cast(fn, handle, verbose=False):
    # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
    # For rnn backend calls that route through _rnn_impls, we must patch the ref
    # that _rnn_impls stashed.  For rnn backend calls that directly invoke
    # _VF.<backend>, e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
    # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
    if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
        mod = torch.nn.modules.rnn._rnn_impls
    else:
        mod = torch.nn.modules.rnn._VF
        assert isinstance(mod, rnn_compat.VariableFunctionsShim)
        fn = fn.lower()
    orig_fn = utils.get_func(mod, fn)
    cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        # Exact call signature from modules/rnn.py
        assert len(args) == 9
        assert len(kwargs) == 0

        if not _amp_state.handle.is_active():
            return orig_fn(*args, **kwargs)

        if isinstance(args[6], bool):
            params_idx = 2 # Not PackedSequence case
        else:
            params_idx = 3 # PackedSequence case

        new_args = []
        for i, arg in enumerate(args):
            if i == params_idx:
                num_params = sum([x.numel() for x in arg])
                fp16_weight_buf = args[0].new_empty((num_params,),
                                                    dtype=torch.half)
                casted_weights = utils.new_synthesize_flattened_rnn_weights(
                    arg, fp16_weight_buf, fn, verbose)
                new_args.append(casted_weights)
            elif utils.is_fp_tensor(arg):
                new_args.append(cast_fn(arg))
            else:
                new_args.append(arg)

        return orig_fn(*new_args)
    utils.set_func_save(handle, mod, fn, wrapper)

def disable_casts(mod, fn, handle):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        with handle._disable_casts():
            return orig_fn(*args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)


================================================
FILE: apex/apex/fp16_utils/README.md
================================================
fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.

#### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)

#### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)

#### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)

#### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)


fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  

#### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)

The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.


================================================
FILE: apex/apex/fp16_utils/__init__.py
================================================
from .fp16util import (
    BN_convert_float,
    network_to_half,
    prep_param_lists,
    model_grads_to_master_grads,
    master_params_to_model_params,
    tofp16,
    to_python_float,
    clip_grad_norm,
    convert_module,
    convert_network,
    FP16Model,
)

from .fp16_optimizer import FP16_Optimizer
from .loss_scaler import LossScaler, DynamicLossScaler


================================================
FILE: apex/apex/fp16_utils/fp16_optimizer.py
================================================
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

from ..amp._amp_state import _amp_state, maybe_print
from ..amp.scaler import LossScaler
from ..multi_tensor_apply import multi_tensor_applier
from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm

# TODO:  Update overflow check + downscale to use Carl's fused kernel.
class FP16_Optimizer(object):
    """
    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, 
    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
    and changing the call to ``backward``.

    Example::

        model = torch.nn.Linear(D_in, D_out).cuda().half()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
        # Name the FP16_Optimizer instance to replace the existing optimizer
        # (recommended but not required):
        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
        ...
        # loss.backward() becomes:
        optimizer.backward(loss)
        ...

    Example with dynamic loss scaling::

        ...
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
                                   # optional arg to control dynamic loss scaling behavior
                                   # dynamic_loss_args={'scale_window' : 500})
                                   # Usually, dynamic_loss_args is not necessary. 

    Args:
        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  
        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`LossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`LossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`LossScaler`'s defaults will be used.
        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.

    ``init_optimizer`` is expected to have been constructed in the ordinary way.  
    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be 
    named to replace ``init_optimizer``, for two reasons:  
    First, it means that references to the same name
    later in the file will not have to change.  
    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to 
    modify ``init_optimizer``.  If you do choose a unique name for the new
    :class:`FP16_Optimizer` instance, you should only work with this new instance,
    because the preexisting optimizer might no longer behave as expected.

    ``init_optimizer`` may be any Pytorch optimizer. 
    It may contain a mixture of fp16 and fp32 parameters organized into any number of 
    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will 
    ingest these ``param_groups`` and remember them. 

    Calls to ::

        loss.backward() 

    must be replaced with ::

        optimizer.backward(loss)  

    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement 
    loss scaling and copies to master gradients.

    .. note::
        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
        are downscaled before being applied.  This means that adjusting the loss scale, or using
        dynamic loss scaling, should not require retuning the learning rate or any other 
        hyperparameters.


    **Advanced options**

    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
    See docstring for :attr:`step`.

    **Gradient clipping**:  Use :attr:`clip_master_grads`.
    
    **Multiple losses**:  If your model accumulates gradients from multiple losses,
    this can be made more efficient by supplying ``update_master_grads=False``
    to :attr:`backward`.  See docstring for :attr:`backward`.

    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::

        print(optimizer.loss_scale)
        optimizer.loss_scale = new_loss_scale

    For static loss scaling, manually adjusting the loss scale over time is a reasonable
    thing to do.  During later epochs, gradients may become smaller, and a 
    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting 
    the loss scale is not recommended.

    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` 
    should still work as intended.
    """

    def __init__(self, 
                 init_optimizer, 
                 static_loss_scale=1.0, 
                 dynamic_loss_scale=False,
                 dynamic_loss_args=None,
                 verbose=True):
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")

        self.verbose = verbose

        self.optimizer = init_optimizer
        # init_state_dict sets up an alternative way to cast per-param state tensors.
        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
        # init_state_dict = init_optimizer.state_dict()

        self.fp16_groups = []
        self.fp32_from_fp16_groups = []
        self.fp32_from_fp32_groups = []
        for i, param_group in enumerate(self.optimizer.param_groups):
            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(param_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
                                         .format(param.size()))
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        param_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                        # Reset existing state dict key to the new master param.
                        # We still need to recast per-param state tensors, if any, to FP32.
                        if param in self.optimizer.state:
                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
                    elif param.type() == 'torch.cuda.FloatTensor':
                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                                         .format(param.size()))
                        fp32_params_this_group.append(param)
                        param_group['params'][i] = param
                    else:
                        raise TypeError("Wrapped parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
                                        "Received {}".format(param.type()))
            
            self.fp16_groups.append(fp16_params_this_group)
            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            self.fp32_from_fp32_groups.append(fp32_params_this_group)

        self.all_fp16_params = []
        for group in self.fp16_groups:
            self.all_fp16_params += group

        self.all_fp32_from_fp16_params = []
        for group in self.fp32_from_fp16_groups:
            self.all_fp32_from_fp16_params += group

        self.all_fp32_from_fp32_params = []
        for group in self.fp32_from_fp32_groups:
            self.all_fp32_from_fp32_params += group

        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
        self.optimizer.load_state_dict(self.optimizer.state_dict())
        # alternative way to cast per-param state tensors:
        # self.optimizer.load_state_dict(init_state_dict)

        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            if dynamic_loss_args is not None:
                self.loss_scaler = LossScaler("dynamic", **dynamic_loss_args)
            else:
                self.loss_scaler = LossScaler("dynamic")
        else:
            self.dynamic_loss_scale = False
            self.loss_scaler = LossScaler(static_loss_scale)

        self.overflow = False
        self.first_closure_call_this_step = True

        self.clip_grad_norm = clip_grad_norm

        # TODO:  Centralize exposure and import error checking for the C backend.
        if multi_tensor_applier.available:
            import amp_C
            self.multi_tensor_scale = amp_C.multi_tensor_scale
            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);

    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
    # of having to support FP16_Optimizer separately, for the time being.
    def maybe_print(self, msg):
        if self.verbose:
            print(msg)
            
    def __getstate__(self):
        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")

    def __setstate__(self, state):
        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")

    def zero_grad(self, set_grads_to_None=False):
        """
        Zero fp32 and fp16 parameter grads.
        """
        # In principle, only the .grad attributes of the model params need to be zeroed,
        # because gradients are copied into the FP32 master params.  However, we zero
        # all gradients owned by the optimizer, just to be safe:
        for group in self.optimizer.param_groups:
             for p in group['params']:
                 if set_grads_to_None:
                     p.grad = None
                 else:
                     if p.grad is not None:
                         p.grad.detach_()
                         p.grad.zero_()

        # Zero fp16 gradients owned by the model:
        for fp16_group in self.fp16_groups:
            for param in fp16_group:
                if set_grads_to_None:
                    param.grad = None
                else:
                    if param.grad is not None:
                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
                        param.grad.zero_()

    # Should not be used anymore.
    # def _check_overflow(self):
    #     params = []
    #     for group in self.fp16_groups:
    #         for param in group:
    #             params.append(param)
    #     for group in self.fp32_from_fp32_groups:
    #         for param in group:
    #             params.append(param)
    #     self.overflow = self.loss_scaler.has_overflow(params)

    # def _update_scale(self, has_overflow=False):
    #     self.loss_scaler.update_scale(has_overflow)

    def _master_params_to_model_params(self):
        if multi_tensor_applier.available:
            if len(self.all_fp16_params) > 0:
                multi_tensor_applier(
                    self.multi_tensor_scale,
                    self._dummy_overflow_buf,
                    [self.all_fp32_from_fp16_params, self.all_fp16_params],
                    1.0)
        else:
            for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
                master_params_to_model_params(fp16_group, fp32_from_fp16_group)

    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
    # def _model_grads_to_master_grads(self):
    #     for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
    #         model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)

    # def _downscale_master(self):
    #     if self.loss_scale != 1.0:
    #         for group in self.optimizer.param_groups:
    #             for param in group['params']:
    #                 if param.grad is not None:
    #                     param.grad.data.mul_(1./self.loss_scale)

    def clip_master_grads(self, max_norm, norm_type=2):
        """
        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.

        Args:
            max_norm (float or int): max norm of the gradients
            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
                infinity norm.

        Returns:
            Total norm of the current fp32 gradients (viewed as a single vector).

        .. warning::
            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
        """
        if not self.overflow:
            fp32_params = []
            for param_group in self.optimizer.param_groups:
                for param in param_group['params']:
                    fp32_params.append(param)
            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
        else:
            return -1

    def state_dict(self):
        """
        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
        of the contained Pytorch optimizer.
        Example::

            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            torch.save(checkpoint, "saved.pth")
        """
        state_dict = {}
        state_dict['loss_scaler'] = self.loss_scaler
        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
        state_dict['overflow'] = self.overflow
        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
        return state_dict

    def load_state_dict(self, state_dict):
        """
        Loads a state_dict created by an earlier call to state_dict(). 
        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
        whose parameters in turn came from ``model``, it is expected that the user 
        will call ``model.load_state_dict()`` before
        ``fp16_optimizer_instance.load_state_dict()`` is called.

        Example::

            model = torch.nn.Linear(D_in, D_out).cuda().half()
            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
            ...
            checkpoint = torch.load("saved.pth")
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        """
        # I think it should actually be ok to reload the optimizer before the model.
        self.loss_scaler = state_dict['loss_scaler']
        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
        self.overflow = state_dict['overflow']
        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
        # The optimizer's hyperparameters and internal buffers are also up to date.  
        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
        # out of date.  There are two options.  
        # 1:  Refresh the master params from the model's fp16 params.  
        # This requires less storage but incurs precision loss.
        # 2:  Save and restore the fp32 master copies separately.
        # We choose option 2.
        # 
        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
        # of their associated parameters, because it's possible those buffers might not exist yet in 
        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
        # constructed in the same way as the one whose state_dict we are loading, the same master params
        # are guaranteed to exist, so we can just copy_() from the saved master params.
        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
            for current, saved in zip(current_group, saved_group):
                current.data.copy_(saved.data)

    def step(self, closure=None): # could add clip option.
        """
        If no closure is supplied, :attr:`step` should be called after 
        ``fp16_optimizer_obj.backward(loss)``.
        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
        another forward pass using their model.

        If a closure is supplied, :attr:`step` may be called without a prior call to 
        :attr:`backward(loss)`.
        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
        However, the user should take care that any ``loss.backward()`` call within the closure
        has been replaced by ``fp16_optimizer_obj.backward(loss)``.

        Args:
           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.

        Example with closure::

            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
            # existing pytorch optimizer.
            for input, target in dataset:
                def closure():
                    optimizer.zero_grad()
                    output = model(input)
                    loss = loss_fn(output, target)
                    # loss.backward() becomes:
                    optimizer.backward(loss)
                    return loss
                optimizer.step(closure)

        .. warning::
            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.

        .. _`ordinary Pytorch optimizer use`:
            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
        """

        scale = self.loss_scaler.loss_scale()
        # To consider:  Should this be in step(), or update_master_grads?  It works either way,
        # but I should make it consistent with the Amp control flow, which updates the scale
        # during backward context manager exit.
        # self._update_scale(self.overflow)

        if self.overflow:
            # Using _amp_state.maybe_print instead of self.print here is intentional.
            maybe_print("Gradient overflow.  Skipping step, reducing " +
                "loss scale to {}".format(self.loss_scaler.loss_scale()))
            return
        
        if closure is not None:
            retval = self._step_with_closure(closure)
        else:
            # torch.cuda.nvtx.range_push("pytorch optimizer step")
            retval = self.optimizer.step()
            # torch.cuda.nvtx.range_pop()

        self._master_params_to_model_params()

        return retval

    def _step_with_closure(self, closure):
        def wrapped_closure():
            # helpful for debugging
            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
            #       .format(self.first_closure_call_this_step))
            if self.first_closure_call_this_step:
                # We expect that the fp16 params are initially fresh on entering self.step(),
                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
                # is called within self.optimizer.step().
                self.first_closure_call_this_step = False
            else:
                # If self.optimizer.step() internally calls wrapped_closure more than once,
                # it may update the fp32 params after each call.  However, self.optimizer 
                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
                # we can't rely on self.optimizer to refresh the fp16 params.  We need
                # to handle that manually:
                self._master_params_to_model_params()
            # Our API expects the user to give us ownership of the backward() call by
            # replacing all calls to loss.backward() with optimizer.backward(loss).
            # This requirement holds whether or not the call to backward() is made within a closure.
            # If the user is properly calling optimizer.backward(loss) within "closure," 
            # calling closure() here will give the fp32 master params fresh gradients
            # for the optimizer to play with, so all wrapped_closure needs to do is call 
            # closure() and return the loss.
            temp_loss = closure() 
            while(self.overflow):
                scale = self.loss_scaler.loss_scale()
                # self._update_scale(self.overflow) # now done at the end of backward
                print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
                      self.loss_scaler.loss_scale()))
                temp_loss = closure()
            return temp_loss

        retval = self.optimizer.step(wrapped_closure)

        self.first_closure_call_this_step = True

        return retval

    def backward(self, loss, update_master_grads=True, retain_graph=False):
        """ 
        :attr:`backward` performs the following conceptual steps:

        1. fp32_loss = loss.float() (see first Note below)
        2. scaled_loss = fp32_loss*loss_scale
        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
        5. Finally, master grads are divided by loss_scale.

        In this way, after :attr:`backward`, the master params have fresh gradients,
        and :attr:`step` may be called.

        .. note::
            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
            This provides some additional safety against overflow if the user has supplied an 
            fp16 loss value.  
            However, for maximum overflow safety, the user should
            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
            :attr:`backward`.

        .. warning::
            The gradients found in a model's leaves after the call to 
            :attr:`backward` should not be regarded as valid in general, 
            because it's possible 
            they have been scaled (and in the case of dynamic loss scaling, 
            the scale factor may change over time).  
            If the user wants to inspect gradients after a call to :attr:`backward`,  
            only the master gradients should be regarded as valid.  These can be retrieved via
            :attr:`inspect_master_grad_data()`.

        Args:
            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).

        Example::

            # Ordinary operation:
            optimizer.backward(loss)

            # Naive operation with multiple losses (technically valid, but less efficient):
            # fp32 grads will be correct after the second call,  but 
            # the first call incurs an unnecessary fp16->fp32 grad copy.
            optimizer.backward(loss1)
            optimizer.backward(loss2)

            # More efficient way to handle multiple losses:
            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
            # losses have been accumulated.
            optimizer.backward(loss1, update_master_grads=False)
            optimizer.backward(loss2, update_master_grads=False)
            optimizer.update_master_grads()
        """ 
        # To consider:  try multiple backward passes using retain_grad=True to find 
        # a loss scale that works.  After you find a loss scale that works, do a final dummy
        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
        scaled_loss = loss.float()*self.loss_scaler.loss_scale()
        scaled_loss.backward(retain_graph=retain_graph)
        if update_master_grads:
            self.update_master_grads()

    def update_master_grads(self):
        # torch.cuda.nvtx.range_push("update_master_grads")
        """
        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
        the ``.grad`` attribute of the fp32 master parameters that are directly 
        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
        """
        # if self.dynamic_loss_scale:
        #     self._check_overflow()
        #     if self.overflow: return
        # self._model_grads_to_master_grads()
        # self._downscale_master()
        # Use the one-shot multi-tensor apply kernel
        self.loss_scaler.clear_overflow_state()
        if len(self.all_fp16_params) > 0:
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp16_params])
            # I'm ONLY writing this as an incremental way to make some tests pass until
            # I can refactor the tests as well.
            # FP16_Optimizer should not be used by anyone.
            model_grads = []
            master_grads = []
            for model_param, master_param in zip(self.all_fp16_params,
                                                 self.all_fp32_from_fp16_params):
                if model_param.grad is not None:
                    model_grads.append(model_param.grad)
                    if master_param.grad is None:
                        master_param.grad = torch.empty_like(master_param)
                    master_grads.append(master_param.grad)
            self.loss_scaler.unscale(
                model_grads,
                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
        if len(self.all_fp32_from_fp32_params) > 0:
            model_grads = []
            master_grads = []
            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
                                                 self.all_fp32_from_fp32_params):
                if model_param.grad is not None:
                    model_grads.append(model_param.grad)
                    master_grads.append(master_param.grad)
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
            self.loss_scaler.unscale(
                model_grads,
                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
        # quit()
        self.overflow = self.loss_scaler.update_scale()
        # torch.cuda.nvtx.range_pop()


    def inspect_master_grad_data(self):
        """
        When running with :class:`FP16_Optimizer`, 
        ``.grad`` attributes of a model's fp16 leaves should not be
        regarded as truthful, because they might be scaled.  
        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
        the fp32 master params' ``.grad``
        attributes will contain valid gradients properly divided by the loss scale.  However, 
        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
        nonintuitive.  :attr:`inspect_master_grad_data`
        allows those gradients to be viewed with shapes corresponding to their associated model leaves.

        Returns:
            List of lists (one list for each parameter group).  The list for each parameter group
            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
        """
        if self.overflow:
            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
            return None
        else:
            # The optimizer owns only references to master params.
            master_grads_data = []
            for param_group in self.optimizer.param_groups:
                master_grads_this_group = []
                for param in param_group['params']:
                    if param.grad is not None:
                        master_grads_this_group.append(param.grad.data)
                    else:
                        master_grads_this_group.append(None)
                master_grads_data.append(master_grads_this_group)
            return master_grads_data


    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
    def _get_loss_scale(self):
        return self.loss_scaler.loss_scale()

    def _set_loss_scale(self, value):
        self.loss_scaler._loss_scale = value

    loss_scale = property(_get_loss_scale, _set_loss_scale)

    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
    def _get_state(self):
        return self.optimizer.state

    def _set_state(self, value):
        self.optimizer.state = value

    state = property(_get_state, _set_state)

    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
    # (for example, to adjust the learning rate)
    def _get_param_groups(self):
        return self.optimizer.param_groups

    def _set_param_groups(self, value):
        self.optimizer.param_groups = value

    param_groups = property(_get_param_groups, _set_param_groups)



================================================
FILE: apex/apex/fp16_utils/fp16util.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors


class tofp16(nn.Module):
    """
    Utility module that implements::

        def forward(self, input):
            return input.half()
    """

    def __init__(self):
        super(tofp16, self).__init__()

    def forward(self, input):
        return input.half()


def BN_convert_float(module):
    """
    Utility function for network_to_half().

    Retained for legacy purposes.
    """
    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
        module.float()
    for child in module.children():
        BN_convert_float(child)
    return module


def network_to_half(network):
    """
    Convert model to half precision in a batchnorm-safe way.

    Retained for legacy purposes. It is recommended to use FP16Model.
    """
    return nn.Sequential(tofp16(), BN_convert_float(network.half()))


def convert_module(module, dtype):
    """
    Converts a module's immediate parameters and buffers to dtype.
    """
    for param in module.parameters(recurse=False):
        if param is not None:
            if param.data.dtype.is_floating_point:
                param.data = param.data.to(dtype=dtype)
            if param._grad is not None and param._grad.data.dtype.is_floating_point:
                param._grad.data = param._grad.data.to(dtype=dtype)

    for buf in module.buffers(recurse=False):
        if buf is not None and buf.data.dtype.is_floating_point:
            buf.data = buf.data.to(dtype=dtype)


def convert_network(network, dtype):
    """
    Converts a network's parameters and buffers to dtype.
    """
    for module in network.modules():
        if

Download .txt

gitextract_kyecer1w/

├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── apex/
│   ├── .gitignore
│   ├── .nojekyll
│   ├── LICENSE
│   ├── README.md
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   └── fused_adam.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── apex.patch
│   ├── csrc/
│   │   ├── amp_C_frontend.cpp
│   │   ├── flatten_unflatten.cpp
│   │   ├── fused_adam_cuda.cpp
│   │   ├── fused_adam_cuda_kernel.cu
│   │   ├── layer_norm_cuda.cpp
│   │   ├── layer_norm_cuda_kernel.cu
│   │   ├── multi_tensor_apply.cuh
│   │   ├── multi_tensor_axpby_kernel.cu
│   │   ├── multi_tensor_l2norm_kernel.cu
│   │   ├── multi_tensor_lamb_stage_1.cu
│   │   ├── multi_tensor_lamb_stage_2.cu
│   │   ├── multi_tensor_scale_kernel.cu
│   │   ├── syncbn.cpp
│   │   ├── type_shim.h
│   │   └── welford.cu
│   ├── docs/
│   │   ├── Makefile
│   │   └── source/
│   │       ├── _static/
│   │       │   └── css/
│   │       │       └── pytorch_theme.css
│   │       ├── _templates/
│   │       │   └── layout.html
│   │       ├── advanced.rst
│   │       ├── amp.rst
│   │       ├── conf.py
│   │       ├── fp16_utils.rst
│   │       ├── index.rst
│   │       ├── layernorm.rst
│   │       ├── optimizers.rst
│   │       └── parallel.rst
│   ├── examples/
│   │   ├── README.md
│   │   ├── dcgan/
│   │   │   └── README.md
│   │   ├── docker/
│   │   │   ├── Dockerfile
│   │   │   └── README.md
│   │   ├── imagenet/
│   │   │   ├── README.md
│   │   │   └── main_amp.py
│   │   └── simple/
│   │       └── distributed/
│   │           ├── README.md
│   │           ├── distributed_data_parallel.py
│   │           └── run.sh
│   ├── setup.py
│   └── tests/
│       ├── L0/
│       │   ├── run_amp/
│       │   │   ├── __init__.py
│       │   │   ├── test_add_param_group.py
│       │   │   ├── test_basic_casts.py
│       │   │   ├── test_cache.py
│       │   │   ├── test_multi_tensor_axpby.py
│       │   │   ├── test_multi_tensor_l2norm.py
│       │   │   ├── test_multi_tensor_scale.py
│       │   │   ├── test_multiple_models_optimizers_losses.py
│       │   │   ├── test_promotion.py
│       │   │   ├── test_rnn.py
│       │   │   └── utils.py
│       │   ├── run_fp16util/
│       │   │   ├── __init__.py
│       │   │   └── test_fp16util.py
│       │   ├── run_fused_layer_norm/
│       │   │   └── test_fused_layer_norm.py
│       │   ├── run_mixed_adam/
│       │   │   ├── __init__.py
│       │   │   ├── test_fp16_optimizer.py
│       │   │   └── test_mixed_adam.py
│       │   └── run_test.py
│       ├── L1/
│       │   ├── common/
│       │   │   ├── compare.py
│       │   │   ├── main_amp.py
│       │   │   └── run_test.sh
│       │   ├── cross_product/
│       │   │   └── run.sh
│       │   └── cross_product_distributed/
│       │       └── run.sh
│       ├── distributed/
│       │   ├── DDP/
│       │   │   ├── ddp_race_condition_test.py
│       │   │   └── run_race_test.sh
│       │   ├── amp_master_params/
│       │   │   ├── amp_master_params.py
│       │   │   ├── compare.py
│       │   │   └── run.sh
│       │   └── synced_batchnorm/
│       │       ├── single_gpu_unit_test.py
│       │       ├── test_groups.py
│       │       ├── two_gpu_unit_test.py
│       │       └── unit_test.sh
│       └── docker_extension_builds/
│           └── run.sh
├── jukebox/
│   ├── Interacting_with_Jukebox.ipynb
│   ├── __init__.py
│   ├── align.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── artist_genre_processor.py
│   │   ├── data_processor.py
│   │   ├── files_dataset.py
│   │   ├── ids/
│   │   │   ├── v2_artist_ids.txt
│   │   │   ├── v2_genre_ids.txt
│   │   │   ├── v3_artist_ids.txt
│   │   │   └── v3_genre_ids.txt
│   │   ├── labels.py
│   │   └── text_processor.py
│   ├── hparams.py
│   ├── lyricdict.py
│   ├── make_models.py
│   ├── prior/
│   │   ├── __init__.py
│   │   ├── autoregressive.py
│   │   ├── conditioners.py
│   │   └── prior.py
│   ├── sample.py
│   ├── save_html.py
│   ├── tests/
│   │   └── test_sample.py
│   ├── train.py
│   ├── transformer/
│   │   ├── __init__.py
│   │   ├── factored_attention.py
│   │   ├── ops.py
│   │   └── transformer.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── audio_utils.py
│   │   ├── checkpoint.py
│   │   ├── dist_adapter.py
│   │   ├── dist_utils.py
│   │   ├── ema.py
│   │   ├── fp16.py
│   │   ├── io.py
│   │   ├── logger.py
│   │   ├── remote_utils.py
│   │   ├── sample_utils.py
│   │   └── torch_utils.py
│   └── vqvae/
│       ├── __init__.py
│       ├── bottleneck.py
│       ├── encdec.py
│       ├── resnet.py
│       └── vqvae.py
├── requirements.txt
├── setup.py
└── tensorboardX/
    ├── .codecov.yml
    ├── .flake8
    ├── .github/
    │   └── ISSUE_TEMPLATE/
    │       ├── bug_report.md
    │       └── feature-requests-or-general-questions.md
    ├── .gitignore
    ├── .travis.yml
    ├── HISTORY.rst
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── compile.sh
    ├── docs/
    │   ├── Makefile
    │   ├── conf.py
    │   ├── index.rst
    │   ├── tensorboard.rst
    │   ├── tutorial.rst
    │   ├── tutorial_zh.rst
    │   └── utils.rst
    ├── examples/
    │   ├── RUN_AFTER_PIP_INSTALL
    │   ├── __init__.py
    │   ├── chainer/
    │   │   ├── extension_logger/
    │   │   │   ├── net.py
    │   │   │   ├── train_dcgan.py
    │   │   │   ├── updater.py
    │   │   │   ├── visualize.py
    │   │   │   └── writetensorboard.py
    │   │   └── plain_logger/
    │   │       ├── data.py
    │   │       ├── net.py
    │   │       └── train_vae.py
    │   ├── demo.py
    │   ├── demo_beholder.py
    │   ├── demo_caffe2.py
    │   ├── demo_custom_scalars.py
    │   ├── demo_embedding.py
    │   ├── demo_graph.py
    │   ├── demo_hparams.py
    │   ├── demo_matplotlib.py
    │   ├── demo_multiple_embedding.py
    │   ├── demo_nvidia_smi.py
    │   ├── demo_onnx.py
    │   └── demo_purge.py
    ├── setup.cfg
    ├── setup.py
    ├── tensorboardX/
    │   ├── __init__.py
    │   ├── beholder/
    │   │   ├── __init__.py
    │   │   ├── beholder.py
    │   │   ├── file_system_tools.py
    │   │   ├── shared_config.py
    │   │   └── video_writing.py
    │   ├── caffe2_graph.py
    │   ├── crc32c.py
    │   ├── embedding.py
    │   ├── event_file_writer.py
    │   ├── onnx_graph.py
    │   ├── proto/
    │   │   ├── __init__.py
    │   │   ├── api.proto
    │   │   ├── api_pb2.py
    │   │   ├── attr_value.proto
    │   │   ├── attr_value_pb2.py
    │   │   ├── event.proto
    │   │   ├── event_pb2.py
    │   │   ├── graph.proto
    │   │   ├── graph_pb2.py
    │   │   ├── layout.proto
    │   │   ├── layout_pb2.py
    │   │   ├── node_def.proto
    │   │   ├── node_def_pb2.py
    │   │   ├── plugin_hparams.proto
    │   │   ├── plugin_hparams_pb2.py
    │   │   ├── plugin_mesh.proto
    │   │   ├── plugin_mesh_pb2.py
    │   │   ├── plugin_pr_curve.proto
    │   │   ├── plugin_pr_curve_pb2.py
    │   │   ├── plugin_text.proto
    │   │   ├── plugin_text_pb2.py
    │   │   ├── resource_handle.proto
    │   │   ├── resource_handle_pb2.py
    │   │   ├── step_stats.proto
    │   │   ├── step_stats_pb2.py
    │   │   ├── summary.proto
    │   │   ├── summary_pb2.py
    │   │   ├── tensor.proto
    │   │   ├── tensor_pb2.py
    │   │   ├── tensor_shape.proto
    │   │   ├── tensor_shape_pb2.py
    │   │   ├── types.proto
    │   │   ├── types_pb2.py
    │   │   ├── versions.proto
    │   │   └── versions_pb2.py
    │   ├── proto_graph.py
    │   ├── pytorch_graph.py
    │   ├── record_writer.py
    │   ├── summary.py
    │   ├── torchvis.py
    │   ├── utils.py
    │   ├── visdom_writer.py
    │   ├── writer.py
    │   └── x2num.py
    ├── tensorboardX.patch
    └── tests/
        ├── __init__.py
        ├── event_file_writer_test.py
        ├── expect/
        │   ├── caffe_mnist.expect
        │   ├── caffe_overfeat.expect
        │   ├── test_caffe2.test_simple_cnnmodel.expect
        │   ├── test_caffe2.test_simple_model.expect
        │   ├── test_pr_curve.test_pr_purve.expect
        │   ├── test_pr_curve.test_pr_purve_raw.expect
        │   ├── test_summary.test_audio.expect
        │   ├── test_summary.test_custom_scalars.expect
        │   ├── test_summary.test_float32_image.expect
        │   ├── test_summary.test_histogram_auto.expect
        │   ├── test_summary.test_histogram_doane.expect
        │   ├── test_summary.test_histogram_fd.expect
        │   ├── test_summary.test_hparams.expect
        │   ├── test_summary.test_image_with_3_channel_batched.expect
        │   ├── test_summary.test_image_with_boxes.expect
        │   ├── test_summary.test_image_with_four_channel.expect
        │   ├── test_summary.test_image_with_four_channel_batched.expect
        │   ├── test_summary.test_image_with_one_channel.expect
        │   ├── test_summary.test_image_with_one_channel_batched.expect
        │   ├── test_summary.test_image_without_channel.expect
        │   ├── test_summary.test_mesh.expect
        │   ├── test_summary.test_text.expect
        │   ├── test_summary.test_uint8_image.expect
        │   └── test_summary.test_video.expect
        ├── expect_reader.py
        ├── record_writer_test.py
        ├── test_beholder.py
        ├── test_caffe2.py
        ├── test_chainer_np.py
        ├── test_crc32c.py
        ├── test_embedding.py
        ├── test_figure.py
        ├── test_numpy.py
        ├── test_onnx_graph.py
        ├── test_pr_curve.py
        ├── test_pytorch_graph.py
        ├── test_pytorch_np.py
        ├── test_record_writer.py
        ├── test_summary.py
        ├── test_summary_writer.py
        ├── test_test.py
        ├── test_utils.py
        ├── test_visdom.py
        └── test_writer.py

Download .txt

SYMBOL INDEX (1379 symbols across 146 files)

FILE: apex/apex/RNN/RNNBackend.py
  function is_iterable (line 10) | def is_iterable(maybe_iterable):
  function flatten_list (line 14) | def flatten_list(tens_list):
  class bidirectionalRNN (line 25) | class bidirectionalRNN(nn.Module):
    method __init__ (line 29) | def __init__(self, inputRNN, num_layers=1, dropout = 0):
    method forward (line 37) | def forward(self, input, collect_hidden=False):
    method reset_parameters (line 52) | def reset_parameters(self):
    method init_hidden (line 59) | def init_hidden(self, bsz):
    method detach_hidden (line 66) | def detach_hidden(self):
    method reset_hidden (line 73) | def reset_hidden(self, bsz):
    method init_inference (line 80) | def init_inference(self, bsz):
  class stackedRNN (line 90) | class stackedRNN(nn.Module):
    method __init__ (line 94) | def __init__(self, inputRNN, num_layers=1, dropout=0):
    method forward (line 122) | def forward(self, input, collect_hidden=False, reverse=False):
    method reset_parameters (line 197) | def reset_parameters(self):
    method init_hidden (line 204) | def init_hidden(self, bsz):
    method detach_hidden (line 211) | def detach_hidden(self):
    method reset_hidden (line 218) | def reset_hidden(self, bsz):
    method init_inference (line 225) | def init_inference(self, bsz):
  class RNNCell (line 232) | class RNNCell(nn.Module):
    method __init__ (line 242) | def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_h...
    method new_like (line 274) | def new_like(self, new_input_size=None):
    method reset_parameters (line 291) | def reset_parameters(self, gain=1):
    method init_hidden (line 309) | def init_hidden(self, bsz):
    method reset_hidden (line 330) | def reset_hidden(self, bsz):
    method detach_hidden (line 338) | def detach_hidden(self):
    method forward (line 348) | def forward(self, input):

FILE: apex/apex/RNN/cells.py
  class mLSTMRNNCell (line 12) | class mLSTMRNNCell(RNNCell):
    method __init__ (line 17) | def __init__(self, input_size, hidden_size, bias = False, output_size ...
    method forward (line 26) | def forward(self, input):
    method new_like (line 45) | def new_like(self, new_input_size=None):
  function mLSTMCell (line 55) | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=N...

FILE: apex/apex/RNN/models.py
  function toRNNBackend (line 8) | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
  function LSTM (line 19) | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function GRU (line 26) | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=Fals...
  function ReLU (line 33) | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function Tanh (line 40) | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function mLSTM (line 47) | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fa...

FILE: apex/apex/amp/_amp_state.py
  class AmpState (line 17) | class AmpState(object):
    method __init__ (line 18) | def __init__(self):
  function warn_or_err (line 28) | def warn_or_err(msg):
  function maybe_print (line 43) | def maybe_print(msg, rank0=False):
  function master_params (line 61) | def master_params(optimizer):

FILE: apex/apex/amp/_initialize.py
  function to_type (line 18) | def to_type(dtype, t):
  function applier (line 36) | def applier(value, fn):
  function check_models (line 61) | def check_models(models):
  function check_params_fp32 (line 76) | def check_params_fp32(models):
  function check_optimizers (line 116) | def check_optimizers(optimizers):
  function wrap_fused_adam (line 134) | def wrap_fused_adam(optimizer, properties):
  function _initialize (line 150) | def _initialize(models, optimizers, properties, num_losses=1, cast_model...

FILE: apex/apex/amp/_process_optimizer.py
  class AmpOptimizerState (line 8) | class AmpOptimizerState(object):
    method __init__ (line 9) | def __init__(self):
  function lazy_init_with_master_weights (line 13) | def lazy_init_with_master_weights(self):
  function prepare_backward_with_master_weights (line 76) | def prepare_backward_with_master_weights(self):
  function post_backward_with_master_weights (line 96) | def post_backward_with_master_weights(self, scaler):
  function lazy_init_no_master_weights (line 165) | def lazy_init_no_master_weights(self):
  function prepare_backward_no_master_weights (line 184) | def prepare_backward_no_master_weights(self):
  function post_backward_no_master_weights (line 202) | def post_backward_no_master_weights(self, scaler):
  function _master_params_to_model_params (line 242) | def _master_params_to_model_params(self):
  function _process_optimizer (line 256) | def _process_optimizer(optimizer, properties):

FILE: apex/apex/amp/amp.py
  function _decorator_helper (line 18) | def _decorator_helper(orig_fn, cast_fn, wrap_fn):
  function half_function (line 30) | def half_function(fn):
  function float_function (line 35) | def float_function(fn):
  function promote_function (line 40) | def promote_function(fn):
  function register_half_function (line 46) | def register_half_function(module, name):
  function register_float_function (line 53) | def register_float_function(module, name):
  function register_promote_function (line 60) | def register_promote_function(module, name):
  function init (line 68) | def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbos...

FILE: apex/apex/amp/compat.py
  function variable_is_tensor (line 4) | def variable_is_tensor():
  function tensor_is_variable (line 8) | def tensor_is_variable():
  function tensor_is_float_tensor (line 13) | def tensor_is_float_tensor():
  function is_tensor_like (line 19) | def is_tensor_like(x):
  function is_floating_point (line 24) | def is_floating_point(x):
  function scalar_python_val (line 35) | def scalar_python_val(x):

FILE: apex/apex/amp/frontend.py
  class Properties (line 6) | class Properties(object):
    method __init__ (line 12) | def __init__(self):
    method _update_options_dict (line 32) | def _update_options_dict(new_options):
    method __getattr__ (line 42) | def __getattr__(self, name):
    method __setattr__ (line 50) | def __setattr__(self, name, value):
  class O3 (line 101) | class O3:
    method __call__ (line 110) | def __call__(self, properties):
  class O2 (line 123) | class O2:
    method __call__ (line 133) | def __call__(self, properties):
  class O1 (line 146) | class O1:
    method __call__ (line 155) | def __call__(self, properties):
  class O0 (line 168) | class O0:
    method __call__ (line 174) | def __call__(self, properties):
  function initialize (line 194) | def initialize(

FILE: apex/apex/amp/handle.py
  function scale_loss (line 16) | def scale_loss(loss,
  function disable_casts (line 163) | def disable_casts():
  class AmpHandle (line 169) | class AmpHandle(object):
    method __init__ (line 170) | def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=...
    method is_active (line 178) | def is_active(self):
    method _disable_casts (line 182) | def _disable_casts(self):
    method wrap_optimizer (line 187) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 192) | def scale_loss(self, loss, optimizer):
    method _clear_cache (line 225) | def _clear_cache(self):
    method _save_func (line 229) | def _save_func(self, mod, fn, func):
    method _deactivate (line 232) | def _deactivate(self):
    method has_cache (line 238) | def has_cache(self):
    method cache (line 242) | def cache(self):
    method remove_cache (line 245) | def remove_cache(self, param):
    method verbose (line 250) | def verbose(self):
  class NoOpHandle (line 253) | class NoOpHandle(object):
    method is_active (line 254) | def is_active(self):
    method _disable_casts (line 258) | def _disable_casts(self):
    method wrap_optimizer (line 261) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 265) | def scale_loss(self, loss, optimizer):
    method has_cache (line 269) | def has_cache(self):
    method verbose (line 273) | def verbose(self):
    method _clear_cache (line 276) | def _clear_cache(self):
    method _deactivate (line 279) | def _deactivate(self):

FILE: apex/apex/amp/opt.py
  class OptimWrapper (line 9) | class OptimWrapper(object):
    method __init__ (line 10) | def __init__(self, optimizer, amp_handle, num_loss):
    method scale_loss (line 19) | def scale_loss(self, loss):
    method _cur_loss_scaler (line 55) | def _cur_loss_scaler(self):
    method step (line 59) | def step(self, closure=None):
    method __getattr__ (line 80) | def __getattr__(self, attr):
    method __getstate__ (line 84) | def __getstate__(self):
    method __setstate__ (line 87) | def __setstate__(self):
    method __repr__ (line 90) | def __repr__(self):
    method state_dict (line 93) | def state_dict(self):
    method load_state_dict (line 96) | def load_state_dict(self, state_dict):
    method zero_grad (line 99) | def zero_grad(self):
    method add_param_group (line 102) | def add_param_group(self, param_group):

FILE: apex/apex/amp/rnn_compat.py
  function _gen_VF_wrapper (line 7) | def _gen_VF_wrapper(name):
  class VariableFunctionsShim (line 17) | class VariableFunctionsShim(object):
    method __init__ (line 18) | def __init__(self):
  function has_old_rnns (line 24) | def has_old_rnns():
  function whitelist_rnn_cells (line 31) | def whitelist_rnn_cells(handle, verbose):

FILE: apex/apex/amp/scaler.py
  function scale_check_overflow_python (line 6) | def scale_check_overflow_python(model_grad, master_grad, scale, check_ov...
  function axpby_check_overflow_python (line 19) | def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, s...
  class LossScaler (line 34) | class LossScaler(object):
    method __init__ (line 39) | def __init__(self,
    method loss_scale (line 74) | def loss_scale(self):
    method unscale_python (line 77) | def unscale_python(self, model_grads, master_grads, scale):
    method unscale (line 95) | def unscale(self, model_grads, master_grads, unused_scale, models_are_...
    method unscale_with_stashed_python (line 125) | def unscale_with_stashed_python(self,
    method unscale_with_stashed (line 149) | def unscale_with_stashed(self,
    method clear_overflow_state (line 184) | def clear_overflow_state(self):
    method update_scale (line 190) | def update_scale(self):

FILE: apex/apex/amp/utils.py
  function get_cuda_version (line 8) | def get_cuda_version():
  function is_fp_tensor (line 11) | def is_fp_tensor(x):
  function is_nested (line 20) | def is_nested(x):
  function should_cache (line 23) | def should_cache(x):
  function collect_fp_tensor_types (line 33) | def collect_fp_tensor_types(args, kwargs):
  function type_string (line 48) | def type_string(x):
  function maybe_half (line 51) | def maybe_half(x, name='', verbose=False):
  function maybe_float (line 62) | def maybe_float(x, name='', verbose=False):
  function casted_args (line 74) | def casted_args(cast_fn, args, kwargs):
  function cached_cast (line 87) | def cached_cast(cast_fn, x, cache):
  function verbosify (line 121) | def verbosify(cast_fn, fn_name, verbose):
  function as_inplace (line 127) | def as_inplace(fns):
  function has_func (line 131) | def has_func(mod, fn):
  function get_func (line 139) | def get_func(mod, fn):
  function set_func (line 147) | def set_func(mod, fn, new_fn):
  function set_func_save (line 155) | def set_func_save(handle, mod, fn, new_fn):
  function synthesize_flattened_rnn_weights (line 174) | def synthesize_flattened_rnn_weights(fp32_weights,
  function new_synthesize_flattened_rnn_weights (line 197) | def new_synthesize_flattened_rnn_weights(fp32_weights,

FILE: apex/apex/amp/wrap.py
  function make_cast_wrapper (line 10) | def make_cast_wrapper(orig_fn, cast_fn, handle,
  function cached_cast (line 31) | def cached_cast(mod, fn, cast_fn, handle,
  function make_promote_wrapper (line 44) | def make_promote_wrapper(orig_fn, cast_fn, handle=None):
  function promote (line 65) | def promote(mod, fn, handle, verbose=False):
  function sequence_promote (line 71) | def sequence_promote(mod, fn, handle, verbose=False):
  function promote_match_arg0 (line 92) | def promote_match_arg0(mod, fn, handle, verbose=False):
  function err_if_any_half (line 114) | def err_if_any_half(mod, fn, handle, custom_err_msg=None):
  function err_if_arg0_half (line 132) | def err_if_arg0_half(mod, fn, handle, verbose=False):
  function rnn_cast (line 157) | def rnn_cast(backend, fn, handle, verbose=False):
  function new_rnn_cast (line 222) | def new_rnn_cast(fn, handle, verbose=False):
  function disable_casts (line 267) | def disable_casts(mod, fn, handle):

FILE: apex/apex/fp16_utils/fp16_optimizer.py
  class FP16_Optimizer (line 13) | class FP16_Optimizer(object):
    method __init__ (line 107) | def __init__(self,
    method maybe_print (line 199) | def maybe_print(self, msg):
    method __getstate__ (line 203) | def __getstate__(self):
    method __setstate__ (line 206) | def __setstate__(self, state):
    method zero_grad (line 209) | def zero_grad(self, set_grads_to_None=False):
    method _master_params_to_model_params (line 249) | def _master_params_to_model_params(self):
    method clip_master_grads (line 274) | def clip_master_grads(self, max_norm, norm_type=2):
    method state_dict (line 298) | def state_dict(self):
    method load_state_dict (line 319) | def load_state_dict(self, state_dict):
    method step (line 361) | def step(self, closure=None): # could add clip option.
    method _step_with_closure (line 423) | def _step_with_closure(self, closure):
    method backward (line 462) | def backward(self, loss, update_master_grads=True, retain_graph=False):
    method update_master_grads (line 525) | def update_master_grads(self):
    method inspect_master_grad_data (line 582) | def inspect_master_grad_data(self):
    method _get_loss_scale (line 617) | def _get_loss_scale(self):
    method _set_loss_scale (line 620) | def _set_loss_scale(self, value):
    method _get_state (line 626) | def _get_state(self):
    method _set_state (line 629) | def _set_state(self, value):
    method _get_param_groups (line 636) | def _get_param_groups(self):
    method _set_param_groups (line 639) | def _set_param_groups(self, value):

FILE: apex/apex/fp16_utils/fp16util.py
  class tofp16 (line 7) | class tofp16(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 18) | def forward(self, input):
  function BN_convert_float (line 22) | def BN_convert_float(module):
  function network_to_half (line 35) | def network_to_half(network):
  function convert_module (line 44) | def convert_module(module, dtype):
  function convert_network (line 60) | def convert_network(network, dtype):
  class FP16Model (line 73) | class FP16Model(nn.Module):
    method __init__ (line 78) | def __init__(self, network):
    method forward (line 82) | def forward(self, *inputs):
  function backwards_debug_hook (line 87) | def backwards_debug_hook(grad):
  function prep_param_lists (line 90) | def prep_param_lists(model, flat_master=False):
  function model_grads_to_master_grads (line 136) | def model_grads_to_master_grads(model_params, master_params, flat_master...
  function master_params_to_model_params (line 158) | def master_params_to_model_params(model_params, master_params, flat_mast...
  function to_python_float (line 176) | def to_python_float(t):

FILE: apex/apex/fp16_utils/loss_scaler.py
  function to_python_float (line 4) | def to_python_float(t):
  class LossScaler (line 10) | class LossScaler:
    method __init__ (line 22) | def __init__(self, scale=1):
    method has_overflow (line 26) | def has_overflow(self, params):
    method _has_inf_or_nan (line 30) | def _has_inf_or_nan(x):
    method update_scale (line 33) | def update_scale(self, overflow):
    method loss_scale (line 37) | def loss_scale(self):
    method scale_gradient (line 40) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 43) | def backward(self, loss, retain_graph=False):
  class DynamicLossScaler (line 47) | class DynamicLossScaler:
    method __init__ (line 73) | def __init__(self,
    method has_overflow (line 84) | def has_overflow(self, params):
    method _has_inf_or_nan (line 92) | def _has_inf_or_nan(x):
    method update_scale (line 113) | def update_scale(self, overflow):
    method loss_scale (line 124) | def loss_scale(self):
    method scale_gradient (line 127) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 130) | def backward(self, loss, retain_graph=False):

FILE: apex/apex/multi_tensor_apply/multi_tensor_apply.py
  class MultiTensorApply (line 3) | class MultiTensorApply(object):
    method __init__ (line 7) | def __init__(self, chunk_size):
    method check_avail (line 16) | def check_avail(self):
    method __call__ (line 24) | def __call__(self, op, noop_flag_buffer, tensor_lists, *args):

FILE: apex/apex/normalization/fused_layer_norm.py
  class FusedLayerNormAffineFunction (line 9) | class FusedLayerNormAffineFunction(torch.autograd.Function):
    method __init__ (line 10) | def __init__(self, normalized_shape, eps=1e-6):
    method forward (line 17) | def forward(self, input, weight, bias):
    method backward (line 26) | def backward(self, grad_output):
  class FusedLayerNormFunction (line 35) | class FusedLayerNormFunction(torch.autograd.Function):
    method __init__ (line 36) | def __init__(self, normalized_shape, eps=1e-6):
    method forward (line 42) | def forward(self, input):
    method backward (line 49) | def backward(self, grad_output):
  function fused_layer_norm_affine (line 58) | def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1...
  function fused_layer_norm (line 61) | def fused_layer_norm(input, normalized_shape, eps=1e-6):
  class FusedLayerNorm (line 64) | class FusedLayerNorm(torch.nn.Module):
    method __init__ (line 123) | def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
    method reset_parameters (line 142) | def reset_parameters(self):
    method forward (line 147) | def forward(self, input):
    method extra_repr (line 158) | def extra_repr(self):

FILE: apex/apex/optimizers/fp16_optimizer.py
  class FP16_Optimizer (line 4) | class FP16_Optimizer(object):
    method __init__ (line 31) | def __init__(self,
    method zero_grad (line 88) | def zero_grad(self, set_grads_to_None=True):
    method _compute_grad_norm (line 103) | def _compute_grad_norm(self, fp16_grads_flat, norm_type=2):
    method step (line 130) | def step(self, closure=None):
    method backward (line 163) | def backward(self, loss):
    method _update_scale (line 174) | def _update_scale(self, skip):
    method _get_state (line 193) | def _get_state(self):
    method _set_state (line 196) | def _set_state(self, value):
    method _get_param_groups (line 203) | def _get_param_groups(self):
    method _set_param_groups (line 206) | def _set_param_groups(self, value):
    method state_dict (line 211) | def state_dict(self):
    method load_state_dict (line 234) | def load_state_dict(self, state_dict):

FILE: apex/apex/optimizers/fused_adam.py
  class FusedAdam (line 5) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 35) | def __init__(self, params,
    method step (line 50) | def step(self, closure=None, grads=None, output_params=None, scale=1.,...

FILE: apex/apex/parallel/LARC.py
  class LARC (line 6) | class LARC(object):
    method __init__ (line 40) | def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1...
    method __getstate__ (line 47) | def __getstate__(self):
    method __setstate__ (line 50) | def __setstate__(self, state):
    method __repr__ (line 53) | def __repr__(self):
    method state_dict (line 56) | def state_dict(self):
    method load_state_dict (line 59) | def load_state_dict(self, state_dict):
    method zero_grad (line 62) | def zero_grad(self):
    method add_param_group (line 65) | def add_param_group(self, param_group):
    method step (line 68) | def step(self):

FILE: apex/apex/parallel/__init__.py
  function convert_syncbn_model (line 21) | def convert_syncbn_model(module, process_group=None, channel_last=False):
  function create_syncbn_process_group (line 55) | def create_syncbn_process_group(group_size):

FILE: apex/apex/parallel/distributed.py
  function import_flatten_impl (line 13) | def import_flatten_impl():
  function flatten (line 25) | def flatten(bucket):
  function unflatten (line 30) | def unflatten(coalesced, bucket):
  function apply_flat_dist_call (line 36) | def apply_flat_dist_call(bucket, call, extra_args=None):
  function split_half_float_double (line 51) | def split_half_float_double(tensors):
  function split_by_type (line 60) | def split_by_type(tensors):
  function flat_dist_call (line 70) | def flat_dist_call(tensors, call, extra_args=None):
  function extract_tensors (line 78) | def extract_tensors(maybe_tensor, tensor_list):
  class Reducer (line 89) | class Reducer(object):
    method __init__ (line 111) | def __init__(self, module_or_grads_list):
    method reduce (line 121) | def reduce(self):
  class DistributedDataParallel (line 129) | class DistributedDataParallel(Module):
    method __init__ (line 162) | def __init__(self,
    method __setstate__ (line 237) | def __setstate__(self, state):
    method __getstate__ (line 243) | def __getstate__(self):
    method enable_allreduce (line 250) | def enable_allreduce(self):
    method disable_allreduce (line 253) | def disable_allreduce(self):
    method sync_bucket_structure (line 258) | def sync_bucket_structure(self):
    method create_hooks (line 294) | def create_hooks(self):
    method allreduce_bucket (line 378) | def allreduce_bucket(self, bucket):
    method allreduce_maybe_retain (line 401) | def allreduce_maybe_retain(self, bucket, bucket_idx=-1):
    method allreduce_fallback (line 420) | def allreduce_fallback(self):
    method comm_ready_buckets (line 435) | def comm_ready_buckets(self, param):
    method forward (line 475) | def forward(self, *inputs, **kwargs):

FILE: apex/apex/parallel/multiproc.py
  function docstring_hack (line 5) | def docstring_hack():

FILE: apex/apex/parallel/optimized_sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 58) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 63) | def _specify_process_group(self, process_group):
    method _specify_channel_last (line 66) | def _specify_channel_last(self, channel_last):
    method forward (line 69) | def forward(self, input):

FILE: apex/apex/parallel/optimized_sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, weight, bias, running_mean, running_variance, ...
    method backward (line 70) | def backward(ctx, grad_output):

FILE: apex/apex/parallel/sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 51) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 62) | def _specify_process_group(self, process_group):
    method forward (line 65) | def forward(self, input):

FILE: apex/apex/parallel/sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, weight, bias, running_mean, running_variance, ...
    method backward (line 33) | def backward(ctx, grad_output):

FILE: apex/apex/reparameterization/__init__.py
  function apply_weight_norm (line 4) | def apply_weight_norm(module, name='', dim=0, hook_child=True):
  function remove_weight_norm (line 50) | def remove_weight_norm(module, name='', remove_all=False):
  function apply_reparameterization (line 64) | def apply_reparameterization(module, reparameterization=None, name='', d...
  function remove_reparameterization (line 96) | def remove_reparameterization(module, reparameterization=Reparameterizat...

FILE: apex/apex/reparameterization/reparameterization.py
  class Reparameterization (line 4) | class Reparameterization(object):
    method __init__ (line 19) | def __init__(self, name, dim, module, retain_forward=True):
    method compute_weight (line 28) | def compute_weight(self, module=None, name=None):
    method reparameterize (line 40) | def reparameterize(self, name, weight, dim):
    method apply (line 57) | def apply(module, name, dim, reparameterization=None, hook_child=True):
    method get_module_and_name (line 105) | def get_module_and_name(module, name):
    method get_params (line 123) | def get_params(self, module):
    method remove (line 127) | def remove(self, module):
    method __call__ (line 139) | def __call__(self, module, inputs):
    method backward_hook (line 147) | def backward_hook(self, module, grad_input, grad_output):

FILE: apex/apex/reparameterization/weight_norm.py
  function _norm (line 8) | def _norm(p, dim):
  class WeightNorm (line 22) | class WeightNorm(Reparameterization):
    method compute_weight (line 39) | def compute_weight(self, module=None, name=None):
    method reparameterize (line 62) | def reparameterize(self, name, weight, dim):

FILE: apex/csrc/amp_C_frontend.cpp
  function PYBIND11_MODULE (line 43) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: apex/csrc/flatten_unflatten.cpp
  function flatten (line 5) | at::Tensor flatten(std::vector<at::Tensor> tensors)
  function unflatten (line 10) | std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tenso...
  function PYBIND11_MODULE (line 15) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: apex/csrc/fused_adam_cuda.cpp
  function adam (line 11) | void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tenso...
  function PYBIND11_MODULE (line 26) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: apex/csrc/layer_norm_cuda.cpp
  function compute_n1_n2 (line 6) | void compute_n1_n2(
  function check_args (line 42) | void check_args(
  function check_args (line 82) | void check_args(
  function layer_norm (line 120) | std::vector<at::Tensor> layer_norm(
  function layer_norm_affine (line 138) | std::vector<at::Tensor> layer_norm_affine(
  function layer_norm_gradient (line 181) | at::Tensor layer_norm_gradient(
  function layer_norm_gradient_affine (line 204) | std::vector<at::Tensor> layer_norm_gradient_affine(
  function PYBIND11_MODULE (line 234) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: apex/csrc/syncbn.cpp
  function PYBIND11_MODULE (line 85) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: apex/docs/source/conf.py
  function patched_make_field (line 206) | def patched_make_field(self, types, domain, items, **kw):

FILE: apex/examples/imagenet/main_amp.py
  function fast_collate (line 77) | def fast_collate(batch):
  function main (line 108) | def main():
  class data_prefetcher (line 256) | class data_prefetcher():
    method __init__ (line 257) | def __init__(self, loader):
    method preload (line 268) | def preload(self):
    method next (line 299) | def next(self):
  function train (line 309) | def train(train_loader, model, criterion, optimizer, epoch):
  function validate (line 393) | def validate(val_loader, model, criterion):
  function save_checkpoint (line 455) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 461) | class AverageMeter(object):
    method __init__ (line 463) | def __init__(self):
    method reset (line 466) | def reset(self):
    method update (line 472) | def update(self, val, n=1):
  function adjust_learning_rate (line 479) | def adjust_learning_rate(optimizer, epoch, step, len_epoch):
  function accuracy (line 499) | def accuracy(output, target, topk=(1,)):
  function reduce_tensor (line 515) | def reduce_tensor(tensor):

FILE: apex/setup.py
  function check_cuda_torch_binary_vs_bare_metal (line 36) | def check_cuda_torch_binary_vs_bare_metal(cuda_dir):

FILE: apex/tests/L0/run_amp/test_add_param_group.py
  class MyModel (line 16) | class MyModel(torch.nn.Module):
    method __init__ (line 17) | def __init__(self, unique):
    method ops (line 24) | def ops(input, weight0, weight1):
    method forward (line 27) | def forward(self, input):
  class TestAddParamGroup (line 34) | class TestAddParamGroup(unittest.TestCase):
    method setUp (line 35) | def setUp(self):
    method tearDown (line 39) | def tearDown(self):
    method zero_grad (line 42) | def zero_grad(self, models, optimizer, how_to_zero):
    method test_add_param_group (line 53) | def test_add_param_group(self):

FILE: apex/tests/L0/run_amp/test_basic_casts.py
  function run_layer_test (line 14) | def run_layer_test(test_case, fns, expected, input_shape, test_backward=...
  class TestBasicCasts (line 23) | class TestBasicCasts(unittest.TestCase):
    method setUp (line 24) | def setUp(self):
    method tearDown (line 28) | def tearDown(self):
    method test_linear_is_half (line 31) | def test_linear_is_half(self):
    method test_conv2d_is_half (line 36) | def test_conv2d_is_half(self):
    method test_softmax_is_float (line 41) | def test_softmax_is_float(self):
    method test_group_norm_is_float (line 46) | def test_group_norm_is_float(self):
    method test_mse_loss_is_float (line 50) | def test_mse_loss_is_float(self):
    method test_relu_is_match (line 58) | def test_relu_is_match(self):
    method test_batch_norm_is_match (line 61) | def test_batch_norm_is_match(self):
  class TestBannedMethods (line 74) | class TestBannedMethods(unittest.TestCase):
    method setUp (line 75) | def setUp(self):
    method tearDown (line 79) | def tearDown(self):
    method bce_common (line 82) | def bce_common(self, assertion):
    method test_bce_raises_by_default (line 92) | def test_bce_raises_by_default(self):
    method test_bce_is_float_with_allow_banned (line 96) | def test_bce_is_float_with_allow_banned(self):
  class TestTensorCasts (line 102) | class TestTensorCasts(unittest.TestCase):
    method setUp (line 103) | def setUp(self):
    method tearDown (line 107) | def tearDown(self):
    method test_matmul_method_is_half (line 110) | def test_matmul_method_is_half(self):
    method test_matmul_op_is_half (line 116) | def test_matmul_op_is_half(self):
    method test_pow_method_is_float (line 122) | def test_pow_method_is_float(self):
    method test_pow_op_is_float (line 126) | def test_pow_op_is_float(self):
    method test_cpu_is_float (line 130) | def test_cpu_is_float(self):
    method test_sum_is_float (line 136) | def test_sum_is_float(self):
  class TestDisabledCasts (line 140) | class TestDisabledCasts(unittest.TestCase):
    method setUp (line 141) | def setUp(self):
    method test_disabled_linear (line 145) | def test_disabled_linear(self):

FILE: apex/tests/L0/run_amp/test_cache.py
  function get_reference_grad (line 15) | def get_reference_grad(i, w, ops):
  class WhitelistModule (line 24) | class WhitelistModule(torch.nn.Module):
    method __init__ (line 25) | def __init__(self, dtype):
    method ops (line 30) | def ops(input, weight):
    method forward (line 33) | def forward(self, input):
  class BlacklistModule (line 37) | class BlacklistModule(torch.nn.Module):
    method __init__ (line 38) | def __init__(self, dtype):
    method ops (line 43) | def ops(input, weight):
    method forward (line 46) | def forward(self, input):
  class PromoteModule (line 50) | class PromoteModule(torch.nn.Module):
    method __init__ (line 51) | def __init__(self, dtype):
    method ops (line 56) | def ops(input, weight):
    method forward (line 59) | def forward(self, input):
  class TestCache (line 62) | class TestCache(unittest.TestCase):
    method setUp (line 63) | def setUp(self):
    method tearDown (line 67) | def tearDown(self):
    method train_eval_train_test (line 70) | def train_eval_train_test(self, module, t):
    method test_whitelist_module_fp16_weight (line 117) | def test_whitelist_module_fp16_weight(self):
    method test_whitelist_module_fp32_weight (line 120) | def test_whitelist_module_fp32_weight(self):
    method test_blacklist_module_fp16_weight (line 123) | def test_blacklist_module_fp16_weight(self):
    method test_blacklist_module_fp32_weight (line 126) | def test_blacklist_module_fp32_weight(self):
    method test_promote_module_fp16_weight (line 129) | def test_promote_module_fp16_weight(self):
    method test_promote_module_fp32_weight (line 132) | def test_promote_module_fp32_weight(self):

FILE: apex/tests/L0/run_amp/test_multi_tensor_axpby.py
  class TestMultiTensorAxpby (line 24) | class TestMultiTensorAxpby(unittest.TestCase):
    method setUp (line 26) | def setUp(self):
    method tearDown (line 36) | def tearDown(self):
    method axpby (line 40) | def axpby(self, sizea, sizeb, applier, repeat_tensors,
    method test_fuzz (line 88) | def test_fuzz(self):

FILE: apex/tests/L0/run_amp/test_multi_tensor_l2norm.py
  class TestMultiTensorL2Norm (line 24) | class TestMultiTensorL2Norm(unittest.TestCase):
    method setUp (line 26) | def setUp(self):
    method tearDown (line 31) | def tearDown(self):
    method l2norm (line 35) | def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_t...
    method test_fuzz (line 59) | def test_fuzz(self):

FILE: apex/tests/L0/run_amp/test_multi_tensor_scale.py
  class TestMultiTensorScale (line 24) | class TestMultiTensorScale(unittest.TestCase):
    method setUp (line 26) | def setUp(self):
    method tearDown (line 32) | def tearDown(self):
    method downscale (line 36) | def downscale(self, sizea, sizeb, applier, repeat_tensors, in_type, ou...
    method find_inf (line 55) | def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out...
    method test_fuzz (line 88) | def test_fuzz(self):

FILE: apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
  class MyModel (line 16) | class MyModel(torch.nn.Module):
    method __init__ (line 17) | def __init__(self, unique):
    method ops (line 24) | def ops(input, weight0, weight1):
    method forward (line 27) | def forward(self, input):
  class TestMultipleModelsOptimizersLosses (line 37) | class TestMultipleModelsOptimizersLosses(unittest.TestCase):
    method setUp (line 38) | def setUp(self):
    method tearDown (line 42) | def tearDown(self):
    method test_2models2losses1optimizer (line 45) | def test_2models2losses1optimizer(self):
    method test_3models2losses1optimizer (line 170) | def test_3models2losses1optimizer(self):
    method test_2models2losses2optimizers (line 326) | def test_2models2losses2optimizers(self):
    method test_3models2losses2optimizers (line 516) | def test_3models2losses2optimizers(self):

FILE: apex/tests/L0/run_amp/test_promotion.py
  class TestPromotion (line 12) | class TestPromotion(unittest.TestCase):
    method setUp (line 13) | def setUp(self):
    method tearDown (line 17) | def tearDown(self):
    method run_binary_promote_test (line 20) | def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
    method test_atan2_matches_widest (line 42) | def test_atan2_matches_widest(self):
    method test_mul_matches_widest (line 47) | def test_mul_matches_widest(self):
    method test_cat_matches_widest (line 52) | def test_cat_matches_widest(self):
    method test_inplace_exp_is_error_for_half (line 62) | def test_inplace_exp_is_error_for_half(self):
    method test_inplace_add_matches_self (line 70) | def test_inplace_add_matches_self(self):

FILE: apex/tests/L0/run_amp/test_rnn.py
  class TestRnnCells (line 10) | class TestRnnCells(unittest.TestCase):
    method setUp (line 11) | def setUp(self):
    method tearDown (line 15) | def tearDown(self):
    method run_cell_test (line 18) | def run_cell_test(self, cell, state_tuple=False):
    method test_rnn_cell_is_half (line 42) | def test_rnn_cell_is_half(self):
    method test_gru_cell_is_half (line 46) | def test_gru_cell_is_half(self):
    method test_lstm_cell_is_half (line 50) | def test_lstm_cell_is_half(self):
  class TestRnns (line 54) | class TestRnns(unittest.TestCase):
    method setUp (line 55) | def setUp(self):
    method tearDown (line 59) | def tearDown(self):
    method run_rnn_test (line 62) | def run_rnn_test(self, rnn, layers, bidir, state_tuple=False):
    method test_rnn_is_half (line 76) | def test_rnn_is_half(self):
    method test_gru_is_half (line 83) | def test_gru_is_half(self):
    method test_lstm_is_half (line 90) | def test_lstm_is_half(self):
    method test_rnn_packed_sequence (line 97) | def test_rnn_packed_sequence(self):

FILE: apex/tests/L0/run_amp/utils.py
  function common_init (line 15) | def common_init(test_case):

FILE: apex/tests/L0/run_fp16util/test_fp16util.py
  class DummyBlock (line 9) | class DummyBlock(nn.Module):
    method __init__ (line 10) | def __init__(self):
    method forward (line 16) | def forward(self, x):
  class DummyNet (line 20) | class DummyNet(nn.Module):
    method __init__ (line 21) | def __init__(self):
    method forward (line 29) | def forward(self, x):
  class DummyNetWrapper (line 38) | class DummyNetWrapper(nn.Module):
    method __init__ (line 39) | def __init__(self):
    method forward (line 45) | def forward(self, x):
  class TestFP16Model (line 49) | class TestFP16Model(unittest.TestCase):
    method setUp (line 50) | def setUp(self):
    method test_params_and_buffers (line 59) | def test_params_and_buffers(self):
    method test_output_is_half (line 72) | def test_output_is_half(self):

FILE: apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
  class TestFusedLayerNorm (line 9) | class TestFusedLayerNorm(unittest.TestCase):
    method setUp (line 10) | def setUp(self):
    method forward_cpu (line 15) | def forward_cpu(self, input_):
    method forward_cuda (line 19) | def forward_cuda(self, input_):
    method test_forward_cuda (line 23) | def test_forward_cuda(self):
    method test_forward_cpu (line 27) | def test_forward_cpu(self):
    method test_same_output (line 31) | def test_same_output(self):
  class TestFusedLayerNormElemWise (line 37) | class TestFusedLayerNormElemWise(TestFusedLayerNorm):
    method setUp (line 38) | def setUp(self):

FILE: apex/tests/L0/run_mixed_adam/test_fp16_optimizer.py
  class TestFP16Optimizer (line 5) | class TestFP16Optimizer(unittest.TestCase):
    method setUp (line 6) | def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
    method get_max_diff (line 22) | def get_max_diff(self, ref_param, tst_param):
    method test_fp16_optimizer (line 33) | def test_fp16_optimizer(self):
    method test_loss_scaling (line 55) | def test_loss_scaling(self):
    method test_parameter_groups (line 76) | def test_parameter_groups(self):
    method test_grad_clip (line 99) | def test_grad_clip(self):
    method test_grad_None (line 121) | def test_grad_None(self):
    method test_weight_decay (line 125) | def test_weight_decay(self):
    method test_group_empty (line 129) | def test_group_empty(self):

FILE: apex/tests/L0/run_mixed_adam/test_mixed_adam.py
  class TestFusedAdam (line 8) | class TestFusedAdam(unittest.TestCase):
    method setUp (line 9) | def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
    method tearDown (line 15) | def tearDown(self):
    method gen_param_optim (line 18) | def gen_param_optim(self, tensors, adam_option):
    method gen_grad (line 30) | def gen_grad(self, ref_param, tst_param):
    method gen_mixed_grad (line 35) | def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
    method get_max_diff (line 42) | def get_max_diff(self, ref_param, tst_param):
    method gen_single_type_test (line 53) | def gen_single_type_test(self, param_type=torch.float):
    method test_double (line 71) | def test_double(self):
    method test_float (line 74) | def test_float(self):
    method test_half (line 77) | def test_half(self):
    method test_multi_params (line 95) | def test_multi_params(self):
    method test_scale (line 114) | def test_scale(self):
    method test_fp16_output (line 133) | def test_fp16_output(self):
    method test_adam_option (line 158) | def test_adam_option(self):

FILE: apex/tests/L1/common/main_amp.py
  function fast_collate (line 81) | def fast_collate(batch):
  function main (line 117) | def main():
  class data_prefetcher (line 263) | class data_prefetcher():
    method __init__ (line 264) | def __init__(self, loader):
    method preload (line 275) | def preload(self):
    method next (line 292) | def next(self):
  function train (line 300) | def train(train_loader, model, criterion, optimizer, epoch):
  function validate (line 398) | def validate(val_loader, model, criterion):
  function save_checkpoint (line 459) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 465) | class AverageMeter(object):
    method __init__ (line 467) | def __init__(self):
    method reset (line 470) | def reset(self):
    method update (line 476) | def update(self, val, n=1):
  function adjust_learning_rate (line 483) | def adjust_learning_rate(optimizer, epoch, step, len_epoch):
  function accuracy (line 503) | def accuracy(output, target, topk=(1,)):
  function reduce_tensor (line 519) | def reduce_tensor(tensor):

FILE: apex/tests/distributed/DDP/ddp_race_condition_test.py
  class Model (line 28) | class Model(Module):
    method __init__ (line 29) | def __init__(self):
    method forward (line 33) | def forward(self, input):
  function info (line 57) | def info(name, param, val):

FILE: apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
  function compare (line 12) | def compare(desc, inp1, inp2, error):

FILE: apex/tests/distributed/synced_batchnorm/test_groups.py
  function compare (line 9) | def compare(desc, inp1, inp2, error):

FILE: apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
  function compare (line 9) | def compare(desc, inp1, inp2, error):

FILE: jukebox/align.py
  function get_alignment (line 15) | def get_alignment(x, zs, labels, prior, fp16, hps):
  function save_alignment (line 85) | def save_alignment(model, device, hps):
  function run (line 100) | def run(model, port=29500, **kwargs):

FILE: jukebox/data/artist_genre_processor.py
  function norm (line 10) | def norm(s):
  function create_reverse_lookup (line 15) | def create_reverse_lookup(atoi):
  class ArtistGenreProcessor (line 27) | class ArtistGenreProcessor():
    method __init__ (line 28) | def __init__(self, v3=False):
    method get_artist_id (line 40) | def get_artist_id(self, artist):
    method get_genre_ids (line 51) | def get_genre_ids(self, genre):
    method get_artist (line 64) | def get_artist(self, artist_id):
    method get_genre (line 67) | def get_genre(self, genre_ids):
    method load_artists (line 75) | def load_artists(self):
    method load_genres (line 84) | def load_genres(self):

FILE: jukebox/data/data_processor.py
  class OffsetDataset (line 9) | class OffsetDataset(Dataset):
    method __init__ (line 10) | def __init__(self, dataset, start, end, test=False):
    method __len__ (line 18) | def __len__(self):
    method __getitem__ (line 21) | def __getitem__(self, item):
  class DataProcessor (line 24) | class DataProcessor():
    method __init__ (line 25) | def __init__(self, hps):
    method set_epoch (line 34) | def set_epoch(self, epoch):
    method create_datasets (line 38) | def create_datasets(self, hps):
    method create_samplers (line 43) | def create_samplers(self, hps):
    method create_data_loaders (line 51) | def create_data_loaders(self, hps):
    method print_stats (line 66) | def print_stats(self, hps):

FILE: jukebox/data/files_dataset.py
  class FilesAudioDataset (line 10) | class FilesAudioDataset(Dataset):
    method __init__ (line 11) | def __init__(self, hps):
    method filter (line 23) | def filter(self, files, durations):
    method init_dataset (line 38) | def init_dataset(self, hps):
    method get_index_offset (line 49) | def get_index_offset(self, item):
    method get_metadata (line 67) | def get_metadata(self, filename, test):
    method get_song_chunk (line 80) | def get_song_chunk(self, index, offset, test=False):
    method get_item (line 91) | def get_item(self, item, test=False):
    method __len__ (line 95) | def __len__(self):
    method __getitem__ (line 98) | def __getitem__(self, item):

FILE: jukebox/data/labels.py
  function get_relevant_lyric_tokens (line 7) | def get_relevant_lyric_tokens(full_tokens, n_tokens, total_length, offse...
  class EmptyLabeller (line 22) | class EmptyLabeller():
    method get_label (line 23) | def get_label(self, artist=None, genre=None, lyrics=None, total_length...
    method get_batch_labels (line 28) | def get_batch_labels(self, metas, device='cpu'):
  class Labeller (line 41) | class Labeller():
    method __init__ (line 42) | def __init__(self, max_genre_words, n_tokens, sample_length, v3=False):
    method get_label (line 50) | def get_label(self, artist, genre, lyrics, total_length, offset):
    method get_y_from_ids (line 65) | def get_y_from_ids(self, artist_id, genre_ids, lyric_tokens, total_len...
    method get_batch_labels (line 76) | def get_batch_labels(self, metas, device='cpu'):
    method set_y_lyric_tokens (line 89) | def set_y_lyric_tokens(self, ys, labels):
    method describe_label (line 107) | def describe_label(self, y):

FILE: jukebox/data/text_processor.py
  class TextProcessor (line 4) | class TextProcessor():
    method __init__ (line 5) | def __init__(self, v3=False):
    method clean (line 19) | def clean(self, text):
    method tokenise (line 25) | def tokenise(self, text):
    method textise (line 28) | def textise(self, tokens):
    method characterise (line 31) | def characterise(self, tokens):

FILE: jukebox/hparams.py
  class Hyperparams (line 4) | class Hyperparams(dict):
    method __getattr__ (line 5) | def __getattr__(self, attr):
    method __setattr__ (line 8) | def __setattr__(self, attr, value):
  function setup_hparams (line 11) | def setup_hparams(hparam_set_names, kwargs):

FILE: jukebox/make_models.py
  function load_checkpoint (line 24) | def load_checkpoint(path):
  function save_checkpoint (line 41) | def save_checkpoint(logger, name, model, opt, metrics, hps):
  function restore_model (line 52) | def restore_model(hps, model, checkpoint_path):
  function restore_opt (line 64) | def restore_opt(opt, shd, checkpoint_path):
  function make_vqvae (line 73) | def make_vqvae(hps, device='cuda'):
  function make_prior (line 112) | def make_prior(hps, vqvae, device='cuda'):
  function make_model (line 189) | def make_model(model, device, hps, levels=None):
  function save_outputs (line 198) | def save_outputs(model, device, hps):
  function run (line 245) | def run(model, port=29500, **kwargs):

FILE: jukebox/prior/autoregressive.py
  function get_normal (line 11) | def get_normal(*shape, std=0.01):
  function roll (line 16) | def roll(x, n):
  function split_chunks (line 19) | def split_chunks(length, chunk_size):
  class PositionEmbedding (line 25) | class PositionEmbedding(nn.Module):
    method __init__ (line 26) | def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
    method forward (line 41) | def forward(self):
  class ConditionalAutoregressive2D (line 48) | class ConditionalAutoregressive2D(nn.Module):
    method __init__ (line 49) | def __init__(self, input_shape, bins,
    method preprocess (line 101) | def preprocess(self, x):
    method postprocess (line 107) | def postprocess(self, x, sample_tokens=None):
    method forward (line 116) | def forward(self, x, x_cond=None, y_cond=None, encoder_kv=None, fp16=F...
    method get_emb (line 177) | def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
    method sample (line 199) | def sample(self, n_samples, x_cond=None, y_cond=None, encoder_kv=None,...
    method primed_sample (line 251) | def primed_sample(self, n_samples, x, x_cond=None, y_cond=None, encode...
    method check_sample (line 361) | def check_sample(self, chunk_size):
  function test_prior (line 391) | def test_prior(input_shape, encoder_dims, blocks, heads, chunk_size):

FILE: jukebox/prior/conditioners.py
  class Conditioner (line 8) | class Conditioner(nn.Module):
    method __init__ (line 9) | def __init__(self, input_shape, bins, down_t, stride_t, out_width, ini...
    method preprocess (line 22) | def preprocess(self, x):
    method postprocess (line 26) | def postprocess(self, x):
    method forward (line 30) | def forward(self, x, x_cond=None):
  function flip (line 50) | def flip(x):
  class SimpleEmbedding (line 57) | class SimpleEmbedding(nn.Module):
    method __init__ (line 58) | def __init__(self, bins, out_width, init_scale):
    method forward (line 64) | def forward(self, y):
  class RangeEmbedding (line 70) | class RangeEmbedding(nn.Module):
    method __init__ (line 79) | def __init__(self, n_time, bins, range, out_width, init_scale, clamp=F...
    method forward (line 88) | def forward(self, pos_start, pos_end=None):
  class LabelConditioner (line 113) | class LabelConditioner(nn.Module):
    method __init__ (line 114) | def __init__(self, y_bins, t_bins, sr, min_duration, max_duration, n_t...
    method forward (line 134) | def forward(self, y):

FILE: jukebox/prior/prior.py
  class SimplePrior (line 27) | class SimplePrior(nn.Module):
    method __init__ (line 28) | def __init__(self, z_shapes, l_bins, encoder, decoder, level,
    method get_y (line 140) | def get_y(self, labels, start, get_indices=False):
    method get_z_conds (line 158) | def get_z_conds(self, zs, start, end):
    method prior_preprocess (line 168) | def prior_preprocess(self, xs, conds):
    method prior_postprocess (line 187) | def prior_postprocess(self, z):
    method x_emb (line 205) | def x_emb(self, z_conds):
    method encode (line 213) | def encode(self, x, start_level=None, end_level=None, bs_chunks=1):
    method decode (line 223) | def decode(self, zs, start_level=None, end_level=None, bs_chunks=1):
    method get_cond (line 234) | def get_cond(self, z_conds, y):
    method sample (line 245) | def sample(self, n_samples, z=None, z_conds=None, y=None, fp16=False, ...
    method get_encoder_kv (line 285) | def get_encoder_kv(self, prime, fp16=False, sample=False):
    method get_prime_loss (line 303) | def get_prime_loss(self, encoder_kv, prime_t):
    method z_forward (line 312) | def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False...
    method forward (line 346) | def forward(self, x, y=None, fp16=False, decode=False, get_preds=False):

FILE: jukebox/sample.py
  function sample_partial_window (line 17) | def sample_partial_window(zs, labels, sampling_kwargs, level, prior, tok...
  function sample_single_window (line 31) | def sample_single_window(zs, labels, sampling_kwargs, level, prior, star...
  function sample_level (line 81) | def sample_level(zs, labels, sampling_kwargs, level, prior, total_length...
  function _sample (line 91) | def _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps):
  function ancestral_sample (line 124) | def ancestral_sample(labels, sampling_kwargs, priors, hps):
  function continue_sample (line 131) | def continue_sample(zs, labels, sampling_kwargs, priors, hps):
  function upsample (line 137) | def upsample(zs, labels, sampling_kwargs, priors, hps):
  function primed_sample (line 143) | def primed_sample(x, labels, sampling_kwargs, priors, hps):
  function load_prompts (line 150) | def load_prompts(audio_files, duration, hps):
  function load_codes (line 164) | def load_codes(codes_file, duration, priors, hps):
  function save_samples (line 178) | def save_samples(model, device, hps, sample_hps):
  function run (line 269) | def run(model, mode='ancestral', codes_file=None, audio_file=None, promp...

FILE: jukebox/save_html.py
  function save_html (line 7) | def save_html(logdir, x, zs, labels, alignments, hps):
  function _save_item_html (line 28) | def _save_item_html(item_dir, item_id, item_name, data):

FILE: jukebox/tests/test_sample.py
  function repeat (line 7) | def repeat(x, n, dim):
  class DummyPrior (line 13) | class DummyPrior:
    method __init__ (line 14) | def __init__(self, n_ctx, level, levels):
    method get_y (line 25) | def get_y(self, labels, start):
    method get_z_conds (line 33) | def get_z_conds(self, zs, start, end):
    method ancestral_sample (line 43) | def ancestral_sample(self, n_samples, z_conds=None, y=None):
    method primed_sample (line 53) | def primed_sample(self, n_samples, z, z_conds=None, y=None):
  function _sample (line 67) | def _sample(zs, labels,  priors, sample_levels, hps):
  function test_ancestral_sample (line 77) | def test_ancestral_sample(labels, priors, hps):
  function test_primed_sample (line 90) | def test_primed_sample(labels, priors, hps):
  function check_sample (line 120) | def check_sample():

FILE: jukebox/train.py
  function prepare_aud (line 24) | def prepare_aud(x, hps):
  function log_aud (line 28) | def log_aud(logger, tag, x, hps):
  function log_labels (line 32) | def log_labels(logger, labeller, tag, y, hps):
  function get_ddp (line 42) | def get_ddp(model, hps):
  function get_ema (line 48) | def get_ema(model, hps):
  function get_lr_scheduler (line 62) | def get_lr_scheduler(opt, hps):
  function get_optimizer (line 78) | def get_optimizer(model, hps):
  function log_inputs (line 103) | def log_inputs(orig_model, logger, x_in, y, x_out, hps, tag="train"):
  function sample_prior (line 118) | def sample_prior(orig_model, ema, logger, x_in, y, hps):
  function evaluate (line 153) | def evaluate(model, orig_model, logger, metrics, data_processor, hps):
  function train (line 203) | def train(model, orig_model, opt, shd, scalar, ema, logger, metrics, dat...
  function run (line 294) | def run(hps="teeny", port=29500, **kwargs):

FILE: jukebox/transformer/factored_attention.py
  function repeat (line 10) | def repeat(x, n, dim):
  function get_mask (line 15) | def get_mask(mask, q_l, kv_l, blocks, spread, device, sample, sample_t):
  class FactoredAttention (line 30) | class FactoredAttention(nn.Module):
    method __init__ (line 31) | def __init__(self, n_in, n_ctx, n_state, n_head,
    method _attn (line 82) | def _attn(self, q, k, v, sample):
    method merge_heads (line 110) | def merge_heads(self, x):
    method split_heads (line 115) | def split_heads(self, x, k=False):
    method dense_attn (line 123) | def dense_attn(self, query, key, value, sample):
    method block_attn (line 135) | def block_attn(self, q, k, v, sample):
    method transpose_block_attn (line 152) | def transpose_block_attn(self, q, k, v, sample):
    method prev_block_attn (line 167) | def prev_block_attn(self, q, k, v, sample):
    method summary_attn (line 195) | def summary_attn(self, q, k, v, sample):
    method summary_spread_attn (line 207) | def summary_spread_attn(self, q, k, v, sample):
    method prime_attn (line 220) | def prime_attn(self, q, k, v, sample):
    method decode_attn (line 226) | def decode_attn(self, q, k, v, sample):
    method factored_qkv (line 230) | def factored_qkv(self, x, encoder_kv=None, sample=False):
    method prime_qkv (line 255) | def prime_qkv(self, x, encoder_kv=None, sample=False):
    method decode_qkv (line 273) | def decode_qkv(self, x, encoder_kv=None, sample=False):
    method forward (line 289) | def forward(self, x, encoder_kv=None, sample=False):
    method _prime_len (line 304) | def _prime_len(self):
    method _offset (line 310) | def _offset(self, curr_ctx):
    method _pad_to_block_ctx (line 315) | def _pad_to_block_ctx(self, x, query=False):
    method _cache_len (line 325) | def _cache_len(self):
    method _suff_cache_len (line 328) | def _suff_cache_len(self):
    method _slice_cache (line 355) | def _slice_cache(self, start, end=None):
    method _append_cache (line 359) | def _append_cache(self, key, value):
    method del_cache (line 375) | def del_cache(self):
    method check (line 383) | def check(self):
    method check_cache (line 412) | def check_cache(self, n_samples, sample_t, fp16):
    method check_sample (line 424) | def check_sample(self):
    method check_chunks (line 457) | def check_chunks(self, chunk_size):

FILE: jukebox/transformer/ops.py
  class LayerNorm (line 14) | class LayerNorm(FusedLayerNorm):
    method __init__ (line 15) | def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
    method forward (line 20) | def forward(self, input):
  function gelu (line 26) | def gelu(x):
  function swish (line 30) | def swish(x):
  function quick_gelu (line 34) | def quick_gelu(x):
  function quick_gelu_bwd (line 38) | def quick_gelu_bwd(x, grad_output):
  class QuickGelu (line 42) | class QuickGelu(t.autograd.Function):
    method forward (line 44) | def forward(ctx, x):
    method backward (line 49) | def backward(ctx, grad_output):
  function memory_efficient_quick_gelu (line 52) | def memory_efficient_quick_gelu(x):
  function _move_to_gpu_and_convert_conv_weights_to_fp16 (line 62) | def _move_to_gpu_and_convert_conv_weights_to_fp16(l):
  function _convert_conv_weights_to_fp32 (line 67) | def _convert_conv_weights_to_fp32(l):
  function _convert_conv_weights_to_fp16 (line 71) | def _convert_conv_weights_to_fp16(l):
  function _convert_embedding_weights_to_fp16 (line 75) | def _convert_embedding_weights_to_fp16(l):
  function _convert_embedding_weights_to_fp32 (line 79) | def _convert_embedding_weights_to_fp32(l):
  class Conv1D (line 83) | class Conv1D(nn.Module):
    method __init__ (line 84) | def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
    method forward (line 97) | def forward(self, x):
  class Mask (line 104) | class Mask(nn.Module):
    method __init__ (line 105) | def __init__(self, n_ctx):
    method forward (line 109) | def forward(self, w):
  function filter_logits (line 113) | def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):

FILE: jukebox/transformer/transformer.py
  function _convert_mlp_traced (line 11) | def _convert_mlp_traced(l):
  function _convert_mlp_traced_fp16 (line 15) | def _convert_mlp_traced_fp16(l):
  class MLP (line 19) | class MLP(nn.Module):
    method __init__ (line 20) | def __init__(self, n_in, n_state, resid_dropout=0.0, afn='quick_gelu',...
    method forward (line 27) | def forward(self, x):
  class ResAttnBlock (line 32) | class ResAttnBlock(nn.Module):
    method __init__ (line 33) | def __init__(self, n_in, n_ctx, n_head,
    method forward (line 62) | def forward(self, x, encoder_kv, sample=False):
  class Transformer (line 88) | class Transformer(nn.Module):
    method __init__ (line 89) | def __init__(self, n_in, n_ctx, n_head, n_depth,
    method set_record_attn (line 146) | def set_record_attn(self, record_attn):
    method forward (line 169) | def forward(self, x, encoder_kv=None, sample=False, fp16=False, fp16_o...
    method check_cache (line 194) | def check_cache(self, n_samples, sample_t, fp16):
    method del_cache (line 198) | def del_cache(self):
    method check_sample (line 202) | def check_sample(self):

FILE: jukebox/utils/audio_utils.py
  class DefaultSTFTValues (line 8) | class DefaultSTFTValues:
    method __init__ (line 9) | def __init__(self, hps):
  class STFTValues (line 15) | class STFTValues:
    method __init__ (line 16) | def __init__(self, hps, n_fft, hop_length, window_size):
  function calculate_bandwidth (line 22) | def calculate_bandwidth(dataset, hps, duration=600):
  function audio_preprocess (line 58) | def audio_preprocess(x, hps):
  function audio_postprocess (line 79) | def audio_postprocess(x, hps):
  function stft (line 82) | def stft(sig, hps):
  function spec (line 85) | def spec(x, hps):
  function norm (line 88) | def norm(x):
  function squeeze (line 91) | def squeeze(x):
  function spectral_loss (line 99) | def spectral_loss(x_in, x_out, hps):
  function multispectral_loss (line 105) | def multispectral_loss(x_in, x_out, hps):
  function spectral_convergence (line 118) | def spectral_convergence(x_in, x_out, hps, epsilon=2e-3):
  function log_magnitude_loss (line 128) | def log_magnitude_loss(x_in, x_out, hps, epsilon=1e-4):
  function load_audio (line 134) | def load_audio(file, sr, offset, duration, mono=False):
  function save_wav (line 142) | def save_wav(fname, aud, sr):

FILE: jukebox/utils/checkpoint.py
  function checkpoint (line 4) | def checkpoint(func, inputs, params, flag):
  class CheckpointFunction (line 11) | class CheckpointFunction(t.autograd.Function):
    method forward (line 13) | def forward(ctx, run_function, length, *args):
    method backward (line 22) | def backward(ctx, *output_grads):

FILE: jukebox/utils/dist_adapter.py
  class ReduceOp (line 4) | class ReduceOp(Enum):
    method ToDistOp (line 10) | def ToDistOp(self):
  function is_available (line 18) | def is_available():
  function get_rank (line 21) | def get_rank():
  function get_world_size (line 27) | def get_world_size():
  function barrier (line 33) | def barrier():
  function all_gather (line 38) | def all_gather(tensor_list, tensor):
  function all_reduce (line 44) | def all_reduce(tensor, op=ReduceOp.SUM):
  function reduce (line 49) | def reduce(tensor, dst, op=ReduceOp.SUM):
  function broadcast (line 54) | def broadcast(tensor, src):
  function init_process_group (line 59) | def init_process_group(backend, init_method):
  function _get_rank (line 64) | def _get_rank():
  function _barrier (line 67) | def _barrier():
  function _get_world_size (line 70) | def _get_world_size():
  function _all_gather (line 73) | def _all_gather(tensor_list, tensor):
  function _all_reduce (line 76) | def _all_reduce(tensor, op):
  function _reduce (line 79) | def _reduce(tensor, dst, op):
  function _broadcast (line 82) | def _broadcast(tensor, src):
  function _init_process_group (line 85) | def _init_process_group(backend, init_method):

FILE: jukebox/utils/dist_utils.py
  function print_once (line 6) | def print_once(msg):
  function print_all (line 10) | def print_all(msg):
  function allgather (line 16) | def allgather(x):
  function allreduce (line 22) | def allreduce(x, op=dist.ReduceOp.SUM):
  function allgather_lists (line 27) | def allgather_lists(xs):
  function setup_dist_from_mpi (line 42) | def setup_dist_from_mpi(
  function _setup_dist_from_mpi (line 59) | def _setup_dist_from_mpi(master_addr, backend, port, n_attempts, verbose):

FILE: jukebox/utils/ema.py
  class EMA (line 6) | class EMA:
    method __init__ (line 7) | def __init__(self, params, mu=0.999):
    method get_model_state (line 11) | def get_model_state(self, p):
    method step (line 14) | def step(self):
    method swap (line 18) | def swap(self):
  class CPUEMA (line 26) | class CPUEMA:
    method __init__ (line 27) | def __init__(self, params, mu=0.999, freq=1):
    method get_model_state (line 33) | def get_model_state(self, p):
    method step (line 38) | def step(self):
    method swap (line 48) | def swap(self):
  class FusedEMA (line 56) | class FusedEMA:
    method __init__ (line 57) | def __init__(self, params, mu=0.999):
    method get_model_state (line 68) | def get_model_state(self, group):
    method step (line 76) | def step(self):
    method swap (line 80) | def swap(self):

FILE: jukebox/utils/fp16.py
  function adam_step (line 12) | def adam_step(p: torch.Tensor, out_p: torch.Tensor, exp_avg: torch.Tenso...
  function backward (line 39) | def backward(loss, params, scalar, fp16, logger):
  class LossScalar (line 68) | class LossScalar(object):
    method __init__ (line 69) | def __init__(self,
    method get_scale (line 87) | def get_scale(self):
    method update_scale (line 90) | def update_scale(self, overflow):
  function check_overflow (line 101) | def check_overflow(val):
  function grad_norm (line 104) | def grad_norm(params, scale, flat=False):
  function clipped_grad_scale (line 122) | def clipped_grad_scale(grad_norm, max_grad_norm, scale):
  class FP16FusedAdam (line 128) | class FP16FusedAdam(Optimizer):
    method __init__ (line 129) | def __init__(
    method init_state (line 150) | def init_state(self):
    method step (line 165) | def step(self, closure=None, scale=1.0):
  class FusedAdam (line 229) | class FusedAdam(Optimizer):
    method __init__ (line 230) | def __init__(
    method step (line 249) | def step(self, closure=None, scale=1.0):

FILE: jukebox/utils/io.py
  function get_duration_sec (line 6) | def get_duration_sec(file, cache=False):
  function load_audio (line 20) | def load_audio(file, sr, offset, duration, resample=True, approx=False, ...
  function test_simple_loader (line 59) | def test_simple_loader():
  function test_dataset_loader (line 87) | def test_dataset_loader():

FILE: jukebox/utils/logger.py
  function def_tqdm (line 8) | def def_tqdm(x):
  function get_range (line 11) | def get_range(x):
  function init_logging (line 17) | def init_logging(hps, local_rank, rank):
  function get_name (line 30) | def get_name(hps):
  function average_metrics (line 36) | def average_metrics(_metrics):
  class Metrics (line 45) | class Metrics:
    method __init__ (line 46) | def __init__(self):
    method update (line 50) | def update(self, tag, val, batch):
    method avg (line 63) | def avg(self, tag):
    method reset (line 69) | def reset(self):
  class Logger (line 73) | class Logger:
    method __init__ (line 74) | def __init__(self, logdir, rank):
    method step (line 83) | def step(self):
    method flush (line 86) | def flush(self):
    method add_text (line 90) | def add_text(self, tag, text):
    method add_audios (line 94) | def add_audios(self, tag, auds, sample_rate=22050, max_len=None, max_l...
    method add_audio (line 102) | def add_audio(self, tag, aud, sample_rate=22050):
    method add_images (line 106) | def add_images(self, tag, img, dataformats="NHWC"):
    method add_image (line 110) | def add_image(self, tag, img):
    method add_scalar (line 114) | def add_scalar(self, tag, val):
    method get_range (line 118) | def get_range(self, loader):
    method close_range (line 125) | def close_range(self):
    method set_postfix (line 129) | def set_postfix(self, *args, **kwargs):
    method add_reduce_scalar (line 134) | def add_reduce_scalar(self, tag, layer, val):
    method finish_reduce (line 141) | def finish_reduce(self):

FILE: jukebox/utils/remote_utils.py
  function download (line 4) | def download(remote_path, local_path, async_download=False):
  function gs_download (line 13) | def gs_download(gs_path, local_path, async_download=False):
  function gs_upload (line 24) | def gs_upload(local_path, gs_path, async_upload=False):
  function ls (line 37) | def ls(regex):

FILE: jukebox/utils/sample_utils.py
  function split_batch (line 3) | def split_batch(obj, n_samples, split_size):
  function get_starts (line 15) | def get_starts(total_length, n_ctx, hop_length):

FILE: jukebox/utils/torch_utils.py
  function freeze_model (line 4) | def freeze_model(model):
  function unfreeze_model (line 10) | def unfreeze_model(model):
  function zero_grad (line 15) | def zero_grad(model):
  function empty_cache (line 20) | def empty_cache():
  function assert_shape (line 24) | def assert_shape(x, exp_shape):
  function count_parameters (line 27) | def count_parameters(model):
  function count_state (line 30) | def count_state(model):

FILE: jukebox/vqvae/bottleneck.py
  class BottleneckBlock (line 7) | class BottleneckBlock(nn.Module):
    method __init__ (line 8) | def __init__(self, k_bins, emb_width, mu):
    method reset_k (line 16) | def reset_k(self):
    method _tile (line 22) | def _tile(self, x):
    method init_k (line 31) | def init_k(self, x):
    method restore_k (line 43) | def restore_k(self, num_tokens=None, threshold=1.0):
    method update_k (line 55) | def update_k(self, x, x_l):
    method preprocess (line 88) | def preprocess(self, x):
    method postprocess (line 105) | def postprocess(self, x_l, x_d, x_shape):
    method quantise (line 112) | def quantise(self, x):
    method dequantise (line 121) | def dequantise(self, x_l):
    method encode (line 125) | def encode(self, x):
    method decode (line 138) | def decode(self, x_l):
    method forward (line 149) | def forward(self, x, update_k=True):
  class Bottleneck (line 182) | class Bottleneck(nn.Module):
    method __init__ (line 183) | def __init__(self, l_bins, emb_width, mu, levels):
    method encode (line 191) | def encode(self, xs):
    method decode (line 195) | def decode(self, zs, start_level=0, end_level=None):
    method forward (line 201) | def forward(self, xs):
  class NoBottleneckBlock (line 218) | class NoBottleneckBlock(nn.Module):
    method restore_k (line 219) | def restore_k(self):
  class NoBottleneck (line 222) | class NoBottleneck(nn.Module):
    method __init__ (line 223) | def __init__(self, levels):
    method encode (line 230) | def encode(self, xs):
    method decode (line 233) | def decode(self, zs, start_level=0, end_level=None):
    method forward (line 238) | def forward(self, xs):

FILE: jukebox/vqvae/encdec.py
  class EncoderConvBlock (line 6) | class EncoderConvBlock(nn.Module):
    method __init__ (line 7) | def __init__(self, input_emb_width, output_emb_width, down_t,
    method forward (line 25) | def forward(self, x):
  class DecoderConvBock (line 28) | class DecoderConvBock(nn.Module):
    method __init__ (line 29) | def __init__(self, input_emb_width, output_emb_width, down_t,
    method forward (line 45) | def forward(self, x):
  class Encoder (line 48) | class Encoder(nn.Module):
    method __init__ (line 49) | def __init__(self, input_emb_width, output_emb_width, levels, downs_t,
    method forward (line 70) | def forward(self, x):
  class Decoder (line 87) | class Decoder(nn.Module):
    method __init__ (line 88) | def __init__(self, input_emb_width, output_emb_width, levels, downs_t,
    method forward (line 110) | def forward(self, xs, all_levels=True):

FILE: jukebox/vqvae/resnet.py
  class ResConvBlock (line 6) | class ResConvBlock(nn.Module):
    method __init__ (line 7) | def __init__(self, n_in, n_state):
    method forward (line 16) | def forward(self, x):
  class Resnet (line 19) | class Resnet(nn.Module):
    method __init__ (line 20) | def __init__(self, n_in, n_depth, m_conv=1.0):
    method forward (line 24) | def forward(self, x):
  class ResConv1DBlock (line 27) | class ResConv1DBlock(nn.Module):
    method __init__ (line 28) | def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scal...
    method forward (line 43) | def forward(self, x):
  class Resnet1D (line 46) | class Resnet1D(nn.Module):
    method __init__ (line 47) | def __init__(self, n_in, n_depth, m_conv=1.0, dilation_growth_rate=1, ...
    method forward (line 69) | def forward(self, x):

FILE: jukebox/vqvae/vqvae.py
  function dont_update (line 10) | def dont_update(params):
  function update (line 14) | def update(params):
  function calculate_strides (line 18) | def calculate_strides(strides, downs):
  function _loss_fn (line 21) | def _loss_fn(loss_fn, x_target, x_pred, hps):
  class VQVAE (line 42) | class VQVAE(nn.Module):
    method __init__ (line 43) | def __init__(self, input_shape, levels, downs_t, strides_t,
    method preprocess (line 90) | def preprocess(self, x):
    method postprocess (line 96) | def postprocess(self, x):
    method _decode (line 101) | def _decode(self, zs, start_level=0, end_level=None):
    method decode (line 115) | def decode(self, zs, start_level=0, end_level=None, bs_chunks=1):
    method _encode (line 124) | def _encode(self, x, start_level=0, end_level=None):
    method encode (line 137) | def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
    method sample (line 146) | def sample(self, n_samples):
    method forward (line 150) | def forward(self, x, hps, loss_fn='l1'):

FILE: tensorboardX/examples/chainer/extension_logger/net.py
  function add_noise (line 13) | def add_noise(h, sigma=0.2):
  class Generator (line 21) | class Generator(chainer.Chain):
    method __init__ (line 23) | def __init__(self, n_hidden, bottom_width=4, ch=512, wscale=0.02):
    method make_hidden (line 42) | def make_hidden(self, batchsize):
    method __call__ (line 46) | def __call__(self, z):
  class Discriminator (line 56) | class Discriminator(chainer.Chain):
    method __init__ (line 58) | def __init__(self, bottom_width=4, ch=512, wscale=0.02):
    method __call__ (line 77) | def __call__(self, x):

FILE: tensorboardX/examples/chainer/extension_logger/train_dcgan.py
  function main (line 19) | def main():

FILE: tensorboardX/examples/chainer/extension_logger/updater.py
  class DCGANUpdater (line 10) | class DCGANUpdater(chainer.training.StandardUpdater):
    method __init__ (line 12) | def __init__(self, *args, **kwargs):
    method loss_dis (line 16) | def loss_dis(self, dis, y_fake, y_real):
    method loss_gen (line 24) | def loss_gen(self, gen, y_fake):
    method update_core (line 30) | def update_core(self):

FILE: tensorboardX/examples/chainer/extension_logger/visualize.py
  function out_generated_image (line 13) | def out_generated_image(gen, dis, rows, cols, seed, dst, writer):

FILE: tensorboardX/examples/chainer/extension_logger/writetensorboard.py
  class LogTensorboard (line 13) | class LogTensorboard(extension.Extension):
    method __init__ (line 56) | def __init__(self, keys=None, trigger=(1, 'epoch'), postprocess=None,
    method __call__ (line 66) | def __call__(self, trainer):
    method log (line 111) | def log(self):
    method serialize (line 115) | def serialize(self, serializer):
    method _init_summary (line 128) | def _init_summary(self):

FILE: tensorboardX/examples/chainer/plain_logger/data.py
  function load_mnist (line 18) | def load_mnist(images, labels, num):
  function download_mnist_data (line 34) | def download_mnist_data():
  function load_mnist_data (line 64) | def load_mnist_data():

FILE: tensorboardX/examples/chainer/plain_logger/net.py
  class VAE (line 9) | class VAE(chainer.Chain):
    method __init__ (line 12) | def __init__(self, n_in, n_latent, n_h):
    method __call__ (line 23) | def __call__(self, x, sigmoid=True):
    method encode (line 27) | def encode(self, x):
    method decode (line 33) | def decode(self, z, sigmoid=True):
    method get_loss_func (line 41) | def get_loss_func(self, C=1.0, k=1):

FILE: tensorboardX/examples/chainer/plain_logger/train_vae.py
  function save_images (line 148) | def save_images(x, filename):

FILE: tensorboardX/examples/demo_beholder.py
  function beholder_pytorch (line 35) | def beholder_pytorch():

FILE: tensorboardX/examples/demo_caffe2.py
  function DownloadResource (line 28) | def DownloadResource(url, path):
  function AddInput (line 88) | def AddInput(model, batch_size, db, db_type):
  function AddLeNetModel (line 102) | def AddLeNetModel(model, data):
  function AddAccuracy (line 130) | def AddAccuracy(model, softmax, label):
  function AddTrainingOperators (line 136) | def AddTrainingOperators(model, softmax, label):
  function AddBookkeepingOperators (line 163) | def AddBookkeepingOperators(model):

FILE: tensorboardX/examples/demo_embedding.py
  class M (line 14) | class M(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 21) | def forward(self, i):
  function get_data (line 36) | def get_data(value, shape):

FILE: tensorboardX/examples/demo_graph.py
  class LinearInLinear (line 11) | class LinearInLinear(nn.Module):
    method __init__ (line 12) | def __init__(self):
    method forward (line 16) | def forward(self, x):
  class MultipleInput (line 23) | class MultipleInput(nn.Module):
    method __init__ (line 24) | def __init__(self):
    method forward (line 29) | def forward(self, x, y):
  class MultipleOutput (line 35) | class MultipleOutput(nn.Module):
    method __init__ (line 36) | def __init__(self):
    method forward (line 41) | def forward(self, x):
  class MultipleOutput_shared (line 48) | class MultipleOutput_shared(nn.Module):
    method __init__ (line 49) | def __init__(self):
    method forward (line 53) | def forward(self, x):
  class SimpleModel (line 60) | class SimpleModel(nn.Module):
    method __init__ (line 61) | def __init__(self):
    method forward (line 64) | def forward(self, x):
  function conv3x3 (line 75) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 81) | class BasicBlock(nn.Module):
    method __init__ (line 84) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 93) | def forward(self, x):
  class Net1 (line 115) | class Net1(nn.Module):
    method __init__ (line 116) | def __init__(self):
    method forward (line 125) | def forward(self, x):
  class Net2 (line 138) | class Net2(nn.Module):
    method __init__ (line 139) | def __init__(self):
    method forward (line 147) | def forward(self, x):
  class SiameseNetwork (line 169) | class SiameseNetwork(nn.Module):
    method __init__ (line 170) | def __init__(self):
    method forward_once (line 174) | def forward_once(self, x):
    method forward (line 178) | def forward(self, input1, input2):
  class RNN (line 208) | class RNN(nn.Module):
    method __init__ (line 209) | def __init__(self, input_size, hidden_size, output_size):
    method forward (line 226) | def forward(self, category, input, hidden):
    method initHidden (line 236) | def initHidden(self):

FILE: tensorboardX/examples/demo_hparams.py
  function train (line 12) | def train(lr, bsize, n_hidden):

FILE: tensorboardX/examples/demo_multiple_embedding.py
  function main (line 6) | def main():

FILE: tensorboardX/setup.py
  function compileProtoBuf (line 11) | def compileProtoBuf():
  class PostDevelopCommand (line 15) | class PostDevelopCommand(develop):
    method run (line 17) | def run(self):
  class PostInstallCommand (line 22) | class PostInstallCommand(install):
    method run (line 24) | def run(self):

FILE: tensorboardX/tensorboardX/beholder/beholder.py
  class Beholder (line 39) | class Beholder(object):
    method __init__ (line 41) | def __init__(self, logdir):
    method _get_config (line 60) | def _get_config(self):
    method _write_summary (line 74) | def _write_summary(self, frame):
    method stats (line 92) | def stats(tensor_and_name):
    method _get_final_image (line 109) | def _get_final_image(self, config, trainable=None, arrays=None, frame=...
    method _enough_time_has_passed (line 130) | def _enough_time_has_passed(self, FPS):
    method _update_frame (line 138) | def _update_frame(self, trainable, arrays, frame, config):
    method _update_recording (line 145) | def _update_recording(self, frame, config):
    method update (line 163) | def update(self, trainable=None, arrays=None, frame=None):
  class BeholderHook (line 201) | class BeholderHook():

FILE: tensorboardX/tensorboardX/beholder/file_system_tools.py
  function write_file (line 25) | def write_file(contents, path, mode='wb'):
  function write_pickle (line 30) | def write_pickle(obj, path):
  function read_pickle (line 35) | def read_pickle(path, default=None):

FILE: tensorboardX/tensorboardX/beholder/video_writing.py
  class VideoWriter (line 27) | class VideoWriter(object):
    method __init__ (line 34) | def __init__(self, directory, outputs):
    method current_output (line 44) | def current_output(self):
    method write_frame (line 47) | def write_frame(self, np_array):
    method finish (line 75) | def finish(self):
  class VideoOutput (line 84) | class VideoOutput(object):
    method available (line 91) | def available(cls):
    method name (line 95) | def name(cls):
    method emit_frame (line 99) | def emit_frame(self, np_array):
    method close (line 103) | def close(self):
  class PNGVideoOutput (line 107) | class PNGVideoOutput(VideoOutput):
    method available (line 111) | def available(cls):
    method __init__ (line 114) | def __init__(self, directory, frame_shape):
    method emit_frame (line 120) | def emit_frame(self, np_array):
    method _write_image (line 125) | def _write_image(self, im, filename):
    method close (line 129) | def close(self):
  class FFmpegVideoOutput (line 133) | class FFmpegVideoOutput(VideoOutput):
    method available (line 137) | def available(cls):
    method __init__ (line 147) | def __init__(self, directory, frame_shape):
    method _handle_error (line 183) | def _handle_error(self):
    method emit_frame (line 189) | def emit_frame(self, np_array):
    method close (line 197) | def close(self):

FILE: tensorboardX/tensorboardX/caffe2_graph.py
  function _make_unique_name (line 21) | def _make_unique_name(seen, name, min_version=0):
  function _rename_tensorflow_style (line 45) | def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
  function _convert_to_ssa (line 84) | def _convert_to_ssa(shapes, blob_name_tracker, ops):
  function _get_blob_names (line 138) | def _get_blob_names(ops):
  function _remap_keys (line 155) | def _remap_keys(old_dict, rename_fn):
  function _rename_all (line 173) | def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
  function _add_gradient_scope (line 221) | def _add_gradient_scope(shapes, blob_name_tracker, ops):
  function _replace_colons (line 245) | def _replace_colons(shapes, blob_name_tracker, ops, repl):
  function _fill_missing_operator_names (line 267) | def _fill_missing_operator_names(ops):
  function _tf_device (line 299) | def _tf_device(device_option):
  function _add_tf_shape (line 323) | def _add_tf_shape(attr_dict, ints):
  function _set_tf_attr (line 343) | def _set_tf_attr(attr_dict, arg):
  function _operator_to_node (line 389) | def _operator_to_node(shapes, op):
  function _operator_to_node_simp (line 417) | def _operator_to_node_simp(op, inter_blobs, seen):
  function _blob_to_node (line 478) | def _blob_to_node(producing_ops, shapes, name):
  function _clear_debug_info (line 513) | def _clear_debug_info(ops, perform_clear):
  function _check_if_forward (line 536) | def _check_if_forward(blob):
  function _check_if_cpu (line 551) | def _check_if_cpu(blob):
  function _compute_in_out (line 564) | def _compute_in_out(ops):
  function _filter_ops (line 593) | def _filter_ops(ops, filter_fn, perform_filter):
  function _operators_to_graph_def (line 628) | def _operators_to_graph_def(
  function _propagate_device_option (line 717) | def _propagate_device_option(net_def):
  function _try_get_shapes (line 738) | def _try_get_shapes(nets):
  function model_to_graph_def (line 760) | def model_to_graph_def(model, **kwargs):
  function nets_to_graph_def (line 778) | def nets_to_graph_def(nets, shapes=None, **kwargs):
  function protos_to_graph_def (line 801) | def protos_to_graph_def(net_defs, shapes=None, **kwargs):

FILE: tensorboardX/tensorboardX/crc32c.py
  function crc_update (line 85) | def crc_update(crc, data):
  function crc_finalize (line 108) | def crc_finalize(crc):
  function _crc32c (line 122) | def _crc32c(data):

FILE: tensorboardX/tensorboardX/embedding.py
  function make_tsv (line 4) | def make_tsv(metadata, save_path, metadata_header=None):
  function make_sprite (line 24) | def make_sprite(label_img, save_path):
  function append_pbtxt (line 53) | def append_pbtxt(metadata, label_img, save_path, subdir, global_step, tag):
  function make_mat (line 73) | def make_mat(matlist, save_path):

FILE: tensorboardX/tensorboardX/event_file_writer.py
  class EventsWriter (line 32) | class EventsWriter(object):
    method __init__ (line 35) | def __init__(self, file_prefix, filename_suffix=''):
    method write_event (line 51) | def write_event(self, event):
    method _write_serialized_event (line 60) | def _write_serialized_event(self, event_str):
    method flush (line 65) | def flush(self):
    method close (line 72) | def close(self):
  class EventFileWriter (line 80) | class EventFileWriter(object):
    method __init__ (line 88) | def __init__(self, logdir, max_queue_size=10, flush_secs=120, filename...
    method get_logdir (line 115) | def get_logdir(self):
    method reopen (line 119) | def reopen(self):
    method add_event (line 132) | def add_event(self, event):
    method flush (line 141) | def flush(self):
    method close (line 151) | def close(self):
  class _EventLoggerThread (line 163) | class _EventLoggerThread(threading.Thread):
    method __init__ (line 166) | def __init__(self, queue, record_writer, flush_secs):
    method stop (line 185) | def stop(self):
    method run (line 189) | def run(self):

FILE: tensorboardX/tensorboardX/onnx_graph.py
  function load_onnx_graph (line 8) | def load_onnx_graph(fname):
  function parse (line 15) | def parse(graph):

FILE: tensorboardX/tensorboardX/proto_graph.py
  function attr_value_proto (line 8) | def attr_value_proto(dtype, shape, s):
  function tensor_shape_proto (line 23) | def tensor_shape_proto(outputsize):
  function node_proto (line 30) | def node_proto(name,

FILE: tensorboardX/tensorboardX/pytorch_graph.py
  class NodeBase (line 18) | class NodeBase(object):
    method __init__ (line 19) | def __init__(self,
    method __repr__ (line 34) | def __repr__(self):
  class NodePy (line 43) | class NodePy(NodeBase):
    method __init__ (line 44) | def __init__(self, node_cpp, valid_methods):
  class NodePyIO (line 75) | class NodePyIO(NodePy):
    method __init__ (line 76) | def __init__(self, node_cpp, input_or_output=None):
  class NodePyOP (line 93) | class NodePyOP(NodePy):
    method __init__ (line 94) | def __init__(self, node_cpp):
  class GraphPy (line 102) | class GraphPy(object):
    method __init__ (line 123) | def __init__(self):
    method append (line 130) | def append(self, x):
    method printall (line 144) | def printall(self):
    method find_common_root (line 151) | def find_common_root(self):
    method populate_namespace_from_OP_to_IO (line 156) | def populate_namespace_from_OP_to_IO(self):
    method to_proto (line 178) | def to_proto(self):
  function parse (line 207) | def parse(graph, args=None, omit_useless_nodes=True):
  function graph (line 247) | def graph(model, args, verbose=False, **kwargs):

FILE: tensorboardX/tensorboardX/record_writer.py
  function register_writer_factory (line 30) | def register_writer_factory(prefix, factory):
  function directory_check (line 36) | def directory_check(path):
  function open_file (line 47) | def open_file(path):
  class S3RecordWriter (line 57) | class S3RecordWriter(object):
    method __init__ (line 60) | def __init__(self, path):
    method __del__ (line 66) | def __del__(self):
    method bucket_and_path (line 69) | def bucket_and_path(self):
    method write (line 78) | def write(self, val):
    method flush (line 81) | def flush(self):
    method close (line 88) | def close(self):
  class S3RecordWriterFactory (line 92) | class S3RecordWriterFactory(object):
    method open (line 95) | def open(self, path):
    method directory_check (line 98) | def directory_check(self, path):
  class RecordWriter (line 107) | class RecordWriter(object):
    method __init__ (line 108) | def __init__(self, path):
    method write (line 115) | def write(self, data):
    method flush (line 123) | def flush(self):
    method close (line 126) | def close(self):
  function masked_crc32c (line 130) | def masked_crc32c(data):
  function u32 (line 135) | def u32(x):
  function make_valid_tf_name (line 139) | def make_valid_tf_name(name):

FILE: tensorboardX/tensorboardX/summary.py
  function _clean_tag (line 28) | def _clean_tag(name):
  function _draw_single_box (line 46) | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, color='...
  function hparams (line 69) | def hparams(hparam_dict=None, metric_dict=None):
  function scalar (line 124) | def scalar(name, scalar, collections=None):
  function histogram_raw (line 145) | def histogram_raw(name, min, max, num, sum, sum_squares, bucket_limits, ...
  function histogram (line 175) | def histogram(name, values, bins, max_bins=None):
  function make_histogram (line 197) | def make_histogram(values, bins, max_bins=None):
  function image (line 243) | def image(tag, tensor, rescale=1, dataformats='CHW'):
  function image_boxes (line 275) | def image_boxes(tag, tensor_image, tensor_boxes, rescale=1, dataformats=...
  function draw_boxes (line 290) | def draw_boxes(disp_image, boxes, labels=None):
  function make_image (line 305) | def make_image(tensor, rescale=1, rois=None, labels=None):
  function video (line 326) | def video(tag, tensor, fps=4):
  function make_video (line 338) | def make_video(tensor, fps):
  function audio (line 374) | def audio(tag, tensor, sample_rate=44100):
  function custom_scalars (line 394) | def custom_scalars(layout):
  function text (line 423) | def text(tag, text):
  function pr_curve_raw (line 434) | def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, num_thresholds=...
  function pr_curve (line 450) | def pr_curve(tag, labels, predictions, num_thresholds=127, weights=None):
  function compute_curve (line 468) | def compute_curve(labels, predictions, num_thresholds=None, weights=None):
  function _get_tensor_summary (line 499) | def _get_tensor_summary(tag, tensor, content_type, json_config):
  function mesh (line 528) | def mesh(tag, vertices, colors, faces, config_dict=None):

FILE: tensorboardX/tensorboardX/torchvis.py
  class TorchVis (line 19) | class TorchVis:
    method __init__ (line 20) | def __init__(self, *args, **init_kwargs):
    method register (line 32) | def register(self, *args, **init_kwargs):
    method unregister (line 39) | def unregister(self, *args):
    method __getattr__ (line 45) | def __getattr__(self, attr):
    method __del__ (line 55) | def __del__(self):

FILE: tensorboardX/tensorboardX/utils.py
  function figure_to_image (line 2) | def figure_to_image(figures, close=True):
  function graphviz_to_image (line 40) | def graphviz_to_image():
  function _prepare_video (line 44) | def _prepare_video(V):
  function make_grid (line 70) | def make_grid(I, ncols=8):
  function convert_to_HWC (line 98) | def convert_to_HWC(tensor, input_format):  # tensor: numpy array

FILE: tensorboardX/tensorboardX/visdom_writer.py
  function _check_connection (line 13) | def _check_connection(fn):
  class VisdomWriter (line 23) | class VisdomWriter:
    method __init__ (line 24) | def __init__(self, *args, **kwargs):
    method _try_connect (line 38) | def _try_connect(self):
    method add_scalar (line 48) | def add_scalar(self, tag, scalar_value, global_step=None, main_tag='de...
    method add_scalars (line 90) | def add_scalars(self, main_tag, tag_scalar_dict, global_step=None):
    method export_scalars_to_json (line 116) | def export_scalars_to_json(self, path):
    method add_histogram (line 128) | def add_histogram(self, tag, values, global_step=None, bins='tensorflo...
    method add_image (line 142) | def add_image(self, tag, img_tensor, global_step=None, caption=None):
    method add_figure (line 159) | def add_figure(self, tag, figure, global_step=None, close=True):
    method add_video (line 173) | def add_video(self, tag, vid_tensor, global_step=None, fps=4):
    method add_audio (line 213) | def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=441...
    method add_text (line 230) | def add_text(self, tag, text_string, global_step=None):
    method add_onnx_graph (line 247) | def add_onnx_graph(self, prototxt):
    method add_graph (line 252) | def add_graph(self, model, input_to_model=None, verbose=False, **kwargs):
    method add_embedding (line 257) | def add_embedding(self, mat, metadata=None, label_img=None, global_ste...
    method add_pr_curve (line 262) | def add_pr_curve(self, tag, labels, predictions, global_step=None, num...
    method add_pr_curve_raw (line 293) | def add_pr_curve_raw(self, tag, true_positive_counts,
    method close (line 325) | def close(self):

FILE: tensorboardX/tensorboardX/writer.py
  class DummyFileWriter (line 28) | class DummyFileWriter(object):
    method __init__ (line 31) | def __init__(self, logdir):
    method get_logdir (line 34) | def get_logdir(self):
    method add_event (line 38) | def add_event(self, event, step=None, walltime=None):
    method add_summary (line 41) | def add_summary(self, summary, global_step=None, walltime=None):
    method add_graph (line 44) | def add_graph(self, graph_profile, walltime=None):
    method add_onnx_graph (line 47) | def add_onnx_graph(self, graph, walltime=None):
    method flush (line 50) | def flush(self):
    method close (line 53) | def close(self):
    method reopen (line 56) | def reopen(self):
  class FileWriter (line 60) | class FileWriter(object):
    method __init__ (line 70) | def __init__(self, logdir, max_queue=10, flush_secs=120, filename_suff...
    method get_logdir (line 95) | def get_logdir(self):
    method add_event (line 99) | def add_event(self, event, step=None, walltime=None):
    method add_summary (line 115) | def add_summary(self, summary, global_step=None, walltime=None):
    method add_graph (line 130) | def add_graph(self, graph_profile, walltime=None):
    method add_onnx_graph (line 148) | def add_onnx_graph(self, graph, walltime=None):
    method flush (line 159) | def flush(self):
    method close (line 166) | def close(self):
    method reopen (line 172) | def reopen(self):
  class SummaryWriter (line 181) | class SummaryWriter(object):
    method __init__ (line 192) | def __init__(self, logdir=None, comment='', purge_step=None, max_queue...
    method __append_to_scalar_dict (line 271) | def __append_to_scalar_dict(self, tag, scalar_value, global_step,
    method _check_caffe2_blob (line 282) | def _check_caffe2_blob(self, item):
    method _get_file_writer (line 297) | def _get_file_writer(self):
    method add_hparams (line 325) | def add_hparams(self, hparam_dict=None, metric_dict=None):
    method add_scalar (line 361) | def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
    method add_scalars (line 390) | def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, wal...
    method export_scalars_to_json (line 436) | def export_scalars_to_json(self, path):
    method add_histogram (line 447) | def add_histogram(self, tag, values, global_step=None, bins='tensorflo...
    method add_histogram_raw (line 481) | def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
    method add_image (line 536) | def add_image(self, tag, img_tensor, global_step=None, walltime=None, ...
    method add_images (line 586) | def add_images(self, tag, img_tensor, global_step=None, walltime=None,...
    method add_image_with_boxes (line 642) | def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_ste...
    method add_figure (line 674) | def add_figure(self, tag, figure, global_step=None, close=True, wallti...
    method add_video (line 691) | def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime...
    method add_audio (line 709) | def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=441...
    method add_text (line 726) | def add_text(self, tag, text_string, global_step=None, walltime=None):
    method add_onnx_graph (line 742) | def add_onnx_graph(self, prototxt):
    method add_graph (line 745) | def add_graph(self, model, input_to_model=None, verbose=False, **kwargs):
    method _encode (line 798) | def _encode(rawstr):
    method add_embedding (line 806) | def add_embedding(self, mat, metadata=None, label_img=None, global_ste...
    method add_pr_curve (line 868) | def add_pr_curve(self, tag, labels, predictions, global_step=None,
    method add_pr_curve_raw (line 905) | def add_pr_curve_raw(self, tag, true_positive_counts,
    method add_custom_scalars_multilinechart (line 943) | def add_custom_scalars_multilinechart(self, tags, category='default', ...
    method add_custom_scalars_marginchart (line 957) | def add_custom_scalars_marginchart(self, tags, category='default', tit...
    method add_custom_scalars (line 972) | def add_custom_scalars(self, layout):
    method add_mesh (line 993) | def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict...
    method close (line 1047) | def close(self):
    method flush (line 1055) | def flush(self):
    method __enter__ (line 1061) | def __enter__(self):
    method __exit__ (line 1064) | def __exit__(self, exc_type, exc_val, exc_tb):

FILE: tensorboardX/tensorboardX/x2num.py
  function check_nan (line 11) | def check_nan(array):
  function make_np (line 18) | def make_np(x):
  function prepare_pytorch (line 37) | def prepare_pytorch(x):
  function prepare_theano (line 45) | def prepare_theano(x):
  function prepare_caffe2 (line 50) | def prepare_caffe2(x):
  function prepare_mxnet (line 56) | def prepare_mxnet(x):
  function prepare_chainer (line 61) | def prepare_chainer(x):

FILE: tensorboardX/tests/event_file_writer_test.py
  class EventFileWriterTest (line 36) | class EventFileWriterTest(unittest.TestCase):
    method get_temp_dir (line 37) | def get_temp_dir(self):
    method test_event_file_writer_roundtrip (line 41) | def test_event_file_writer_roundtrip(self):
    method test_setting_filename_suffix_works (line 57) | def test_setting_filename_suffix_works(self):
    method test_async_writer_without_write (line 65) | def test_async_writer_without_write(self):
  class AsyncWriterTest (line 78) | class AsyncWriterTest(): #unittest.TestCase):
    method get_temp_dir (line 79) | def get_temp_dir(self):
    method test_async_writer_write_once (line 83) | def test_async_writer_write_once(self):
    method test_async_writer_write_queue_full (line 93) | def test_async_writer_write_queue_full(self):
    method test_async_writer_write_one_slot_queue (line 104) | def test_async_writer_write_one_slot_queue(self):
    method test_async_writer_close_triggers_flush (line 115) | def test_async_writer_close_triggers_flush(self):
    method test_write_after_async_writer_closed (line 124) | def test_write_after_async_writer_closed(self):

FILE: tensorboardX/tests/expect_reader.py
  function removeWhiteChar (line 6) | def removeWhiteChar(string):
  function compare_proto (line 10) | def compare_proto(str_to_compare, function_ptr):
  function write_proto (line 27) | def write_proto(str_to_compare, function_ptr):

FILE: tensorboardX/tests/record_writer_test.py
  class RecordWriterTest (line 29) | class RecordWriterTest(unittest.TestCase):
    method get_temp_dir (line 30) | def get_temp_dir(self):
    method test_expect_bytes_written (line 34) | def test_expect_bytes_written(self):
    method test_empty_record (line 44) | def test_empty_record(self):
    method test_record_writer_roundtrip (line 54) | def test_record_writer_roundtrip(self):

FILE: tensorboardX/tests/test_beholder.py
  class BeholderTest (line 14) | class BeholderTest(unittest.TestCase):
    method test_beholder (line 15) | def test_beholder(self):
    method test_beholder_video (line 27) | def test_beholder_video(self):

FILE: tensorboardX/tests/test_caffe2.py
  class Caffe2Test (line 23) | class Caffe2Test(unittest.TestCase):
    method test_caffe2_np (line 24) | def test_caffe2_np(self):
    method test_that_operators_gets_non_colliding_names (line 29) | def test_that_operators_gets_non_colliding_names(self):
    method test_that_replacing_colons_gives_non_colliding_names (line 37) | def test_that_replacing_colons_gives_non_colliding_names(self):
    method test_that_adding_gradient_scope_does_no_fancy_renaming (line 56) | def test_that_adding_gradient_scope_does_no_fancy_renaming(self):
    method test_that_auto_ssa_gives_non_colliding_names (line 75) | def test_that_auto_ssa_gives_non_colliding_names(self):
    method test_renaming_tensorflow_style (line 99) | def test_renaming_tensorflow_style(self):
    method test_filter_ops (line 176) | def test_filter_ops(self):
    method test_simple_cnnmodel (line 215) | def test_simple_cnnmodel(self):
    method test_simple_model (line 242) | def test_simple_model(self):

FILE: tensorboardX/tests/test_chainer_np.py
  class ChainerTest (line 23) | class ChainerTest(unittest.TestCase):
    method test_chainer_np (line 24) | def test_chainer_np(self):
    method test_chainer_img (line 33) | def test_chainer_img(self):
    method test_chainer_write (line 39) | def test_chainer_write(self):

FILE: tensorboardX/tests/test_crc32c.py
  class CRC32CTest (line 5) | class CRC32CTest(unittest.TestCase):
    method test_crc32c (line 6) | def test_crc32c(self):
    method test_crc32c_python (line 10) | def test_crc32c_python(self):
    method test_crc32c_native (line 14) | def test_crc32c_native(self):

FILE: tensorboardX/tests/test_embedding.py
  class EmbeddingTest (line 6) | class EmbeddingTest(unittest.TestCase):
    method test_embedding (line 7) | def test_embedding(self):
    method test_embedding_64 (line 27) | def test_embedding_64(self):
    method test_embedding_square (line 46) | def test_embedding_square(self):
    method test_embedding_fail (line 56) | def test_embedding_fail(self):

FILE: tensorboardX/tests/test_figure.py
  class FigureTest (line 12) | class FigureTest(unittest.TestCase):
    method test_figure (line 13) | def test_figure(self):
    method test_figure_list (line 32) | def test_figure_list(self):

FILE: tensorboardX/tests/test_numpy.py
  class NumpyTest (line 12) | class NumpyTest(unittest.TestCase):
    method test_scalar (line 13) | def test_scalar(self):
    method test_make_grid (line 25) | def test_make_grid(self):
    method test_numpy_vid (line 28) | def test_numpy_vid(self):
    method test_numpy_vid_uint8 (line 34) | def test_numpy_vid_uint8(self):

FILE: tensorboardX/tests/test_onnx_graph.py
  class ONNXGraphTest (line 6) | class ONNXGraphTest(unittest.TestCase):
    method test_onnx_graph (line 7) | def test_onnx_graph(self):

FILE: tensorboardX/tests/test_pr_curve.py
  class PRCurveTest (line 17) | class PRCurveTest(unittest.TestCase):
    method test_smoke (line 18) | def test_smoke(self):
    method test_pr_purve (line 31) | def test_pr_purve(self):
    method test_pr_purve_raw (line 59) | def test_pr_purve_raw(self):

FILE: tensorboardX/tests/test_pytorch_graph.py
  class PytorchGraphTest (line 7) | class PytorchGraphTest(unittest.TestCase):
    method test_pytorch_graph (line 8) | def test_pytorch_graph(self):
    method test_wrong_input_size (line 22) | def test_wrong_input_size(self):

FILE: tensorboardX/tests/test_pytorch_np.py
  class PyTorchNumpyTest (line 12) | class PyTorchNumpyTest(unittest.TestCase):
    method test_pytorch_np (line 13) | def test_pytorch_np(self):
    method test_pytorch_write (line 34) | def test_pytorch_write(self):
    method test_pytorch_histogram (line 38) | def test_pytorch_histogram(self):
    method test_pytorch_histogram_raw (line 43) | def test_pytorch_histogram_raw(self):

FILE: tensorboardX/tests/test_record_writer.py
  class RecordWriterTest (line 12) | class RecordWriterTest(unittest.TestCase):
    method test_record_writer_s3 (line 14) | def test_record_writer_s3(self):
    method test_make_valid_tf_name (line 24) | def test_make_valid_tf_name(self):

FILE: tensorboardX/tests/test_summary.py
  function tensor_N (line 9) | def tensor_N(shape, dtype=float):
  class SummaryTest (line 14) | class SummaryTest(unittest.TestCase):
    method test_uint8_image (line 15) | def test_uint8_image(self):
    method test_float32_image (line 22) | def test_float32_image(self):
    method test_float_1_converts_to_uint8_255 (line 30) | def test_float_1_converts_to_uint8_255(self):
    method test_list_input (line 38) | def test_list_input(self):
    method test_empty_input (line 42) | def test_empty_input(self):
    method test_image_with_boxes (line 47) | def test_image_with_boxes(self):
    method test_image_with_one_channel (line 52) | def test_image_with_one_channel(self):
    method test_image_with_four_channel (line 55) | def test_image_with_four_channel(self):
    method test_image_with_one_channel_batched (line 58) | def test_image_with_one_channel_batched(self):
    method test_image_with_3_channel_batched (line 61) | def test_image_with_3_channel_batched(self):
    method test_image_with_four_channel_batched (line 64) | def test_image_with_four_channel_batched(self):
    method test_image_without_channel (line 67) | def test_image_without_channel(self):
    method test_video (line 70) | def test_video(self):
    method test_audio (line 79) | def test_audio(self):
    method test_text (line 82) | def test_text(self):
    method test_histogram_auto (line 85) | def test_histogram_auto(self):
    method test_histogram_fd (line 88) | def test_histogram_fd(self):
    method test_histogram_doane (line 91) | def test_histogram_doane(self):
    method test_custom_scalars (line 94) | def test_custom_scalars(self):
    method test_mesh (line 100) | def test_mesh(self):
    method test_hparams (line 122) | def test_hparams(self):
    method test_hparams_smoke (line 127) | def test_hparams_smoke(self):

FILE: tensorboardX/tests/test_summary_writer.py
  class SummaryWriterTest (line 5) | class SummaryWriterTest(unittest.TestCase):
    method test_summary_writer_ctx (line 6) | def test_summary_writer_ctx(self):
    method test_summary_writer_backcomapt (line 12) | def test_summary_writer_backcomapt(self):
    method test_summary_writer_close (line 16) | def test_summary_writer_close(self):
    method test_windowsPath (line 28) | def test_windowsPath(self):
    method test_pathlib (line 35) | def test_pathlib(self):

FILE: tensorboardX/tests/test_test.py
  function test_linting (line 1) | def test_linting():

FILE: tensorboardX/tests/test_utils.py
  class UtilsTest (line 8) | class UtilsTest(unittest.TestCase):
    method test_to_HWC (line 9) | def test_to_HWC(self):
    method test_prepare_video (line 21) | def test_prepare_video(self):

FILE: tensorboardX/tests/test_visdom.py
  class VisdomTest (line 15) | class VisdomTest(unittest.TestCase):
    method test_TorchVis (line 16) | def test_TorchVis(self):

FILE: tensorboardX/tests/test_writer.py
  class WriterTest (line 19) | class WriterTest(unittest.TestCase):
    method test_flush (line 20) | def test_flush(self):
    method test_flush_timer_is_long_so_data_is_not_there (line 32) | def test_flush_timer_is_long_so_data_is_not_there(self):
    method test_flush_after_close (line 45) | def test_flush_after_close(self):
    method test_flush (line 58) | def test_flush(self):
    method test_auto_close (line 71) | def test_auto_close(self):
    method test_writer (line 74) | def test_writer(self):

Download .json

Condensed preview — 319 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,876K chars).

[
  {
    "path": ".gitignore",
    "chars": 1824,
    "preview": "# Global\n.DS_Store\n.idea\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so"
  },
  {
    "path": "LICENSE",
    "chars": 1462,
    "preview": "Noncommercial Use License\n\nSoftware Copyright (c) 2020 OpenAI\n\nWe don’t claim ownership of the content you create with J"
  },
  {
    "path": "MANIFEST.in",
    "chars": 63,
    "preview": "recursive-include jukebox *.py\nrecursive-include jukebox *.txt\n"
  },
  {
    "path": "README.md",
    "chars": 17714,
    "preview": "**Status:** Archive (code is provided as-is, no updates expected)\n\n# Jukebox\nCode for \"Jukebox: A Generative Model for M"
  },
  {
    "path": "apex/.gitignore",
    "chars": 38,
    "preview": "apex.egg-info\ndist\nbuild\ndocs/build\n*~"
  },
  {
    "path": "apex/.nojekyll",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "apex/LICENSE",
    "chars": 1449,
    "preview": "All rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification, are permitted pro"
  },
  {
    "path": "apex/README.md",
    "chars": 4843,
    "preview": "# Introduction\n\nThis repository holds NVIDIA-maintained utilities to streamline \nmixed precision and distributed trainin"
  },
  {
    "path": "apex/apex/RNN/README.md",
    "chars": 22,
    "preview": "Under construction...\n"
  },
  {
    "path": "apex/apex/RNN/RNNBackend.py",
    "chars": 11578,
    "preview": "import torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\n\nimport torch.nn.functional as F\n\nimport math\n\n\nd"
  },
  {
    "path": "apex/apex/RNN/__init__.py",
    "chars": 71,
    "preview": "from .models import LSTM, GRU, ReLU, Tanh, mLSTM\n\n__all__ = ['models']\n"
  },
  {
    "path": "apex/apex/RNN/cells.py",
    "chars": 2550,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom .RNNBackend import RNNCell\n\nfrom torch.nn._func"
  },
  {
    "path": "apex/apex/RNN/models.py",
    "chars": 2137,
    "preview": "import torch\n\nfrom torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell\n\nfrom .RNNBackend import b"
  },
  {
    "path": "apex/apex/__init__.py",
    "chars": 632,
    "preview": "from . import parallel\nfrom . import amp\nfrom . import fp16_utils\n\n# For optimizers and normalization there is no Python"
  },
  {
    "path": "apex/apex/amp/README.md",
    "chars": 2107,
    "preview": "# amp: Automatic Mixed Precision\n\n## Annotating User Functions\n\nNearly all PyTorch user code needs nothing more than the"
  },
  {
    "path": "apex/apex/amp/__init__.py",
    "chars": 281,
    "preview": "from .amp import init, half_function, float_function, promote_function,\\\n    register_half_function, register_float_func"
  },
  {
    "path": "apex/apex/amp/__version__.py",
    "chars": 62,
    "preview": "VERSION = (0, 1, 0)\n__version__ = '.'.join(map(str, VERSION))\n"
  },
  {
    "path": "apex/apex/amp/_amp_state.py",
    "chars": 1941,
    "preview": "# This is a \"header object\" that allows different amp modules to communicate.\n# I'm a C++ guy, not a python guy.  I deci"
  },
  {
    "path": "apex/apex/amp/_initialize.py",
    "chars": 12311,
    "preview": "import torch\nfrom torch._six import string_classes\nimport functools\nimport numpy as np\nimport warnings\nfrom ._amp_state "
  },
  {
    "path": "apex/apex/amp/_process_optimizer.py",
    "chars": 17481,
    "preview": "import types\nfrom ..fp16_utils import master_params_to_model_params\nfrom ..multi_tensor_apply import multi_tensor_applie"
  },
  {
    "path": "apex/apex/amp/amp.py",
    "chars": 7266,
    "preview": "from . import compat, rnn_compat, utils, wrap\nfrom .handle import AmpHandle, NoOpHandle\nfrom .lists import functional_ov"
  },
  {
    "path": "apex/apex/amp/compat.py",
    "chars": 1204,
    "preview": "import torch\n\n# True for post-0.4, when Variables/Tensors merged.\ndef variable_is_tensor():\n    v = torch.autograd.Varia"
  },
  {
    "path": "apex/apex/amp/frontend.py",
    "chars": 19598,
    "preview": "import torch\nfrom ._initialize import _initialize\nfrom ._amp_state import _amp_state, warn_or_err, maybe_print\n\n\nclass P"
  },
  {
    "path": "apex/apex/amp/handle.py",
    "chars": 12008,
    "preview": "import contextlib\nimport warnings\nimport torch\n\nfrom . import utils\nfrom .opt import OptimWrapper\nfrom .scaler import Lo"
  },
  {
    "path": "apex/apex/amp/lists/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "apex/apex/amp/lists/functional_overrides.py",
    "chars": 2175,
    "preview": "\n# TODO: think about the following two. They do weird things.\n# - torch.nn.utils.clip_grad (but it should always be fp32"
  },
  {
    "path": "apex/apex/amp/lists/tensor_overrides.py",
    "chars": 1313,
    "preview": "from .. import compat\nfrom . import torch_overrides\n\nimport importlib\n\nimport torch\n\nif compat.variable_is_tensor() and "
  },
  {
    "path": "apex/apex/amp/lists/torch_overrides.py",
    "chars": 1678,
    "preview": "import torch\n\nfrom .. import utils\n\nMODULE = torch\n\nFP16_FUNCS = [\n    # Low level functions wrapped by torch.nn layers."
  },
  {
    "path": "apex/apex/amp/opt.py",
    "chars": 3446,
    "preview": "import contextlib\nimport warnings\n\nfrom .scaler import LossScaler, master_params\nfrom ._amp_state import maybe_print\n\nim"
  },
  {
    "path": "apex/apex/amp/rnn_compat.py",
    "chars": 1995,
    "preview": "from . import utils, wrap\n\nimport torch\n_VF = torch._C._VariableFunctions\nRNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'l"
  },
  {
    "path": "apex/apex/amp/scaler.py",
    "chars": 9919,
    "preview": "import torch\nfrom ..multi_tensor_apply import multi_tensor_applier\nfrom ._amp_state import _amp_state, master_params, ma"
  },
  {
    "path": "apex/apex/amp/utils.py",
    "chars": 7487,
    "preview": "from . import compat\n\nimport functools\nimport itertools\n\nimport torch\n\ndef get_cuda_version():\n    return tuple(int(x) f"
  },
  {
    "path": "apex/apex/amp/wrap.py",
    "chars": 11242,
    "preview": "from . import compat\nfrom . import utils\nfrom ._amp_state import _amp_state\nfrom . import rnn_compat\n\nimport functools\n\n"
  },
  {
    "path": "apex/apex/fp16_utils/README.md",
    "chars": 1443,
    "preview": "fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatic"
  },
  {
    "path": "apex/apex/fp16_utils/__init__.py",
    "chars": 367,
    "preview": "from .fp16util import (\n    BN_convert_float,\n    network_to_half,\n    prep_param_lists,\n    model_grads_to_master_grads"
  },
  {
    "path": "apex/apex/fp16_utils/fp16_optimizer.py",
    "chars": 33677,
    "preview": "import torch\r\nfrom torch import nn\r\nfrom torch.autograd import Variable\r\nfrom torch.nn.parameter import Parameter\r\nfrom "
  },
  {
    "path": "apex/apex/fp16_utils/fp16util.py",
    "chars": 7141,
    "preview": "import torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nfrom torch._utils import _flatten_dense_tensors, "
  },
  {
    "path": "apex/apex/fp16_utils/loss_scaler.py",
    "chars": 7568,
    "preview": "import torch\n\n# item() is a recent addition, so this helps with backward compatibility.\ndef to_python_float(t):\n    if h"
  },
  {
    "path": "apex/apex/multi_tensor_apply/__init__.py",
    "chars": 100,
    "preview": "from .multi_tensor_apply import MultiTensorApply\n\nmulti_tensor_applier = MultiTensorApply(2048*32)\n\n"
  },
  {
    "path": "apex/apex/multi_tensor_apply/multi_tensor_apply.py",
    "chars": 991,
    "preview": "import torch\n\nclass MultiTensorApply(object):\n    available = False\n    warned = False\n\n    def __init__(self, chunk_siz"
  },
  {
    "path": "apex/apex/normalization/__init__.py",
    "chars": 45,
    "preview": "from .fused_layer_norm import FusedLayerNorm\n"
  },
  {
    "path": "apex/apex/normalization/fused_layer_norm.py",
    "chars": 6450,
    "preview": "import math\nimport torch\nimport numbers\nfrom torch.nn.parameter import Parameter\nfrom torch.nn import init\nfrom torch.nn"
  },
  {
    "path": "apex/apex/optimizers/__init__.py",
    "chars": 77,
    "preview": "from .fused_adam import FusedAdam\nfrom .fp16_optimizer import FP16_Optimizer\n"
  },
  {
    "path": "apex/apex/optimizers/fp16_optimizer.py",
    "chars": 12199,
    "preview": "import torch\nfrom torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors\n\nclass FP16_Optimizer(object):\n  "
  },
  {
    "path": "apex/apex/optimizers/fused_adam.py",
    "chars": 6853,
    "preview": "import types\nimport torch\nimport importlib\n\nclass FusedAdam(torch.optim.Optimizer):\n\n    \"\"\"Implements Adam algorithm. C"
  },
  {
    "path": "apex/apex/parallel/LARC.py",
    "chars": 3850,
    "preview": "import torch\nfrom torch import nn\nfrom torch.autograd import Variable\nfrom torch.nn.parameter import Parameter\n\nclass LA"
  },
  {
    "path": "apex/apex/parallel/README.md",
    "chars": 2699,
    "preview": "## Distributed Data Parallel\n\ndistributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a mod"
  },
  {
    "path": "apex/apex/parallel/__init__.py",
    "chars": 3512,
    "preview": "import torch\n\nif hasattr(torch.distributed, 'ReduceOp'):\n    ReduceOp = torch.distributed.ReduceOp\nelif hasattr(torch.di"
  },
  {
    "path": "apex/apex/parallel/distributed.py",
    "chars": 24867,
    "preview": "import torch\nimport torch.distributed as dist\nfrom torch.nn.modules import Module\nfrom torch.autograd import Variable\nfr"
  },
  {
    "path": "apex/apex/parallel/multiproc.py",
    "chars": 884,
    "preview": "import torch\nimport sys\nimport subprocess\n\ndef docstring_hack():\n    \"\"\"\n    Multiproc file which will launch a set of p"
  },
  {
    "path": "apex/apex/parallel/optimized_sync_batchnorm.py",
    "chars": 4246,
    "preview": "import torch\nfrom torch.nn.modules.batchnorm import _BatchNorm\nfrom torch.nn import functional as F\n\nimport syncbn\nfrom "
  },
  {
    "path": "apex/apex/parallel/optimized_sync_batchnorm_kernel.py",
    "chars": 4990,
    "preview": "import torch\nfrom torch.autograd.function import Function\n\nimport syncbn\nfrom apex.parallel import ReduceOp\n\nclass SyncB"
  },
  {
    "path": "apex/apex/parallel/sync_batchnorm.py",
    "chars": 6381,
    "preview": "import torch\nfrom torch.nn.modules.batchnorm import _BatchNorm\nfrom torch.nn import functional as F\n\nfrom .sync_batchnor"
  },
  {
    "path": "apex/apex/parallel/sync_batchnorm_kernel.py",
    "chars": 3761,
    "preview": "import torch\nfrom torch.autograd.function import Function\n\nfrom apex.parallel import ReduceOp\n\n\nclass SyncBatchnormFunct"
  },
  {
    "path": "apex/apex/reparameterization/README.md",
    "chars": 22,
    "preview": "Under construction...\n"
  },
  {
    "path": "apex/apex/reparameterization/__init__.py",
    "chars": 5500,
    "preview": "from .weight_norm import WeightNorm\r\nfrom .reparameterization import Reparameterization\r\n\r\ndef apply_weight_norm(module,"
  },
  {
    "path": "apex/apex/reparameterization/reparameterization.py",
    "chars": 6442,
    "preview": "import torch\r\nfrom torch.nn.parameter import Parameter\r\nimport sys\r\nclass Reparameterization(object):\r\n    \"\"\"\r\n    Clas"
  },
  {
    "path": "apex/apex/reparameterization/weight_norm.py",
    "chars": 3280,
    "preview": "import torch\r\nfrom torch.nn.parameter import Parameter\r\nfrom ..fp16_utils import Fused_Weight_Norm\r\nimport time\r\n\r\nfrom "
  },
  {
    "path": "apex/apex.patch",
    "chars": 2148,
    "preview": "diff --git a/csrc/fused_adam_cuda_kernel.cu b/csrc/fused_adam_cuda_kernel.cu\nindex 34f7aa2..95581d1 100644\n--- a/csrc/fu"
  },
  {
    "path": "apex/csrc/amp_C_frontend.cpp",
    "chars": 1773,
    "preview": "#include <torch/extension.h>\n\nvoid multi_tensor_scale_cuda(\n  int chunk_size,\n  at::Tensor noop_flag,\n  std::vector<std:"
  },
  {
    "path": "apex/csrc/flatten_unflatten.cpp",
    "chars": 584,
    "preview": "#include <torch/extension.h>\n#include <torch/csrc/utils/tensor_flatten.h>\n// https://github.com/pytorch/pytorch/blob/mas"
  },
  {
    "path": "apex/csrc/fused_adam_cuda.cpp",
    "chars": 1623,
    "preview": "#include <torch/extension.h>\n\n// CUDA forward declaration\nvoid fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::"
  },
  {
    "path": "apex/csrc/fused_adam_cuda_kernel.cu",
    "chars": 5019,
    "preview": "#include \"ATen/ATen.h\"\n#include \"ATen/cuda/CUDAContext.h\"\n#include \"ATen/cuda/detail/IndexUtils.cuh\"\n#include <cuda.h>\n#"
  },
  {
    "path": "apex/csrc/layer_norm_cuda.cpp",
    "chars": 6545,
    "preview": "#include <torch/extension.h>\n#include <vector>\n#include <cassert>\n\nnamespace {\nvoid compute_n1_n2(\n    at::Tensor input,"
  },
  {
    "path": "apex/csrc/layer_norm_cuda_kernel.cu",
    "chars": 24501,
    "preview": "#include \"ATen/ATen.h\"\n#include \"ATen/AccumulateType.h\"\n#include \"ATen/cuda/CUDAContext.h\"\n#include <THC/THCDeviceUtils."
  },
  {
    "path": "apex/csrc/multi_tensor_apply.cuh",
    "chars": 4225,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/multi_tensor_axpby_kernel.cu",
    "chars": 3290,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/multi_tensor_l2norm_kernel.cu",
    "chars": 4845,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/multi_tensor_lamb_stage_1.cu",
    "chars": 4183,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/multi_tensor_lamb_stage_2.cu",
    "chars": 2861,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/multi_tensor_scale_kernel.cu",
    "chars": 3073,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "apex/csrc/syncbn.cpp",
    "chars": 5440,
    "preview": "#include <torch/extension.h>\n#include <ATen/ATen.h>\n\n#include <vector>\n\n// returns {mean,biased_var}\n// implemented usin"
  },
  {
    "path": "apex/csrc/type_shim.h",
    "chars": 2986,
    "preview": "#include <ATen/ATen.h>\n\n// Forward/backward compatiblity hack around\n// https://github.com/pytorch/pytorch/commit/3aeb78"
  },
  {
    "path": "apex/csrc/welford.cu",
    "chars": 48233,
    "preview": "#include <iostream>\n#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n\n#include"
  },
  {
    "path": "apex/docs/Makefile",
    "chars": 947,
    "preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHI"
  },
  {
    "path": "apex/docs/source/_static/css/pytorch_theme.css",
    "chars": 2290,
    "preview": "body {\n    font-family: \"Lato\",\"proxima-nova\",\"Helvetica Neue\",Arial,sans-serif;\n}\n\n/* Default header fonts are ugly */\n"
  },
  {
    "path": "apex/docs/source/_templates/layout.html",
    "chars": 1019,
    "preview": "{% extends \"!layout.html\" %}\n  {% block sidebartitle %} {{ super() }}\n\n  <style>\n    /* Sidebar header (and topbar for m"
  },
  {
    "path": "apex/docs/source/advanced.rst",
    "chars": 9938,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\nAdvanced Amp Usage\n===================================\n\nGANs\n----\n\nGANs are"
  },
  {
    "path": "apex/docs/source/amp.rst",
    "chars": 11259,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\napex.amp\n===================================\n\nThis page documents the updat"
  },
  {
    "path": "apex/docs/source/conf.py",
    "chars": 8119,
    "preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# PyTorch documentation build configuration file, created by\n# sphinx-q"
  },
  {
    "path": "apex/docs/source/fp16_utils.rst",
    "chars": 1962,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\napex.fp16_utils\n===================================\n\nThis submodule contain"
  },
  {
    "path": "apex/docs/source/index.rst",
    "chars": 1300,
    "preview": ".. PyTorch documentation master file, created by\n   sphinx-quickstart on Fri Dec 23 13:31:47 2016.\n   You can adapt this"
  },
  {
    "path": "apex/docs/source/layernorm.rst",
    "chars": 264,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\napex.normalization.fused_layer_norm\n===================================\n\n.."
  },
  {
    "path": "apex/docs/source/optimizers.rst",
    "chars": 233,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\napex.optimizers\n===================================\n\n.. automodule:: apex.o"
  },
  {
    "path": "apex/docs/source/parallel.rst",
    "chars": 431,
    "preview": ".. role:: hidden\n    :class: hidden-section\n\napex.parallel\n===================================\n\n.. automodule:: apex.par"
  },
  {
    "path": "apex/examples/README.md",
    "chars": 446,
    "preview": "This directory contains examples illustrating Apex mixed precision and distributed tools.\n\n**Note for users of the pre-u"
  },
  {
    "path": "apex/examples/dcgan/README.md",
    "chars": 22,
    "preview": "Under construction...\n"
  },
  {
    "path": "apex/examples/docker/Dockerfile",
    "chars": 760,
    "preview": "# Base image must at least have pytorch and CUDA installed.\nARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.03-py3\nFROM $BASE_I"
  },
  {
    "path": "apex/examples/docker/README.md",
    "chars": 2032,
    "preview": "## Option 1:  Create a new container with Apex\n\n**Dockerfile** installs the latest Apex on top of an existing image.  Ru"
  },
  {
    "path": "apex/examples/imagenet/README.md",
    "chars": 8920,
    "preview": "# Mixed Precision ImageNet Training in PyTorch\n\n`main_amp.py` is based on [https://github.com/pytorch/examples/tree/mast"
  },
  {
    "path": "apex/examples/imagenet/main_amp.py",
    "chars": 19876,
    "preview": "import argparse\nimport os\nimport shutil\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.parallel\nimport "
  },
  {
    "path": "apex/examples/simple/distributed/README.md",
    "chars": 762,
    "preview": "**distributed_data_parallel.py** and **run.sh** show an example using Amp with\n[apex.parallel.DistributedDataParallel](h"
  },
  {
    "path": "apex/examples/simple/distributed/distributed_data_parallel.py",
    "chars": 2548,
    "preview": "import torch\nimport argparse\nimport os\nfrom apex import amp\n# FOR DISTRIBUTED: (can also use torch.nn.parallel.Distribut"
  },
  {
    "path": "apex/examples/simple/distributed/run.sh",
    "chars": 95,
    "preview": "#!/bin/bash\npython -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py\n"
  },
  {
    "path": "apex/setup.py",
    "chars": 5844,
    "preview": "import torch\nfrom setuptools import setup, find_packages\nimport subprocess\n\nimport sys\n\nif not torch.cuda.is_available()"
  },
  {
    "path": "apex/tests/L0/run_amp/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "apex/tests/L0/run_amp/test_add_param_group.py",
    "chars": 5831,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nfrom apex.amp import _amp_state\nimp"
  },
  {
    "path": "apex/tests/L0/run_amp/test_basic_casts.py",
    "chars": 5782,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nimport torch\nfrom torch import nn\ni"
  },
  {
    "path": "apex/tests/L0/run_amp/test_cache.py",
    "chars": 4833,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nfrom apex.amp import _amp_state\nimp"
  },
  {
    "path": "apex/tests/L0/run_amp/test_multi_tensor_axpby.py",
    "chars": 4700,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nimport torch\nfrom torch import nn\ni"
  },
  {
    "path": "apex/tests/L0/run_amp/test_multi_tensor_l2norm.py",
    "chars": 2719,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nimport torch\nfrom torch import nn\ni"
  },
  {
    "path": "apex/tests/L0/run_amp/test_multi_tensor_scale.py",
    "chars": 4573,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nimport torch\nfrom torch import nn\ni"
  },
  {
    "path": "apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py",
    "chars": 36499,
    "preview": "import unittest\n\nimport functools as ft\nimport itertools as it\n\nfrom apex import amp\nfrom apex.amp import _amp_state\nimp"
  },
  {
    "path": "apex/tests/L0/run_amp/test_promotion.py",
    "chars": 2558,
    "preview": "import unittest\n\nimport itertools as it\n\nfrom apex import amp\nimport torch\nfrom torch import nn\nimport torch.nn.function"
  },
  {
    "path": "apex/tests/L0/run_amp/test_rnn.py",
    "chars": 4506,
    "preview": "import unittest\n\nfrom apex import amp\nimport random\nimport torch\nfrom torch import nn\n\nfrom utils import common_init, HA"
  },
  {
    "path": "apex/tests/L0/run_amp/utils.py",
    "chars": 512,
    "preview": "import torch\n\nHALF = 'torch.cuda.HalfTensor'\nFLOAT = 'torch.cuda.FloatTensor'\n\nDTYPES = [torch.half, torch.float]\n\nALWAY"
  },
  {
    "path": "apex/tests/L0/run_fp16util/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "apex/tests/L0/run_fp16util/test_fp16util.py",
    "chars": 2051,
    "preview": "import unittest\n\nimport torch\nimport torch.nn as nn\n\nfrom apex.fp16_utils import FP16Model\n\n\nclass DummyBlock(nn.Module)"
  },
  {
    "path": "apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py",
    "chars": 1279,
    "preview": "import unittest\nimport os\nimport random\n\nimport torch\nimport apex\n\n        \nclass TestFusedLayerNorm(unittest.TestCase):"
  },
  {
    "path": "apex/tests/L0/run_mixed_adam/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "apex/tests/L0/run_mixed_adam/test_fp16_optimizer.py",
    "chars": 5318,
    "preview": "import unittest\nimport torch\nimport apex\n\nclass TestFP16Optimizer(unittest.TestCase):\n    def setUp(self, max_abs_diff=1"
  },
  {
    "path": "apex/tests/L0/run_mixed_adam/test_mixed_adam.py",
    "chars": 6970,
    "preview": "import unittest\nimport os\nimport random\n\nimport torch\nimport apex\n\nclass TestFusedAdam(unittest.TestCase):\n    def setUp"
  },
  {
    "path": "apex/tests/L0/run_test.py",
    "chars": 407,
    "preview": "import unittest\nimport sys\n\ntest_dirs = [\"run_amp\", \"run_fp16util\", \"run_mixed_adam\", \"run_fused_layer_norm\"]\n\nrunner = "
  },
  {
    "path": "apex/tests/L1/common/compare.py",
    "chars": 2231,
    "preview": "import argparse\nimport torch\n\nparser = argparse.ArgumentParser(description='Compare')\nparser.add_argument('--opt-level',"
  },
  {
    "path": "apex/tests/L1/common/main_amp.py",
    "chars": 19169,
    "preview": "import argparse\nimport os\nimport shutil\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.parallel\nimport "
  },
  {
    "path": "apex/tests/L1/common/run_test.sh",
    "chars": 3543,
    "preview": "#!/bin/bash\n\nprint_banner() {\n  printf \"\\n\\n\\n\\e[30m\\e[42m$1\\e[0m\\n\\n\\n\\n\"\n}\n\nprint_banner \"Distributed status:  $1\"\n\nec"
  },
  {
    "path": "apex/tests/L1/cross_product/run.sh",
    "chars": 213,
    "preview": "#!/bin/bash\n\nDATADIR=\"/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/\"\n# DATADIR=\"/op"
  },
  {
    "path": "apex/tests/L1/cross_product_distributed/run.sh",
    "chars": 62,
    "preview": "#!/bin/bash\n\ncp ../common/* .\nbash run_test.sh distributed $1\n"
  },
  {
    "path": "apex/tests/distributed/DDP/ddp_race_condition_test.py",
    "chars": 2326,
    "preview": "import torch\nimport torch.distributed as dist\nfrom torch.nn import Parameter\nfrom torch.nn import Module\nfrom apex.paral"
  },
  {
    "path": "apex/tests/distributed/DDP/run_race_test.sh",
    "chars": 119,
    "preview": "#!/bin/bash\n\nCUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py\n"
  },
  {
    "path": "apex/tests/distributed/amp_master_params/amp_master_params.py",
    "chars": 2799,
    "preview": "import torch\nimport argparse\nimport os\nfrom apex import amp\n# FOR DISTRIBUTED: (can also use torch.nn.parallel.Distribut"
  },
  {
    "path": "apex/tests/distributed/amp_master_params/compare.py",
    "chars": 1531,
    "preview": "import torch\n\nmodel_params_rank0 = torch.load(\"rank0model.pth\",\n                           map_location = lambda storage"
  },
  {
    "path": "apex/tests/distributed/amp_master_params/run.sh",
    "chars": 106,
    "preview": "#!/bin/bash\npython -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py\n\npython compare.py\n"
  },
  {
    "path": "apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py",
    "chars": 7002,
    "preview": "import torch\nimport numpy as np\nimport apex\nif True:\n    print(\"using setup tools\")\n    import syncbn\nelse:\n    print(\"u"
  },
  {
    "path": "apex/tests/distributed/synced_batchnorm/test_groups.py",
    "chars": 6690,
    "preview": "import torch\nimport numpy as np\nimport apex\nimport syncbn\nimport os\nimport argparse\nimport torch.optim as optim\n\ndef com"
  },
  {
    "path": "apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py",
    "chars": 6259,
    "preview": "import torch\nimport numpy as np\nimport apex\nimport syncbn\nimport os\nimport argparse\nimport torch.optim as optim\n\ndef com"
  },
  {
    "path": "apex/tests/distributed/synced_batchnorm/unit_test.sh",
    "chars": 350,
    "preview": "python single_gpu_unit_test.py\npython -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py\npython -m torc"
  },
  {
    "path": "apex/tests/docker_extension_builds/run.sh",
    "chars": 1686,
    "preview": "#!/bin/bash\n\nprint_banner() {\n  printf \"\\n\\n\\n\\e[30m\\e[42m$1\\e[0m\\n\\n\\n\\n\"\n}\n\nprint_green() {\n  printf \"\\e[30m\\e[42m$1\\e"
  },
  {
    "path": "jukebox/Interacting_with_Jukebox.ipynb",
    "chars": 31408,
    "preview": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"Interacting with Jukebox\",\n     "
  },
  {
    "path": "jukebox/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/align.py",
    "chars": 4072,
    "preview": "\"\"\"\nGet alignment from attn values\n1. run a forward pass on each hop, get attn values\n2. concat for all hops\n\"\"\"\nimport "
  },
  {
    "path": "jukebox/data/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/data/artist_genre_processor.py",
    "chars": 3520,
    "preview": "import os\nimport re\n\naccepted = frozenset([chr(i) for i in range(ord('a'), ord('z') + 1)] +\n                     [chr(i)"
  },
  {
    "path": "jukebox/data/data_processor.py",
    "chars": 3167,
    "preview": "import torch as t\nimport jukebox.utils.dist_adapter as dist\nfrom torch.utils.data.distributed import DistributedSampler\n"
  },
  {
    "path": "jukebox/data/files_dataset.py",
    "chars": 4969,
    "preview": "import librosa\nimport math\nimport numpy as np\nimport jukebox.utils.dist_adapter as dist\nfrom torch.utils.data import Dat"
  },
  {
    "path": "jukebox/data/ids/v2_artist_ids.txt",
    "chars": 71676,
    "preview": "unknown;0\nvarious;0\n;0\nandr_s_schiff;1\nsonny_terry;2\nnelly;3\nmarkus_schulz;4\nmodest_petrovich_mussorgsky;5\notis_redding;"
  },
  {
    "path": "jukebox/data/ids/v2_genre_ids.txt",
    "chars": 1216,
    "preview": "unknown;0\nclassical;1\nblues;2\nhip;3\nhop;4\ndance;5\nsoul;6\nhard;7\nrock;8\njazz;9\nreggae;10\ncountry;11\nalternative;12\nsoundt"
  },
  {
    "path": "jukebox/data/ids/v3_artist_ids.txt",
    "chars": 140835,
    "preview": "beat farmers;1\naaron sprinkle;2\ndianne reeves;3\nlowe;4\nharry manx;5\nhail of bullets;6\nian gillan;7\nandraé crouch;8\nwides"
  },
  {
    "path": "jukebox/data/ids/v3_genre_ids.txt",
    "chars": 9336,
    "preview": "electroclash;1\nacid rock;2\nchristian metal;3\npop rock;4\ngothic;5\nbig beat;6\npsychedelic rock‎;7\nfunk carioca;8\nbebop;9\nd"
  },
  {
    "path": "jukebox/data/labels.py",
    "chars": 5995,
    "preview": "import torch as t\nimport numpy as np\nfrom jukebox.data.artist_genre_processor import ArtistGenreProcessor\nfrom jukebox.d"
  },
  {
    "path": "jukebox/data/text_processor.py",
    "chars": 1256,
    "preview": "import re\nfrom unidecode import unidecode\n\nclass TextProcessor():\n    def __init__(self, v3=False):\n        if v3:\n     "
  },
  {
    "path": "jukebox/hparams.py",
    "chars": 12034,
    "preview": "HPARAMS_REGISTRY = {}\nDEFAULTS = {}\n\nclass Hyperparams(dict):\n    def __getattr__(self, attr):\n        return self[attr]"
  },
  {
    "path": "jukebox/lyricdict.py",
    "chars": 20924,
    "preview": "# Poems\npoems = {\n'ozymandias': '''\nI met a traveller from an antique land,\nWho said—“Two vast and trunkless legs of sto"
  },
  {
    "path": "jukebox/make_models.py",
    "chars": 11894,
    "preview": "\"\"\"\nMake model classes\nLoad from checkpoints\nTest on dummy outputs to see if everything matches\n\"\"\"\nimport os\nimport num"
  },
  {
    "path": "jukebox/prior/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/prior/autoregressive.py",
    "chars": 18151,
    "preview": "import numpy as np\nimport torch as t\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom jukebox.transformer.ops"
  },
  {
    "path": "jukebox/prior/conditioners.py",
    "chars": 7283,
    "preview": "import torch as t\nimport torch.nn as nn\n\nfrom jukebox.transformer.ops import LayerNorm\nfrom jukebox.vqvae.encdec import "
  },
  {
    "path": "jukebox/prior/prior.py",
    "chars": 17788,
    "preview": "import numpy as np\nimport torch as t\nimport torch.nn as nn\nimport jukebox.utils.dist_adapter as dist\n\nfrom jukebox.trans"
  },
  {
    "path": "jukebox/sample.py",
    "chars": 12280,
    "preview": "import os\nimport torch as t\nimport jukebox.utils.dist_adapter as dist\n\nfrom jukebox.hparams import Hyperparams\nfrom juke"
  },
  {
    "path": "jukebox/save_html.py",
    "chars": 6136,
    "preview": "import os\nimport json\nimport numpy as np\nfrom PIL import Image, ImageFilter\nimport soundfile\n\ndef save_html(logdir, x, z"
  },
  {
    "path": "jukebox/tests/test_sample.py",
    "chars": 5750,
    "preview": "import torch as t\nimport numpy as np\nfrom jukebox.sample import sample_level\nfrom jukebox.utils.torch_utils import asser"
  },
  {
    "path": "jukebox/train.py",
    "chars": 12919,
    "preview": "\"\"\"\nAbility to train vq-vae and prior\nFirst try for random inputs\nThen from maestros\n\"\"\"\nimport sys\nimport fire\nimport w"
  },
  {
    "path": "jukebox/transformer/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/transformer/factored_attention.py",
    "chars": 24071,
    "preview": "# Factored attention\nimport math\nimport numpy as np\nimport torch as t\nimport torch.nn as nn\nimport torch.nn.functional a"
  },
  {
    "path": "jukebox/transformer/ops.py",
    "chars": 5176,
    "preview": "import math\nimport numpy as np\nimport torch as t\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# Import FusedLa"
  },
  {
    "path": "jukebox/transformer/transformer.py",
    "chars": 10927,
    "preview": "import functools\nimport numpy as np\nimport torch as t\nimport torch.nn as nn\nimport jukebox.utils.dist_adapter as dist\n\nf"
  },
  {
    "path": "jukebox/utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/utils/audio_utils.py",
    "chars": 5039,
    "preview": "import numpy as np\nimport torch as t\nimport jukebox.utils.dist_adapter as dist\nimport soundfile\nimport librosa\nfrom juke"
  },
  {
    "path": "jukebox/utils/checkpoint.py",
    "chars": 1245,
    "preview": "# Simple gradient checkpointing. Works with distributed data parallel\nimport torch as t\n\ndef checkpoint(func, inputs, pa"
  },
  {
    "path": "jukebox/utils/dist_adapter.py",
    "chars": 1935,
    "preview": "import torch.distributed as dist\nfrom enum import Enum\n\nclass ReduceOp(Enum):\n    SUM = 0,\n    PRODUCT = 1,\n    MIN = 2,"
  },
  {
    "path": "jukebox/utils/dist_utils.py",
    "chars": 3550,
    "preview": "import os\nfrom time import sleep\nimport torch\nimport jukebox.utils.dist_adapter as dist\n\ndef print_once(msg):\n    if (no"
  },
  {
    "path": "jukebox/utils/ema.py",
    "chars": 3361,
    "preview": "import torch\nfrom torch._utils import _flatten_dense_tensors\nimport numpy as np\n\n# EMA always in float, as accumulation "
  },
  {
    "path": "jukebox/utils/fp16.py",
    "chars": 11158,
    "preview": "# Utils for fp16 training.\nimport importlib\nimport math\nimport numpy as np\nimport torch\nimport jukebox.utils.dist_adapte"
  },
  {
    "path": "jukebox/utils/io.py",
    "chars": 5271,
    "preview": "import numpy as np\nimport av\nimport torch as t\nimport jukebox.utils.dist_adapter as dist\n\ndef get_duration_sec(file, cac"
  },
  {
    "path": "jukebox/utils/logger.py",
    "chars": 4444,
    "preview": "import torch as t\nimport jukebox.utils.dist_adapter as dist\nfrom tqdm import tqdm\nfrom datetime import date\nimport os\nim"
  },
  {
    "path": "jukebox/utils/remote_utils.py",
    "chars": 1333,
    "preview": "import sys\nimport subprocess\n\ndef download(remote_path, local_path, async_download=False):\n    args = ['wget', '-O', loc"
  },
  {
    "path": "jukebox/utils/sample_utils.py",
    "chars": 849,
    "preview": "import torch as t\n\ndef split_batch(obj, n_samples, split_size):\n    n_passes = (n_samples + split_size - 1) // split_siz"
  },
  {
    "path": "jukebox/utils/torch_utils.py",
    "chars": 758,
    "preview": "import gc\nimport torch as t\n\ndef freeze_model(model):\n    model.eval()\n    for params in model.parameters():\n        par"
  },
  {
    "path": "jukebox/vqvae/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "jukebox/vqvae/bottleneck.py",
    "chars": 8718,
    "preview": "import numpy as np\nimport torch as t\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport jukebox.utils.dist_ada"
  },
  {
    "path": "jukebox/vqvae/encdec.py",
    "chars": 5568,
    "preview": "import torch as t\nimport torch.nn as nn\nfrom jukebox.vqvae.resnet import Resnet, Resnet1D\nfrom jukebox.utils.torch_utils"
  },
  {
    "path": "jukebox/vqvae/resnet.py",
    "chars": 2653,
    "preview": "import math\nimport torch.nn as nn\nimport jukebox.utils.dist_adapter as dist\nfrom jukebox.utils.checkpoint import checkpo"
  },
  {
    "path": "jukebox/vqvae/vqvae.py",
    "chars": 8831,
    "preview": "import numpy as np\nimport torch as t\nimport torch.nn as nn\n\nfrom jukebox.vqvae.encdec import Encoder, Decoder, assert_sh"
  },
  {
    "path": "requirements.txt",
    "chars": 108,
    "preview": "fire==0.1.3\ntqdm==4.45.0\nsoundfile==0.10.3.post1\nunidecode==1.1.1\nnumba==0.48.0\nlibrosa==0.7.2\nmpi4py>=3.0.0"
  },
  {
    "path": "setup.py",
    "chars": 438,
    "preview": "import os\n\nimport pkg_resources\nfrom setuptools import setup, find_packages\n\nsetup(\n    name=\"jukebox\",\n    py_modules=["
  },
  {
    "path": "tensorboardX/.codecov.yml",
    "chars": 250,
    "preview": "coverage:\n  status:\n    project:                   # measuring the overall project coverage\n      default:              "
  },
  {
    "path": "tensorboardX/.flake8",
    "chars": 118,
    "preview": "[flake8]\nmax-line-length = 120\nignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999\nexclude = tensorboardX/proto"
  },
  {
    "path": "tensorboardX/.github/ISSUE_TEMPLATE/bug_report.md",
    "chars": 734,
    "preview": "---\nname: Bug report\nabout: Create bug report\ntitle: ''\nlabels: ''\nassignees: ''\n\n---\n\n**Describe the bug**\nA clear and "
  },
  {
    "path": "tensorboardX/.github/ISSUE_TEMPLATE/feature-requests-or-general-questions.md",
    "chars": 135,
    "preview": "---\nname: Feature requests or General questions\nabout: Feature requests or general questions\ntitle: ''\nlabels: ''\nassign"
  },
  {
    "path": "tensorboardX/.gitignore",
    "chars": 87,
    "preview": "proto_src/\nprotoc-*.zip\nprotoc/\n__pycache__\ndocs/_*\nbuild\ndist\n*.egg-info\nruns/*\n*.pyc\n"
  },
  {
    "path": "tensorboardX/.travis.yml",
    "chars": 2545,
    "preview": "dist: xenial\nlanguage: python\npython:\n  # We don't actually use the Travis Python, but this keeps it organized.\n  - \"2.7"
  },
  {
    "path": "tensorboardX/HISTORY.rst",
    "chars": 3038,
    "preview": "History\n=======\n1.8 (2019-07-05)\n-----------------\n* Draw label text on image with bounding box provided.\n* crc32c speed"
  },
  {
    "path": "tensorboardX/LICENSE",
    "chars": 1070,
    "preview": "MIT License\n\nCopyright (c) 2017 Tzu-Wei Huang\n\nPermission is hereby granted, free of charge, to any person obtaining a c"
  },
  {
    "path": "tensorboardX/MANIFEST.in",
    "chars": 189,
    "preview": "include HISTORY.rst\ninclude LICENSE\ninclude compile.sh\nrecursive-include tensorboardX/proto *\nrecursive-exclude test *\nr"
  },
  {
    "path": "tensorboardX/README.md",
    "chars": 4037,
    "preview": "# tensorboardX\n\n[![Build Status](https://travis-ci.org/lanpa/tensorboardX.svg?branch=master)](https://travis-ci.org/lanp"
  },
  {
    "path": "tensorboardX/compile.sh",
    "chars": 1392,
    "preview": "#!/bin/bash\n\n# Exit on error\n# set -e\n\nDESIRED_PROTO_VERSION=\"3.6.1\"\n\n# call protoc direclty, if version is not the desi"
  },
  {
    "path": "tensorboardX/docs/Makefile",
    "chars": 609,
    "preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHI"
  },
  {
    "path": "tensorboardX/docs/conf.py",
    "chars": 5419,
    "preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# tensorboardX documentation build configuration file, created by\n# sph"
  },
  {
    "path": "tensorboardX/docs/index.rst",
    "chars": 509,
    "preview": ".. tensorboardX documentation master file, created by\n   sphinx-quickstart on Wed Aug  9 01:38:01 2017.\n   You can adapt"
  },
  {
    "path": "tensorboardX/docs/tensorboard.rst",
    "chars": 224,
    "preview": "tensorboardX\n===================================\n.. automodule:: tensorboardX\n\n.. autoclass:: SummaryWriter\n    :members"
  },
  {
    "path": "tensorboardX/docs/tutorial.rst",
    "chars": 7601,
    "preview": "Tutorials\n*********\n\nWhat is tensorboard X?\n----------------------\n\nAt first, the package was named tensorboard, and soo"
  },
  {
    "path": "tensorboardX/docs/tutorial_zh.rst",
    "chars": 4433,
    "preview": "Tutorials_zh\n*************\n\n緣起\n------\nGoogle TensorFlow 附加的工具 Tensorboard 是一個很好用的視覺化工具。他可以記錄數字，影像或者是聲音資訊，對於觀察類神經網路訓練的過程非"
  },
  {
    "path": "tensorboardX/docs/utils.rst",
    "chars": 105,
    "preview": "Helper functions\n===================================\n.. autofunction:: tensorboardX.utils.figure_to_image"
  },
  {
    "path": "tensorboardX/examples/RUN_AFTER_PIP_INSTALL",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "tensorboardX/examples/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "tensorboardX/examples/chainer/extension_logger/net.py",
    "chars": 3623,
    "preview": "#!/usr/bin/env python\n\nfrom __future__ import print_function\n\nimport numpy\n\nimport chainer\nfrom chainer import cuda\nimpo"
  },
  {
    "path": "tensorboardX/examples/chainer/extension_logger/train_dcgan.py",
    "chars": 4734,
    "preview": "#!/usr/bin/env python\n\nfrom __future__ import print_function\nimport argparse\nimport os\n\nimport chainer\nfrom chainer impo"
  },
  {
    "path": "tensorboardX/examples/chainer/extension_logger/updater.py",
    "chars": 1441,
    "preview": "#!/usr/bin/env python\n\nfrom __future__ import print_function\n\nimport chainer\nimport chainer.functions as F\nfrom chainer "
  },
  {
    "path": "tensorboardX/examples/chainer/extension_logger/visualize.py",
    "chars": 575,
    "preview": "#!/usr/bin/env python\n\nimport os\n\nimport numpy as np\nfrom PIL import Image\n\nimport chainer\nimport chainer.cuda\nfrom chai"
  },
  {
    "path": "tensorboardX/examples/chainer/extension_logger/writetensorboard.py",
    "chars": 5278,
    "preview": "import json\nimport os\nimport shutil\nimport tempfile\n\nimport six\nfrom chainer import reporter\nfrom chainer import seriali"
  }
]

// ... and 119 more files (download for full content)

About this extraction

This page contains the full source code of the openai/jukebox GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 319 files (1.7 MB), approximately 499.5k tokens, and a symbol index with 1379 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo