Repository: for-ai/TD
Branch: master
Commit: 877e3b9d1491
Files: 50
Total size: 136.8 KB

Directory structure:
gitextract_tcky92iz/

├── .gitignore
├── .travis.yml
├── README.md
├── __init__.py
├── data/
│   ├── __init__.py
│   ├── data_generators/
│   │   ├── __init__.py
│   │   ├── cifar_generator.py
│   │   ├── generator_utils.py
│   │   └── mnist_generator.py
│   ├── dataset_maps.py
│   ├── image_reader.py
│   ├── imagenet_augs.py
│   └── registry.py
├── hparams/
│   ├── __init__.py
│   ├── basic.py
│   ├── defaults.py
│   ├── lenet.py
│   ├── registry.py
│   ├── resnet.py
│   ├── user.py
│   ├── utils.py
│   └── vgg.py
├── models/
│   ├── __init__.py
│   ├── basic/
│   │   ├── __init__.py
│   │   └── basic.py
│   ├── lenet/
│   │   ├── __init__.py
│   │   └── lenet.py
│   ├── registry.py
│   ├── resnet/
│   │   ├── __init__.py
│   │   └── resnet.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── activations.py
│   │   ├── dropouts.py
│   │   ├── initializations.py
│   │   ├── model_utils.py
│   │   └── optimizers.py
│   └── vgg/
│       ├── __init__.py
│       └── vgg.py
├── requirements.txt
├── scripts/
│   ├── __init__.py
│   └── prune/
│       ├── README.md
│       ├── __init__.py
│       ├── eval.py
│       └── prune.py
├── train.py
└── training/
    ├── __init__.py
    ├── envs.py
    ├── flags.py
    ├── lr_schemes.py
    └── tpu.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

tmp
runs
run

# PyCharm
.idea/

# macOS metadata
.DS_Store

.vscode

================================================
FILE: .travis.yml
================================================
language: python
python:
  - "3.6"

# command to install dependencies
install:
  - pip install -r requirements.txt

# command to run tests
script:
  - export FILES="$(git diff --name-only $TRAVIS_COMMIT_RANGE)"
  - cd /home/travis/build/for-ai
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams mnist_basic_no_dropout
  
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar_lenet
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar_lenet_weight
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar_lenet_trgtd_weight
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar_lenet_unit
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar_lenet_trgtd_unit
  
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar10_resnet32
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar10_resnet32_weight
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar10_resnet32_trgtd_weight
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar10_resnet32_unit
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar10_resnet32_trgtd_unit
  
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar100_vgg16_no_dropout
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar100_vgg16_untargeted_dropout
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar100_vgg16_targeted_dropout
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar100_vgg16_untargeted_unit_dropout
  - python3 -m TD.train --eval_steps 1 --eval_every 1 --train_epochs -1 --env local --hparams cifar100_vgg16_targeted_unit_dropout


================================================
FILE: README.md
================================================
# Targeted Dropout

Aidan N. Gomez, Ivan Zhang, Kevin Swersky, Yarin Gal, and Geoffrey E. Hinton

## Table of Contents
- [Requirements](#requirements)
- [Quick Start](#quick-start)
- [Experiments](#experiments)

## Requirements
- Python 3
- Tensorflow 1.8

## Quick Start
1. Train a model: `python -m TD.train --hparams=resnet_default`
2. Prune that model: `python -m TD.scripts.prune.eval --hparams=resnet_default --prune_percent 0.0,0.25,0.5,0.75,0.95`

### Flags
- `--env`: one of `local`, `gcp` (GPU instances), or `tpu` (TPU instances). Feel free to add more if necessary.
- `--hparams`: the hparam set you want to run.
- `--hparam_override`: manually specify hparams to be overridden (e.g `--hparam_override 'drop_rate=0.66'`)


================================================
FILE: __init__.py
================================================
__all__ = ["data", "hparams", "models", "training"]

from .data import *
from .hparams import *
from .models import *
from .training import *


================================================
FILE: data/__init__.py
================================================
__all__ = [
    "image_reader",
    "registry",
    "dataset_maps",
]


================================================
FILE: data/data_generators/__init__.py
================================================
__all__ = [
    "cifar_generator",
    "generator_utils",
    "mnist_generator",
]


================================================
FILE: data/data_generators/cifar_generator.py
================================================
try:
  import cPickle
except ImportError:
  import pickle as cPickle
import os
import random
import sys
import tarfile
import urllib.request
import numpy as np
import tensorflow as tf

from .generator_utils import generate_files
from ...models.utils.model_utils import ModeKeys

FLAGS = tf.app.flags.FLAGS

_URL = "http://www.cs.toronto.edu/~kriz/"
_CIFAR10_TAR = "cifar-10-python.tar.gz"
_CIFAR10_DIR = "cifar-10-batches-py"
_CIFAR10_TRAIN = [
    "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4",
    "data_batch_5"
]
_CIFAR10_TEST = ["test_batch"]

_CIFAR100_TAR = "cifar-100-python.tar.gz"
_CIFAR100_DIR = "cifar-100-python"
_CIFAR100_TRAIN = ["train"]
_CIFAR100_TEST = ["test"]

_WORKING_DIR = "/tmp/tf_data"


def download(v100):
  archive = _CIFAR100_TAR if v100 else _CIFAR10_TAR
  filepath = os.path.join(_WORKING_DIR, archive)
  if not os.path.exists(_WORKING_DIR):
    os.makedirs(_WORKING_DIR)
  url = _URL + archive
  if not os.path.isfile(filepath):
    print("Downloading " + url)
    urllib.request.urlretrieve(url, filepath)
  print("Extracting " + filepath)
  tar = tarfile.open(filepath, "r:gz")
  tar.extractall(path=_WORKING_DIR)
  tar.close()


def maybe_download(files, v100):
  for file in files:
    filepath = os.path.join(_WORKING_DIR, _CIFAR100_DIR
                            if v100 else _CIFAR10_DIR, file)
    if not os.path.isfile(filepath):
      download(v100)
      break


def read_files(files, v100):
  images = None
  labels = None
  for file in files:
    filename = os.path.join(_WORKING_DIR, _CIFAR100_DIR
                            if v100 else _CIFAR10_DIR, file)
    data = None
    with tf.gfile.Open(filename, "rb") as f:
      if sys.version_info < (3,):
        data = cPickle.load(f)
      else:
        data = cPickle.load(f, encoding="bytes")

    info = np.transpose(data[b"data"].reshape((-1, 3, 32, 32)), (0, 2, 3, 1))
    if images is None:
      images = info
    else:
      images = np.concatenate((images, info))

    info = data[b"fine_labels"] if v100 else data[b"labels"]
    if labels is None:
      labels = info
    else:
      labels = np.concatenate((labels, info))
  return images, labels


def cifar_generator(v100, mode):
  files = None
  if v100:
    files = _CIFAR100_TRAIN if mode != ModeKeys.TEST else _CIFAR100_TEST
  else:
    files = _CIFAR10_TRAIN if mode != ModeKeys.TEST else _CIFAR10_TEST
  maybe_download(files, v100)

  images, labels = read_files(files, v100)
  data = list(zip(images, labels))
  random.shuffle(data)
  
  samples = len(data)
  if mode == ModeKeys.TRAIN:
    data = data[:int(samples * 0.8)]
  elif mode == ModeKeys.EVAL:
    data = data[int(samples * 0.8):]

  image_ph = tf.placeholder(dtype=tf.uint8, shape=(32, 32, 3))
  encoded_ph = tf.image.encode_png(image_ph)

  sess = tf.Session()
  for image, label in data:
    encoded_im = sess.run(encoded_ph, feed_dict={image_ph: image})
    yield {
        "image/encoded": [encoded_im],
        "image/format": [b"png"],
        "image/class/label": [label],
        "image/height": [32],
        "image/width": [32],
        "image/channels": [3]
    }


def generate(train_name, eval_name, test_name, hparams):
  v100 = hparams.data in ["cifar100", "cifar100_tpu"]
  generate_files(
      cifar_generator(v100, mode=ModeKeys.TRAIN), train_name, hparams.data_dir,
      FLAGS.num_shards)
  generate_files(
      cifar_generator(v100, mode=ModeKeys.EVAL), eval_name, hparams.data_dir,
      FLAGS.num_shards)
  generate_files(
      cifar_generator(v100, mode=ModeKeys.TEST), test_name, hparams.data_dir,
      FLAGS.num_shards)


================================================
FILE: data/data_generators/generator_utils.py
================================================
import operator
import os
import numpy as np
import tensorflow as tf

tf.flags.DEFINE_boolean("v100", False,
                        "Download CIFAR-100 instead of CIFAR-10.")
tf.flags.DEFINE_integer("num_shards", 1,
                        "The number of output shards to write to.")


def to_example(dictionary):
  features = {}
  for k, v in dictionary.items():
    if len(v) == 0:
      raise Exception("Empty field: %s" % str((k, v)))
    if isinstance(v[0], (int, np.int8, np.int32, np.int64)):
      features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v))
    elif isinstance(v[0], (float, np.float32)):
      features[k] = tf.train.Feature(float_list=tf.train.FloatList(value=v))
    elif isinstance(v[0], (str, bytes)):
      features[k] = tf.train.Feature(bytes_list=tf.train.BytesList(value=v))
    else:
      raise Exception("Unsupported type: %s" % type(v[0]))
  return tf.train.Example(features=tf.train.Features(feature=features))


def generate_files(generator,
                   output_name,
                   output_dir,
                   num_shards,
                   max_cases=None):
  if not tf.gfile.Exists(output_dir):
    tf.gfile.MakeDirs(output_dir)

  writers = []
  for shard in range(num_shards):
    output_filename = "%s-%dof%d" % (output_name, shard + 1, num_shards)
    output_file = os.path.join(output_dir, output_filename)
    writers.append(tf.python_io.TFRecordWriter(output_file))

  counter, shard = 0, 0
  for case in generator:
    if counter % 100 == 0:
      tf.logging.info("Processed %d examples..." % counter)
    counter += 1
    if max_cases and counter > max_cases:
      break
    sequence_example = to_example(case)
    writers[shard].write(sequence_example.SerializeToString())
    shard = (shard + 1) % num_shards

  for writer in writers:
    writer.close()


================================================
FILE: data/data_generators/mnist_generator.py
================================================
import gzip
import os
import random
import urllib
import numpy as np
import tensorflow as tf

from .generator_utils import generate_files
from ...models.utils.model_utils import ModeKeys

FLAGS = tf.app.flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)

_TRAIN_IMAGE_COUNT = 60000
_TRAIN_IMAGE_FILE = "train-images-idx3-ubyte.gz"
_TRAIN_LABEL_FILE = "train-labels-idx1-ubyte.gz"

_TEST_IMAGE_COUNT = 10000
_TEST_IMAGE_FILE = "t10k-images-idx3-ubyte.gz"
_TEST_LABEL_FILE = "t10k-labels-idx1-ubyte.gz"

_WORKING_DIR = "/tmp/tf_data"


def download_files(filenames):
  """Download files to tmp/data if file does not exist
  Args:
    filenames: list of string; list of filenames to check if exist
  """
  if not os.path.exists(_WORKING_DIR):
    os.makedirs(_WORKING_DIR)
  for filename in filenames:
    filepath = os.path.join(_WORKING_DIR, filename)
    url = "http://yann.lecun.com/exdb/mnist/" + filename
    if not os.path.isfile(filepath):
      print("Downloading %s" % (url + filename))
      try:
        urllib.urlretrieve(url, filepath)
      except AttributeError:
        urllib.request.urlretrieve(url, filepath)


def read_images(filepath, num_images):
  with gzip.open(filepath) as f:
    f.read(16)
    buf = f.read(28 * 28 * num_images)
    data = np.frombuffer(buf, dtype=np.uint8)
    data = data.reshape(num_images, 28, 28, 1)
  return data


def read_labels(filepath, num_labels):
  with gzip.open(filepath) as f:
    f.read(8)
    buf = f.read(num_labels)
    data = np.frombuffer(buf, dtype=np.uint8)
  return data.astype(np.int64)


def mnist_generator(mode):
  num_images = _TRAIN_IMAGE_COUNT if mode != ModeKeys.TEST else _TEST_IMAGE_COUNT
  image_filepath = _TRAIN_IMAGE_FILE if mode != ModeKeys.TEST else _TEST_IMAGE_FILE
  label_filepath = _TRAIN_LABEL_FILE if mode != ModeKeys.TEST else _TEST_LABEL_FILE

  download_files([image_filepath, label_filepath])

  image_filepath = os.path.join(_WORKING_DIR, image_filepath)
  label_filepath = os.path.join(_WORKING_DIR, label_filepath)

  images = read_images(image_filepath, num_images)
  labels = read_labels(label_filepath, num_images)

  data = list(zip(images, labels))
  random.shuffle(data)
  
  if mode == ModeKeys.TRAIN:
    data = data[:5*num_images//6]
  elif mode == ModeKeys.EVAL:
    data = data[5*num_images//6:]

  image_ph = tf.placeholder(dtype=tf.uint8, shape=(28, 28, 1))
  encoded_ph = tf.image.encode_png(image_ph)

  sess = tf.Session()
  for image, label in data:
    encoded_im = sess.run(encoded_ph, feed_dict={image_ph: image})
    yield {
        "image/encoded": [encoded_im],
        "image/format": [b"png"],
        "image/class/label": [label],
        "image/height": [28],
        "image/width": [28]
    }


def generate(train_name, eval_name, test_name, hparams):
  generate_files(
      mnist_generator(mode=ModeKeys.TRAIN), train_name, hparams.data_dir, 1)
  generate_files(
      mnist_generator(mode=ModeKeys.EVAL), eval_name, hparams.data_dir, 1)
  generate_files(
      mnist_generator(mode=ModeKeys.TEST), test_name, hparams.data_dir, 1)


================================================
FILE: data/dataset_maps.py
================================================
import tensorflow as tf
from . import imagenet_augs 

_AUGMENTATIONS = dict()


def register(fn):
  global _AUGMENTATIONS
  _AUGMENTATIONS[fn.__name__] = fn
  return fn


def get_augmentation(name, params, training):

  def fn(*args, **kwargs):
    return _AUGMENTATIONS[name](
        *args, **kwargs, training=training, params=params)

  return fn


@register
def cifar_augmentation(image, label, training, params):
  """Image augmentation suitable for CIFAR-10/100.
  As described in https://arxiv.org/pdf/1608.06993v3.pdf (page 5).
  Args:
    images: a Tensor.
  Returns:
    Tensor of the same shape as images.
  """
  if training:
    image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
    image = tf.random_crop(image, [32, 32, 3])
    image = tf.image.random_flip_left_right(image)

  image = tf.image.per_image_standardization(image)
  return image, label

@register
def imagenet_augmentation(image, label, training, params):
  """Imagenet augmentations.
  Args:
    images: a Tensor.
  Returns:
    Tensor of the same shape as images.
  """
  if training:
    image = imagenet_augs.preprocess_for_train(image, params.input_shape[0])
  else:
    image = imagenet_augs.preprocess_for_eval(image, params.input_shape[0])
  return image, label


@register
def load_images(example, training, params):
  data_fields_to_features = {
      "image/encoded": tf.FixedLenFeature((), tf.string),
      "image/format": tf.FixedLenFeature((), tf.string),
      "image/class/label": tf.FixedLenFeature((), tf.int64)
  }

  example = tf.parse_single_example(example, data_fields_to_features)
  image = example["image/encoded"]
  image = tf.image.decode_png(image, channels=params.channels, dtype=tf.uint8)
  image = tf.to_float(image)

  label = tf.to_int32(example["image/class/label"])

  return image, label

@register
def set_shapes(image, label, training, params):
  image = tf.reshape(image, params.input_shape)
  return image, label
@register
def transpose(image, label, training, params):
  image = tf.transpose(image, [2, 0, 1])
  return image, label

================================================
FILE: data/image_reader.py
================================================
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

from .registry import register
from .dataset_maps import get_augmentation
from .data_generators import cifar_generator, mnist_generator


@register("imagenet", None)
@register("mnist", mnist_generator.generate)
@register("cifar10", cifar_generator.generate)
@register("cifar100", cifar_generator.generate)
def image_reader(data_sources, hparams, training):
  """Input function for image data."""

  def _input_fn(params=None):
    """Input function compatible with Experiment API."""
    if params is not None and "batch_size" in params:
      hparams.batch_size = params["batch_size"]

    dataset = tf.data.TFRecordDataset(
        data_sources, num_parallel_reads=4 if training else 1)
    dataset = dataset.prefetch(5 * hparams.batch_size)

    if hparams.shuffle_data:
      dataset = dataset.shuffle(5 * hparams.batch_size)

    dataset = dataset.map(get_augmentation("load_images", hparams, training))

    if hparams.data_augmentations is not None:
      for augmentation_name in hparams.data_augmentations:
        dataset = dataset.map(
            get_augmentation(augmentation_name, hparams, training))

    dataset = dataset.map(get_augmentation("set_shapes", hparams, training))
    if hparams.data_format == "channels_first":
      dataset = dataset.map(get_augmentation("transpose", hparams, training))
    dataset = dataset.repeat().batch(hparams.batch_size)
    dataset_it = dataset.make_one_shot_iterator()

    images, labels = dataset_it.get_next()
    if params is not None and "batch_size" in params:
      images = tf.reshape(images,
                          [hparams.batch_size] + images.shape.as_list()[1:])
      labels = tf.reshape(labels,
                          [hparams.batch_size] + labels.shape.as_list()[1:])
    return {"inputs": images, "labels": labels}, labels

  return _input_fn


@register("mnist_simple", None)
def mnist_simple(data_source, params, training):
  """Input function for MNIST image data."""

  mnist = input_data.read_data_sets(data_source, one_hot=True)

  data_set = mnist.train if training else mnist.test

  def _input_fn():
    input_images = tf.constant(data_set.images)

    input_labels = tf.constant(
        data_set.labels) if not params.is_ae else tf.constant(data_set.images)

    image, label = tf.train.slice_input_producer([input_images, input_labels])

    imageBatch, labelBatch = tf.train.batch(
        [image, label], batch_size=params.batch_size)

    return {"inputs": imageBatch}, labelBatch

  return _input_fn


@register("fashion", None)
def fashion(data_source, params, training):
  """Input function for MNIST image data."""

  mnist = input_data.read_data_sets(
      data_source,
      source_url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/',
      one_hot=True)

  data_set = mnist.train if training else mnist.test

  def _input_fn():
    input_images = tf.constant(data_set.images)

    input_labels = tf.constant(data_set.labels)
    image, label = tf.train.slice_input_producer([input_images, input_labels])

    imageBatch, labelBatch = tf.train.batch(
        [image, label], batch_size=params.batch_size)

    return {"inputs": imageBatch}, labelBatch

  return _input_fn


================================================
FILE: data/imagenet_augs.py
================================================
import tensorflow as tf

MEAN_RGB = [0.485, 0.456, 0.406]
STDDEV_RGB = [0.229, 0.224, 0.225]


# The following preprocessing functions were taken from
# cloud_tpu/models/resnet/resnet_preprocessing.py
# ==============================================================================
def _crop(image, offset_height, offset_width, crop_height, crop_width):
  """Crops the given image using the provided offsets and sizes.
  Note that the method doesn't assume we know the input image size but it does
  assume we know the input image rank.
  Args:
    image: `Tensor` image of shape [height, width, channels].
    offset_height: `Tensor` indicating the height offset.
    offset_width: `Tensor` indicating the width offset.
    crop_height: the height of the cropped image.
    crop_width: the width of the cropped image.
  Returns:
    the cropped (and resized) image.
  Raises:
    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
      less than the crop size.
  """
  original_shape = tf.shape(image)

  rank_assertion = tf.Assert(
      tf.equal(tf.rank(image), 3), ["Rank of image must be equal to 3."])
  with tf.control_dependencies([rank_assertion]):
    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])

  size_assertion = tf.Assert(
      tf.logical_and(
          tf.greater_equal(original_shape[0], crop_height),
          tf.greater_equal(original_shape[1], crop_width)),
      ["Crop size greater than the image size."])

  offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))

  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
  # define the crop size.
  with tf.control_dependencies([size_assertion]):
    image = tf.slice(image, offsets, cropped_shape)
  return tf.reshape(image, cropped_shape)


def distorted_bounding_box_crop(image,
                                bbox,
                                min_object_covered=0.1,
                                aspect_ratio_range=(0.75, 1.33),
                                area_range=(0.05, 1.0),
                                max_attempts=100,
                                scope=None):
  """Generates cropped_image using a one of the bboxes randomly distorted.
  See `tf.image.sample_distorted_bounding_box` for more documentation.
  Args:
    image: `Tensor` of image (it will be converted to floats in [0, 1]).
    bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
        where each coordinate is [0, 1) and the coordinates are arranged
        as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
        image.
    min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
        area of the image must contain at least this fraction of any bounding
        box supplied.
    aspect_ratio_range: An optional list of `float`s. The cropped area of the
        image must have an aspect ratio = width / height within this range.
    area_range: An optional list of `float`s. The cropped area of the image
        must contain a fraction of the supplied image within in this range.
    max_attempts: An optional `int`. Number of attempts at generating a cropped
        region of the image of the specified constraints. After `max_attempts`
        failures, return the entire image.
    scope: Optional `str` for name scope.
  Returns:
    (cropped image `Tensor`, distorted bbox `Tensor`).
  """
  with tf.name_scope(
      scope, default_name="distorted_bounding_box_crop", values=[image, bbox]):
    # Each bounding box has shape [1, num_boxes, box coords] and
    # the coordinates are ordered [ymin, xmin, ymax, xmax].

    # A large fraction of image datasets contain a human-annotated bounding
    # box delineating the region of the image containing the object of interest.
    # We choose to create a new bounding box for the object which is a randomly
    # distorted version of the human-annotated bounding box that obeys an
    # allowed range of aspect ratios, sizes and overlap with the human-annotated
    # bounding box. If no box is supplied, then we assume the bounding box is
    # the entire image.
    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
        tf.shape(image),
        bounding_boxes=bbox,
        min_object_covered=min_object_covered,
        aspect_ratio_range=aspect_ratio_range,
        area_range=area_range,
        max_attempts=max_attempts,
        use_image_if_no_bounding_boxes=True)
    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box

    # Crop the image to the specified bounding box.
    cropped_image = tf.slice(image, bbox_begin, bbox_size)
    return cropped_image, distort_bbox


def _random_crop(image, size):
  """Make a random crop of (`size` x `size`)."""
  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
  random_image, bbox = distorted_bounding_box_crop(
      image,
      bbox,
      min_object_covered=0.1,
      aspect_ratio_range=(3. / 4, 4. / 3.),
      area_range=(0.08, 1.0),
      max_attempts=1,
      scope=None)
  bad = _at_least_x_are_true(tf.shape(image), tf.shape(random_image), 3)

  image = tf.cond(
      bad, lambda: _center_crop(_do_scale(image, size), size),
      lambda: tf.image.resize_bicubic([random_image], [size, size])[0])
  return image


def _flip(image):
  """Random horizontal image flip."""
  image = tf.image.random_flip_left_right(image)
  return image


def _at_least_x_are_true(a, b, x):
  """At least `x` of `a` and `b` `Tensors` are true."""
  match = tf.equal(a, b)
  match = tf.cast(match, tf.int32)
  return tf.greater_equal(tf.reduce_sum(match), x)


def _do_scale(image, size):
  """Rescale the image by scaling the smaller spatial dimension to `size`."""
  shape = tf.cast(tf.shape(image), tf.float32)
  w_greater = tf.greater(shape[0], shape[1])
  shape = tf.cond(
      w_greater, lambda: tf.cast([shape[0] / shape[1] * size, size], tf.int32),
      lambda: tf.cast([size, shape[1] / shape[0] * size], tf.int32))

  return tf.image.resize_bicubic([image], shape)[0]


def _center_crop(image, size):
  """Crops to center of image with specified `size`."""
  image_height = tf.shape(image)[0]
  image_width = tf.shape(image)[1]

  offset_height = ((image_height - size) + 1) / 2
  offset_width = ((image_width - size) + 1) / 2
  image = _crop(image, offset_height, offset_width, size, size)
  return image


def _normalize(image):
  """Normalize the image to zero mean and unit variance."""
  offset = tf.constant(MEAN_RGB, shape=[1, 1, 3])
  image -= offset

  scale = tf.constant(STDDEV_RGB, shape=[1, 1, 3])
  image /= scale
  return image


def preprocess_for_train(image, image_size=224):
  """Preprocesses the given image for evaluation.
  Args:
    image: `Tensor` representing an image of arbitrary size.
    image_size: int, how large the output image should be.
  Returns:
    A preprocessed image `Tensor`.
  """
  image = _random_crop(image, image_size)
  image = _normalize(image)
  image = _flip(image)
  image = tf.reshape(image, [image_size, image_size, 3])
  return image


def preprocess_for_eval(image, image_size=224):
  """Preprocesses the given image for evaluation.
  Args:
    image: `Tensor` representing an image of arbitrary size.
    image_size: int, how large the output image should be.
  Returns:
    A preprocessed image `Tensor`.
  """
  image = _do_scale(image, image_size + 32)
  image = _normalize(image)
  image = _center_crop(image, image_size)
  image = tf.reshape(image, [image_size, image_size, 3])
  return image


================================================
FILE: data/registry.py
================================================
import os

import tensorflow as tf

_INPUT_FNS = dict()
_GENERATORS = dict()


def register(name, generator):

  def add_to_dict(fn):
    global _INPUT_FNS
    global _GENERATORS
    _INPUT_FNS[name] = fn
    _GENERATORS[name] = generator
    return fn

  return add_to_dict


def get_input_fns(hparams, generate=True):
  train_path = os.path.join(hparams.data_dir, "train*")
  eval_path = os.path.join(hparams.data_dir, "eval*")
  test_path = os.path.join(hparams.data_dir, "test*")

  if generate:
    if not tf.gfile.Exists(hparams.data_dir):
      tf.gfile.MakeDirs(hparams.data_dir)

    # generate if train doesnt exist
    maybe_generate(train_path, hparams)
    maybe_generate(eval_path, hparams)
    maybe_generate(test_path, hparams)

  train_path = tf.gfile.Glob(train_path)
  eval_path = tf.gfile.Glob(eval_path)
  test_path = tf.gfile.Glob(test_path)

  input_fn = _INPUT_FNS[hparams.data]
  train_fn = input_fn(train_path, hparams, training=True)
  eval_fn = None if not eval_path else input_fn(
      eval_path, hparams, training=False)
  test_fn = None if not test_path else input_fn(
      test_path, hparams, training=False)
  if not (eval_path or test_path):
    raise Exception("Could not find eval or test files.")
  return train_fn, eval_fn, test_fn


def get_dataset(hparams):
  train_path = os.path.join(hparams.data_dir, "train*")
  eval_path = os.path.join(hparams.data_dir, "eval*")
  test_path = os.path.join(hparams.data_dir, "test*")
  maybe_generate(train_path, hparams)
  maybe_generate(eval_path, hparams)
  maybe_generate(test_path, hparams)
  return train_path, eval_path, test_path


def maybe_generate(check_path, hparams):
  if not tf.gfile.Glob(check_path):
    generate_fn = _GENERATORS[hparams.data]
    if generate_fn:
      generate_fn("train", "eval", "test", hparams)
    else:
      tf.logging.warn(
          "No generator function. Unable to generate: %s" % check_path)


================================================
FILE: hparams/__init__.py
================================================
__all__ = ["defaults", "registry", "resnet", "lenet", "utils", "vgg", "basic"]

from .defaults import *
from .resnet import *
from .registry import *
from .user import *
from .utils import *
from .lenet import *
from .basic import *
from .vgg import *
from .basic import *


================================================
FILE: hparams/basic.py
================================================
import tensorflow as tf

from . import defaults
from .registry import register


# MNIST =========================
@register
def mnist_basic_no_dropout():
  hps = defaults.default()
  hps.model = "basic"
  hps.data = "mnist"
  hps.activation = "relu"
  hps.batch_norm = False
  hps.drop_rate = 0.0
  hps.dropout_type = None
  hps.initializer = "glorot_uniform_initializer"
  hps.layers = [128, 64, 32]
  hps.input_shape = [784]
  hps.output_shape = [10]
  hps.layer_type = "dense"

  hps.learning_rate = 0.1
  hps.optimizer = "momentum"
  hps.momentum = 0.0

  return hps


@register
def mnist_basic_trgtd_dropout():
  hps = mnist_basic_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.5

  return hps


@register
def mnist_basic_untrgtd_dropout():
  hps = mnist_basic_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_weight"

  return hps


@register
def mnist_basic_trgtd_dropout_random():
  hps = mnist_basic_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_weight_random"
  hps.targ_rate = 0.5

  return hps


@register
def mnist_basic_trgtd_unit_dropout():
  hps = mnist_basic_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.5

  return hps


@register
def mnist_basic_smallify_dropout_1eneg4():
  hps = mnist_basic_no_dropout()
  hps.dropout_type = "smallify_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5

  return hps


@register
def mnist_basic_smallify_dropout_1eneg3():
  hps = mnist_basic_smallify_dropout_1eneg4()
  hps.smallify = 1e-3

  return hps


@register
def mnist_basic_smallify_weight_dropout_1eneg4():
  hps = mnist_basic_no_dropout()
  hps.dropout_type = "smallify_weight_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5

  return hps


@register
def cifar10_basic_no_dropout():
  hps = defaults.default()
  hps.model = "basic"
  hps.data = "cifar10"
  hps.activation = "relu"
  hps.batch_norm = False
  hps.drop_rate = 0.0
  hps.dropout_type = None
  hps.initializer = "glorot_uniform_initializer"
  hps.layers = [128, 64, 32]
  hps.channels = 3
  hps.input_shape = [32, 32, 3]
  hps.output_shape = [10]
  hps.layer_type = "dense"

  hps.learning_rate = 0.1
  hps.optimizer = "momentum"
  hps.momentum = 0.0

  return hps


@register
def cifar100_basic_no_dropout():
  hps = cifar10_basic_no_dropout()
  hps.output_shape = [100]
  hps.data = "cifar100"
  return hps


@register
def imagenet32_basic():
  hps = defaults.default_imagenet32()
  hps.model = "basic"
  hps.activation = "relu"
  hps.batch_norm = False
  hps.drop_rate = 0.0
  hps.dropout_type = None
  hps.initializer = "glorot_uniform_initializer"
  hps.layers = [128, 64, 32]
  hps.layer_type = "dense"
  hps.learning_rate = 0.1
  hps.optimizer = "momentum"
  hps.momentum = 0.0
  return hps

================================================
FILE: hparams/defaults.py
================================================
import tensorflow as tf

from .registry import register
from .utils import HParams


@register
def default():
  return HParams(
      model=None,
      data=None,
      shuffle_data=True,
      data_augmentations=None,
      train_epochs=256,
      eval_steps=100,
      type="image",
      batch_size=64,
      learning_rate=0.01,
      lr_scheme="constant",
      initializer="glorot_normal_initializer",
      delay=0,
      staircased=False,
      learning_rate_decay_interval=2000,
      learning_rate_decay_rate=0.1,
      clip_grad_norm=1.0,
      l2_loss=0.0,
      prune_val=0.8,
      label_smoothing=0.1,
      use_tpu=False,
      momentum=0.9,
      init_scheme="random",
      warmup_steps=10000,
      use_nesterov=False,
      louizos_cost=0.0,
      l1_norm=0.0,
      thresh=2.5,
      fixed=False,
      var_scale=1,
      klscale=1.0,
      ard_cost=0.0,
      logit_packing=0.0,
      logit_squeezing=0.0,
      clp=0.0,
      logit_bound=None,
      dropout_type=None,
      smallify=0.0,
      smallify_delay=1000,
      linear_drop_rate=False,
      weight_decay_and_noise=False,
      weight_decay_only_features=True,
      weight_decay_weight_names=["DW", "kernel", "bias"],
      dropout_delay_steps=5000,
      grad_noise_scale=0.0,
      td_nines=0,
      targ_cost=1.0,
      aparams="",
      channels=1,
      data_format="channels_last",
      epoch_size=50000,
  )


@register
def default_cifar10():
  hps = default()
  hps.data = "cifar10"
  hps.data_augmentations = ["cifar_augmentation"]
  hps.epoch_size = 50000  # number of images in train set

  hps.input_shape = [32, 32, 3]
  hps.output_shape = [10]
  hps.channels = 3
  hps.num_classes = 10

  return hps


@register
def default_cifar100():
  hps = default_cifar10()
  hps.data = "cifar100"
  hps.output_shape = [100]
  hps.num_classes = 100

  return hps


@register
def default_imagenet299():
  hps = default()
  hps.data = "imagenet"
  hps.data_augmentations = ["imagenet_augmentation"]
  hps.epoch_size = 1281167

  hps.input_shape = [299, 299, 3]
  hps.channels = 3
  hps.output_shape = [1001]
  hps.num_classes = 1001

  return hps


@register
def default_imagenet224():
  hps = default_imagenet299()
  hps.input_shape = [224, 224, 3]

  return hps


@register
def default_imagenet64():
  hps = default_imagenet299()
  hps.input_shape = [64, 64, 3]

  return hps


@register
def default_imagenet32():
  hps = default_imagenet299()
  hps.input_shape = [32, 32, 3]

  return hps


================================================
FILE: hparams/lenet.py
================================================
import tensorflow as tf

from .defaults import default, default_cifar10
from .registry import register

# lenet


@register
def cifar_lenet():
  hps = default_cifar10()

  hps.model = "lenet"

  hps.activation = "relu"
  hps.residual = True
  hps.initializer = "glorot_normal_initializer"
  hps.kernel_size = 5
  hps.lr_scheme = "constant"
  hps.batch_size = 128

  hps.learning_rate = 0.01
  hps.optimizer = "momentum"
  hps.momentum = 0.9
  hps.use_nesterov = True

  hps.drop_rate = 0.0
  hps.dropout_type = None
  hps.targ_rate = 0.0

  hps.axis_aligned_cost = False
  hps.clp = False
  hps.logit_squeezing = False

  return hps


@register
def cifar_lenet_no_dropout():
  hps = cifar_lenet()
  return hps


@register
def cifar_lenet_weight():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "untargeted_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_trgtd_weight():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.5
  hps.targ_rate = 0.5
  hps.dropout_type = "targeted_weight"
  return hps


@register
def cifar_lenet_unit():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_unit"
  return hps


@register
def cifar_lenet_trgtd_unit():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.5
  hps.targ_rate = 0.5
  hps.dropout_type = "targeted_unit"
  return hps


@register
def cifar_lenet_l1():
  hps = cifar_lenet_no_dropout()
  hps.l1_norm = 0.1
  return hps


@register
def cifar_lenet_trgtd_weight_l1():
  hps = cifar_lenet_no_dropout()
  hps.l1_norm = 0.1
  hps.drop_rate = 0.5
  hps.targ_rate = 0.5
  hps.dropout_type = "targeted_weight"
  return hps


@register
def cifar_lenet_trgtd_unit_l1():
  hps = cifar_lenet_no_dropout()
  hps.l1_norm = 0.1
  hps.drop_rate = 0.5
  hps.targ_rate = 0.5
  hps.dropout_type = "targeted_unit"
  return hps


@register
def cifar_lenet_trgtd_unit_botk75_33():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.33
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.75
  return hps


@register
def cifar_lenet_trgtd_unit_botk75_66():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.66
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.75
  return hps


@register
def cifar_lenet_trgtd_weight_botk75_33():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.33
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.75
  return hps


@register
def cifar_lenet_trgtd_weight_botk75_66():
  hps = cifar_lenet_no_dropout()
  hps.drop_rate = 0.66
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.75
  return hps


@register
def cifar_lenet_louizos_weight_1en3():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.001
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_weight_1en1():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.1
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_weight_1en2():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.01
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_weight_5en3():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.005
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_weight_1en4():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.0001
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_unit_1en3():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.001
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_unit_1en1():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.1
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_unit_1en2():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.01
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_unit_5en3():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.005
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_louizos_unit_1en4():
  hps = cifar_lenet_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.0001
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25
  return hps


@register
def cifar_lenet_variational():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "variational"
  hps.var_scale = 1. / 100
  hps.drop_rate = 0.75

  return hps


@register
def cifar_lenet_variational_unscaled():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "variational"
  hps.drop_rate = 0.75

  return hps


@register
def cifar_lenet_variational_unit():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "variational_unit"
  hps.var_scale = 1. / 100
  hps.drop_rate = 0.75

  return hps


@register
def cifar_lenet_variational_unit_unscaled():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "variational_unit"
  hps.drop_rate = 0.75

  return hps


@register
def cifar_lenet_smallify_neg4():
  hps = cifar_lenet_no_dropout()
  hps.dropout_type = "smallify_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5
  hps.smallify_delay = 10000
  return hps


================================================
FILE: hparams/registry.py
================================================
import tensorflow as tf

_HPARAMS = dict()


def register(fn):
  global _HPARAMS
  _HPARAMS[fn.__name__] = fn()
  return fn


def get_hparams(hparams_list):
  """Fetches a merged group of hyperparameter sets (chronological priority)."""
  final = tf.contrib.training.HParams()
  for name in hparams_list.split("-"):
    curr = _HPARAMS[name]
    final_dict = final.values()
    for k, v in curr.values().items():
      if k not in final_dict:
        final.add_hparam(k, v)
      elif final_dict[k] is None:
        setattr(final, k, v)
  return final


================================================
FILE: hparams/resnet.py
================================================
import tensorflow as tf

from .registry import register
from .defaults import *


# from https://github.com/tensorflow/models/blob/master/resnet/resnet_main.py
@register
def resnet_default():
  hps = default_cifar10()
  hps.model = "resnet"
  hps.residual_filters = [16, 32, 64, 128]
  hps.residual_units = [5, 5, 5]
  hps.use_bottleneck = False
  hps.batch_size = 128
  hps.learning_rate = 0.4
  hps.lr_scheme = "resnet"
  hps.weight_decay_rate = 2e-4
  hps.optimizer = "momentum"
  return hps


@register
def resnet102_imagenet224():
  hps = default_imagenet224()
  hps.model = "resnet"
  hps.residual_filters = [64, 64, 128, 256, 512]
  hps.residual_units = [3, 4, 23, 3]
  hps.use_bottleneck = True
  hps.batch_size = 128 * 8
  hps.learning_rate = 0.128 * hps.batch_size / 256.
  hps.lr_scheme = "warmup_cosine"
  hps.warmup_steps = 10000
  hps.weight_decay_rate = 1e-4
  hps.optimizer = "momentum"
  hps.use_nesterov = True
  hps.initializer = "variance_scaling_initializer"
  hps.learning_rate_cosine_cycle_steps = 120000
  hps.cosine_alpha = 0.0
  return hps


@register
def resnet102_imagenet64():
  hps = resnet102_imagenet224()
  hps.input_shape = [64, 64, 3]
  return hps


@register
def resnet50_imagenet224():
  hps = resnet102_imagenet224()
  hps.residual_units = [3, 4, 6, 3]
  return hps


@register
def resnet34_imagenet224():
  hps = resnet50_imagenet224()
  hps.use_bottleneck = False
  return hps


@register
def resnet_cifar100():
  hps = resnet_default()
  hps.num_classes = 100
  return hps


@register
def cifar10_resnet32():
  hps = resnet_default()

  return hps


@register
def cifar10_resnet32_no_dropout():
  hps = cifar10_resnet32()
  hps.drop_rate = 0.0

  return hps


@register
def cifar10_resnet32_trgtd_weight():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.5

  return hps


@register
def cifar10_resnet32_weight():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_weight"

  return hps


@register
def cifar10_resnet32_weight_50():
  hps = cifar10_resnet32_weight()
  hps.drop_rate = 0.50

  return hps


@register
def cifar10_resnet32_trgtd_unit():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.5

  return hps


@register
def cifar10_resnet32_trgtd_ard():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "targeted_ard"
  hps.targ_rate = 0.5

  return hps


@register
def cifar10_resnet32_unit():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_unit"

  return hps


@register
def cifar10_resnet32_unit_50():
  hps = cifar10_resnet32_unit()
  hps.drop_rate = 0.50

  return hps


@register
def cifar10_resnet32_l1_1eneg3():
  hps = cifar10_resnet32_no_dropout()
  hps.l1_norm = 0.001

  return hps


@register
def cifar10_resnet32_l1_1eneg2():
  hps = cifar10_resnet32_no_dropout()
  hps.l1_norm = 0.01

  return hps


@register
def cifar10_resnet32_l1_1eneg1():
  hps = cifar10_resnet32_no_dropout()
  hps.l1_norm = 0.1

  return hps


@register
def cifar10_resnet32_trgted_weight_l1():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.5
  hps.l1_norm = 0.1

  return hps


@register
def cifar10_resnet32_targeted_unit_l1():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.5
  hps.l1_norm = 0.1

  return hps


@register
def cifar10_resnet32_trgtd_unit_botk75_33():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.33
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.75

  return hps


@register
def cifar10_resnet32_trgtd_unit_botk75_66():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.66
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.75

  return hps


@register
def cifar10_resnet32_trgtd_weight_botk75_33():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.33
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.75

  return hps


@register
def cifar10_resnet32_trgtd_weight_botk75_66():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.66
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.75

  return hps


@register
def cifar10_resnet32_trgtd_unit_ramping_botk90_99():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.99
  hps.dropout_type = "targeted_unit_piecewise"
  hps.targ_rate = 0.90

  return hps


@register
def cifar10_resnet32_trgtd_weight_ramping_botk99_99():
  hps = cifar10_resnet32_no_dropout()
  hps.drop_rate = 0.99
  hps.dropout_type = "targeted_weight_piecewise"
  hps.targ_rate = 0.99
  hps.linear_drop_rate = True

  return hps


@register
def cifar10_resnet32_louizos_weight_1en3():
  hps = cifar10_resnet32_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.001
  hps.dropout_type = "louizos_weight"
  hps.drop_rate = 0.001

  return hps


@register
def cifar10_resnet32_louizos_weight_1en1():
  hps = cifar10_resnet32_louizos_weight_1en3()
  hps.louizos_cost = 0.1
  hps.dropout_type = "louizos_weight"

  return hps


@register
def cifar10_resnet32_louizos_weight_1en2():
  hps = cifar10_resnet32_louizos_weight_1en3()
  hps.louizos_cost = 0.01

  return hps


@register
def cifar10_resnet32_louizos_weight_5en3():
  hps = cifar10_resnet32_louizos_weight_1en3()
  hps.louizos_cost = 0.005

  return hps


@register
def cifar10_resnet32_louizos_weight_1en4():
  hps = cifar10_resnet32_louizos_weight_1en3()
  hps.louizos_cost = 0.0001

  return hps


@register
def cifar10_resnet32_louizos_unit_1en3():
  hps = cifar10_resnet32_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.001
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.001

  return hps


@register
def cifar10_resnet32_louizos_unit_1en1():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.1

  return hps


@register
def cifar10_resnet32_louizos_unit_1en2():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.01

  return hps


@register
def cifar10_resnet32_louizos_unit_5en3():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.005

  return hps


@register
def cifar10_resnet32_louizos_unit_1en4():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.0001

  return hps


@register
def cifar10_resnet32_louizos_unit_1en5():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.00001

  return hps


@register
def cifar10_resnet32_louizos_unit_1en6():
  hps = cifar10_resnet32_louizos_unit_1en3()
  hps.louizos_cost = 0.000001

  return hps


@register
def cifar10_resnet32_variational_weight():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "variational"
  hps.drop_rate = 0.75
  hps.thresh = 3
  hps.var_scale = 1. / 100
  hps.weight_decay_rate = None

  return hps


@register
def cifar10_resnet32_variational_weight_unscaled():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "variational"
  hps.drop_rate = 0.75
  hps.thresh = 3
  hps.var_scale = 1
  hps.weight_decay_rate = None

  return hps


@register
def cifar10_resnet32_variational_unit():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "variational_unit"
  hps.drop_rate = 0.75
  hps.thresh = 3
  hps.var_scale = 1. / 100
  hps.weight_decay_rate = None

  return hps


@register
def cifar10_resnet32_variational_unit_unscaled():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "variational_unit"
  hps.drop_rate = 0.75
  hps.thresh = 3
  hps.var_scale = 1
  hps.weight_decay_rate = None

  return hps


@register
def cifar10_resnet32_smallify_1eneg4():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "smallify_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5

  return hps


@register
def cifar10_resnet32_smallify_1eneg3():
  hps = cifar10_resnet32_smallify_1eneg4()
  hps.smallify = 1e-3

  return hps


@register
def cifar10_resnet32_smallify_1eneg5():
  hps = cifar10_resnet32_smallify_1eneg4()
  hps.smallify = 1e-5

  return hps


@register
def cifar10_resnet32_smallify_1eneg6():
  hps = cifar10_resnet32_smallify_1eneg4()
  hps.smallify = 1e-6

  return hps


@register
def cifar10_resnet32_smallify_weight_1eneg4():
  hps = cifar10_resnet32_no_dropout()
  hps.dropout_type = "smallify_weight_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5

  return hps


@register
def cifar10_resnet32_smallify_weight_1eneg3():
  hps = cifar10_resnet32_smallify_weight_1eneg4()
  hps.smallify = 1e-3

  return hps


@register
def cifar10_resnet32_smallify_weight_1eneg5():
  hps = cifar10_resnet32_smallify_weight_1eneg3()
  hps.smallify = 1e-5

  return hps


@register
def cifar10_resnet32_smallify_weight_1eneg6():
  hps = cifar10_resnet32_smallify_weight_1eneg3()
  hps.smallify = 1e-6

  return hps


# ================================


================================================
FILE: hparams/user.py
================================================
import tensorflow as tf

from .defaults import default
from .registry import register

# Add experimental hparams below


================================================
FILE: hparams/utils.py
================================================
import tensorflow as tf


class HParams(tf.contrib.training.HParams):
  """Override of TensorFlow's HParams.

  Replaces HParams.add_hparam(name, value) with simple attribute assignment.
    I.e. There is no need to explicitly add an hparam:
      Replace: `hparams.add_hparam("learning_rate", 0.1)`
      With:    `hparams.learning_rate = 0.1`
  """

  def __setattr__(self, name, value):
    """Adds {name, value} pair to hyperparameters.

    Args:
      name: Name of the hyperparameter.
      value: Value of the hyperparameter. Can be one of the following types:
        int, float, string, int list, float list, or string list.

    Raises:
      ValueError: if one of the arguments is invalid.
    """
    # Keys in kwargs are unique, but 'name' could the name of a pre-existing
    # attribute of this object.  In that case we refuse to use it as a
    # hyperparameter name.
    if name[0] == "_":
      object.__setattr__(self, name, value)
      return
    if isinstance(value, (list, tuple)):
      if not value:
        raise ValueError(
            'Multi-valued hyperparameters cannot be empty: %s' % name)
      self._hparam_types[name] = (type(value[0]), True)
    else:
      self._hparam_types[name] = (type(value), False)
    object.__setattr__(self, name, value)


================================================
FILE: hparams/vgg.py
================================================
import tensorflow as tf

from .registry import register
from .defaults import default, default_cifar10


# from https://github.com/tensorflow/models/blob/master/resnet/resnet_main.py
@register
def vgg16_default():
  vgg_default = default_cifar10()
  vgg_default.initializer = "glorot_uniform_initializer"
  vgg_default.model = "vgg"
  vgg_default.learning_rate = 0.01
  vgg_default.lr_scheme = "constant"
  vgg_default.weight_decay_rate = 0.0005
  vgg_default.num_classes = 10
  vgg_default.optimizer = "adam"
  vgg_default.adam_epsilon = 1e-6
  vgg_default.beta1 = 0.85
  vgg_default.beta2 = 0.997
  vgg_default.input_shape = [32, 32, 3]
  vgg_default.output_shape = [10]
  return vgg_default


@register
def cifar10_vgg16():
  hps = vgg16_default()
  hps.data = "cifar10"
  return hps


@register
def cifar100_vgg16_no_dropout():
  hps = vgg16_default()
  hps.data = "cifar100"

  hps.input_shape = [32, 32, 3]
  hps.output_shape = [100]
  hps.num_classes = 100
  hps.channels = 3
  hps.learning_rate = 0.0001
  return hps


@register
def cifar10_vgg16_no_dropout():
  hps = vgg16_default()
  hps.data = "cifar10"

  hps.input_shape = [32, 32, 3]
  hps.output_shape = [10]
  hps.num_classes = 10
  hps.channels = 3
  hps.learning_rate = 0.0001
  return hps


@register
def cifar100_vgg16_targeted_dropout():
  hps = cifar100_vgg16_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_weight"
  hps.targ_rate = 0.5
  return hps


@register
def cifar100_vgg16_untargeted_dropout():
  hps = cifar100_vgg16_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_weight"
  return hps


@register
def cifar100_vgg16_untargeted_unit_dropout():
  hps = cifar100_vgg16_no_dropout()
  hps.drop_rate = 0.25
  hps.dropout_type = "untargeted_unit"
  return hps


@register
def cifar100_vgg16_targeted_unit_dropout():
  hps = cifar100_vgg16_no_dropout()
  hps.drop_rate = 0.5
  hps.dropout_type = "targeted_unit"
  hps.targ_rate = 0.5
  return hps


@register
def cifar100_vgg16_targeted_unit_dropout_botk75_66():
  hps = cifar100_vgg16_targeted_unit_dropout()
  hps.drop_rate = 0.66
  hps.targ_rate = 0.75
  return hps


@register
def cifar100_vgg16_louizos_unit():
  hps = cifar100_vgg16_no_dropout()
  hps.louizos_beta = 2. / 3.
  hps.louizos_zeta = 1.1
  hps.louizos_gamma = -0.1
  hps.louizos_cost = 0.001
  hps.dropout_type = "louizos_unit"
  hps.drop_rate = 0.25

  return hps


@register
def cifar100_vgg16_louizos_weight():
  hps = cifar100_vgg16_louizos_unit()
  hps.dropout_type = "louizos_weight"

  return hps


@register
def cifar100_vgg16_variational_unscaled():
  hps = cifar100_vgg16_no_dropout()
  hps.dropout_type = "variational"
  hps.drop_rate = 0.75
  hps.thresh = 3
  hps.var_scale = 1
  hps.weight_decay_rate = 0.0

  return hps


@register
def cifar100_vgg16_variational():
  hps = cifar100_vgg16_variational_unscaled()
  hps.var_scale = 1. / 100

  return hps


@register
def cifar100_vgg16_variational_unit_unscaled():
  hps = cifar100_vgg16_variational_unscaled()
  hps.dropout_type = "variational_unit"

  return hps


@register
def cifar100_vgg16_variational_unit():
  hps = cifar100_vgg16_variational_unit_unscaled()
  hps.var_scale = 1. / 100

  return hps


@register
def cifar100_vgg16_smallify_1eneg4():
  hps = cifar100_vgg16_no_dropout()
  hps.dropout_type = "smallify_dropout"
  hps.smallify = 1e-4
  hps.smallify_mv = 0.9
  hps.smallify_thresh = 0.5

  return hps


@register
def cifar100_vgg16_smallify_weight_1eneg5():
  hps = cifar100_vgg16_smallify_1eneg4()
  hps.dropout_type = "smallify_weight_dropout"
  hps.smallify = 1e-5

  return hps


================================================
FILE: models/__init__.py
================================================
__all__ = ["basic", "registry", "resnet", "lenet", "vgg"]

from .basic import *
from .resnet import *
from .registry import *
from .lenet import *
from .vgg import *


================================================
FILE: models/basic/__init__.py
================================================
__all__ = ["basic"]

from .basic import *


================================================
FILE: models/basic/basic.py
================================================
import tensorflow as tf

from ..registry import register

from ..utils.activations import get_activation
from ..utils.initializations import get_init
from ..utils.optimizers import get_optimizer
from ..utils import model_utils


@register("basic")
def get_basic(params, lr):
  """Callable model function compatible with Experiment API.

  Args:
    params: a HParams object containing values for fields:
    lr: learning rate variable
  """

  def basic(features, labels, mode, _):
    """The basic neural net net template.

    Args:
      features: a dict containing key "inputs"
      mode: training, evaluation or infer
    """
    with tf.variable_scope("basic", initializer=get_init(params)):
      is_training = mode == tf.estimator.ModeKeys.TRAIN
      actvn = get_activation(params)
      x = features["inputs"]
      batch_size = tf.shape(x)[0]

      nonzero = 0
      activations = []
      for i, feature_count in enumerate(params.layers):
        with tf.variable_scope("layer_%d" % i):
          if params.layer_type == "dense":
            x, w = model_utils.collect_vars(
                lambda: model_utils.dense(x, feature_count, params, is_training)
            )
          elif params.layer_type == "conv":
            x, w = model_utils.collect_vars(lambda: tf.layers.conv2d(
                x, feature_count, params.kernel_size, padding="SAME"))
          if params.batch_norm:
            x = tf.layers.batch_normalization(x, training=is_training)
          x = actvn(x)
          activations.append(x)
      x = tf.reshape(x, [batch_size, params.layers[-1]])
      with tf.variable_scope('logit'):
        x = tf.layers.dense(x, params.output_shape[0], use_bias=False)

      if mode in [model_utils.ModeKeys.PREDICT, model_utils.ModeKeys.ATTACK]:
        predictions = {
            'classes': tf.argmax(x, axis=1),
            'logits': x,
            'probabilities': tf.nn.softmax(x, name='softmax_tensor'),
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

      loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=x)
      if params.smallify > 0.0:
        loss += model_utils.switch_loss() * params.smallify

      # Summaries
      # ========================
      if not params.use_tpu:
        tf.summary.scalar("nonzero", model_utils.nonzero_count())
        tf.summary.scalar("percent_sparsity", model_utils.percent_sparsity())
      # ========================

      return model_utils.model_top(labels, tf.nn.softmax(x, -1), loss, lr,
                                   mode, params)

  return basic


================================================
FILE: models/lenet/__init__.py
================================================
__all__ = ["lenet"]


================================================
FILE: models/lenet/lenet.py
================================================
import tensorflow as tf

from ..registry import register

from ..utils.activations import get_activation
from ..utils.dropouts import get_dropout
from ..utils.initializations import get_init
from ..utils.optimizers import get_optimizer
from ..utils import model_utils


@register("lenet")
def get_lenet(hparams, lr):
  """Callable model function compatible with Experiment API.

    Args:
      params: a HParams object containing values for fields:
      lr: learning rate variable
    """

  def _conv(name, x, filter_size, in_filters, out_filters, strides, mode):
    """Convolution."""
    with tf.variable_scope(name):
      kernel = tf.get_variable(
          'DW', [filter_size, filter_size, in_filters, out_filters],
          tf.float32)
      is_training = mode == tf.estimator.ModeKeys.TRAIN
      if hparams.dropout_type is not None:
        dropout_fn = get_dropout(hparams.dropout_type)
        kernel = dropout_fn(kernel, hparams, is_training)

        # special case for variational
        if hparams.dropout_type and "variational" in hparams.dropout_type:
          kernel, log_alpha = kernel[0], kernel[1]
          if is_training:
            conved_mu = tf.nn.conv2d(
                x, kernel, strides=strides, padding='VALID')
            conved_si = tf.sqrt(
                tf.nn.conv2d(
                    tf.square(x),
                    tf.exp(log_alpha) * tf.square(kernel),
                    strides=strides,
                    padding='VALID') + 1e-8)
            return conved_mu + tf.random_normal(
                tf.shape(conved_mu)) * conved_si, tf.count_nonzero(kernel)

      return tf.nn.conv2d(x, kernel, strides, padding='VALID')

  def lenet(features, labels, mode, params):
    """The lenet neural net net template.

            Args:
              features: a dict containing key "inputs"
              mode: training, evaluation or infer
            """
    with tf.variable_scope("lenet", initializer=get_init(hparams)):
      is_training = mode == tf.estimator.ModeKeys.TRAIN
      actvn = get_activation(hparams)

      if hparams.use_tpu and 'batch_size' in params.keys():
        hparams.batch_size = params['batch_size']

      # input layer
      x = features["inputs"]
      x = model_utils.standardize_images(x)

      # unflatten
      x = tf.reshape(x, [hparams.batch_size] + hparams.input_shape)

      # conv1
      b_conv1 = tf.get_variable(
          "Variable", initializer=tf.constant_initializer(0.1), shape=[6])
      h_conv1 = _conv('conv1', x, 5, 3, 6, [1, 1, 1, 1], mode) + b_conv1
      h_conv1 = tf.nn.relu(h_conv1)
      h_pool1 = tf.nn.max_pool(
          h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

      # conv2
      b_conv2 = tf.get_variable(
          "Variable_1", initializer=tf.constant_initializer(0.1), shape=[16])
      h_conv2 = _conv('conv2', h_pool1, 5, 6, 16, [1, 1, 1, 1], mode) + b_conv2
      h_conv2 = tf.nn.relu(h_conv2)
      h_pool2 = tf.nn.max_pool(
          h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

      # flatten for fc
      h_pool2_flat = tf.reshape(h_pool2, [hparams.batch_size, -1])

      # fc1
      with tf.variable_scope('fc1'):
        h_fc1 = tf.nn.relu(
            model_utils.dense(h_pool2_flat, 500, hparams, is_training))

      # fc2
      with tf.variable_scope('fc2'):
        y = model_utils.dense(h_fc1, 10, hparams, is_training, dropout=False)

      if mode in [model_utils.ModeKeys.PREDICT, model_utils.ModeKeys.ATTACK]:
        predictions = {
            'classes': tf.argmax(y, axis=1),
            'logits': y,
            'probabilities': tf.nn.softmax(y, name='softmax_tensor'),
        }

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

      loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=y)

      if hparams.axis_aligned_cost:
        negativity_cost, axis_alignedness_cost, one_bound = model_utils.axis_aligned_cost(
            y, hparams)
        masked_max = tf.abs(y) * (
            1 - tf.one_hot(tf.argmax(tf.abs(y), -1), hparams.num_classes))
        tf.summary.scalar(
            "logit_prior",
            tf.reduce_mean(
                tf.to_float(
                    tf.logical_and(masked_max >= 0.0, masked_max <= 0.1))))
        tf.summary.scalar("avg_max",
                          tf.reduce_mean(tf.reduce_max(tf.abs(y), axis=-1)))
        loss += hparams.axis_aligned_cost * tf.reduce_mean(
            negativity_cost + axis_alignedness_cost + 20. * one_bound)

      if hparams.logit_squeezing:
        loss += hparams.logit_squeezing * tf.reduce_mean(y**2)

      if hparams.clp:
        loss += hparams.clp * tf.reduce_mean(
            (y[:hparams.batch_size // 2] - y[hparams.batch_size // 2:])**2)

      if hparams.dropout_type and "variational" in hparams.dropout_type:
        # prior DKL part of the ELBO
        graph = tf.get_default_graph()
        node_defs = [
            n for n in graph.as_graph_def().node if 'log_alpha' in n.name
        ]
        log_alphas = [
            graph.get_tensor_by_name(n.name + ":0") for n in node_defs
        ]
        divergences = [model_utils.dkl_qp(la) for la in log_alphas]
        # combine to form the ELBO
        N = float(50000)
        dkl = tf.reduce_sum(tf.stack(divergences))

        warmup_steps = 50000
        inv_base = tf.exp(tf.log(0.01) / warmup_steps)
        inv_decay = inv_base**(
            warmup_steps - tf.to_float(tf.train.get_global_step()))

        loss += (1. / N) * dkl * inv_decay * hparams.var_scale

      if hparams.smallify > 0.0:
        loss += model_utils.switch_loss() * hparams.smallify

      return model_utils.model_top(labels, tf.nn.softmax(y, -1), loss, lr,
                                   mode, hparams)

  return lenet


================================================
FILE: models/registry.py
================================================
from ..training.lr_schemes import get_lr

import tensorflow as tf

_MODELS = dict()


def register(name):

  def add_to_dict(fn):
    global _MODELS
    _MODELS[name] = fn
    return fn

  return add_to_dict


def get_model(hparams):

  def model_fn(features, labels, mode, params=None):
    lr = tf.constant(0.0)
    if mode == tf.estimator.ModeKeys.TRAIN:
      lr = get_lr(hparams)
    return _MODELS[hparams.model](hparams, lr)(features, labels, mode, params)

  return model_fn


================================================
FILE: models/resnet/__init__.py
================================================
__all__ = ["resnet"]


================================================
FILE: models/resnet/resnet.py
================================================
import tensorflow as tf
import numpy as np

from ..utils import dropouts
from ..utils.activations import get_activation
from ..utils.dropouts import get_dropout, smallify_dropout
from ..utils.initializations import get_init
from ..registry import register
from ..utils import model_utils
from ..utils.model_utils import ModeKeys
from ...training import tpu


@register("resnet")
def get_resnet(hparams, lr):
  """Callable model function compatible with Experiment API.

          Args:
            params: a HParams object containing values for fields:
              use_bottleneck: bool to bottleneck the network
              num_residual_units: number of residual units
              num_classes: number of classes
              batch_size: batch size
              weight_decay_rate: weight decay rate
          """

  def resnet(features, labels, mode, params):
    if hparams.use_tpu and 'batch_size' in params.keys():
      hparams.batch_size = params['batch_size']

    is_training = mode == tf.estimator.ModeKeys.TRAIN

    def _residual(x, out_filter, stride, projection=False):
      """Residual unit with 2 sub layers."""
      is_variational = hparams.dropout_type is not None and "variational" in hparams.dropout_type

      orig_x = x
      if not is_variational:
        x = model_utils.batch_norm(x, hparams, is_training)
        x = tf.nn.relu(x)

      if projection:
        orig_x = model_utils.conv(
            x,
            1,
            out_filter,
            hparams,
            is_training=is_training,
            strides=stride,
            name="shortcut")

      with tf.variable_scope('sub1'):
        x = model_utils.conv(
            x,
            3,
            out_filter,
            hparams,
            is_training=is_training,
            strides=stride,
            name='conv1')

        x = model_utils.batch_norm(x, hparams, is_training)
        x = tf.nn.relu(x)

      with tf.variable_scope('sub2'):
        x = model_utils.conv(
            x,
            3,
            out_filter,
            hparams,
            is_training=is_training,
            strides=[1, 1, 1, 1],
            name='conv2')

      x += orig_x

      return x

    def _bottleneck_residual(x, out_filter, stride, projection=False):
      """Residual unit with 3 sub layers."""

      is_variational = hparams.dropout_type is not None and "variational" in hparams.dropout_type

      orig_x = x
      if not is_variational:
        x = model_utils.batch_norm(x, hparams, is_training)
        x = tf.nn.relu(x)

      if projection:
        orig_x = model_utils.conv(
            x,
            1,
            4 * out_filter,
            hparams,
            is_training=is_training,
            strides=stride,
            name="shortcut")

      with tf.variable_scope('sub1'):
        x = model_utils.conv(
            x,
            1,
            out_filter,
            hparams,
            is_training=is_training,
            strides=[1, 1, 1, 1],
            name='conv1')
        x = model_utils.batch_norm(x, hparams, is_training)
        x = tf.nn.relu(x)
      with tf.variable_scope('sub2'):
        x = model_utils.conv(
            x,
            3,
            out_filter,
            hparams,
            is_training=is_training,
            strides=stride,
            name='conv2')
        x = model_utils.batch_norm(x, hparams, is_training)
        x = tf.nn.relu(x)
      with tf.variable_scope('sub3'):
        x = model_utils.conv(
            x,
            1,
            4 * out_filter,
            hparams,
            is_training=is_training,
            strides=[1, 1, 1, 1],
            name='conv3')

      return orig_x + x

    def _l1():
      """L1 weight decay loss."""
      if hparams.l1_norm == 0:
        return 0

      costs = []
      for var in tf.trainable_variables():
        if "DW" in var.name and "logit" not in var.name:
          costs.append(tf.reduce_mean(tf.abs(var)))

      return tf.multiply(hparams.l1_norm, tf.add_n(costs))

    def _fully_connected(x, out_dim):
      """FullyConnected layer for final output."""
      prev_dim = np.product(x.get_shape().as_list()[1:])
      x = tf.reshape(x, [hparams.batch_size, prev_dim])
      w = tf.get_variable('DW', [prev_dim, out_dim])
      b = tf.get_variable(
          'biases', [out_dim], initializer=tf.zeros_initializer())
      return tf.nn.xw_plus_b(x, w, b)

    def _global_avg_pool(x):
      assert x.get_shape().ndims == 4
      if hparams.data_format == "channels_last":
        return tf.reduce_mean(x, [1, 2])

      return tf.reduce_mean(x, [2, 3])

    def _stride_arr(stride):
      """Map a stride scalar to the stride array for tf.nn.conv2d."""
      if hparams.data_format == "channels_last":
        return [1, stride, stride, 1]

      return [1, 1, stride, stride]

    if mode == ModeKeys.PREDICT or mode == ModeKeys.ATTACK:
      if "labels" in features:
        labels = features["labels"]

    with tf.variable_scope("resnet", initializer=get_init(hparams)):
      hparams.mode = mode
      strides = [1, 2, 2, 2]
      res_func = (_residual
                  if not hparams.use_bottleneck else _bottleneck_residual)
      filters = hparams.residual_filters
      large_input = hparams.input_shape[0] > 32

      # 3 and 16 picked from example implementation
      with tf.variable_scope('init'):
        x = features["inputs"]
        stride = _stride_arr(2) if large_input else _stride_arr(1)
        x = model_utils.conv(
            x,
            7,
            filters[0],
            hparams,
            strides=stride,
            dropout=False,
            name='init_conv')

        if large_input:
          x = tf.layers.max_pooling2d(
              inputs=x,
              pool_size=3,
              strides=2,
              padding="SAME",
              data_format=hparams.data_format)

      with tf.variable_scope('unit_1_0'):
        x = res_func(x, filters[1], _stride_arr(strides[0]), True)

      for i in range(1, hparams.residual_units[0]):
        with tf.variable_scope('unit_1_%d' % i):
          x = res_func(x, filters[1], _stride_arr(1), False)

      with tf.variable_scope('unit_2_0'):
        x = res_func(x, filters[2], _stride_arr(strides[1]), True)

      for i in range(1, hparams.residual_units[1]):
        with tf.variable_scope('unit_2_%d' % i):
          x = res_func(x, filters[2], _stride_arr(1), False)

      with tf.variable_scope('unit_3_0'):
        x = res_func(x, filters[3], _stride_arr(strides[2]), True)

      for i in range(1, hparams.residual_units[2]):
        with tf.variable_scope('unit_3_%d' % i):
          x = res_func(x, filters[3], _stride_arr(1), False)

      if len(filters) == 5:
        with tf.variable_scope('unit_4_0'):
          x = res_func(x, filters[4], _stride_arr(strides[3]), True)

        for i in range(1, hparams.residual_units[3]):
          with tf.variable_scope('unit_4_%d' % i):
            x = res_func(x, filters[4], _stride_arr(1), False)

      x = model_utils.batch_norm(x, hparams, is_training)
      x = tf.nn.relu(x)

      with tf.variable_scope('unit_last'):
        x = _global_avg_pool(x)

      with tf.variable_scope('logit'):
        logits = _fully_connected(x, hparams.num_classes)
        predictions = tf.nn.softmax(logits)

      if mode in [ModeKeys.PREDICT, ModeKeys.ATTACK]:

        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'classes': tf.argmax(predictions, axis=1),
                'logits': logits,
                'probabilities': predictions,
            })

      with tf.variable_scope('costs'):
        xent = tf.losses.sparse_softmax_cross_entropy(
            labels=labels, logits=logits)
        cost = tf.reduce_mean(xent, name='xent')
        if is_training:
          cost += model_utils.weight_decay(hparams)
          cost += _l1()

          if hparams.dropout_type is not None:
            if "louizos" in hparams.dropout_type:
              cost += hparams.louizos_cost * model_utils.louizos_complexity_cost(
                  hparams) / 50000

            if "variational" in hparams.dropout_type:
              # prior DKL part of the ELBO
              graph = tf.get_default_graph()
              node_defs = [
                  n for n in graph.as_graph_def().node if 'log_alpha' in n.name
              ]
              log_alphas = [
                  graph.get_tensor_by_name(n.name + ":0") for n in node_defs
              ]
              print([
                  n.name
                  for n in graph.as_graph_def().node
                  if 'log_alpha' in n.name
              ])
              print("found %i logalphas" % len(log_alphas))
              divergences = [dropouts.dkl_qp(la) for la in log_alphas]
              # combine to form the ELBO
              N = float(50000)
              dkl = tf.reduce_sum(tf.stack(divergences))

              warmup_steps = 50000
              dkl = (1. / N) * dkl * tf.minimum(
                  1.0,
                  tf.to_float(tf.train.get_global_step()) /
                  warmup_steps) * hparams.var_scale
              cost += dkl
              tf.summary.scalar("dkl", dkl)

          if hparams.ard_cost > 0.0:
            cost += model_utils.ard_cost() * hparams.ard_cost

          if hparams.smallify > 0.0:
            cost += model_utils.switch_loss() * hparams.smallify

    # Summaries
    # ========================
    tf.summary.scalar("total_nonzero", model_utils.nonzero_count())
    all_weights = tf.concat(
        [
            tf.reshape(v, [-1])
            for v in tf.trainable_variables()
            if "DW" in v.name
        ],
        axis=0)
    tf.summary.histogram("weights", all_weights)
    # ========================

    return model_utils.model_top(labels, predictions, cost, lr, mode, hparams)

  return resnet


================================================
FILE: models/utils/__init__.py
================================================
__all__ = [
    "activations", "dropouts", "initializations", "model_utils", "optimizers"
]

from .activations import *
from .dropouts import *
from .initializations import *
from .model_utils import *
from .optimizers import *


================================================
FILE: models/utils/activations.py
================================================
import tensorflow as tf

_ACTIVATION = dict()


def register(name):

  def add_to_dict(fn):
    global _ACTIVATION
    _ACTIVATION[name] = fn
    return fn

  return add_to_dict


def get_activation(params):
  return _ACTIVATION[params.activation](params)


@register("relu")
def relu(params):
  return tf.nn.relu


@register("brelu")
def brelu(params):

  def fn(a):
    idx = tf.range(a.shape[-1])
    idx = tf.mod(idx, 2)
    idx = tf.cast(idx, tf.bool)

    even = tf.nn.relu(a)
    odd = -tf.nn.relu(-a)

    return tf.where(idx, odd, even)

  return fn


@register("selu")
def selu(params):
  return tf.nn.selu


@register("elu")
def elu(params):
  return tf.nn.elu


@register("sigmoid")
def sigmoid(params):
  return tf.nn.sigmoid


@register("swish")
def swish(params):
  return lambda x: tf.nn.sigmoid(x) * x


@register("tanh")
def tanh(params):
  return tf.nn.tanh


================================================
FILE: models/utils/dropouts.py
================================================
import numpy as np
import tensorflow as tf

_DROPOUTS = dict()


def register(name):

  def add_to_dict(fn):
    global _DROPOUTS
    _DROPOUTS[name] = fn
    return fn

  return add_to_dict


def get_dropout(name):
  return _DROPOUTS[name]


@register("targeted_weight")
def targeted_weight_dropout(w, params, is_training):
  drop_rate = params.drop_rate
  targ_perc = params.targ_rate

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])
  norm = tf.abs(w)
  idx = tf.to_int32(targ_perc * tf.to_float(tf.shape(w)[0]))
  threshold = tf.contrib.framework.sort(norm, axis=0)[idx]
  mask = norm < threshold[None, :]

  if not is_training:
    w = (1. - tf.to_float(mask)) * w
    w = tf.reshape(w, w_shape)
    return w

  mask = tf.to_float(
      tf.logical_and(tf.random_uniform(tf.shape(w)) < drop_rate, mask))
  w = (1. - mask) * w
  w = tf.reshape(w, w_shape)
  return w


@register("targeted_weight_random")
def targeted_weight_random(w, params, is_training):
  drop_rate = params.drop_rate
  targ_perc = params.targ_rate

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])

  switch = tf.get_variable(
      "mask",
      w.shape,
      initializer=tf.random_uniform_initializer(),
      trainable=False)

  if is_training:
    mask = tf.logical_and(switch < targ_perc,
                          tf.random_uniform(w.shape) < drop_rate)
  else:
    mask = switch < targ_perc

  mask = 1. - tf.to_float(mask)
  mask = tf.stop_gradient(mask)

  w = mask * w
  w = tf.reshape(w, w_shape)
  return w


@register("ramping_targeted_weight_random")
def ramping_targeted_weight_random(w, params, is_training):
  drop_rate = params.drop_rate
  targ_perc = 0.95 * params.targ_rate * tf.minimum(
      1.0,
      tf.to_float(tf.train.get_global_step()) / 20000.)
  targ_perc = targ_perc + 0.05 * params.targ_rate * tf.maximum(
      0.0,
      tf.minimum(1.0,
                 (tf.to_float(tf.train.get_global_step()) - 20000.) / 20000.))

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])

  switch = tf.get_variable(
      "mask",
      w.shape,
      initializer=tf.random_uniform_initializer(),
      trainable=False)

  if is_training:
    mask = tf.logical_and(switch < targ_perc,
                          tf.random_uniform(w.shape) < drop_rate)
  else:
    mask = switch < (targ_perc * drop_rate)

  mask = 1. - tf.to_float(mask)
  mask = tf.stop_gradient(mask)

  w = mask * w
  w = tf.reshape(w, w_shape)
  return w


@register("targeted_weight_piecewise")
def targeted_weight_piecewise_dropout(w, params, is_training):
  drop_rate = params.drop_rate * tf.minimum(
      1.0,
      tf.to_float(tf.train.get_global_step()) / 40000.)

  targ_perc = 0.95 * params.targ_rate * tf.minimum(
      1.0,
      tf.to_float(tf.train.get_global_step()) / 20000.)
  targ_perc = targ_perc + 0.05 * params.targ_rate * tf.maximum(
      0.0,
      tf.minimum(1.0,
                 (tf.to_float(tf.train.get_global_step()) - 20000.) / 20000.))

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])
  norm = tf.abs(w)
  idx = tf.to_int32(targ_perc * tf.to_float(tf.shape(w)[0]))
  threshold = tf.contrib.framework.sort(norm, axis=0)[idx]
  mask = norm < threshold[None, :]

  if not is_training:
    w = w * (1 - tf.to_float(mask))
    return tf.reshape(w, w_shape)

  mask = tf.where(
      tf.logical_and((1. - drop_rate) < tf.random_uniform(tf.shape(w)), mask),
      tf.ones_like(w, dtype=tf.float32), tf.zeros_like(w, dtype=tf.float32))
  w = (1 - mask) * w
  w = tf.reshape(w, w_shape)
  return w


@register("targeted_unit_piecewise")
def targeted_unit_piecewise(w, params, is_training):
  drop_rate = params.drop_rate * tf.minimum(
      1.0,
      tf.to_float(tf.train.get_global_step()) / 40000.)

  targ_perc = 0.95 * params.targ_rate * tf.minimum(
      1.0,
      tf.to_float(tf.train.get_global_step()) / 20000.)
  targ_perc = targ_perc + 0.05 * params.targ_rate * tf.maximum(
      0.0,
      tf.minimum(1.0,
                 (tf.to_float(tf.train.get_global_step()) - 20000.) / 20000.))

  w_shape = w.shape
  w = tf.reshape(w, [-1, w.shape[-1]])
  norm = tf.norm(w, axis=0)
  idx = tf.to_int32(targ_perc * tf.to_float(w.shape[1]))
  sorted_norms = tf.contrib.framework.sort(norm)
  threshold = sorted_norms[idx]
  mask = (norm < threshold)[None, :]

  if not is_training:
    w = w * (1 - tf.to_float(mask))
    return tf.reshape(w, w_shape)

  mask = tf.tile(mask, [w.shape[0], 1])
  mask = tf.where(
      tf.logical_and((1. - drop_rate) < tf.random_uniform(tf.shape(w)), mask),
      tf.ones_like(w, dtype=tf.float32), tf.zeros_like(w, dtype=tf.float32))
  w = tf.reshape((1 - mask) * w, w_shape)
  return w


@register("delayed_targeted_weight_prune")
def delayed_targeted_weight(w, params, is_training):
  orig_w = w
  targ_perc = params.targ_rate

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])
  norm = tf.abs(w)
  idx = tf.to_int32(targ_perc * tf.to_float(tf.shape(w)[0]))
  threshold = tf.contrib.framework.sort(norm, axis=0)[idx]
  mask = norm >= threshold[None, :]

  w = w * tf.to_float(mask)
  cond = tf.to_float(tf.train.get_global_step() >= params.dropout_delay_steps)
  return cond * tf.reshape(w, w_shape) + (1 - cond) * orig_w


@register("delayed_targeted_unit_prune")
def delayed_targeted_unit(x, params, is_training):
  orig_x = x

  w = tf.reshape(x, [-1, x.shape[-1]])
  norm = tf.norm(w, axis=0)
  idx = int(params.targ_rate * int(w.shape[1]))
  sorted_norms = tf.contrib.framework.sort(norm)
  threshold = sorted_norms[idx]
  mask = (norm >= threshold)[None, None]

  w = w * tf.to_float(mask)
  return tf.cond(
      tf.greater(tf.train.get_global_step(), params.dropout_delay_steps),
      lambda: tf.reshape(w, x.shape), lambda: orig_x)


@register("untargeted_weight")
def untargeted_weight(w, params, is_training):
  if not is_training:
    return w
  return tf.nn.dropout(w, keep_prob=(1. - params.drop_rate))


@register("targeted_unit")
def targeted_unit_dropout(x, params, is_training):
  w = tf.reshape(x, [-1, x.shape[-1]])
  norm = tf.norm(w, axis=0)
  idx = int(params.targ_rate * int(w.shape[1]))
  sorted_norms = tf.contrib.framework.sort(norm)
  threshold = sorted_norms[idx]
  mask = (norm < threshold)[None, :]
  mask = tf.tile(mask, [w.shape[0], 1])

  if not is_training:
    w = (1. - tf.to_float(mask)) * w
    w = tf.reshape(w, x.shape)
    return w

  
  mask = tf.where(
      tf.logical_and((1. - params.drop_rate) < tf.random_uniform(tf.shape(w)),
                     mask), tf.ones_like(w, dtype=tf.float32),
      tf.zeros_like(w, dtype=tf.float32))
  x = tf.reshape((1 - mask) * w, x.shape)
  return x


@register("targeted_unit_random")
def targeted_unit_random(w, params, is_training):
  drop_rate = params.drop_rate
  targ_perc = params.targ_rate

  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-1]])

  switch = tf.get_variable(
      "mask",
      w.shape[-1],
      initializer=tf.random_uniform_initializer(),
      trainable=False)

  if is_training:
    mask = tf.logical_and(switch < targ_perc,
                          tf.random_uniform(switch.shape) < drop_rate)
  else:
    mask = switch < targ_perc

  mask = 1. - tf.to_float(mask)
  mask = tf.stop_gradient(mask[None, :])

  w = mask * w
  w = tf.reshape(w, w_shape)
  return w


@register("targeted_ard")
def targeted_ard_dropout(w, x, params, is_training):
  if not is_training:
    return w
  x = tf.reshape(x, [-1, x.shape[-1]])
  activation_norms = tf.reduce_mean(tf.abs(x), axis=0)
  w_shape = w.shape
  w = tf.reshape(w, [-1, w_shape[-2], w_shape[-1]])
  norm = tf.norm(w, axis=(0, 2)) * activation_norms
  idx = int(params.targ_rate * int(w.shape[1]))
  sorted_norms = tf.contrib.framework.sort(norm)
  threshold = sorted_norms[idx]
  mask = (norm < threshold)[None, :, None]
  mask = tf.tile(mask, [w.shape[0], 1, w.shape[-1]])
  mask = tf.where(
      tf.logical_and((1. - params.drop_rate) < tf.random_uniform(tf.shape(w)),
                     mask), tf.ones_like(w, dtype=tf.float32),
      tf.zeros_like(w, dtype=tf.float32))
  w = tf.reshape((1 - mask) * w, w_shape)
  return w


@register("untargeted_unit")
def unit_dropout(w, params, is_training):
  if not is_training:
    return w
  w_shape = w.shape
  w = tf.reshape(w, [-1, w.shape[-1]])
  mask = tf.to_float(
      tf.random_uniform([int(w.shape[1])]) > params.drop_rate)[None, :]
  w = tf.reshape(mask * w, w_shape)
  return w / (1 - params.drop_rate)


@register("louizos_weight")
def louizos_weight_dropout(w, params, is_training):
  with tf.variable_scope("louizos"):
    EPS = 1e-8
    noise = (1 - EPS) * tf.random_uniform(w.shape) + (EPS / 2)
    rate = np.log(1 - params.drop_rate) - np.log(params.drop_rate)
    gates = tf.get_variable(
        "gates",
        shape=w.shape,
        initializer=tf.random_normal_initializer(mean=rate, stddev=0.01))
    if is_training:
      s = tf.nn.sigmoid(
          (gates + tf.log(noise / (1. - noise))) / params.louizos_beta)
      s_bar = s * (
          params.louizos_zeta - params.louizos_gamma) + params.louizos_gamma
    else:
      s = tf.nn.sigmoid(gates)
      s_bar = s * (
          params.louizos_zeta - params.louizos_gamma) + params.louizos_gamma
    mask = tf.minimum(1., tf.maximum(0., s_bar))

    return mask * w


@register("louizos_unit")
def louizos_unit_dropout(w, params, is_training):
  with tf.variable_scope("louizos"):
    EPS = 1e-8
    noise = (1 - EPS) * \
        tf.random_uniform([w.shape.as_list()[-1]]) + (EPS / 2)
    rate = np.log(1 - params.drop_rate) - np.log(params.drop_rate)
    gates = tf.get_variable(
        "gates",
        shape=[w.shape.as_list()[-1]],
        initializer=tf.random_normal_initializer(mean=rate, stddev=0.01))
    if is_training:
      s = tf.nn.sigmoid(
          (gates + tf.log(noise / (1. - noise))) / params.louizos_beta)
      s_bar = s * (
          params.louizos_zeta - params.louizos_gamma) + params.louizos_gamma
    else:
      s = tf.nn.sigmoid(gates)
      s_bar = s * (
          params.louizos_zeta - params.louizos_gamma) + params.louizos_gamma
    mask = tf.minimum(1., tf.maximum(0., s_bar))

    return mask * w


# from https://github.com/BayesWatch/tf-variational-dropout/blob/master/variational_dropout.py
def log_sigma2_variable(shape, ard_init=-10.):
  return tf.get_variable(
      "log_sigma2", shape=shape, initializer=tf.constant_initializer(ard_init))


# from https://github.com/BayesWatch/tf-variational-dropout/blob/master/variational_dropout.py
def get_log_alpha(log_sigma2, w):
  log_alpha = clip(log_sigma2 - paranoid_log(tf.square(w)))
  return tf.identity(log_alpha, name='log_alpha')


# from https://github.com/BayesWatch/tf-variational-dropout/blob/master/variational_dropout.py
def paranoid_log(x, eps=1e-8):
  v = tf.log(x + eps)
  return v


# from https://github.com/BayesWatch/tf-variational-dropout/blob/master/variational_dropout.py
def clip(x):
  return tf.clip_by_value(x, -8., 8.)


def dkl_qp(log_alpha):
  k1, k2, k3 = 0.63576, 1.8732, 1.48695
  C = -k1
  mdkl = k1 * tf.nn.sigmoid(k2 + k3 * log_alpha) - 0.5 * tf.log1p(
      tf.exp(-log_alpha)) + C
  return -tf.reduce_sum(mdkl)


@register("variational")
def variational_dropout(w, _, is_training):
  with tf.variable_scope("variational"):
    log_sigma2 = log_sigma2_variable(w.get_shape())
    log_alpha = get_log_alpha(log_sigma2, w)
    select_mask = tf.cast(tf.less(log_alpha, 3), tf.float32)

    if is_training:
      return w, log_alpha

    return w * select_mask, log_alpha


@register("variational_unit")
def variational_unit_dropout(w, _, is_training):
  with tf.variable_scope("variational"):
    log_sigma2 = log_sigma2_variable(int(w.shape[-1]))
    log_sigma2 = tf.reshape(log_sigma2, [1, 1, 1, -1])
    log_sigma2 = tf.tile(log_sigma2, [w.shape[0], w.shape[1], w.shape[2], 1])
    log_alpha = get_log_alpha(log_sigma2, w)
    select_mask = tf.cast(tf.less(log_alpha, 3), tf.float32)

    if is_training:
      return w, log_alpha

    return w * select_mask, log_alpha


@register("smallify_dropout")
def smallify_dropout(x, hparams, is_training):
  with tf.variable_scope("smallify", reuse=tf.AUTO_REUSE):
    switch = tf.get_variable(
        "switch",
        shape=[1] * (len(x.shape) - 1) + [x.shape[-1]],
        initializer=tf.random_uniform_initializer())

    mask = tf.get_variable(
        initializer=lambda: tf.ones_like(switch.initialized_value()),
        name="mask",
        trainable=False)
    exp_avg = tf.get_variable(
        initializer=lambda: tf.sign(switch.initialized_value()),
        name="exp_avg",
        trainable=False)
    exp_std = tf.get_variable(
        initializer=lambda: tf.zeros_like(switch.initialized_value()),
        name="exp_std",
        trainable=False)
    gates = switch * mask

    batch_sign = tf.sign(switch)
    diff = batch_sign - exp_avg

    new_mask = tf.cast(tf.less(exp_std, hparams.smallify_thresh), tf.float32)

    if not is_training:
      return tf.identity(x * gates, name="smallified")

    with tf.control_dependencies([
        tf.assign(mask, mask * new_mask),
        tf.assign(
            exp_std, hparams.smallify_mv * exp_std +
            (1 - hparams.smallify_mv) * diff**2),
        tf.assign(
            exp_avg, hparams.smallify_mv * exp_avg +
            (1 - hparams.smallify_mv) * batch_sign)
    ]):
      return tf.identity(x * gates, name="smallified")


@register("smallify_weight_dropout")
def smallify_weight_dropout(x, hparams, is_training):
  with tf.variable_scope("smallify"):
    switch = tf.get_variable(
        "switch", shape=x.shape, initializer=tf.random_uniform_initializer())

    mask = tf.get_variable(
        initializer=lambda: tf.ones_like(switch.initialized_value()),
        name="mask",
        trainable=False)
    exp_avg = tf.get_variable(
        initializer=lambda: tf.sign(switch.initialized_value()),
        name="exp_avg",
        trainable=False)
    exp_std = tf.get_variable(
        initializer=lambda: tf.zeros_like(switch.initialized_value()),
        name="exp_std",
        trainable=False)
    gates = switch * mask

    batch_sign = tf.sign(switch)
    diff = batch_sign - exp_avg

    new_mask = tf.cast(tf.less(exp_std, hparams.smallify_thresh), tf.float32)

    if not is_training:
      return tf.identity(x * gates, name="smallified")

    with tf.control_dependencies([
        tf.assign(mask, mask * new_mask),
        tf.assign(
            exp_std, hparams.smallify_mv * exp_std +
            (1 - hparams.smallify_mv) * diff**2),
        tf.assign(
            exp_avg, hparams.smallify_mv * exp_avg +
            (1 - hparams.smallify_mv) * batch_sign)
    ]):
      return tf.identity(x * gates, name="smallified")


================================================
FILE: models/utils/initializations.py
================================================
import tensorflow as tf

_INIT = dict()


def register(name):

  def add_to_dict(fn):
    global _INIT
    _INIT[name] = fn
    return fn

  return add_to_dict


def get_init(params):
  return _INIT[params.initializer](params)


@register("normal")
def normal(params):
  return tf.random_normal_initializer(mean=params.mean, stddev=params.sd)


@register("constant")
def constant(params):
  return tf.constant_initializer(0.1, tf.float32)


@register("uniform_unit_scaling")
def uniform_unit_scaling(params):
  return tf.uniform_unit_scaling_initializer()


@register("glorot_normal_initializer")
def glorot_normal_initializer(params):
  return tf.glorot_normal_initializer()


@register("glorot_uniform_initializer")
def glorot_uniform_initializer(params):
  return tf.glorot_uniform_initializer()


@register("variance_scaling_initializer")
def variance_scaling_initializer(params):
  return tf.variance_scaling_initializer()


class RandomUnitScaling(tf.keras.initializers.Initializer):

  def __call__(self, shape, dtype=None, partition_info=None):
    if len(shape) == 2:
      dim = (shape[0] + shape[1]) / 2.
    elif len(shape) == 4:
      dim = shape[0] * shape[1] * (shape[2] + shape[3]) / 2.

    m = tf.sqrt(3 / tf.to_float(dim))
    init = m * (2 * tf.random_uniform(shape) - 1)
    return init


class RandomHadamardConstant(tf.keras.initializers.Initializer):

  def __call__(self, shape, dtype=None, partition_info=None):
    dim = (shape[0] + shape[1]) / 2.

    flip = 2 * tf.round(tf.random_uniform(shape)) - 1
    m = tf.pow(dim, -1 / 2.)
    return m * flip


class RandomHadamardUnscaled(tf.keras.initializers.Initializer):

  def __call__(self, shape, dtype=None, partition_info=None):
    return 2 * tf.round(tf.random_uniform(shape)) - 1


class RandomWarpedUniform(tf.keras.initializers.Initializer):

  def __init__(self, k=2):
    self.k = k

  def __call__(self, shape, dtype=None, partition_info=None):
    if len(shape) == 2:
      dim = (shape[0] + shape[1]) / 2.
    elif len(shape) == 4:
      dim = shape[0] * shape[1] * (shape[2] + shape[3]) / 2.

    m = tf.sqrt(3 / tf.to_float(dim))

    eps = 1e-10
    unif = (1 - eps) * tf.random_uniform(shape) + eps / 2
    skew_unif = tf.nn.sigmoid(self.k * tf.log(unif / (1 - unif)))
    init = m * (2 * skew_unif - 1)
    return init


@register("warped_unif")
def warped_unif(params):
  return RandomWarpedUniform(params.k)


@register("unit_scaling")
def unit_scaling(params):
  return RandomUnitScaling()


@register("hadamard_constant")
def hadamard_constant(params):
  return RandomHadamardConstant()


@register("hadamard_unscaled")
def hadamard_unscaled(params):
  return RandomHadamardUnscaled()

================================================
FILE: models/utils/model_utils.py
================================================
import operator
from functools import reduce

import tensorflow as tf
from tensorflow.contrib.tpu.python.tpu import tpu_estimator

from . import dropouts
from .optimizers import get_optimizer
from ...training import tpu


class ModeKeys(object):
  TRAIN = tf.estimator.ModeKeys.TRAIN
  EVAL = tf.estimator.ModeKeys.EVAL
  TEST = "test"
  PREDICT = tf.estimator.ModeKeys.PREDICT
  ATTACK = "attack"


def collect_vars(fn):
  """Collect all new variables created within `fn`.

  Args:
    fn: a function that takes no arguments and creates trainable tf.Variable
      objects.

  Returns:
    outputs: the outputs of `fn()`.
    new_vars: a list of the newly created variables.
  """
  previous_vars = set(tf.trainable_variables())
  outputs = fn()
  current_vars = set(tf.trainable_variables())
  new_vars = current_vars.difference(previous_vars)
  return outputs, list(new_vars)


def dense(x, units, hparams, is_training, dropout=True):
  with tf.variable_scope(None, default_name="dense") as scope:
    w = tf.get_variable("kernel", shape=[x.shape[1], units], dtype=tf.float32)
    b = tf.get_variable(
        "bias",
        shape=[units],
        dtype=tf.float32,
        initializer=tf.zeros_initializer())
    if dropout and hparams.dropout_type is not None and is_training:
      w = dropouts.get_dropout(hparams.dropout_type)(w, hparams, is_training)

    w = tf.identity(w, name="post_dropout")
    y = tf.matmul(x, w) + b
    return y


def conv(x,
         filter_size,
         out_filters,
         hparams,
         strides=[1, 1, 1, 1],
         padding="SAME",
         is_training=False,
         activation=None,
         dropout=True,
         name=None,
         schit_layer=False):
  """Convolution."""
  with tf.variable_scope(name, default_name="conv2d"):
    if hparams.data_format == "channels_last":
      in_filters = x.shape[-1]
    else:
      in_filters = x.shape[1]

    kernel = tf.get_variable(
        'DW', [filter_size, filter_size, in_filters, out_filters], tf.float32)
    use_dropout = hparams.dropout_type is not None and dropout

    # schit layer
    if schit_layer:
      scale = tf.get_variable(
          'scale',
          kernel.shape[-1],
          tf.float32,
          initializer=tf.zeros_initializer())
      kernel = hparams.lipschitz_constant * tf.nn.sigmoid(
          scale) * kernel / tf.norm(
              tf.reshape(kernel, shape=[-1, kernel.shape[-1]]), axis=0)

    if use_dropout:
      dropout_fn = dropouts.get_dropout(hparams.dropout_type)

      if hparams.dropout_type == "targeted_ard":
        kernel = dropout_fn(kernel, x, hparams, is_training)
      else:
        kernel = dropout_fn(kernel, hparams, is_training)

      # special case for variational
      if "variational" in hparams.dropout_type:
        kernel, log_alpha = kernel[0], kernel[1]
        if is_training:
          conved_mu = tf.nn.conv2d(x, kernel, strides=strides, padding=padding)
          conved_si = tf.sqrt(
              tf.nn.conv2d(
                  tf.square(x),
                  tf.exp(log_alpha) * tf.square(kernel),
                  strides=strides,
                  padding=padding) + 1e-8)
          conved = conved_mu + tf.random_normal(
              tf.shape(conved_mu)) * conved_si

          conved = tf.identity(conved, name="post_dropout")
          return conved

    data_format = "NHWC" if hparams.data_format == "channels_last" else "NCHW"
    conv = tf.nn.conv2d(
        x, kernel, strides, padding=padding, data_format=data_format)

    if activation:
      conv = activation(conv)

    conv = tf.identity(conv, name="post_dropout")
    return conv


def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
  """Apply weight decay and weight noise."""

  weight_decay_loss = weight_decay(hparams)
  tf.summary.scalar("losses/weight_decay", weight_decay_loss)
  weight_noise_ops = weight_noise(hparams, learning_rate)
  with tf.control_dependencies(weight_noise_ops):
    loss = tf.identity(loss)

  loss += weight_decay_loss
  return loss


def weight_noise(hparams, learning_rate):
  """Apply weight noise to vars in var_list."""
  if not hparams.weight_noise_rate:
    return [tf.no_op()]

  tf.logging.info("Applying weight noise scaled by learning rate, "
                  "noise_rate: %0.5f", hparams.weight_noise_rate)
  noise_ops = []

  noise_vars = [v for v in tf.trainable_variables() if "/body/" in v.name]
  for v in var_list:
    with tf.device(v._ref().device):  # pylint: disable=protected-access
      scale = hparams.weight_noise_rate * learning_rate * 0.001
      tf.summary.scalar("weight_noise_scale", scale)
      noise = tf.truncated_normal(v.shape) * scale
      noise_op = v.assign_add(noise)
      noise_ops.append(noise_op)
  return noise_ops


def weight_decay(hparams):
  """Apply weight decay to vars in var_list."""
  if not hparams.weight_decay_rate:
    return 0.

  only_features = hparams.weight_decay_only_features
  var_list = [v for v in tf.trainable_variables()]
  weight_decays = []
  for v in var_list:
    # Weight decay.
    is_feature = any(n in v.name for n in hparams.weight_decay_weight_names)
    if (not only_features) or is_feature:
      if hparams.initializer == "hadamard_unscaled":
        v_loss = tf.reduce_sum((tf.abs(v) - 1)**2) / 2
      else:
        v_loss = tf.nn.l2_loss(v)
      weight_decays.append(v_loss)

  return tf.reduce_sum(weight_decays, axis=0) * hparams.weight_decay_rate


def axis_aligned_cost(logits, hparams):
  negativity_cost = tf.nn.relu(-logits)
  max_mask = tf.one_hot(tf.argmax(tf.abs(logits), -1), hparams.num_classes)
  min_logits = tf.abs(logits) * (1 - max_mask)
  max_logit = tf.abs(logits) * max_mask
  one_bound = tf.nn.relu(logits - hparams.logit_bound)
  axis_alignedness_cost = tf.nn.relu(min_logits - 0.1 * hparams.logit_bound)

  logits_packed = tf.reduce_all(tf.less(max_logit, hparams.logit_bound), -1)
  logits_packed = tf.logical_and(logits_packed,
                                 tf.reduce_all(
                                     tf.less(min_logits,
                                             0.1 * hparams.logit_bound), -1))
  logits_packed = tf.reduce_mean(tf.to_float(logits_packed))
  tf.summary.scalar("logits_packed", logits_packed)
  tf.summary.scalar(
      "logits_max",
      tf.to_float(tf.shape(max_logit)[-1]) * tf.reduce_mean(max_logit))

  return negativity_cost, axis_alignedness_cost, one_bound


def ard_cost():
  with tf.variable_scope("ard_cost"):
    cost = 0
    for v in tf.trainable_variables():
      if "kernel" in v.name or "DW" in v.name:
        rv = tf.reshape(v, [-1, int(v.shape[-1])])
        sq_rv = tf.square(rv)
        sum_sq = tf.reduce_sum(sq_rv, axis=1, keepdims=True)
        ard = sq_rv / (sum_sq / tf.cast(tf.shape(sq_rv)[1], tf.float32)
                      ) - 0.5 * tf.log(sum_sq)
        cost += tf.reduce_sum(ard)

    return cost


def shape_list(x):
  """Return list of dims, statically where possible."""
  x = tf.convert_to_tensor(x)

  # If unknown rank, return dynamic shape
  if x.get_shape().dims is None:
    return tf.shape(x)

  static = x.get_shape().as_list()
  shape = tf.shape(x)

  ret = []
  for i, dim in enumerate(static):
    if dim is None:
      dim = shape[i]
    ret.append(dim)
  return ret


def standardize_images(x):
  """Image standardization on batches."""

  with tf.name_scope("standardize_images", [x]):
    x = tf.to_float(x)
    x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
    x_variance = tf.reduce_mean(
        tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
    x_shape = shape_list(x)
    num_pixels = tf.to_float(x_shape[1] * x_shape[2] * x_shape[3])
    x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
    return x


def batch_norm(inputs, hparams, training):
  """Performs a batch normalization using a standard set of parameters."""
  # We set fused=True for a significant performance boost. See
  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
  if hparams.data_format == "channels_first":
    axis = 1
  else:
    axis = -1

  return tf.layers.batch_normalization(
      inputs=inputs,
      axis=axis,
      momentum=0.997,
      epsilon=0.001,
      center=True,
      scale=True,
      training=training,
      fused=True)


def louizos_complexity_cost(params):
  gates = {
      w.name.strip(":0"): w
      for w in tf.trainable_variables()
      if "gates" in w.name
  }
  names = list(gates.keys())
  concat_gates = tf.concat([tf.reshape(gates[name], [-1]) for name in names],
                           0)
  if params.dropout_type == "louizos_weight":
    complexity_cost = tf.nn.sigmoid(
        concat_gates - params.louizos_beta * tf.
        log(-1 * params.louizos_gamma / params.louizos_zeta))
  elif params.dropout_type == "louizos_unit":
    reshaped_gates = [
        tf.reshape(gates[name], [-1, gates[name].shape[-1]]) for name in names
    ]

    parameters = []
    for name in names:
      g_name = name[:-len("louizos/gates")] + "DW"
      g = tf.contrib.framework.get_unique_variable(g_name)
      parameters.extend(
          [reduce(operator.mul,
                  g.shape.as_list()[:-1], 1)] * g.shape.as_list()[-1])
    group_sizes = tf.constant(parameters)
    assert group_sizes.shape[0] == concat_gates.shape[0], "{} != {}".format(
        group_sizes.shape[0], concat_gates.shape[0])

    complexity_cost = tf.cast(group_sizes, tf.float32) * tf.nn.sigmoid(
        concat_gates - params.louizos_beta * tf.
        log(-1 * params.louizos_gamma / params.louizos_zeta))
  return tf.reduce_sum(complexity_cost)


def switch_loss():
  losses = 0

  for v in tf.trainable_variables():
    if "switch" in v.name:
      losses += tf.reduce_sum(tf.abs(v))

  tf.summary.scalar("switch_loss", losses)
  return losses


def nonzero_count():
  nonzeroes = 0
  for op in tf.get_default_graph().get_operations():
    if "post_dropout" in op.name:
      v = tf.get_default_graph().get_tensor_by_name(op.name + ":0")
      count = tf.to_float(tf.equal(v, 0.))
      count = tf.reduce_sum(1 - count)
      nonzeroes += count
  return nonzeroes


def percent_sparsity():
  nonzeroes = 0
  total = 0
  for op in tf.get_default_graph().get_operations():
    if "post_dropout" in op.name:
      v = tf.get_default_graph().get_tensor_by_name(op.name + ":0")
      count = tf.to_float(tf.equal(v, 0.))
      count = tf.reduce_sum(1 - count)
      nonzeroes += count
      total += tf.size(v)
  return tf.to_float(nonzeroes) / tf.to_float(total)


def convert(num, base, length=None):
  ''' Converter from decimal to numeral systems from base 2 to base 10 '''
  num = int(num)
  base = int(base)
  result = []
  if num == 0:
    result.append(0)
  else:
    while (num > 0):
      result.append(num % base)
      num //= base
  # Reverse from LSB to MSB
  result = result[::-1]
  if length is not None:
    n_to_fill = length - len(result)
    if n_to_fill > 0:
      result = [0] * n_to_fill + result
  return result


def equal_mult(size, num_branches):
  return [
      tf.constant(1.0 / num_branches, shape=[size, 1, 1, 1], dtype=tf.float32)
      for _ in range(num_branches)
  ]


def uniform(size, num_branches):
  return [
      tf.random_uniform([size, 1, 1, 1], minval=0, maxval=1, dtype=tf.float32)
      for _ in range(num_branches)
  ]


def bernoulli(size, num_branches):
  random = tf.random_uniform([size], maxval=num_branches, dtype=tf.int32)
  bernoulli = tf.one_hot(random, depth=num_branches)
  rand = tf.split(bernoulli, [1] * num_branches, 1)
  rand = [tf.reshape(x, [-1, 1, 1, 1]) for x in rand]
  return rand


def combine(rand_uniform, rand_bernoulli, num_branches):
  return [
      tf.concat([rand_uniform[i], rand_bernoulli[i]], axis=0)
      for i in range(num_branches)
  ]


def model_top(labels, preds, cost, lr, mode, hparams):
  tf.summary.scalar("acc",
      tf.reduce_mean(
          tf.to_float(
              tf.equal(labels,
                       tf.argmax(
                           preds, axis=-1,
                           output_type=tf.int32)))))
  tf.summary.scalar("loss", cost)

  gs = tf.train.get_global_step()

  if hparams.weight_decay_and_noise:
    cost = weight_decay_and_noise(cost, hparams, lr)
    cost = tf.identity(cost, name="total_loss")
  optimizer = get_optimizer(lr, hparams)

  train_op = tf.contrib.layers.optimize_loss(
      name="training",
      loss=cost,
      global_step=gs,
      learning_rate=lr,
      clip_gradients=hparams.clip_grad_norm or None,
      gradient_noise_scale=hparams.grad_noise_scale or None,
      optimizer=optimizer,
      colocate_gradients_with_ops=True)

  if hparams.use_tpu:

    def metric_fn(l, p):
      return {
          "acc":
          tf.metrics.accuracy(
              labels=l, predictions=tf.argmax(p, -1, output_type=tf.int32)),
      }

    host_call = None
    if hparams.tpu_summarize:
      host_call = tpu.create_host_call(hparams.output_dir)
    tpu.remove_summaries()

    if mode == tf.estimator.ModeKeys.EVAL:
      return tpu_estimator.TPUEstimatorSpec(
          mode=mode,
          predictions=preds,
          loss=cost,
          eval_metrics=(metric_fn, [labels, preds]),
          host_call=host_call)

    return tpu_estimator.TPUEstimatorSpec(
        mode=mode, loss=cost, train_op=train_op, host_call=host_call)

  return tf.estimator.EstimatorSpec(
      mode,
      eval_metric_ops={
          "acc":
          tf.metrics.accuracy(
              labels=labels,
              predictions=tf.argmax(preds, axis=-1, output_type=tf.int32)),
      },
      loss=cost,
      train_op=train_op)


================================================
FILE: models/utils/optimizers.py
================================================
import tensorflow as tf

_OPTIMIZER = dict()


def register(name):

  def add_to_dict(fn):
    global _OPTIMIZER
    _OPTIMIZER[name] = fn
    return fn

  return add_to_dict


def get_optimizer(lr, params):
  optimizer = _OPTIMIZER[params.optimizer](lr, params)
  if params.use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
  return optimizer


@register("sgd")
def sgd(lr, params):
  return tf.train.GradientDescentOptimizer(lr)


@register("adam")
def adam(lr, params):
  return tf.train.AdamOptimizer(lr, beta1=params.beta1, beta2=params.beta2)


@register("adagrad")
def adagrad(lr, params):
  return tf.train.AdagradOptimizer(lr)


@register("momentum")
def momentum(lr, params):
  return tf.train.MomentumOptimizer(
      lr, momentum=params.momentum, use_nesterov=params.use_nesterov)


================================================
FILE: models/vgg/__init__.py
================================================
__all__ = ["vgg"]


================================================
FILE: models/vgg/vgg.py
================================================
import tensorflow as tf

from ..utils.activations import get_activation
from ..utils.dropouts import get_dropout
from ..utils.initializations import get_init
from ..utils.optimizers import get_optimizer
from ..registry import register
from ..utils import model_utils
from ..utils import dropouts
from ...training import tpu
import six

import numpy as np
from tensorflow.contrib.tpu.python.tpu import tpu_estimator, tpu_optimizer


def metric_fn(labels, predictions):
  return {
      "acc":
      tf.metrics.accuracy(
          labels=tf.argmax(labels, -1), predictions=tf.argmax(predictions,
                                                              -1)),
  }


@register("vgg")
def get_vgg(hparams, lr):
  """Callable model function compatible with Experiment API."""

  def vgg(features, labels, mode, params):
    if hparams.use_tpu and 'batch_size' in params.keys():
      hparams.batch_size = params['batch_size']

    is_training = mode == tf.estimator.ModeKeys.TRAIN

    inputs = features["inputs"]
    with tf.variable_scope("vgg", initializer=get_init(hparams)):
      total_nonzero = 0
      conv1_1 = model_utils.conv(
          inputs, 3, 64, hparams, name="conv1_1", is_training=is_training)

      conv1_1 = model_utils.batch_norm(conv1_1, hparams, is_training)
      conv1_1 = tf.nn.relu(conv1_1)

      conv1_2 = model_utils.conv(
          conv1_1, 3, 64, hparams, name="conv1_2", is_training=is_training)
      conv1_2 = model_utils.batch_norm(conv1_2, hparams, is_training)
      conv1_2 = tf.nn.relu(conv1_2)

      pool1 = tf.layers.max_pooling2d(
          conv1_2, 2, 2, padding="SAME", name='pool1')

      conv2_1 = model_utils.conv(
          pool1, 3, 128, hparams, name="conv2_1", is_training=is_training)
      conv2_1 = model_utils.batch_norm(conv2_1, hparams, is_training)
      conv2_1 = tf.nn.relu(conv2_1)

      conv2_2 = model_utils.conv(
          conv2_1, 3, 128, hparams, name="conv2_2", is_training=is_training)
      conv2_2 = model_utils.batch_norm(conv2_2, hparams, is_training)
      conv2_2 = tf.nn.relu(conv2_2)

      pool2 = tf.layers.max_pooling2d(
          conv2_2, 2, 2, padding="SAME", name='pool2')

      conv3_1 = model_utils.conv(
          pool2, 3, 256, hparams, name="conv3_1", is_training=is_training)
      conv3_1 = model_utils.batch_norm(conv3_1, hparams, is_training)
      conv3_1 = tf.nn.relu(conv3_1)

      conv3_2 = model_utils.conv(
          conv3_1, 3, 256, hparams, name="conv3_2", is_training=is_training)
      conv3_2 = model_utils.batch_norm(conv3_2, hparams, is_training)
      conv3_2 = tf.nn.relu(conv3_2)

      conv3_3 = model_utils.conv(
          conv3_2, 3, 256, hparams, name="conv3_3", is_training=is_training)
      conv3_3 = model_utils.batch_norm(conv3_3, hparams, is_training)
      conv3_3 = tf.nn.relu(conv3_3)

      pool3 = tf.layers.max_pooling2d(
          conv3_3, 2, 2, padding="SAME", name='pool3')

      conv4_1 = model_utils.conv(
          pool3, 3, 512, hparams, name="conv4_1", is_training=is_training)
      conv4_1 = model_utils.batch_norm(conv4_1, hparams, is_training)
      conv4_1 = tf.nn.relu(conv4_1)

      conv4_2 = model_utils.conv(
          conv4_1, 3, 512, hparams, name="conv4_2", is_training=is_training)
      conv4_2 = model_utils.batch_norm(conv4_2, hparams, is_training)
      conv4_2 = tf.nn.relu(conv4_2)

      conv4_3 = model_utils.conv(
          conv4_2, 3, 512, hparams, name="conv4_3", is_training=is_training)
      conv4_3 = model_utils.batch_norm(conv4_3, hparams, is_training)
      conv4_3 = tf.nn.relu(conv4_3)

      pool4 = tf.layers.max_pooling2d(
          conv4_3, 2, 2, padding="SAME", name='pool4')

      conv5_1 = model_utils.conv(
          pool4, 3, 512, hparams, name="conv5_1", is_training=is_training)
      conv5_1 = model_utils.batch_norm(conv5_1, hparams, is_training)
      conv5_1 = tf.nn.relu(conv5_1)

      conv5_2 = model_utils.conv(
          conv5_1, 3, 512, hparams, name="conv5_2", is_training=is_training)
      conv5_2 = model_utils.batch_norm(conv5_2, hparams, is_training)
      conv5_2 = tf.nn.relu(conv5_2)

      conv5_3 = model_utils.conv(
          conv5_2, 3, 512, hparams, name="conv5_3", is_training=is_training)
      conv5_3 = model_utils.batch_norm(conv5_3, hparams, is_training)
      conv5_3 = tf.nn.relu(conv5_3)

      pool5 = tf.layers.max_pooling2d(
          conv5_3, 2, 2, padding="SAME", name='pool5')

      flat_x = tf.reshape(pool5, [hparams.batch_size, 512])
      fc6 = model_utils.batch_norm(
          model_utils.dense(flat_x, 4096, hparams, is_training), hparams,
          is_training)
      fc7 = model_utils.batch_norm(
          model_utils.dense(fc6, 4096, hparams, is_training), hparams,
          is_training)

      logits = tf.layers.dense(fc7, hparams.num_classes, name="logits")
      probs = tf.nn.softmax(logits, axis=-1)

      if mode in [model_utils.ModeKeys.PREDICT, model_utils.ModeKeys.ATTACK]:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'classes': tf.argmax(probs, axis=1),
                'logits': logits,
                'probabilities': probs,
            })

      xent = tf.losses.sparse_softmax_cross_entropy(
          labels=labels, logits=logits)
      cost = tf.reduce_mean(xent, name='xent')
      cost += model_utils.weight_decay(hparams)

      tf.summary.scalar("total_nonzero", model_utils.nonzero_count())
      tf.summary.scalar("percent_sparsity", model_utils.percent_sparsity())
      if hparams.dropout_type is not None:
        if "louizos" in hparams.dropout_type:
          cost += hparams.louizos_cost * model_utils.louizos_complexity_cost(
              hparams) / 50000

        if "variational" in hparams.dropout_type:
          # prior DKL part of the ELBO
          graph = tf.get_default_graph()
          node_defs = [
              n for n in graph.as_graph_def().node if 'log_alpha' in n.name
          ]
          log_alphas = [
              graph.get_tensor_by_name(n.name + ":0") for n in node_defs
          ]
          print([
              n.name
              for n in graph.as_graph_def().node
              if 'log_alpha' in n.name
          ])
          print("found %i logalphas" % len(log_alphas))
          divergences = [dropouts.dkl_qp(la) for la in log_alphas]
          # combine to form the ELBO
          N = float(50000)
          dkl = tf.reduce_sum(tf.stack(divergences))

          warmup_steps = 50000
          dkl = (1. / N) * dkl * tf.minimum(
              1.0,
              tf.to_float(tf.train.get_global_step()) /
              warmup_steps) * hparams.var_scale
          cost += dkl
          tf.summary.scalar("dkl", dkl)

      if hparams.ard_cost > 0.0:
        cost += model_utils.ard_cost() * hparams.ard_cost

      if hparams.smallify > 0.0:
        cost += model_utils.switch_loss() * hparams.smallify

    return model_utils.model_top(labels, probs, cost, lr, mode, hparams)

  return vgg


================================================
FILE: requirements.txt
================================================
tensorflow>=1.9
requests>=2.19.1
dl-cloud>=0.0.4

================================================
FILE: scripts/__init__.py
================================================


================================================
FILE: scripts/prune/README.md
================================================
# Library for Pruning


================================================
FILE: scripts/prune/__init__.py
================================================


================================================
FILE: scripts/prune/eval.py
================================================
import tensorflow as tf
import os
import numpy as np

from ...hparams.registry import get_hparams
from ...models.registry import get_model
from ...data.registry import get_input_fns
from ...training import flags
from .prune import get_prune_fn, get_current_weights, get_louizos_masks, get_smallify_masks, prune_weights, is_prunable_weight


def init_flags():
  tf.flags.DEFINE_string("model", None, "Which model to use.")
  tf.flags.DEFINE_string("data", None, "Which data to use.")
  tf.flags.DEFINE_string("env", None, "Which environment to use.")
  tf.flags.DEFINE_string("hparams", None, "Which hparams to use.")
  tf.flags.DEFINE_string("hparam_override", "",
                         "Run-specific hparam settings to use.")
  tf.flags.DEFINE_string("output_dir", None, "The output directory.")
  tf.flags.DEFINE_string("data_dir", None, "The data directory.")
  tf.flags.DEFINE_integer("train_steps", 10000,
                          "Number of training steps to perform.")
  tf.flags.DEFINE_integer("eval_every", 1000,
                          "Number of steps between evaluations.")
  tf.flags.DEFINE_string(
      "post_weights_dir", "",
      "folder of the weights, if not set defaults to output_dir")
  tf.flags.DEFINE_string("prune_percent", "0.5",
                         "percent of weights to prune, comma separated")
  tf.flags.DEFINE_string("prune", "weight", "one_shot or fisher")
  tf.flags.DEFINE_boolean("variational", False, "use evaluate")
  tf.flags.DEFINE_string("eval_file", "eval_prune_results",
                         "file to put results")
  tf.flags.DEFINE_integer("train_epochs", None,
                          "Number of training epochs to perform.")
  tf.flags.DEFINE_integer("eval_steps", None,
                          "Number of evaluation steps to perform.")

def eval_model(FLAGS, hparam_name):
  hparams = get_hparams(hparam_name)
  hparams = hparams.parse(FLAGS.hparam_override)
  hparams = flags.update_hparams(FLAGS, hparams)

  model_fn = get_model(hparams)
  _, _, test_input_fn = get_input_fns(hparams, generate=False)

  features, labels = test_input_fn()
  sess = tf.Session()
  tf.train.create_global_step()
  model_fn(features, labels, tf.estimator.ModeKeys.TRAIN)
  saver = tf.train.Saver()
  ckpt_dir = tf.train.latest_checkpoint(hparams.output_dir)
  print("Loading model from...", ckpt_dir)
  saver.restore(sess, ckpt_dir)

  evals = []
  prune_percents = [float(i) for i in FLAGS.prune_percent.split(",")]

  mode = "standard"
  orig_weights = get_current_weights(sess)
  louizos_masks, smallify_masks = None, None
  if "louizos" in hparam_name:
    louizos_masks = get_louizos_masks(sess, orig_weights)
    mode = "louizos"
  elif "smallify" in hparam_name:
    smallify_masks = get_smallify_masks(sess, orig_weights)
  elif "variational" in hparam_name:
    mode = "variational"

  for prune_percent in prune_percents:
    if prune_percent > 0.0:
      prune_fn = get_prune_fn(FLAGS.prune)(mode, k=prune_percent)
      w_copy = dict(orig_weights)
      sm_copy = dict(smallify_masks) if smallify_masks is not None else None
      lm_copy = dict(louizos_masks) if louizos_masks is not None else None
      post_weights_pruned, weight_counts = prune_weights(
          prune_fn,
          w_copy,
          louizos_masks=lm_copy,
          smallify_masks=sm_copy,
          hparams=hparams)
      print("current weight counts at {}: {}".format(prune_percent,
                                                     weight_counts))

      print("there are ", len(tf.trainable_variables()), " weights")
      for v in tf.trainable_variables():
        if is_prunable_weight(v):
          assign_op = v.assign(
              np.reshape(post_weights_pruned[v.name.strip(":0")], v.shape))
          sess.run(assign_op)

    saver.save(sess, os.path.join(hparams.output_dir, "tmp", "model"))
    estimator = tf.estimator.Estimator(
        model_fn=tf.contrib.estimator.replicate_model_fn(model_fn),
        model_dir=os.path.join(hparams.output_dir, "tmp"))
    print(
        f"Processing pruning {prune_percent} of weights for {hparams.eval_steps} steps"
    )  
    acc = estimator.evaluate(test_input_fn, hparams.eval_steps)['acc']
    print(f"Accuracy @ prune {100*prune_percent}% is {acc}")
    evals.append(acc)
  return evals


def _run(FLAGS):
  eval_file = open(FLAGS.eval_file, "w")

  hparams_list = FLAGS.hparams.split(",")
  total_evals = {}
  for hparam_name in hparams_list:
    evals = eval_model(FLAGS, hparam_name)

    print(hparam_name, ":", evals)
    eval_file.writelines("{}:{}\n".format(hparam_name, evals))
    total_evals[hparam_name] = evals
    tf.reset_default_graph()

  print("processed results:", total_evals)
  eval_file.close()


if __name__ == "__main__":
  init_flags()
  FLAGS = tf.app.flags.FLAGS
  _run(FLAGS)


================================================
FILE: scripts/prune/prune.py
================================================
import numpy as np
import tensorflow as tf
import statistics
from ...models.utils import model_utils

_PRUNE_FN = dict()


def register(fn):
  global _PRUNE_FN
  _PRUNE_FN[fn.__name__] = fn
  return fn


def get_prune_fn(name):
  return _PRUNE_FN[name]


@register
def weight(mode, k=0.5):

  if mode == "standard":

    def prune(weight_dict, weight_key):
      weights = weight_dict[weight_key]
      w = weights.copy()
      if len(weights.shape) == 4:
        w = w.reshape([-1, weights.shape[-1]])

      abs_w = np.abs(w)
      idx = int(k * abs_w.shape[0])
      med = np.sort(abs_w, axis=0)[idx:idx + 1]
      mask = (abs_w >= med).astype(float)
      pruned_w = mask * w

      return pruned_w, mask
  elif mode == "variational":

    def prune(weight_dict, weight_key):
      weights = weight_dict[weight_key]
      if k == 0.0:
        return weights, None
      log_alpha = weight_dict[weight_key.strip("DW") + "variational/log_alpha"]
      w = weights.copy()
      la = log_alpha.copy()
      if len(weights.shape) == 4:
        w = w.reshape([-1, weights.shape[-1]])
        la = la.reshape([-1, weights.shape[-1]])

      idx = int((1 - k) * la.shape[0])
      med = np.sort(la, axis=0)[idx:idx + 1]
      mask = (la < med).astype(float)
      pruned_w = mask * w

      return pruned_w, mask
  elif mode == "louizos":

    def prune(weight_dict, weight_key):
      weights = weight_dict[weight_key]
      w = weights.copy()
      if len(weights.shape) == 4:
        w = w.reshape([-1, weights.shape[-1]])

      idx = int(k * w.shape[0])
      med = np.sort(w, axis=0)[idx:idx + 1]
      mask = (w >= med).astype(float)
      pruned_w = mask * w

      return pruned_w, mask

  return prune


@register
def unit(mode, k=0.5):

  if mode == "standard" or mode == "variational":

    def prune(weight_dict, weight_key):
      weights = weight_dict[weight_key]
      w = weights.copy()
      if len(weights.shape) == 4:
        w = w.reshape([-1, weights.shape[-1]])
      norm = np.linalg.norm(w, axis=0)
      idx = int(k * norm.shape[0])
      med = np.sort(norm, axis=0)[idx]
      mask = (norm >= med).astype(float)
      pruned_w = mask * w

      return pruned_w, mask
  elif mode == "louizos":

    def prune(weight_dict, weight_key):
      weights = weight_dict[weight_key]
      w = weights.copy()
      assert len(weights.shape) == 1
      idx = int(k * w.shape[0])
      med = np.sort(w, axis=0)[idx]
      mask = (w >= med).astype(float)
      pruned_w = mask * w

      return pruned_w, mask

  return prune


@register
def ard(k=0.5):

  def prune(weight_dict, weight_key):
    weights = weight_dict[weight_key]
    w = weights.copy()
    if len(weights.shape) == 4:
      w = w.reshape([-1, weights.shape[-1]])
    norm = np.linalg.norm(w, axis=1, keepdims=True)
    idx = int(k * norm.shape[0])
    med = np.sort(norm, axis=0)[idx]
    mask = (norm >= med).astype(float)
    pruned_w = mask * w

    return pruned_w, mask

  return prune


def prune_weights(prune_fn,
                  weights,
                  louizos_masks=None,
                  smallify_masks=None,
                  hparams=None):
  weights_pruned = {}

  pre_prune_nonzero = 0
  pre_prune_total = 0
  if louizos_masks:
    orig_weights = dict(weights)
    for weight_name in weights:
      if weight_name not in louizos_masks.keys():
        print("WARN louizos: mask not found for {}".format(weight_name))
        continue
      weights[weight_name] = louizos_masks[weight_name]
  elif smallify_masks:
    orig_weights = dict(weights)
    for weight_name in weights:
      if weight_name not in smallify_masks.keys():
        print("WARN smallify: not pruning {}".format(weight_name))
        continue
      mask = smallify_masks[weight_name]
      weights[weight_name] = weights[weight_name] * mask

  for weight_name in weights:
    if "variational" in weight_name:
      print("WARN variational: not pruning {}".format(weight_name))
      continue

    pre_prune_nonzero += np.count_nonzero(weights[weight_name])
    pre_prune_total += weights[weight_name].size

    weights_pruned[weight_name], mask = prune_fn(weights, weight_name)
    if louizos_masks or smallify_masks:
      print("applied masks to", weight_name)
      weights_pruned[weight_name] = mask * orig_weights[weight_name].reshape(
          [-1, orig_weights[weight_name].shape[-1]])

  return weights_pruned, {
      "pre_prune_nonzero": pre_prune_nonzero,
      "pre_prune_total": pre_prune_total
  }


def get_louizos_masks(sess, weights):
  masks = {}
  for weight_name in weights:
    m_name = weight_name.strip("DW") + "louizos/gates"
    m = tf.contrib.framework.get_variables_by_name(m_name)
    assert len(m) == 1
    m = m[0]
    masks[weight_name] = sess.run(m)

  return masks


def get_smallify_masks(sess, weights):
  masks = {}
  for weight_name in weights:
    switch_name = weight_name.strip("DW") + "smallify/switch"
    mask_name = weight_name.strip("DW") + "smallify/mask"
    switch = tf.contrib.framework.get_variables_by_name(switch_name)
    mask = tf.contrib.framework.get_variables_by_name(mask_name)
    assert len(switch) == 1 and len(mask) == 1
    switch, mask = switch[0], mask[0]
    switch, mask = sess.run((switch, mask))

    masks[weight_name] = switch * mask

  return masks


def is_prunable_weight(weight):
  necessary_tokens = ["kernel", "DW", "variational"]
  blacklisted_tokens = ["logit", "fc", "init", "switch", "mask", "log_sigma"]

  contains_a_necessary_token = any(t in weight.name for t in necessary_tokens)
  contains_a_blacklisted_token = any(
      t in weight.name for t in blacklisted_tokens)

  is_prunable = contains_a_necessary_token and not contains_a_blacklisted_token

  if not is_prunable:
    print("WARN: not pruning %s" % weight.name)

  return is_prunable


def get_current_weights(sess):
  weights = {}
  variables = {}
  for v in tf.trainable_variables():
    if is_prunable_weight(v):
      name = v.name.strip(":0")
      variables[name] = v

  graph = tf.get_default_graph()
  node_defs = [n for n in graph.as_graph_def().node if 'log_alpha' in n.name]

  for n in node_defs:
    weights[n.name] = sess.run(graph.get_tensor_by_name(n.name + ":0"))

  for weight_name, w in variables.items():
    weights[weight_name] = sess.run(w)

  return weights


def prune_sess_weights(sess, prune_percent, FLAGS, hparams):
  current_weights = get_current_weights(sess)
  prune_fn = get_prune_fn(FLAGS.prune)(k=prune_percent)
  current_weights_pruned = prune_weights(prune_fn, current_weights, None,
                                         hparams)

  print("there are ", len(tf.trainable_variables()), " weights")
  for v in tf.trainable_variables():
    if is_prunable_weight(v):
      assign_op = v.assign(
          np.reshape(current_weights_pruned[v.name.strip(":0")], v.shape))
      sess.run(assign_op)


================================================
FILE: train.py
================================================
import cloud
import os
import sys
import subprocess
import random
import tensorflow as tf
import numpy as np
import time
import logging

from .hparams.registry import get_hparams
from .models.registry import get_model
from .data.registry import get_input_fns
from .training.lr_schemes import get_lr
from .training.envs import get_env
from .training import flags
from tensorflow.contrib.tpu.python.tpu import tpu_config
from tensorflow.contrib.tpu.python.tpu import tpu_estimator


def init_flags():
  tf.flags.DEFINE_string("env", None, "Which environment to use.")  # required
  tf.flags.DEFINE_string("hparams", None, "Which hparams to use.")  # required
  # Utility flags
  tf.flags.DEFINE_string("hparam_override", "",
                         "Run-specific hparam settings to use.")
  tf.flags.DEFINE_boolean("fresh", False, "Remove output_dir before running.")
  tf.flags.DEFINE_integer("seed", None, "Random seed.")
  tf.flags.DEFINE_integer("train_epochs", None,
                          "Number of training epochs to perform.")
  tf.flags.DEFINE_integer("eval_steps", None,
                          "Number of evaluation steps to perform.")
  # TPU flags
  tf.flags.DEFINE_string("tpu_name", "", "Name of TPU(s)")
  tf.flags.DEFINE_integer(
      "tpu_iterations_per_loop", 1000,
      "The number of training steps to run on TPU before"
      "returning control to CPU.")
  tf.flags.DEFINE_integer(
      "tpu_shards", 8, "The number of TPU shards in the system "
      "(a single Cloud TPU has 8 shards.")
  tf.flags.DEFINE_boolean(
      "tpu_summarize", False, "Save summaries for TensorBoard. "
      "Warning: this will slow down execution.")
  tf.flags.DEFINE_boolean("tpu_dedicated", False,
                          "Do not use preemptible TPUs.")
  tf.flags.DEFINE_string("data_dir", None, "The data directory.")
  tf.flags.DEFINE_string("output_dir", None, "The output directory.")
  tf.flags.DEFINE_integer("eval_every", 1000,
                          "Number of steps between evaluations.")


tf.logging.set_verbosity(tf.logging.INFO)
FLAGS = None


def init_random_seeds():
  tf.set_random_seed(FLAGS.seed)
  random.seed(FLAGS.seed)
  np.random.seed(FLAGS.seed)


def init_model(hparams_name):
  flags.validate_flags(FLAGS)

  tf.reset_default_graph()

  hparams = get_hparams(hparams_name)
  hparams = hparams.parse(FLAGS.hparam_override)
  hparams = flags.update_hparams(FLAGS, hparams, hparams_name)

  # set larger eval_every for TPUs to improve utilization
  if FLAGS.env == "tpu":
    FLAGS.eval_every = max(FLAGS.eval_every, 5000)
    hparams.tpu_summarize = FLAGS.tpu_summarize

  tf.logging.warn("\n-----------------------------------------\n"
                  "BEGINNING RUN:\n"
                  "\t hparams: %s\n"
                  "\t output_dir: %s\n"
                  "\t data_dir: %s\n"
                  "-----------------------------------------\n" %
                  (hparams_name, hparams.output_dir, hparams.data_dir))

  return hparams


def construct_estimator(model_fn, hparams, tpu=None):
  if hparams.use_tpu:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        tpu=tpu.name)
    master = tpu_cluster_resolver.get_master()
    config = tpu_config.RunConfig(
        master=master,
        evaluation_master=master,
        model_dir=hparams.output_dir,
        session_config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.tpu_iterations_per_loop,
            num_shards=FLAGS.tpu_shards),
        save_checkpoints_steps=FLAGS.eval_every)
    estimator = tpu_estimator.TPUEstimator(
        use_tpu=hparams.use_tpu,
        model_fn=model_fn,
        model_dir=hparams.output_dir,
        config=config,
        train_batch_size=hparams.batch_size,
        eval_batch_size=hparams.batch_size)
  else:
    gpu_config = tf.ConfigProto(allow_soft_placement=True)
    gpu_config.gpu_options.allow_growth = True
    run_config = tf.estimator.RunConfig(
        save_checkpoints_steps=FLAGS.eval_every, session_config=gpu_config)

    estimator = tf.estimator.Estimator(
        model_fn=tf.contrib.estimator.replicate_model_fn(model_fn),
        model_dir=hparams.output_dir,
        config=run_config)

  return estimator


def _run(hparams_name):
  """Run training, evaluation and inference."""
  hparams = init_model(hparams_name)
  original_batch_size = hparams.batch_size
  if tf.gfile.Exists(hparams.output_dir) and FLAGS.fresh:
    tf.gfile.DeleteRecursively(hparams.output_dir)

  if not tf.gfile.Exists(hparams.output_dir):
    tf.gfile.MakeDirs(hparams.output_dir)
  model_fn = get_model(hparams)
  train_input_fn, eval_input_fn, test_input_fn = get_input_fns(hparams)

  tpu = None
  if hparams.use_tpu:
    cloud.instance.tpu.clean()
    tpu = cloud.instance.tpu.get(preemptible=not FLAGS.tpu_dedicated)

  estimator = construct_estimator(model_fn, hparams, tpu)

  if not hparams.use_tpu:
    features, labels = train_input_fn()
    sess = tf.Session()
    tf.train.get_or_create_global_step()

    model_fn(features, labels, tf.estimator.ModeKeys.TRAIN)
    sess.run(tf.global_variables_initializer())

  # output metadata about the run
  with tf.gfile.GFile(os.path.join(hparams.output_dir, 'hparams.txt'),
                      'w') as hparams_file:
    hparams_file.write("{}\n".format(time.time()))
    hparams_file.write("{}\n".format(str(hparams)))

  def loop(steps=FLAGS.eval_every):
    estimator.train(train_input_fn, steps=steps)
    if eval_input_fn:
      estimator.evaluate(eval_input_fn, steps=hparams.eval_steps, name="eval")
    if test_input_fn:
      estimator.evaluate(test_input_fn, steps=hparams.eval_steps, name="test")

  loop(1)

  steps = estimator.get_variable_value("global_step")
  k = steps * original_batch_size / float(hparams.epoch_size)
  while k <= hparams.train_epochs:
    tf.logging.info("Beginning epoch %f / %d" % (k, hparams.train_epochs))

    if tpu and not tpu.usable:
      tpu.delete(async=True)
      tpu = cloud.instance.tpu.get(preemptible=not FLAGS.tpu_dedicated)
      estimator = construct_estimator(model_fn, hparams, tpu)

    loop()

    steps = estimator.get_variable_value("global_step")
    k = steps * original_batch_size / float(hparams.epoch_size)


def main(_):
  global FLAGS
  FLAGS = tf.app.flags.FLAGS

  init_random_seeds()
  if FLAGS.env != "local":
    cloud.connect()
  for hparams_name in FLAGS.hparams.split(","):
    _run(hparams_name)


if __name__ == "__main__":
  init_flags()
  tf.app.run()


================================================
FILE: training/__init__.py
================================================
__all__ = ["lr_schemes", "tpu", "flags"]

from .lr_schemes import *
from .tpu import *
from .flags import *


================================================
FILE: training/envs.py
================================================
_ENVS = dict()


def register(cls):
  global _ENVS
  _ENVS[cls.__name__.lower()] = cls()
  return cls


def get_env(name):
  return _ENVS[name]


@register
class GCP(object):
  data_dir = "/path/to/your/data"
  output_dir = "/path/to/your/output"


@register
class TPU(object):
  data_dir = "/path/to/your/data"
  output_dir = "/path/to/your/output"


@register
class Local(object):
  data_dir = "/tmp/data"
  output_dir = "/tmp/runs"


================================================
FILE: training/flags.py
================================================
import getpass
import os
import subprocess

import tensorflow as tf

from .envs import get_env


def validate_flags(FLAGS):
  messages = []
  if not FLAGS.env:
    messages.append("Missing required flag --env")
  if not FLAGS.hparams:
    messages.append("Missing required flag --hparams")

  if len(messages) > 0:
    raise Exception("\n".join(messages))

  return FLAGS


def update_hparams(FLAGS, hparams, hparams_name):
  hparams.env = FLAGS.env
  hparams.use_tpu = hparams.env == "tpu"
  hparams.train_epochs = FLAGS.train_epochs or hparams.train_epochs
  hparams.eval_steps = FLAGS.eval_steps or hparams.eval_steps

  env = get_env(FLAGS.env)
  hparams.data_dir = os.path.join(FLAGS.data_dir or env.data_dir, hparams.data)
  hparams.output_dir = os.path.join(env.output_dir, FLAGS.hparams)

  return hparams


================================================
FILE: training/lr_schemes.py
================================================
import tensorflow as tf

_LR = dict()


def register(name):

  def add_to_dict(fn):
    global _LR
    _LR[name] = fn
    return fn

  return add_to_dict


def get_lr(params):
  gs = tf.train.get_global_step()
  return _LR[params.lr_scheme](gs, params)


@register("constant")
def constant(gs, params):
  return tf.constant(params.learning_rate)


@register("exp")
def exponential_decay(gs, params, delay=0):
  gs -= delay
  return tf.train.exponential_decay(
      params.learning_rate,
      gs,
      params.learning_rate_decay_interval,
      params.learning_rate_decay_rate,
      staircase=params.staircased)


@register("lin")
def linear_decay(gs, params, delay=0):
  gs -= delay
  return (
      params.learning_rate -
      (tf.to_float(gs) / (params.train_steps - delay)) * params.learning_rate)


@register("delay_exp")
def delayed_exponential_decay(gs, params):
  d = params.delay
  return tf.cond(
      tf.greater(gs, d), lambda: exponential_decay(gs - d, params, delay=d),
      lambda: params.learning_rate)


@register("delay_lin")
def delayed_linear_decay(gs, params):
  d = params.delay
  return tf.cond(
      tf.greater(gs, d), lambda: linear_decay(gs - d, params, delay=d),
      lambda: params.learning_rate)


@register("warmup_resnet")
def warmup_resnet(gs, params):
  warmup_steps = params.warmup_steps
  inv_base = tf.exp(tf.log(0.01) / warmup_steps)
  inv_decay = inv_base**(warmup_steps - tf.to_float(gs))

  epoch = params.epoch_size // params.batch_size
  boundaries = [epoch * 30, epoch * 60, epoch * 80, epoch * 90]
  values = [1e0, 1e-1, 1e-2, 1e-3, 1e-4]
  lr = tf.train.piecewise_constant(
      gs - warmup_steps, boundaries=boundaries, values=values)

  return tf.cond(
      tf.greater(gs, warmup_steps), lambda: lr,
      lambda: inv_decay * params.learning_rate)


@register("resnet")
def resnet(gs, params):
  return tf.cond(
      tf.less(gs, 40000),
      lambda: params.learning_rate,
      lambda: tf.cond(
          tf.less(gs, 60000),
          lambda: params.learning_rate*0.1,
          lambda: tf.cond(
              tf.less(gs, 80000),
              lambda: params.learning_rate * 0.01,
              lambda: params.learning_rate * 0.001)))


@register("lenet")
def lenet(gs, _):
  return tf.cond(
      tf.less(gs, 80000), lambda: 0.05,
      lambda: tf.cond(tf.less(gs, 120000), lambda: 0.005, lambda: 0.0005))


@register("steps")
def stepped_lr(gs, params):
  lr = params.lr_values[-1]
  for step, value in reversed(list(zip(params.lr_steps, params.lr_values))):
    lr = tf.cond(tf.greater(gs, step), lambda: lr, lambda: value)
  return lr


@register("warmup_linear_decay")
def warmup_linear_decay(gs, params):
  d = params.delay
  warmup_steps = params.warmup_steps
  inv_base = tf.exp(tf.log(0.01) / warmup_steps)
  inv_decay = inv_base**(warmup_steps - tf.to_float(gs))

  return tf.cond(
      tf.greater(gs, warmup_steps), lambda: linear_decay(gs, params, delay=d),
      lambda: inv_decay * params.learning_rate)


@register("warmup_constant")
def warmup_constant(gs, params):
  warmup_steps = params.warmup_steps
  inv_base = tf.exp(tf.log(0.01) / warmup_steps)
  inv_decay = inv_base**(warmup_steps - tf.to_float(gs))

  return tf.cond(
      tf.greater(gs, warmup_steps), lambda: constant(gs, params),
      lambda: inv_decay * params.learning_rate)


@register("warmup_exponential_decay")
def warmup_exponential_decay(gs, params):
  d = params.delay
  warmup_steps = params.warmup_steps
  inv_base = tf.exp(tf.log(0.01) / warmup_steps)
  inv_decay = inv_base**(warmup_steps - tf.to_float(gs))

  return tf.cond(
      tf.greater(gs,
                 warmup_steps), lambda: exponential_decay(gs, params, delay=d),
      lambda: inv_decay * params.learning_rate)


@register("warmup_cosine")
def warmup_cosine(gs, params):
  from numpy import pi

  warmup_steps = params.warmup_steps
  inv_base = tf.exp(tf.log(0.01) / warmup_steps)
  inv_decay = inv_base**(warmup_steps - tf.to_float(gs))

  gs = tf.minimum(gs - warmup_steps, params.learning_rate_cosine_cycle_steps)
  cosine_decay = 0.5 * (1 + tf.cos(
      pi * tf.to_float(gs) / params.learning_rate_cosine_cycle_steps))
  decayed = (1 - params.cosine_alpha) * cosine_decay + params.cosine_alpha
  lr = params.learning_rate * decayed

  return tf.cond(
      tf.greater(gs, warmup_steps), lambda: lr,
      lambda: inv_decay * params.learning_rate)


@register("cosine")
def cosine_annealing(gs, params):
  from numpy import pi

  gs = tf.minimum(gs, params.learning_rate_cosine_cycle_steps)
  cosine_decay = 0.5 * (1 + tf.cos(
      pi * tf.to_float(gs) / params.learning_rate_cosine_cycle_steps))
  decayed = (1 - params.cosine_alpha) * cosine_decay + params.cosine_alpha
  decayed_learning_rate = params.learning_rate * decayed

  return decayed_learning_rate


================================================
FILE: training/tpu.py
================================================
import collections
import six

import tensorflow as tf


def remove_summaries():
  g = tf.get_default_graph()
  key = tf.GraphKeys.SUMMARIES
  del g.get_collection_ref(key)[:]
  assert not g.get_collection(key)


# From Tensor2Tensor
def create_host_call(model_dir):
  """Construct a host_call writing scalar summaries.
  Args:
    model_dir: String containing path to train
  Returns:
    (fn, args) Pair to be called by TPUEstimator as the host_call.
  """
  graph = tf.get_default_graph()
  summaries = graph.get_collection(tf.GraphKeys.SUMMARIES)

  gs_t = tf.reshape(tf.to_int32(tf.train.get_global_step()), [1])
  summary_kwargs = collections.OrderedDict()
  for t in summaries:
    if t.op.type not in ["ScalarSummary", "HistogramSummary"]:
      tf.logging.warn("Ignoring unsupported tf.Summary type %s" % t.op.type)
      continue

    name = t.op.name
    tensor = t.op.inputs[1]
    if t.op.type == "ScalarSummary":
      assert tensor.shape.is_compatible_with([])
      if tensor.dtype == tf.int64:
        tensor = tf.to_int32(tensor)
      summary_kwargs["ScalarSummary" + name] = tf.reshape(tensor, [1])
    elif t.op.type == "HistogramSummary":
      summary_kwargs["HistogramSummary" + name] = tf.reshape(tensor, [-1])
  # When no supported summaries are found, don't create host_call. Otherwise,
  # TPU outfeed queue would enqueue global_step while host_call doesn't dequeue
  # it, eventually causing hang.
  if not summary_kwargs:
    return None
  summary_kwargs["global_step"] = gs_t

  def host_call_fn(**kwargs):
    """Training host call. Creates summaries for training metrics.
    Args:
      **kwargs: Dict of {str: Tensor} , with `Tensor` of shape `[batch]`. Must
        contain key "global_step" with value of current global_step Tensor.
    Returns:
      List of summary ops to run on the CPU host.
    """
    gs = tf.to_int64(kwargs.pop("global_step")[0])
    with tf.contrib.summary.create_file_writer(model_dir).as_default():
      with tf.contrib.summary.always_record_summaries():
        # We need to use tf.contrib.summary in order to feed the `step`.
        for name, value in sorted(six.iteritems(kwargs)):
          if name.startswith("ScalarSummary"):
            name = name[len("ScalarSummary"):]
            tf.contrib.summary.scalar(
                name, tf.reduce_mean(tf.to_float(value)), step=gs)
          elif name.startswith("HistogramSummary"):
            name = name[len("HistogramSummary"):]
            tf.contrib.summary.histogram(name, value, step=gs)
          elif name.startswith("ImageSummary"):
            name = name[len("ImageSummary"):]
            tf.contrib.summary.image(name, value, step=gs)

        return tf.contrib.summary.all_summary_ops()

  return (host_call_fn, summary_kwargs)