[
  {
    "path": ".gitignore",
    "content": "*.pyc\n*.swp\n"
  },
  {
    "path": "README.md",
    "content": "#Generative Moment Matching Networks (GMMNs)\nThis is the code we used for the following paper:\n* Yujia Li, Kevin Swersky, Richard Zemel.  *Generative moment matching networks*.  In International Conference on Machine Learning (ICML), 2015.\n\nIf you use this code in your research you should cite the above paper.\n\n### Dependencies\nTo use the code you need to install some dependencies first:\n* Standard python packages like **numpy, scipy, matplotlib**.  matplotlib is only needed for visualization.  You may also need sklearn for some features.\n* [**gnumpy**](http://www.cs.toronto.edu/~tijmen/gnumpy.html).  If you have a NVIDIA GPU gnumpy can speed up your computation significantly.  To use GPUs you need to install [**cudamat**](https://github.com/cudamat/cudamat) first.  If you don't have a GPU you can use [**npmat**](http://www.cs.toronto.edu/~ilya/npmat.py) as a replacement for cudamat, then all computations will be done on a CPU.\n* The authors' lightweight neural network and optimization packages [**pynn**](https://github.com/yujiali/pynn) and [**pyopt**](https://github.com/yujiali/pyopt).\n\nOnce you get all dependencies ready, try to run `python test.py`.  If you are running this with npmat then all tests should pass.  If you are running this on a GPU with cudamat then some tests will fail - this is expected because of the low numeric precision supported by cudamat (`float32` every where), but all tests should run and finish properly.\n\n### Prepare data\nPrepare the MNIST and TFD data, then go into the `dataio` directory, change paths to the datasets in `mnist.py` and `tfd.py`.\n\n### Train the models\nUse `python train.py -m <mode>` to train the corresponding model.  `<mode>` can be `mnistinput`, `mnistcode`, `tfdinput`, `tfdcode`, corresponding to the input space model and autoencoder code space model for the two datasets.\n\n##### Other resources\nThere is a tensorflow implementation of GMMN provided by Siddharth Agrawal: https://github.com/siddharth-agrawal/Generative-Moment-Matching-Networks\n"
  },
  {
    "path": "core/__init__.py",
    "content": ""
  },
  {
    "path": "core/generative.py",
    "content": "\"\"\"\nGenerative model using MMD objective.\n\nYujia Li, 09/2014\n\"\"\"\n\nimport pynn.nn as nn\nimport pynn.loss as ls\nimport pynn.learner as learner\nimport gnumpy as gnp\nimport numpy as np\nimport math\nimport util\nimport time\nimport scipy.optimize as spopt\n\nclass UnsupervisedMmdLoss(ls.Loss):\n    \"\"\"\n    MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n    \"\"\"\n    def __init__(self, **kwargs):\n        super(UnsupervisedMmdLoss, self).__init__(**kwargs)\n        self.sigma = kwargs.get('sigma', 1)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Create the S matrix that will be used in loss computation.\n        \"\"\"\n        s = gnp.zeros((n_pred + n_target, 2))\n        s[:n_pred, 0] = 1.0 / n_pred\n        s[n_pred:, 1] = 1.0 / n_target\n        s -= 1.0 / (n_pred + n_target)\n        return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        n_pred = pred.shape[0]\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        XX = X.dot(X.T)\n        if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n            x = gnp.garray(np.diag(XX.asarray()))\n        else:\n            x = XX.diag()\n\n        K = gnp.exp(-1.0 / (2 * self.sigma) * (-2 * XX + x + x[:,gnp.newaxis]))\n        A = W.dot(W.T) * K\n\n        loss = A.sum()\n        a = A.sum(axis=1)\n        grad = 2.0 / self.sigma * (A.dot(X) - X * a[:,gnp.newaxis])\n\n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen'\n\n    def get_id(self):\n        return 201\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%g' % (\n                self.get_name(), self.weight, self.sigma)\n\nls.register_loss(UnsupervisedMmdLoss())\n\nclass UnsupervisedMmdLossMultiScale(ls.Loss):\n    \"\"\"\n    Multi-scale MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(UnsupervisedMmdLossMultiScale, self).__init__(**kwargs)\n        self.sigma = [float(s) for s in sigma]\n        self.n_scales = len(sigma)\n\n        if scale_weight is None:\n            self.scale_weight = [1.0] * self.n_scales\n        else:\n            assert(len(scale_weight) == len(sigma))\n            self.scale_weight = [float(w) for w in scale_weight]\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Create the S matrix that will be used in loss computation.\n        \"\"\"\n        s = gnp.zeros((n_pred + n_target, 2))\n        s[:n_pred, 0] = 1.0 / n_pred\n        s[n_pred:, 1] = 1.0 / n_target\n        s -= 1.0 / (n_pred + n_target)\n        return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        n_pred = pred.shape[0]\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        XX = X.dot(X.T)\n        if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n            x = gnp.garray(np.diag(XX.asarray()))\n        else:\n            x = XX.diag()\n\n        prod_mat = XX - 0.5 * x - 0.5 * x[:,gnp.newaxis]\n        ww = W.dot(W.T)\n\n        loss = 0\n        grad = None\n        for i in range(self.n_scales):\n            K = gnp.exp(1.0 / self.sigma[i] * prod_mat)\n            A = self.scale_weight[i] * ww * K\n            loss += A.sum()\n            a = A.sum(axis=1)\n            if grad is None:\n                grad = 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n            else:\n                grad += 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n\n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen_multiscale'\n\n    def get_id(self):\n        return 202\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(UnsupervisedMmdLossMultiScale())\n\nclass LinearTimeUnsupervisedMmdLoss(ls.Loss):\n    \"\"\"\n    MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n\n    This is the linear time estimator proposed by Gretton et al.\n    \"\"\"\n    def __init__(self, **kwargs):\n        super(LinearTimeUnsupervisedMmdLoss, self).__init__(**kwargs)\n        self.use_modified_loss = kwargs.get('use_modified_loss', False)\n        self.use_absolute_value = kwargs.get('use_absolute_value', True)\n        self.sigma = kwargs.get('sigma', 1)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        n_pred = pred.shape[0]\n        assert n_pred == self.n_target\n        assert n_pred % 2 == 0\n\n        n_half = n_pred / 2\n\n        X = pred[:n_half]\n        X_N = pred[n_half:]\n        Y = self.target[:n_half]\n        Y_N = self.target[n_half:]\n\n        diff_x_xn = X - X_N\n        diff_x_yn = X - Y_N\n        diff_xn_y = X_N - Y\n        diff_y_yn = Y - Y_N\n\n\n        factor = -0.5 / self.sigma\n\n        k_x_xn = gnp.exp(factor * (diff_x_xn**2).sum(axis=1))\n        k_y_yn = gnp.exp(factor * (diff_y_yn**2).sum(axis=1))\n        k_x_yn = gnp.exp(factor * (diff_x_yn**2).sum(axis=1))\n        k_xn_y = gnp.exp(factor * (diff_xn_y**2).sum(axis=1))\n\n        loss = 1.0 / n_pred * (k_x_xn.sum() + k_y_yn.sum() - k_x_yn.sum() - k_xn_y.sum())\n        grad_x = 1.0 / (n_pred * self.sigma) * (k_x_yn[:,gnp.newaxis] * diff_x_yn - k_x_xn[:,gnp.newaxis] * diff_x_xn)\n        grad_xn = 1.0 / (n_pred * self.sigma) * (k_xn_y[:,gnp.newaxis] * diff_xn_y + k_x_xn[:,gnp.newaxis] * diff_x_xn)\n\n        if self.use_modified_loss:\n            diff_x_y = X - Y\n            diff_xn_yn = X_N - Y_N\n            k_x_y = gnp.exp(factor * (diff_x_y**2).sum(axis=1))\n            k_xn_yn = gnp.exp(factor * (diff_xn_yn**2).sum(axis=1))\n\n            loss += 1.0 / n_pred * (k_x_xn.sum() + k_y_yn.sum() - k_x_y.sum() - k_xn_yn.sum())\n            grad_x += 1.0 / (n_pred * self.sigma) * (k_x_y[:,gnp.newaxis] * diff_x_y - k_x_xn[:,gnp.newaxis] * diff_x_xn)\n            grad_xn += 1.0 / (n_pred * self.sigma) * (k_xn_yn[:,gnp.newaxis] * diff_xn_yn + k_x_xn[:,gnp.newaxis] * diff_x_xn)\n\n        grad = gnp.concatenate([grad_x, grad_xn], axis=0)\n\n        if self.use_absolute_value and loss < 0:\n            loss = -loss\n            grad = -grad\n\n        return loss, grad\n\n    def get_name(self):\n        return 'linear_time_mmdgen'\n\n    def get_id(self):\n        return 203\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%g' % (\n                self.get_name(), self.weight, self.sigma)\n\nls.register_loss(LinearTimeUnsupervisedMmdLoss())\n\nclass LinearTimeMinibatchUnsupervisedMmdLoss(ls.Loss):\n    \"\"\"\n    MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n\n    This is a version where the full MMD is only computed on minibatches,\n    therefore the time complexity for a set of N pairs of data points and\n    minibatch size M is O(N/M * M^2) = O(NM)\n    \"\"\"\n    def __init__(self, **kwargs):\n        super(LinearTimeMinibatchUnsupervisedMmdLoss, self).__init__(**kwargs)\n        self.sigma = kwargs.get('sigma', 1)\n        self.minibatch_size = kwargs.get('minibatch_size', 100)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Create the S matrix that will be used in loss computation.\n        \"\"\"\n        s = gnp.zeros((n_pred + n_target, 2))\n        s[:n_pred, 0] = 1.0 / n_pred\n        s[n_pred:, 1] = 1.0 / n_target\n        s -= 1.0 / (n_pred + n_target)\n        return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        n_pred = pred.shape[0]\n        assert n_pred == self.n_target\n\n        W_Full = self._make_s_mat(n_pred, self.n_target)\n\n        loss = 0\n        grad = []\n\n        n_batches = (n_pred + self.minibatch_size - 1) / self.minibatch_size\n        for i_batch in range(n_batches):\n            i_start = i_batch * self.minibatch_size\n            if i_batch < n_batches - 1:\n                i_end = i_start + self.minibatch_size\n            else:\n                i_end = n_pred\n\n            X = gnp.concatenate((pred[i_start:i_end], self.target[i_start:i_end]), axis=0)\n            W = self._make_s_mat(i_end - i_start, i_end - i_start)\n            \n            XX = X.dot(X.T)\n            if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n                x = gnp.garray(np.diag(XX.asarray()))\n            else:\n                x = XX.diag()\n\n            K = gnp.exp(-1.0 / (2 * self.sigma) * (-2 * XX + x + x[:,gnp.newaxis]))\n            A = W.dot(W.T) * K\n\n            loss += A.sum()\n            a = A.sum(axis=1)\n            grad.append((2.0 / self.sigma * (A.dot(X) - X * a[:,gnp.newaxis]))[:(i_end - i_start)])\n\n        return loss / n_batches, gnp.concatenate(grad, axis=0) / n_batches\n\n    def get_name(self):\n        return 'linear_time_minibatch_mmdgen'\n\n    def get_id(self):\n        return 204\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%g, minibatch_size=%d' % (\n                self.get_name(), self.weight, self.sigma, self.minibatch_size)\n\nls.register_loss(LinearTimeMinibatchUnsupervisedMmdLoss())\n\nclass RandomFeatureMmdLoss(ls.Loss):\n    \"\"\"\n    MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n\n    This is a version where the kernel k(x,y) is estimated by product of random\n    features.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, n_features=1024, **kwargs):\n        super(RandomFeatureMmdLoss, self).__init__(**kwargs)\n        self.original_sigma = sigma\n        self.sigma = [np.sqrt(float(s)) for s in sigma]\n        self.n_scales = len(sigma)\n\n        if scale_weight is None:\n            self.scale_weight = [1.0] * self.n_scales\n        else:\n            assert(len(scale_weight) == len(sigma))\n            self.scale_weight = [float(w) for w in scale_weight]\n\n        self.n_features = n_features\n\n    def _generate_random_matrix(self, n_features, n_dims, sigma):\n        \"\"\"\n        return a list of random matrices each of size n_features x n_dims\n        \"\"\"\n        w = []\n        for i in range(len(sigma)):\n            w.append(gnp.randn(n_features, n_dims) / sigma[i])\n        return w\n\n    def _generate_random_features(self, x, w):\n        return gnp.cos(x.dot(w.T)) / np.sqrt(self.n_features), \\\n                gnp.sin(x.dot(w.T)) / np.sqrt(self.n_features)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        # actually target does not need to be stored\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n        self.w = self._generate_random_matrix(self.n_features, target.shape[1], self.sigma)\n\n        self.v_target = []\n        for w in self.w:\n            t_c, t_s = self._generate_random_features(target, w)\n            self.v_target.append((t_c.mean(axis=0), t_s.mean(axis=0)))\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        loss = 0\n        grad = None\n        for i in range(self.n_scales):\n            w = self.w[i]\n            x_c, x_s = self._generate_random_features(pred, w)\n            d_c = x_c.mean(axis=0) - self.v_target[i][0]\n            d_s = x_s.mean(axis=0) - self.v_target[i][1]\n\n            loss += ((d_c**2).sum() + (d_s**2).sum()) * self.scale_weight[i]\n            s_c = 2.0 / pred.shape[0] * d_c \n            s_s = 2.0 / pred.shape[0] * d_s\n\n            g = (-x_s * s_c + x_c * s_s).dot(w) * self.scale_weight[i]\n\n            if grad is None:\n                grad = g\n            else:\n                grad += g\n\n        return loss, grad\n\n    def get_name(self):\n        return 'random_feature_mmdgen'\n\n    def get_id(self):\n        return 205\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, nf=%d, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, self.n_features, str(self.original_sigma),\n                str(self.scale_weight))\n\nls.register_loss(RandomFeatureMmdLoss())\n\nclass PairMmdLossMultiScale(ls.Loss):\n    \"\"\"\n    Multi-scale MMD loss for unsupervised learning.\n\n    This loss measures the discrepancy between a distribution given by a \n    neural net model with a data distribution.\n\n    This class considers only a pair of distributions, rather than a set of \n    multiple distributions.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(PairMmdLossMultiScale, self).__init__(**kwargs)\n        self.sigma = [float(s) for s in sigma]\n        self.n_scales = len(sigma)\n\n        if scale_weight is None:\n            self.scale_weight = [1.0] * self.n_scales\n        else:\n            assert(len(scale_weight) == len(sigma))\n            self.scale_weight = [float(w) for w in scale_weight]\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data distribution, n_cases * n_dims matrix.\n        \"\"\"\n        if isinstance(target, gnp.garray):\n            self.target = target\n        else:\n            self.target = gnp.garray(target)\n\n        self.n_target = target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Create the S matrix that will be used in loss computation.\n        \"\"\"\n        s = gnp.zeros((n_pred + n_target, 1))\n        s[:n_pred] = 1\n        s = s / n_pred - (1 - s) / n_target\n        return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        if not isinstance(pred, gnp.garray):\n            pred = gnp.garray(pred)\n\n        n_pred = pred.shape[0]\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        XX = X.dot(X.T)\n        if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n            x = gnp.garray(np.diag(XX.asarray()))\n        else:\n            x = XX.diag()\n\n        prod_mat = XX - 0.5 * x - 0.5 * x[:,gnp.newaxis]\n        ww = W.dot(W.T)\n\n        loss = 0\n        grad = None\n        for i in range(self.n_scales):\n            K = gnp.exp(1.0 / self.sigma[i] * prod_mat)\n            A = self.scale_weight[i] * ww * K\n            loss += A.sum()\n            a = A.sum(axis=1)\n            if grad is None:\n                grad = 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n            else:\n                grad += 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n\n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen_multiscale_pair'\n\n    def get_id(self):\n        return 206\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(PairMmdLossMultiScale())\n\n############################################################\n# Some extensions to the loss\n############################################################\n\nclass DifferentiableKernelMmdLoss(ls.Loss):\n    \"\"\"\n    Base class for MMD loss with kernels that can be backpropagated through.\n    \"\"\"\n    def __init__(self, **kwargs):\n        super(DifferentiableKernelMmdLoss, self).__init__(**kwargs)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data batch that we want our model to match.\n        \"\"\"\n        self.target = util.to_garray(target)\n        self.n_target = self.target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Make the S matrix. Here it is only a single vector as we have only two\n        domains.\n\n        The full set of data is always assumed to have the samples (pred) first\n        and then the real data (target).\n        \"\"\"\n        s = gnp.zeros((n_pred + n_target, 1))\n        s[:n_pred] = 1\n        s = s / n_pred - (1 - s) / n_target\n        return s\n\n        #s = gnp.zeros((n_pred + n_target, 2))\n        #s[:n_pred, 0] = 1.0 / n_pred\n        #s[n_pred:, 1] = 1.0 / n_target\n        #s -= 1.0 / (n_pred + n_target)\n        #return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        \"\"\"\n        Return loss and gradient\n        \"\"\"\n        raise NotImplementedError()\n\nclass MultiScaleDifferentiableKernelMmdLoss(DifferentiableKernelMmdLoss):\n    \"\"\"\n    Base class for MMD loss with kernels on multiple scales.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(MultiScaleDifferentiableKernelMmdLoss, self).__init__(**kwargs)\n        if not isinstance(sigma, list):\n            sigma = [sigma]\n        self.sigma = [float(s) for s in sigma]\n        self.n_scales = len(sigma)\n\n        if scale_weight is None:\n            self.scale_weight = [1.0] * self.n_scales\n        else:\n            if not isinstance(scale_weight, list):\n                scale_weight = [scale_weight]\n            assert(len(scale_weight) == len(sigma))\n            self.scale_weight = [float(w) for w in scale_weight]\n\nclass GaussianKernelMmdLoss(MultiScaleDifferentiableKernelMmdLoss):\n    \"\"\"\n    k(x,y) = exp(-|x-y|^2 / (2 sigma))\n\n    Multi-scale MMD loss with Gaussian kernels.  Essentially reimplementing \n    PairMmdLossMultiScale / UnsupervisedMmdLoss.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(GaussianKernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        pred = util.to_garray(pred)\n        n_pred = pred.shape[0]\n\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        XX = X.dot(X.T)\n        if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n            x = gnp.garray(np.diag(XX.asarray()))\n        else:\n            x = XX.diag()\n\n        prod_mat = XX - 0.5 * x - 0.5 * x[:,gnp.newaxis]\n        ww = W.dot(W.T)\n\n        loss = 0\n        grad = None\n        for i in range(self.n_scales):\n            K = gnp.exp(1.0 / self.sigma[i] * prod_mat)\n            A = self.scale_weight[i] * ww * K\n            loss += A.sum()\n            a = A.sum(axis=1)\n            if grad is None:\n                grad = 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n            else:\n                grad += 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n\n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen_gaussian'\n\n    def get_id(self):\n        return 301\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(GaussianKernelMmdLoss())\n\nclass LaplacianKernelMmdLoss(MultiScaleDifferentiableKernelMmdLoss):\n    \"\"\"\n    k(x,y) = exp(-|x-y|_2/sigma)\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(LaplacianKernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        pred = util.to_garray(pred)\n        n_pred = pred.shape[0]\n\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        ww = W.dot(W.T)\n\n        XX = X.dot(X.T)\n        if XX.shape[0] > 4000:  # this special case is due to a weird bug in gnumpy\n            x = gnp.garray(np.diag(XX.asarray()))\n        else:\n            x = XX.diag()\n\n        idx = np.arange(X.shape[0])\n        zv = gnp.zeros(idx.size)\n\n        # handle numeric problems\n        _R = x + x[:,gnp.newaxis] - 2 * XX\n        _R_min = _R.min()\n        if _R_min < 1e-4:\n            _R = _R - _R_min + 1e-4\n            _R[idx,idx] = zv\n\n        R = gnp.sqrt(_R)\n\n        loss = 0\n        grad = None\n\n        for i in range(self.n_scales):\n            K = gnp.exp(-1.0 / self.sigma[i] * R)\n            L = self.scale_weight[i] * ww * K\n            loss += L.sum()\n            A = L / (R + gnp.eye(L.shape[0]))\n            A[idx,idx] = zv\n            a = A.sum(axis=1)\n            if grad is None:\n                grad = 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n            else:\n                grad += 2.0 / self.sigma[i] * (A.dot(X) - X * a[:,gnp.newaxis])\n        \n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen_laplacian'\n\n    def get_id(self):\n        return 302\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(LaplacianKernelMmdLoss())\n\nclass LaplacianL1KernelMmdLoss(MultiScaleDifferentiableKernelMmdLoss):\n    \"\"\"\n    k(x,y) = exp(-|x-y|/sigma)\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(LaplacianL1KernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        pred = util.to_garray(pred)\n        n_pred = pred.shape[0]\n\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = gnp.concatenate((pred, self.target), axis=0)\n\n        ww = W.dot(W.T)\n\n        loss = 0\n        grad = None\n        \n        for i in range(X.shape[0]):\n            v = X[i]\n            w = ww[i]\n            diff = X - v\n\n            a = diff.abs().sum(axis=1)\n            for i_scale in range(self.n_scales):\n                k = gnp.exp(-a / self.sigma[i_scale])\n                loss += self.scale_weight[i_scale] * (w * k).sum()\n\n                g = (self.scale_weight[i_scale] * w * k / self.sigma[i_scale])[:,gnp.newaxis] * ((diff < 0) - (diff > 0)) \n                g[i] = -g.sum(axis=0)\n                if grad is None:\n                    grad = g\n                else:\n                    grad += g\n\n        return loss, grad[:n_pred,:]\n\n    def get_name(self):\n        return 'mmdgen_laplacian_l1'\n\n    def get_id(self):\n        return 303\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(LaplacianL1KernelMmdLoss())\n\nclass SqrtGaussianKernelMmdLoss(GaussianKernelMmdLoss):\n    \"\"\"\n    k(x,y) = sqrt{exp(-|x-y|^2 / (2 sigma))}\n\n    Multi-scale MMD loss with Gaussian kernels.  Essentially reimplementing \n    PairMmdLossMultiScale / UnsupervisedMmdLoss.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(SqrtGaussianKernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        loss, grad = super(SqrtGaussianKernelMmdLoss, self).compute_not_weighted_loss_and_grad(pred, compute_grad=compute_grad)\n        sqrt_loss = math.sqrt(loss)\n        return sqrt_loss, grad / (2 * sqrt_loss + 1e-10)\n\n    def get_name(self):\n        return 'mmdgen_sqrt_gaussian'\n\n    def get_id(self):\n        return 304\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(SqrtGaussianKernelMmdLoss())\n\nclass CpuDifferentiableKernelMmdLoss(ls.Loss):\n    \"\"\"\n    Base class for MMD loss with kernels that can be backpropagated through.\n    \"\"\"\n    def __init__(self, **kwargs):\n        super(CpuDifferentiableKernelMmdLoss, self).__init__(**kwargs)\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data batch that we want our model to match.\n        \"\"\"\n        self.target = util.to_nparray(target)\n        self.n_target = self.target.shape[0]\n\n    def _make_s_mat(self, n_pred, n_target):\n        \"\"\"\n        Make the S matrix. Here it is only a single vector as we have only two\n        domains.\n\n        The full set of data is always assumed to have the samples (pred) first\n        and then the real data (target).\n        \"\"\"\n        s = np.zeros((n_pred + n_target, 1), dtype=np.float32)\n        s[:n_pred] = 1\n        s = s / n_pred - (1 - s) / n_target\n        return s\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        \"\"\"\n        Return loss and gradient\n        \"\"\"\n        raise NotImplementedError()\n\nclass CpuMultiScaleDifferentiableKernelMmdLoss(CpuDifferentiableKernelMmdLoss):\n    \"\"\"\n    Base class for MMD loss with kernels on multiple scales.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(CpuMultiScaleDifferentiableKernelMmdLoss, self).__init__(**kwargs)\n        if not isinstance(sigma, list):\n            sigma = [sigma]\n        self.sigma = [float(s) for s in sigma]\n        self.n_scales = len(sigma)\n\n        if scale_weight is None:\n            self.scale_weight = [1.0] * self.n_scales\n        else:\n            if not isinstance(scale_weight, list):\n                scale_weight = [scale_weight]\n            assert(len(scale_weight) == len(sigma))\n            self.scale_weight = [float(w) for w in scale_weight]\n\nclass CpuGaussianKernelMmdLoss(CpuMultiScaleDifferentiableKernelMmdLoss):\n    \"\"\"\n    k(x,y) = exp(-|x-y|^2 / (2 sigma))\n\n    Multi-scale MMD loss with Gaussian kernels.  Essentially reimplementing \n    PairMmdLossMultiScale / UnsupervisedMmdLoss.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(CpuGaussianKernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        pred = util.to_nparray(pred)\n        n_pred = pred.shape[0]\n\n        W = self._make_s_mat(n_pred, self.n_target)\n        X = np.concatenate((pred, self.target), axis=0)\n\n        XX = X.dot(X.T)\n        x = np.diag(XX)\n\n        prod_mat = XX - 0.5 * x - 0.5 * x[:,np.newaxis]\n        ww = W.dot(W.T)\n\n        loss = 0\n        grad = None\n\n        K = self.scale_weight[0] * np.exp(1.0 / self.sigma[0] * prod_mat)\n        scaled_K = K / self.sigma[0]\n        for i in range(1, self.n_scales):\n            T = self.scale_weight[i] * np.exp(1.0 / self.sigma[i] * prod_mat)\n            K += T\n            scaled_K += T / self.sigma[i]\n\n        loss = (ww * K).sum()\n        A = ww * scaled_K\n        a = A.sum(axis=1)\n\n        grad = 2.0 * (A[:n_pred,:].dot(X) - X[:n_pred,:] * a[:n_pred,np.newaxis])\n\n        return loss, util.to_garray(grad)\n\n    def get_name(self):\n        return 'cpu_mmdgen_gaussian'\n\n    def get_id(self):\n        return 305\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(CpuGaussianKernelMmdLoss())\n\nclass CpuSqrtGaussianKernelMmdLoss(CpuGaussianKernelMmdLoss):\n    \"\"\"\n    k(x,y) = sqrt{exp(-|x-y|^2 / (2 sigma))}\n\n    Multi-scale MMD loss with Gaussian kernels.  Essentially reimplementing \n    PairMmdLossMultiScale / UnsupervisedMmdLoss.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, **kwargs):\n        super(CpuSqrtGaussianKernelMmdLoss, self).__init__(sigma=sigma, \n                scale_weight=scale_weight, **kwargs)\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        loss, grad = super(CpuSqrtGaussianKernelMmdLoss, self).compute_not_weighted_loss_and_grad(pred, compute_grad=compute_grad)\n        sqrt_loss = math.sqrt(loss)\n        return sqrt_loss, grad / (2 * sqrt_loss + 1e-10)\n\n    def get_name(self):\n        return 'cpu_mmdgen_sqrt_gaussian'\n\n    def get_id(self):\n        return 306\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s' % (\n                self.get_name(), self.weight, str(self.sigma), str(self.scale_weight))\n\nls.register_loss(CpuSqrtGaussianKernelMmdLoss())\n\nclass CpuPerExampleSqrtGaussianKernelMmdLoss(ls.Loss):\n    \"\"\"\n    Each batch contains multiple examples, MMD is applied on a per example basis.\n    \"\"\"\n    def __init__(self, sigma=[1.0], scale_weight=None, pred_per_example=1, **kwargs):\n        super(CpuPerExampleSqrtGaussianKernelMmdLoss, self).__init__(**kwargs)\n        self.mmd_loss = CpuSqrtGaussianKernelMmdLoss(sigma=sigma, scale_weight=scale_weight, **kwargs)\n        self.pred_per_example = pred_per_example\n\n    def load_target(self, target, **kwargs):\n        \"\"\"\n        target is the target data batch that we want our model to match.\n\n        target is a list of target matrices, each correspond to the targets for\n        one prediction/one group of predictions.\n        \"\"\"\n        self.target = [util.to_nparray(t) if len(t.shape) > 1 else util.to_nparray(t)[np.newaxis,:] for t in target]\n\n    def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False):\n        \"\"\"\n        Return loss and gradient\n        \"\"\"\n        pred = util.to_nparray(pred)\n\n        loss = 0\n        grad = gnp.zeros(pred.shape)\n\n        assert pred.shape[0] % self.pred_per_example == 0\n        n_groups = pred.shape[0] / self.pred_per_example\n        assert n_groups == len(self.target)\n\n        for i_group in range(n_groups):\n            i_start = i_group * self.pred_per_example\n            i_end = i_start + self.pred_per_example\n\n            self.mmd_loss.load_target(self.target[i_group])\n            t_loss, grad[i_start:i_end] = self.mmd_loss.compute_not_weighted_loss_and_grad(pred[i_start:i_end], compute_grad=True)\n            loss += t_loss\n\n        return loss / n_groups, grad / n_groups\n\n    def get_name(self):\n        return 'cpu_per_example_mmdgen_sqrt_gaussian'\n\n    def get_id(self):\n        return 307\n\n    def __repr__(self):\n        return 'Loss <%s> w=%g, sigma=%s, scale_weight=%s, pred_per_example=%s' % (\n                self.get_name(), self.mmd_loss.weight, str(self.mmd_loss.sigma), str(self.mmd_loss.scale_weight), str(self.pred_per_example))\n\nls.register_loss(CpuPerExampleSqrtGaussianKernelMmdLoss())\n\n############################################################\n# Learners, samplers, and others\n############################################################\n\nclass StochasticGenerativeNet(nn.NeuralNet):\n    \"\"\"\n    A generative feed-forward neural net with a layer of stochastic hidden \n    units at the top (or bottom depending on how you orient the network), and\n    a deterministic top-down mapping given by the neural net.\n\n    The hidden units are fixed to have a uniform distribution over the space\n    [-1,1]^out_dim.\n    \"\"\"\n    def __init__(self, in_dim=0, out_dim=0):\n        super(StochasticGenerativeNet, self).__init__(in_dim, out_dim)\n\n    def sample_hiddens(self, n_samples):\n        \"\"\"\n        Generate specified number of samples of hidden units.\n        \"\"\"\n        return 2 * gnp.rand(n_samples, self.in_dim) - 1\n        # return gnp.randn(n_samples, self.in_dim)\n\n    def generate_samples(self, z=None, n_samples=100, sample_batch_size=1000):\n        \"\"\"\n        Generate samples of visibles units. The provided z will be used for\n        propagating samples if given, otherwise new samples of z will be\n        generated using sample_hiddens.\n        \"\"\"\n        if z is not None:\n            return self.forward_prop(z, add_noise=False, compute_loss=False)\n\n        samples = gnp.zeros((n_samples, self.out_dim))\n        n_batches = (n_samples + sample_batch_size - 1) / sample_batch_size\n\n        for i_batch in range(n_batches):\n            i_start = i_batch * sample_batch_size\n            i_end = (i_start + sample_batch_size) if i_batch + 1 < n_batches else n_samples\n            n_samples_in_batch = i_end - i_start\n\n            z = self.sample_hiddens(n_samples_in_batch)\n            samples[i_start:i_end] = self.forward_prop(z, add_noise=False, compute_loss=False)\n\n        # return self.forward_prop(z, add_noise=False, compute_loss=False)\n        return samples\n\nclass StochasticGenerativeNetWithAutoencoder(StochasticGenerativeNet):\n    \"\"\"\n    A StochasticGenerativeNet together with an autoencoder. The stochastic \n    generative network is used in the code layer of the autoencoder.\n    \"\"\"\n    def __init__(self, in_dim=0, out_dim=0, autoencoder=None):\n        super(StochasticGenerativeNetWithAutoencoder, self).__init__(in_dim, out_dim)\n        self.autoencoder = autoencoder\n\n    def _generate_code_samples(self, z=None, n_samples=100, sample_batch_size=1000):\n        return super(StochasticGenerativeNetWithAutoencoder, self).generate_samples(\n                z=z, n_samples=n_samples, sample_batch_size=sample_batch_size)\n\n    def generate_samples(self, z=None, n_samples=100, sample_batch_size=1000):\n        return self.autoencoder.decoder.forward_prop(self._generate_code_samples(\n                z=z, n_samples=n_samples, sample_batch_size=sample_batch_size))\n\n    def load_target(self, target, *args, **kwargs):\n        \"\"\"\n        Need to first transform target into the code space using encoder.\n        \"\"\"\n        super(StochasticGenerativeNetWithAutoencoder, self).load_target(\n                self.autoencoder.encode(target), *args, **kwargs)\n\nclass StochasticGenerativeNetWithAutoencoderContainer(object):\n    \"\"\"\n    A container used to combine a net with an autoencoder after training - for\n    generating samples.\n    \"\"\"\n    def __init__(self, net, autoencoder):\n        self.net = net\n        self.autoencoder = autoencoder\n\n    def generate_samples(self, z=None, n_samples=100, sample_batch_size=1000):\n        return self.autoencoder.decoder.forward_prop(\n                self.net.generate_samples(z, n_samples, sample_batch_size))\n\nclass SampleFilter(object):\n    \"\"\"\n    Used to filter samples.\n    \"\"\"\n    def __init__(self):\n        pass\n\n    def filter(self, x):\n        \"\"\"\n        x: n x D is a matrix of examples\n\n        Return a matrix n' x D, with n' <= n, such that it contains all 'good'\n        samples.\n        \"\"\"\n        raise NotImplementedError()\n\nclass BlankSampleFilter(SampleFilter):\n    \"\"\"\n    Place holder for debugging, this class does nothing.\n    \"\"\"\n    def filter(self, x):\n        return x\n\nclass ClassifierSampleFilter(SampleFilter):\n    \"\"\"\n    Applies a classifier to judge whether a sample is good.\n    \"\"\"\n    def __init__(self, classifier, threshold, prev=None):\n        \"\"\"\n        The classifier makes probabilistic predictions and has a function \n        predict_proba that outputs a prediction matrix with elements between 0\n        and 1.  p[i][0] close to 1 indicates that a sample is good, close to 0\n        indicates a sample is bad. p[i][1] should always be 1-p[i][0].\n\n        prev: allows multiple filters to be chained together.\n        \"\"\"\n        self.classifier = classifier\n        self.threshold = threshold\n        self.prev = prev\n\n    def filter(self, x):\n        if self.prev is not None:\n            x = self.prev.filter(x)\n        is_garray = isinstance(x, gnp.garray)\n        if is_garray:\n            x = x.asarray()\n        p = self.classifier.predict_proba(x)\n        idx = np.arange(p.shape[0])[p[:,0] > self.threshold]\n        x = x[idx]\n        if is_garray:\n            x = gnp.garray(x)\n        return x\n\nclass ClassifierSampleStochasticFilter(SampleFilter):\n    \"\"\"\n    Same as above, but filter out samples probabilistically rather than\n    deterministically using a hard threshold.\n    \"\"\"\n    def __init__(self, classifier, prev=None):\n        \"\"\"\n        The classifier should support probabilistic outputs.\n        \"\"\"\n        self.classifier = classifier\n        self.prev = prev\n\n    def filter(self, x):\n        if self.prev is not None:\n            x = self.prev.filter(x)\n        is_garray = isinstance(x, gnp.garray)\n        if is_garray:\n            x = x.asarray()\n        p = self.classifier.predict_proba(x)\n\n        # TODO: implement probabilistic filtering\n        idx = np.arange(p.shape[0])[p[:,0] > np.random.rand(p.shape[0])]\n        x = x[idx]\n        if is_garray:\n            x = gnp.garray(x)\n\n        return x\n\n\nclass StochasticGenerativeNetWithFilter(object):\n    \"\"\"\n    This is a class used purely for generating samples, it is required to have\n    a method called generate_samples.\n\n    StochasticGenerativeNet can be used as a subclass of this one.\n    \"\"\"\n    def __init__(self, net, sample_filter):\n        \"\"\"\n        net can be StochasticGenerativeNet, or StochasticGenerativeNetWithFilter,\n        which allows multiple filtered nets to be chained together.\n        \"\"\"\n        self.net = net\n        self.sample_filter = sample_filter\n\n    def generate_samples(self, z=None, n_samples=100):\n        \"\"\"\n        Generate samples from the StochasticGenerativeNet and then filter out\n        bad samples using the sample filter.\n        \"\"\"\n        factor = 2\n        x = self.sample_filter.filter(self.net.generate_samples(z, n_samples * factor))[:n_samples]\n        gnp.free_reuse_cache()\n        is_garray = isinstance(x, gnp.garray)\n        while x.shape[0] < n_samples:\n            # factor *= 2   # this will explode in high threshold settings\n            y = self.sample_filter.filter(self.net.generate_samples(z, (n_samples - x.shape[0]) * factor))\n            if is_garray:\n                x = gnp.concatenate([x, y[:n_samples - x.shape[0]]], axis=0)\n            else:\n                x = np.r_[x, y[:n_samples - x.shape[0]]]\n            gnp.free_reuse_cache()\n\n        return x\n\nclass StochasticGenerativeNetLearner(learner.Learner):\n    \"\"\"\n    Used for learning the StochasticGenerativeNet model.\n    \"\"\"\n    def __init__(self, net):\n        super(StochasticGenerativeNetLearner, self).__init__(net)\n        self.n_samples_per_update = 100\n        self.n_sample_update_iters = 1\n        self.i_sample_update_iter = 0\n\n        self.set_output_dir('.')\n\n    def load_data(self, x_train):\n        self.x_train = util.to_garray(x_train)\n\n    def load_train_target(self):\n        self.net.load_target(self.x_train)\n\n    def sample_hiddens(self):\n        self.z = self.net.sample_hiddens(self.n_samples_per_update)\n\n    def f_and_fprime(self, w):\n        self.net.set_param_from_vec(w)\n        self.net.clear_gradient()\n        if self.i_sample_update_iter % self.n_sample_update_iters == 0:\n            self.sample_hiddens()\n        self.i_sample_update_iter = (self.i_sample_update_iter + 1) % self.n_sample_update_iters\n        self.net.forward_prop(self.z, add_noise=True, compute_loss=True)\n        loss = self.net.get_loss() / self.z.shape[0]\n        self.net.backward_prop()\n        grad = self.net.get_grad_vec() / self.z.shape[0]\n        return loss, grad\n\n    def create_minibatch_generator(self, minibatch_size):\n        self.minibatch_generator = learner.MiniBatchGenerator(\n                self.x_train, minibatch_size=minibatch_size, random_order=True)\n\n    def f_and_fprime_minibatch(self, w):\n        self.net.set_param_from_vec(w)\n        self.net.clear_gradient()\n\n        if self.i_sample_update_iter % self.n_sample_update_iters == 0:\n            if self.minibatch_load_target:\n                x = self.minibatch_generator.next()\n                self.net.load_target(x)\n            self.sample_hiddens()\n\n        self.i_sample_update_iter = (self.i_sample_update_iter + 1) % self.n_sample_update_iters\n\n        self.net.forward_prop(self.z, add_noise=True, compute_loss=True)\n        loss = self.net.get_loss() / self.z.shape[0]\n        self.net.backward_prop()\n        grad = self.net.get_grad_vec() / self.z.shape[0]\n\n        return loss, grad\n\n    def train_stochastic_lbfgs(self, **kwargs):\n        self._prepare_for_training()\n        if 'minibatch_size' in kwargs:\n            minibatch_size = kwargs['minibatch_size']\n            del kwargs['minibatch_size']\n        else:\n            minibatch_size = 100\n\n        self.create_minibatch_generator(minibatch_size)\n        self._process_options(kwargs)\n        #self.print_options(kwargs)\n        self.best_w, self.best_obj, d = spopt.fmin_l_bfgs_b(self.f_and_fprime_minibatch, self.init_w, **kwargs)\n        self.best_grad = d['grad']\n        return self.f_post_training()\n\n    def f_info(self, w):\n        \"\"\"\n        train_loss = None\n\n        w_0 = self.net.get_param_vec()\n        self.net.set_noiseless_param_from_vec(w)\n\n        y = self.net.forward_prop(self.x_train, add_noise=False, compute_loss=True)\n        train_loss = self.net.get_loss() / self.x_train.shape[0]\n        train_acc = (self.t_train == y.argmax(axis=1)).mean()\n\n        if self.use_validation:\n            y = self.net.forward_prop(self.x_val, add_noise=False, compute_loss=False)\n            val_acc = (self.t_val == y.argmax(axis=1)).mean()\n\n            s = 'train loss %.4f, acc %.4f, val acc ' % (train_loss, train_acc)\n            if self.best_obj is None or val_acc > self.best_obj:\n                self.best_obj = val_acc \n                self.best_w = w.copy()\n                s += co.good_colored_str('%.4f' % val_acc)\n            else:\n                s += '%.4f' % val_acc\n        else:\n            s = 'train loss %.4f, acc ' % train_loss\n            if self.best_obj is None or train_acc < self.best_obj:\n                self.best_obj = train_acc\n                self.best_w = w.copy()\n                s += co.good_colored_str('%.4f' % train_acc)\n            else:\n                s += '%.4f' % train_acc\n\n        self.net.set_param_from_vec(w_0)\n        return s\n        \"\"\"\n        return '<place holder>'\n\n    def _process_options(self, kwargs):\n        if 'n_samples_per_update' in kwargs:\n            self.n_samples_per_update = kwargs['n_samples_per_update']\n            del kwargs['n_samples_per_update']\n        if 'n_sample_update_iters' in kwargs:\n            self.n_sample_update_iters = kwargs['n_sample_update_iters']\n            del kwargs['n_sample_update_iters']\n\n        self.i_sample_update_iter = 0\n\n        if 'minibatch_size' in kwargs:\n            minibatch_size = kwargs['minibatch_size']\n            del kwargs['minibatch_size']\n        else:\n            minibatch_size = 100\n\n        self.create_minibatch_generator(minibatch_size)\n\n        self.minibatch_load_target = True\n        if 'minibatch_load_target' in kwargs:\n            self.minibatch_load_target = kwargs['minibatch_load_target']\n            del kwargs['minibatch_load_target']\n\n    def f_post_training(self):\n        # self.net.set_param_from_vec(self.best_w)\n        if hasattr(self, 'best_grad') and hasattr(self, 'best_obj'):\n            return self.best_obj, self.best_grad\n\n    def save_model(self):\n        # self.net.save_model_to_file(self.output_dir + '/gen_%s.pdata' % (time.strftime('%Y%m%d_%H%M%S', time.localtime())))\n        self.net.save_model_to_file(self.output_dir + '/gen_end.pdata')\n\n    def save_checkpoint(self, label):\n        self.net.save_model_to_file(self.output_dir + '/checkpoint_%s.pdata' % str(label))\n\nclass StochasticGenerativeNetLearnerAutoScale(learner.Learner):\n    \"\"\"\n    Used for learning the StochasticGenerativeNet model with MMD loss.  The\n    scale parameter will be automatically tuned.\n    \"\"\"\n    def __init__(self, net):\n        super(StochasticGenerativeNetLearnerAutoScale, self).__init__(net)\n        self.n_samples_per_update = 100\n        self.n_sample_update_iters = 1\n        self.i_sample_update_iter = 0\n        self.i_scale_update_iter = 0\n        self.n_scale_update_iters = 0\n        self.n_scale_update_samples = 2000\n        self._scale_selection_range = np.logspace(0, 8, 30)\n\n        self.set_output_dir('.')\n\n    def load_data(self, x_train):\n        self.x_train = util.to_garray(x_train)\n\n    def load_train_target(self):\n        self.net.load_target(self.x_train)\n\n    def sample_hiddens(self):\n        self.z = self.net.sample_hiddens(self.n_samples_per_update)\n\n    def update_loss_scale(self):\n        \"\"\"\n        Automatically set the scale of the loss.\n        \"\"\"\n        n_data_samples = min(self.x_train.shape[0], self.n_scale_update_samples)\n        data = self.x_train[np.random.permutation(self.x_train.shape[0])[:n_data_samples]]\n        samples = self.net.generate_samples(n_samples=self.n_scale_update_samples)\n\n        max_loss = 0\n        max_sigma = 1\n        for s in self._scale_selection_range:\n            mmd = ls.get_loss_from_type_name(self.net.loss.get_name(), sigma=s, scale_weight=self.net.loss.scale_weight[0])\n            mmd.load_target(data)\n            loss = mmd.compute_not_weighted_loss_and_grad(samples, compute_grad=False)[0]\n            if loss > max_loss:\n                max_loss = loss\n                max_sigma = s\n\n        print '>>> Reset loss...'\n        self.net.loss.sigma = [float(max_sigma)]\n        self.net.loss.scale_weight = [float(self.net.loss.scale_weight[0])]\n        print '>>>',\n        print self.net.loss\n\n    def f_and_fprime(self, w):\n        self.net.set_param_from_vec(w)\n        self.net.clear_gradient()\n\n        # resample if necessary\n        if self.i_sample_update_iter % self.n_sample_update_iters == 0:\n            self.sample_hiddens()\n        self.i_sample_update_iter = (self.i_sample_update_iter + 1) % self.n_sample_update_iters\n\n        # update scale of the loss if necessary\n        if self.n_scale_update_iters > 0:\n            if self.i_scale_update_iter % self.n_scale_update_iters == 0:\n                self.update_loss_scale()\n            self.i_scale_update_iter = (self.i_scale_update_iter + 1) % self.n_scale_update_iters\n\n        self.net.forward_prop(self.z, add_noise=True, compute_loss=True)\n        loss = self.net.get_loss() / self.z.shape[0]\n        self.net.backward_prop()\n        grad = self.net.get_grad_vec() / self.z.shape[0]\n        return loss, grad\n\n    def create_minibatch_generator(self, minibatch_size):\n        self.minibatch_generator = learner.MiniBatchGenerator(\n                self.x_train, minibatch_size=minibatch_size, random_order=True)\n\n    def f_and_fprime_minibatch(self, w):\n        self.net.set_param_from_vec(w)\n        self.net.clear_gradient()\n\n        if self.i_sample_update_iter % self.n_sample_update_iters == 0:\n            if self.minibatch_load_target:\n                x = self.minibatch_generator.next()\n                self.net.load_target(x)\n            self.sample_hiddens()\n\n        self.i_sample_update_iter = (self.i_sample_update_iter + 1) % self.n_sample_update_iters\n\n        if self.n_scale_update_iters > 0:\n            if self.i_scale_update_iter % self.n_scale_update_iters == 0:\n                self.update_loss_scale()\n            self.i_scale_update_iter = (self.i_scale_update_iter + 1) % self.n_scale_update_iters\n\n        self.net.forward_prop(self.z, add_noise=True, compute_loss=True)\n        loss = self.net.get_loss() / self.z.shape[0]\n        self.net.backward_prop()\n        grad = self.net.get_grad_vec() / self.z.shape[0]\n\n        return loss, grad\n\n    def train_stochastic_lbfgs(self, **kwargs):\n        self._prepare_for_training()\n        if 'minibatch_size' in kwargs:\n            minibatch_size = kwargs['minibatch_size']\n            del kwargs['minibatch_size']\n        else:\n            minibatch_size = 100\n\n        self.create_minibatch_generator(minibatch_size)\n        self._process_options(kwargs)\n        #self.print_options(kwargs)\n        self.best_w, self.best_obj, d = spopt.fmin_l_bfgs_b(self.f_and_fprime_minibatch, self.init_w, **kwargs)\n        self.best_grad = d['grad']\n        return self.f_post_training()\n\n    def f_info(self, w):\n        \"\"\"\n        train_loss = None\n\n        w_0 = self.net.get_param_vec()\n        self.net.set_noiseless_param_from_vec(w)\n\n        y = self.net.forward_prop(self.x_train, add_noise=False, compute_loss=True)\n        train_loss = self.net.get_loss() / self.x_train.shape[0]\n        train_acc = (self.t_train == y.argmax(axis=1)).mean()\n\n        if self.use_validation:\n            y = self.net.forward_prop(self.x_val, add_noise=False, compute_loss=False)\n            val_acc = (self.t_val == y.argmax(axis=1)).mean()\n\n            s = 'train loss %.4f, acc %.4f, val acc ' % (train_loss, train_acc)\n            if self.best_obj is None or val_acc > self.best_obj:\n                self.best_obj = val_acc \n                self.best_w = w.copy()\n                s += co.good_colored_str('%.4f' % val_acc)\n            else:\n                s += '%.4f' % val_acc\n        else:\n            s = 'train loss %.4f, acc ' % train_loss\n            if self.best_obj is None or train_acc < self.best_obj:\n                self.best_obj = train_acc\n                self.best_w = w.copy()\n                s += co.good_colored_str('%.4f' % train_acc)\n            else:\n                s += '%.4f' % train_acc\n\n        self.net.set_param_from_vec(w_0)\n        return s\n        \"\"\"\n        return '<place holder>'\n\n    def _process_options(self, kwargs):\n        if 'n_samples_per_update' in kwargs:\n            self.n_samples_per_update = kwargs['n_samples_per_update']\n            del kwargs['n_samples_per_update']\n        if 'n_sample_update_iters' in kwargs:\n            self.n_sample_update_iters = kwargs['n_sample_update_iters']\n            del kwargs['n_sample_update_iters']\n\n        self.i_sample_update_iter = 0\n\n        if 'minibatch_size' in kwargs:\n            minibatch_size = kwargs['minibatch_size']\n            del kwargs['minibatch_size']\n        else:\n            minibatch_size = 100\n\n        self.create_minibatch_generator(minibatch_size)\n\n        self.minibatch_load_target = True\n        if 'minibatch_load_target' in kwargs:\n            self.minibatch_load_target = kwargs['minibatch_load_target']\n            del kwargs['minibatch_load_target']\n\n        if 'i_scale_update' in kwargs:\n            self.n_scale_update_iters = kwargs['i_scale_update']\n            del kwargs['i_scale_update']\n        else:\n            self.n_scale_update_iters = 0\n\n        if 'n_scale_update_samples' in kwargs:\n            self.n_scale_update_samples = kwargs['n_scale_update_samples']\n            del kwargs['n_scale_update_samples']\n        else:\n            self.n_scale_update_samples = 2000\n\n    def f_post_training(self):\n        # self.net.set_param_from_vec(self.best_w)\n        if hasattr(self, 'best_grad') and hasattr(self, 'best_obj'):\n            return self.best_obj, self.best_grad\n\n    def save_model(self):\n        # self.net.save_model_to_file(self.output_dir + '/gen_%s.pdata' % (time.strftime('%Y%m%d_%H%M%S', time.localtime())))\n        self.net.save_model_to_file(self.output_dir + '/gen_end.pdata')\n\n    def save_checkpoint(self, label):\n        self.net.save_model_to_file(self.output_dir + '/checkpoint_%s.pdata' % str(label))\n\n\n\n\n"
  },
  {
    "path": "core/kernels.py",
    "content": "\"\"\"\nImplementation of different kernel functions.\n\nYujia Li, 11/2014\n\"\"\"\n\nimport numpy as np\nimport gnumpy as gnp\n\ndef safe_diag(x):\n    if isinstance(x, np.ndarray):\n        return x.diagonal()\n    if isinstance(x, gnp.garray):\n        if x.shape[0] > 4000:\n            return gnp.garray(x.asarray().diagonal())\n        else:\n            return x.diag()\n\n    raise Exception()\n\nclass Kernel(object):\n    def __init__(self):\n        pass\n\n    def compute_kernel_matrix(self, x):\n        \"\"\"\n        x: n_examples * n_dims input data matrix\n\n        Return: n_examples * n_examples kernel matrix\n        \"\"\"\n        return self.compute_kernel_transformation(x, x)\n\n    def compute_kernel_transformation(self, x_base, x_new):\n        \"\"\"\n        x_base: n_examples_1 * n_dims data matrix\n        x_new: n_examples_2 * n_dims data matrix\n\n        For each example in x_new, compute its kernel distance with each of the\n        examples in x_base, return a n_examples_2 * n_examples_1 matrix as the\n        transformed representation of x_new.\n        \"\"\"\n        raise NotImplementedError()\n\n    def get_name(self):\n        raise NotImplementedError()\n\nclass GaussianKernel(Kernel):\n    def __init__(self, sigma):\n        self.sigma = sigma\n\n    def compute_kernel_matrix(self, x):\n        x = x if isinstance(x, gnp.garray) else gnp.garray(x)\n        xx = x.dot(x.T)\n        x_diag = safe_diag(xx)\n\n        return gnp.exp(-1.0 / (2 * self.sigma**2) * (-2 * xx + x_diag + x_diag[:,gnp.newaxis]))\n\n    def compute_kernel_transformation(self, x_base, x_new):\n        x_base = x_base if isinstance(x_base, gnp.garray) else gnp.garray(x_base)\n        x_new = x_new if isinstance(x_new, gnp.garray) else gnp.garray(x_new)\n\n        xx = x_new.dot(x_base.T)\n        xx_base = (x_base**2).sum(axis=1)\n        xx_new = (x_new**2).sum(axis=1)\n        return gnp.exp(-1.0 / (2 * self.sigma**2) * (-2 * xx + xx_base + xx_new[:,gnp.newaxis]))\n\n    def get_name(self):\n        return 'gaussian_kernel'\n\nclass EuclideanKernel(Kernel):\n    def __init__(self):\n        pass\n\n    def compute_kernel_matrix(self, x):\n        x = x if isinstance(x, gnp.garray) else gnp.garray(x)\n        xx = x.dot(x.T)\n        x_diag = safe_diag(xx)\n\n        return (-2 * xx + x_diag + x_diag[:,gnp.newaxis])\n\n    def compute_kernel_transformation(self, x_base, x_new):\n        x_base = x_base if isinstance(x_base, gnp.garray) else gnp.garray(x_base)\n        x_new = x_new if isinstance(x_new, gnp.garray) else gnp.garray(x_new)\n\n        xx = x_new.dot(x_base.T)\n        xx_base = (x_base**2).sum(axis=1)\n        xx_new = (x_new**2).sum(axis=1)\n\n        return (-2 * xx + xx_base + xx_new[:,gnp.newaxis])\n\nclass CPUGaussianKernel(Kernel):\n    def __init__(self, sigma):\n        self.sigma = sigma\n\n    def compute_kernel_matrix(self, x):\n        pass\n\nclass LinearKernel(Kernel):\n    def compute_kernel_matrix(self, x):\n        x = x if isinstance(x, gnp.garray) else gnp.garray(x)\n        return x.dot(x.T)\n\n    def compute_kernel_transformation(self, x_base, x_new):\n        x_base = x_base if isinstance(x_base, gnp.garray) else gnp.garray(x_base)\n        x_new = x_new if isinstance(x_new, gnp.garray) else gnp.garray(x_new)\n\n        return x_new.dot(x_base.T)\n\n    def get_name(self):\n        return 'linear_kernel'\n\nclass CosineKernel(Kernel):\n    def compute_kernel_matrix(self, x):\n        x = x if isinstance(x, gnp.garray) else gnp.garray(x)\n        x_norm = gnp.sqrt((x**2).sum(axis=1))\n        x_norm = x_norm[:,gnp.newaxis] + x_norm[gnp.newaxis,:] + 1e-20\n\n        return x.dot(x.T) / x_norm\n\n    def compute_kernel_transformation(self, x_base, x_new):\n        x_base = x_base if isinstance(x_base, gnp.garray) else gnp.garray(x_base)\n        x_new = x_new if isinstance(x_new, gnp.garray) else gnp.garray(x_new)\n\n        base_norm = (x_base**2).sum(axis=1)\n        new_norm = (x_new**2).sum(axis=1)\n\n        return x_new.dot(x_base.T) / (base_norm + new_norm[:,gnp.newaxis])\n\n"
  },
  {
    "path": "core/util.py",
    "content": "\"\"\"\nSome helpful utility functions.\n\nYujia Li, 09/2014\n\"\"\"\n\nimport gnumpy as gnp\nimport numpy as np\n\ndef to_garray(x):\n    return x if isinstance(x, gnp.garray) else gnp.garray(x)\n\ndef to_nparray(x):\n    return x if isinstance(x, np.ndarray) else x.asarray()\n\ndef to_one_of_K(t, K=None):\n    n_cases = t.size\n    if K is None:\n        K = t.max() + 1\n    if len(t.shape) > 0:\n        t = t.ravel()\n\n    t_mat = np.zeros((n_cases, K))\n    t_mat[np.arange(n_cases), t] = 1\n    return gnp.garray(t_mat)\n\ndef to_plus_minus_of_K(t, K=None):\n    \"\"\"\n    Convert the 1-D label vector into a matrix where the t[i]th element on the\n    ith row is 1 and all others on that row is -1.\n    \"\"\"\n    n_cases = t.size\n    if K is None:\n        K = t.max() + 1\n    if len(t.shape) > 0:\n        t = t.ravel()\n\n    t_mat = -np.ones((n_cases, K))\n    t_mat[np.arange(n_cases), t] = 1\n    return gnp.garray(t_mat)\n\n"
  },
  {
    "path": "dataio/__init__.py",
    "content": ""
  },
  {
    "path": "dataio/mnist.py",
    "content": "\"\"\"\nData I/O for mnist dataset.\n\nYujia Li, 01/2015\n\"\"\"\n\nimport cPickle as pickle\nimport numpy as np\n\n# Fill in the path to your mnist data here\n#\n# This data file is supposed to be a pickled dictionary containing numpy arrays\n# including train_data, test_data, train_labels, test_labels.  For train_data\n# and test_data, they are matrices of size NxD, where N is the number of data\n# points and D=784 (28x28) is the size of the image.  Each row is a data point,\n# assumed to be already normalized to [0,1].  For train_label and test_label, \n# they are matrices of size Nx1, each label is an integer from 0 to 9.\n_DATA_FILE_PATH = 'path/to/your/mnist/data'\n\ndef load_raw_data():\n    \"\"\"\n    Return the original train/test split.\n    \"\"\"\n    with open(_DATA_FILE_PATH) as f:\n        d = pickle.load(f)\n\n    return d['train_data'], d['test_data']\n\ndef load_data():\n    \"\"\"\n    Split part of training data to be used as validation data.\n    \"\"\"\n    with open(_DATA_FILE_PATH) as f:\n        d = pickle.load(f)\n\n    x_train = d['train_data']\n    x_test  = d['test_data']\n\n    # keep current state of random number generator\n    rand_state = np.random.get_state()\n\n    np.random.seed(0)\n    idx = np.random.permutation(x_train.shape[0])\n\n    n_val = 5000\n    x_val = x_train[idx[:n_val]]\n    x_train = x_train[idx[n_val:]]\n\n    # restore the state of random number generator\n    np.random.set_state(rand_state)\n\n    return x_train, x_val, x_test\n\ndef load_labeled_data(n_val=5000):\n    \"\"\"\n    Load both the data and the labels.\n    \"\"\"\n    with open(_DATA_FILE_PATH) as f:\n        d = pickle.load(f)\n\n    x_train = d['train_data']\n    t_train = d['train_label']\n    \n    x_test = d['test_data']\n    t_test = d['test_label']\n\n    rand_state = np.random.get_state()\n\n    np.random.seed(0)\n    idx = np.random.permutation(x_train.shape[0])\n\n    x_val = x_train[idx[:n_val]]\n    t_val = t_train[idx[:n_val]]\n    x_train = x_train[idx[n_val:]]\n    t_train = t_train[idx[n_val:]]\n\n    np.random.set_state(rand_state)\n\n    return x_train, t_train, x_val, t_val, x_test, t_test\n\n\n"
  },
  {
    "path": "dataio/tfd.py",
    "content": "\"\"\"\nThe Toronto Face Database, Charlie/Marc'Aurelio's version\n\nYujia Li, 01/2015\n\"\"\"\n\nimport scipy.io as sio\nimport numpy as np\n\n# Fill in your TFD path here\n_TFD_DATA_PATH_FORMAT = 'path/to/your/TFD_ranzato_%dx%d.mat'\n\ndef _load_raw_data(image_size=48):\n    d = sio.loadmat(_TFD_DATA_PATH_FORMAT % (image_size, image_size))\n    return d['images'], d['folds'], d['labs_id'].squeeze(), d['labs_ex'].squeeze()\n\ndef get_fixed_rand_permutation(size, seed=1):\n    rand_state = np.random.get_state()\n    np.random.seed(seed)\n    idx = np.random.permutation(size)\n    np.random.set_state(rand_state)\n\n    return idx\n\nclass TFD(object):\n    def __init__(self, image_size=48):\n        self.images, self.folds, self.labs_id, self.labs_ex = \\\n                _load_raw_data(image_size)\n\n        self._val_sizes = [(self.folds[:,fold] == 2).sum() for fold in range(5)]\n        self._val_idx_start = np.array([0] + self._val_sizes).cumsum()\n\n    def get_fold(self, fold, set_name, center=False, scale=False):\n        \"\"\"\n        0 <= fold < 5\n        set_name should be one of {train, val, test, unlabeled}\n\n        Return images, labs_id, and labs_ex.\n\n        There are two labels available: identity and expression.  For \n        unsupervised learning tasks these labels are not useful though.  The\n        quality of these labels are also not very high.\n        \"\"\"\n        set_map = {'unlabeled': 0, 'train' : 1, 'val': 2, 'test': 3}\n        set_id = set_map[set_name]\n        data_mask = (self.folds[:,fold] == set_id)\n\n        images = self.images[data_mask].astype(np.float32)\n        labs_id = self.labs_id[data_mask]\n        labs_ex = self.labs_ex[data_mask]\n\n        if center and scale:\n            images -= 127.5\n            images /= 127.5\n        elif center:\n            images -= 127.5\n        elif scale:\n            images /= 255.0\n\n        return images, labs_id, labs_ex\n\n    def get_proper_fold(self, fold, set_name, center=False, scale=False):\n        \"\"\"\n        Same as get_fold, except that the validation sets across folds will be\n        disjoint from test sets and training sets - so validation is proper.\n        \"\"\"\n        set_map = {'unlabeled': 0, 'train' : 1, 'val': 2, 'test': 3}\n        set_id = set_map[set_name]\n\n        if set_id == 0 or set_id == 2:\n            data_mask = (self.folds[:,fold] == 0)\n            unlabeled_idx = np.arange(self.folds.shape[0])[data_mask]\n            idx = get_fixed_rand_permutation(unlabeled_idx.size)\n            data_mask = np.zeros(data_mask.size, dtype=np.bool)\n            if set_id == 2:\n                data_mask[idx[self._val_idx_start[fold]:self._val_idx_start[fold+1]]] = True\n            else:\n                data_mask[idx[self._val_idx_start[-1]:]] = True\n        else:\n            data_mask = (self.folds[:,fold] == set_id)\n\n        images = self.images[data_mask].astype(np.float32)\n        labs_id = self.labs_id[data_mask]\n        labs_ex = self.labs_ex[data_mask]\n\n        if center and scale:\n            images -= 127.5\n            images /= 127.5\n        elif center:\n            images -= 127.5\n        elif scale:\n            images /= 255.0\n\n        return images, labs_id, labs_ex\n        \n\n\n_tfd = {48: None, 96: None}\n\ndef load_fold(fold, set_name, center=False, scale=False, image_size=48):\n    if image_size != 48 and image_size != 96:\n        raise Exception('image_size has to be either 48 or 96!')\n\n    if _tfd[image_size] is None:\n        _tfd[image_size] = TFD(image_size) # load data the first time we use it\n\n    return _tfd[image_size].get_fold(fold, set_name, center, scale)\n\ndef load_proper_fold(fold, set_name, center=False, scale=False, image_size=48):\n    if image_size != 48 and image_size != 96:\n        raise Exception('image_size has to be either 48 or 96!')\n\n    if _tfd[image_size] is None:\n        _tfd[image_size] = TFD(image_size) # load data the first time we use it\n\n    return _tfd[image_size].get_proper_fold(fold, set_name, center, scale)\n\n"
  },
  {
    "path": "eval_mmd_generative_model.py",
    "content": "\"\"\"\nModule for evaluating MMD generative models.\n\nYujia Li, 11/2014\n\"\"\"\n\nimport cPickle as pickle\nimport time\nimport numpy as np\nimport gnumpy as gnp\nimport core.generative as gen\nimport core.kernels as ker\n\ndef load_tfd_fold(fold=0):\n    \"\"\"\n    Return train, val, test data for the particular fold.\n    \"\"\"\n    import dataio.tfd as tfd\n    # note that the training set used here is the 'unlabeled' set in TFD\n    x_train, _, _ = tfd.load_fold(fold, 'unlabeled', scale=True)\n    x_val,   _, _ = tfd.load_fold(fold, 'val', scale=True)\n    x_test,  _, _ = tfd.load_fold(fold, 'test', scale=True)\n\n    imsz = np.prod(x_train.shape[1:])\n\n    return x_train.reshape(x_train.shape[0], imsz), \\\n            x_val.reshape(x_val.shape[0], imsz), \\\n            x_test.reshape(x_test.shape[0], imsz)\n\ndef linear_classifier_discrimination(model, data, C_range=[1], verbose=True, samples=None):\n    \"\"\"\n    Compute the logistic regression classification accuracy.\n    \"\"\"\n    import sklearn.linear_model as lm\n    n_examples = data.shape[0]\n    if samples is None:\n        gnp.seed_rand(8)\n        samples = model.generate_samples(n_samples=n_examples).asarray()\n\n    x = np.r_[data, samples]\n    t = np.r_[np.zeros(n_examples, dtype=np.int), np.ones(samples.shape[0], dtype=np.int)]\n\n    best_acc = 0\n    best_classifier = None\n\n    for C in C_range:\n        t_start = time.time()\n        lr = lm.LogisticRegression(C=C, dual=False, random_state=8)\n        lr.fit(x,t)\n        acc = (lr.predict(x) == t).mean()\n\n        if verbose:\n            print 'C=%g  acc=%.4f' % (C, acc),\n        if acc > best_acc:\n            best_acc = acc\n            best_classifier = lr \n            if verbose:\n                print '*',\n        else:\n            if verbose:\n                print ' ',\n\n        if verbose:\n            print 'time=%.2f' % (time.time() - t_start)\n\n    return best_acc, best_classifier\n\ndef eval_filter_thresholds(model, data, thres_range=np.arange(0, 0.9, 0.1)):\n    \"\"\"\n    Evaluate the discrimination performance at different threshold levels.\n    \"\"\"\n    #data = data[:1000]\n    n_thres = len(thres_range)\n\n    # base classifier\n    acc, c = linear_classifier_discrimination(model, data, verbose=False)\n    acc_rec = np.zeros(n_thres, dtype=np.float)\n    acc_best = 1\n    for i in range(n_thres):\n        t_start = time.time()\n        ftr = gen.ClassifierSampleFilter(c, thres_range[i])\n        ftnet = gen.StochasticGenerativeNetWithFilter(model, ftr)\n        s = ftnet.generate_samples(n_samples=data.shape[0]).asarray()\n        acc_ftr, c_ftr = linear_classifier_discrimination(None, data, verbose=False, samples=s)\n        print 'thres=%.2f, acc=%.4f, time=%.2f' % (thres_range[i], acc_ftr, time.time() - t_start)\n        acc_rec[i] = acc_ftr\n        if acc_ftr < acc_best:\n            acc_best = acc_ftr\n            best_ftnet = ftnet\n\n    return best_ftnet\n\ndef get_filtered_model(net, data):\n    acc, lr = linear_classifier_discrimination(net, data, C_range=[1, 10, 100, 1000], verbose=True)\n    filter = gen.ClassifierSampleFilter(lr, threshold=0.8)\n    filtered_net = gen.StochasticGenerativeNetWithFilter(net, filter)\n    return filtered_net, lr\n\ndef test_single_filter_old(net, data, base_samples, base_classifier, threshold, base_filter=None):\n    \"\"\"\n    net: the base net\n    data: training data\n    base_samples: samples generated by the base model with the base filter\n    base_classifier: classifier trained to discriminate data from base_samples\n    threshold: threshold used for the filter\n    \"\"\"\n    if base_classifier is None:\n        ftr = gen.BlankSampleFilter()\n    else:\n        ftr = gen.ClassifierSampleFilter(base_classifier, threshold, prev=base_filter)\n    s = ftr.filter(base_samples)\n\n    n_base = base_samples.shape[0]\n    n_s = s.shape[0]\n\n    print 'Filtered out %d/%d ~ %%%.1f' % (n_base - n_s, n_base, 100.0 * (n_base - n_s) / n_base)\n\n    ftnet = gen.StochasticGenerativeNetWithFilter(net, ftr)\n    ss = ftnet.generate_samples(n_samples=data.shape[0]).asarray()\n\n    acc, c = linear_classifier_discrimination(net, data, samples=ss)\n\n    return ftr, c\n\ndef test_single_filter(net, data, threshold, base_samples=None, base_classifier=None, base_filter=None):\n    pass\n\ndef log_exp_sum_1d(x):\n    \"\"\"\n    This computes log(exp(x_1) + exp(x_2) + ... + exp(x_n)) as \n    x* + log(exp(x_1-x*) + exp(x_2-x*) + ... + exp(x_n-x*)), where x* is the\n    max over all x_i.  This can avoid numerical problems.\n    \"\"\"\n    x_max = x.max()\n    if isinstance(x, gnp.garray):\n        return x_max + gnp.log(gnp.exp(x - x_max).sum())\n    else:\n        return x_max + np.log(np.exp(x - x_max).sum())\n\ndef log_exp_sum(x, axis=1):\n    x_max = x.max(axis=axis)\n    if isinstance(x, gnp.garray):\n        return (x_max + gnp.log(gnp.exp(x - x_max[:,gnp.newaxis]).sum(axis=axis))).asarray()\n    else:\n        return x_max + np.log(np.exp(x - x_max[:,np.newaxis]).sum(axis=axis))\n\nclass KDE(object):\n    \"\"\"\n    Kernel density estimation.\n    \"\"\"\n    def __init__(self, data, sigma):\n        self.x = gnp.garray(data) if not isinstance(data, gnp.garray) else data\n        self.sigma = sigma\n        self.N = self.x.shape[0]\n        self.d = self.x.shape[1]\n        self._ek =  ker.EuclideanKernel()\n\n        self.factor = float(-np.log(self.N) - self.d / 2.0 * np.log(2 * np.pi * self.sigma**2))\n\n    def _log_likelihood(self, data):\n        return log_exp_sum(-self._ek.compute_kernel_transformation(self.x, data) / (2 * self.sigma**2), axis=1) + self.factor\n\n    def log_likelihood(self, data, batch_size=1000):\n        n_cases = data.shape[0]\n        if n_cases <= batch_size:\n            return self._log_likelihood(data)\n        else:\n            n_batches = (n_cases + batch_size - 1) / batch_size\n            log_like = np.zeros(n_cases, dtype=np.float)\n\n            for i_batch in range(n_batches):\n                i_start = i_batch * batch_size\n                i_end = n_cases if (i_batch + 1 == n_batches) else (i_start + batch_size)\n                log_like[i_start:i_end] = self._log_likelihood(data[i_start:i_end])\n\n            return log_like\n\n    def likelihood(self, data):\n        \"\"\"\n        data is a n_example x n_dims matrix.\n        \"\"\"\n        return np.exp(self.log_likelihood(data))\n\n    def average_likelihood(self, data):\n        return self.likelihood(data).mean()\n\n    def average_log_likelihood(self, data, batch_size=1000):\n        return self.log_likelihood(data, batch_size=batch_size).mean()\n\n    def average_std_log_likelihood(self, data, batch_size=1000):\n        l = self.log_likelihood(data)\n        return l.mean(), l.std()\n\n    def average_se_log_likelihood(self, data, batch_size=1000):\n        l = self.log_likelihood(data)\n        return l.mean(), l.std() / np.sqrt(data.shape[0])\n\nclass AlternativeKDE(object):\n    \"\"\"\n    Kernel density estimation.\n    \"\"\"\n    def __init__(self, data, sigma):\n        self.x = data if not isinstance(data, gnp.garray) else data.asarray()\n        self.sigma = sigma\n        self.N = self.x.shape[0]\n        self.d = self.x.shape[1]\n\n    def _compute_log_prob(self, data, batch_size=1000):\n        \"\"\"\n        Break down data into smaller pieces so large matrix will also work.\n        \"\"\"\n        data = data if not isinstance(data, gnp.garray) else data.asarray()\n        n_cases = data.shape[0]\n        K = np.zeros((n_cases, self.N), dtype=np.float)\n        log_prob = np.zeros(n_cases, dtype=np.float)\n        for i in range(n_cases):\n            K[i] = -((self.x - data[i])**2).sum(axis=1) / (2 * self.sigma**2)\n            log_prob[i] = log_exp_sum_1d(K[i]) - np.log(self.N) - self.d / 2.0 * (np.log(2 * np.pi) + 2 * np.log(self.sigma))\n\n        return log_prob\n\n    def likelihood(self, data):\n        \"\"\"\n        data is a n_example x n_dims matrix.\n        \"\"\"\n        return np.exp(self._compute_log_prob(data))\n\n    def average_likelihood(self, data):\n        return self.likelihood(data).mean()\n\n    def log_likelihood(self, data):\n        # return np.log(self._compute_kde(data) + 1e-50)# - self.d / 2.0 * (np.log(2 * np.pi) + 2 * np.log(self.sigma))\n        return self._compute_log_prob(data)\n\n    def average_log_likelihood(self, data):\n        return self.log_likelihood(data).mean()\n\n\ndef kde_evaluation(test_data, samples, sigma_range=np.arange(0.1, 0.3, 0.01), verbose=True):\n    best_log_likelihood = float('-inf')\n    for sigma in sigma_range:\n        log_likelihood = KDE(samples, sigma).average_log_likelihood(test_data)\n        if log_likelihood > best_log_likelihood:\n            best_log_likelihood = log_likelihood\n        if verbose:\n            print 'sigma=%g, log_likelihood=%.2f' % (sigma, log_likelihood)\n\n    if verbose:\n        print '===================='\n        print 'Best log_likelihood=%.2f' % best_log_likelihood\n        print ''\n    return best_log_likelihood\n\ndef kde_evaluation_tfd(test_data, samples, sigma_range=np.arange(0.05, 0.25, 0.01), verbose=True):\n    return kde_evaluation(test_data, samples, sigma_range, verbose)\n\ndef kde_evaluation_all_folds(test_data, samples, sigma_range=np.arange(0.05, 0.25, 0.01), verbose=True):\n    n_folds = len(samples)\n    best_log_likelihood = float('-inf')\n    for sigma in sigma_range:\n        log_likelihood = [KDE(samples[i], sigma).average_log_likelihood(test_data[i]) for i in range(n_folds)]\n        avg_log_likelihood = sum(log_likelihood) / float(n_folds)\n        if avg_log_likelihood > best_log_likelihood:\n            best_log_likelihood = avg_log_likelihood\n        if verbose:\n            print 'sigma=%5g, log_likelihood=%8.2f   [%s]' % (sigma, avg_log_likelihood, ', '.join(['%8.2f' % l for l in log_likelihood]))\n\n    if verbose:\n        print '===================='\n        print 'Best log_likelihood=%.2f' % best_log_likelihood\n        print ''\n    return best_log_likelihood\n\ndef generate_fold_samples(net, fold_model_format, ae=None, fold_ae_format=None, n_samples=10000, n_folds=5):\n    samples = []\n    for fold in range(n_folds):\n        net.load_model_from_file(fold_model_format % fold)\n        if ae is not None:\n            ae.load_model_from_file(fold_ae_format % fold)\n            net.autoencoder = ae\n        samples.append(net.generate_samples(n_samples=n_samples))\n\n    return samples\n\ndef get_fold_data(set_name, n_folds=5):\n    data = []\n    for i_fold in range(n_folds):\n        x_train, x_val, x_test = load_tfd_fold(i_fold)\n        if set_name == 'train':\n            data.append(x_train)\n        elif set_name == 'val':\n            data.append(x_val)\n        elif set_name == 'test':\n            data.append(x_test)\n    return data\n\ndef kde_eval_mnist(net, test_data, n_samples=10000, sigma_range=np.arange(0.1, 0.3, 0.01), verbose=True):\n    s = net.generate_samples(n_samples=n_samples)\n    best_log_likelihood = float('-inf')\n    best_se = 0\n    best_sigma = 0\n    for sigma in sigma_range:\n        log_likelihood, se = KDE(s, sigma).average_se_log_likelihood(test_data)\n        if log_likelihood > best_log_likelihood:\n            best_log_likelihood = log_likelihood\n            best_se = se \n            best_sigma = sigma\n        if verbose:\n            print 'sigma=%g, log_likelihood=%.2f (%.2f)' % (sigma, log_likelihood, se)\n\n    if verbose:\n        print '===================='\n        print 'Best log_likelihood=%.2f (%.2f)' % (best_log_likelihood, best_se)\n        print ''\n    return best_log_likelihood, best_se, best_sigma\n\ndef kde_eval_tfd(net, test_data_all_folds, n_samples=10000, sigma_range=np.arange(0.05, 0.25, 0.01), verbose=True):\n    s = net.generate_samples(n_samples=n_samples)\n    best_log_likelihood = float('-inf')\n    n_folds = len(test_data_all_folds)\n    for sigma in sigma_range:\n        kde = KDE(s, sigma)\n        log_likelihood = [kde.average_log_likelihood(test_data_all_folds[i]) for i in range(n_folds)]\n        avg_log_likelihood = sum(log_likelihood) / float(n_folds)\n        avg_se = np.array(log_likelihood).std() / np.sqrt(n_folds)\n        if avg_log_likelihood > best_log_likelihood:\n            best_log_likelihood = avg_log_likelihood\n            best_se = avg_se\n            best_sigma = sigma\n        if verbose:\n            print 'sigma=%5g, log_likelihood=%8.2f (%.2f)  [%s]' % (sigma, avg_log_likelihood, avg_se, ', '.join(['%8.2f' % l for l in log_likelihood]))\n\n    if verbose:\n        print '===================='\n        print 'Best log_likelihood=%.2f (%.2f)' % (best_log_likelihood, best_se)\n        print ''\n    return best_log_likelihood, best_se, best_sigma\n\n\n"
  },
  {
    "path": "generate_sample_figures.py",
    "content": "\"\"\"\nScript used for generating sample figures used in the paper.\n\nYujia Li, 02/2015\n\"\"\"\n\nimport core.generative as gen\nimport pynn.nn as nn\nimport matplotlib.pyplot as plt\nimport vistools as vt\nimport visualize as vis\nimport dataio.tfd as tfd\nimport dataio.mnist as mnistio\nimport gnumpy as gnp\nimport numpy as np\nimport os\n\nplt.ion()\n\n# fill in the paths to the model files here\nBEST_MNIST_INPUT_SPACE_MODEL = ''\nBEST_MNIST_AUTOENCODER = ''\nBEST_MNIST_CODE_SPACE_MODEL = ''\nBEST_TFD_INPUT_SPACE_MODEL = ''\nBEST_TFD_AUTOENCODER = ''\nBEST_TFD_CODE_SPACE_MODEL = ''\n\ndef get_mnist_input_space_model():\n    net = gen.StochasticGenerativeNet()\n    net.load_model_from_file(BEST_MNIST_INPUT_SPACE_MODEL)\n    return net\n\ndef get_mnist_code_space_model():\n    ae = nn.AutoEncoder()\n    ae.load_model_from_file(BEST_MNIST_AUTOENCODER)\n    net = gen.StochasticGenerativeNetWithAutoencoder()\n    net.load_model_from_file(BEST_MNIST_CODE_SPACE_MODEL)\n    net.autoencoder = ae\n    return net\n\ndef get_tfd_input_space_model():\n    net = gen.StochasticGenerativeNet()\n    net.load_model_from_file(BEST_TFD_INPUT_SPACE_MODEL)\n    return net\n\ndef get_tfd_code_space_model():\n    ae = nn.AutoEncoder()\n    ae.load_model_from_file(BEST_TFD_AUTOENCODER)\n    net = gen.StochasticGenerativeNetWithAutoencoder()\n    net.load_model_from_file(BEST_TFD_CODE_SPACE_MODEL)\n    net.autoencoder = ae\n    return net\n\ndef get_model(dataset='mnist', mode='input_space'):\n    if dataset == 'mnist':\n        if mode == 'input_space':\n            return get_mnist_input_space_model()\n        elif mode == 'code_space':\n            return get_mnist_code_space_model()\n    elif dataset == 'tfd':\n        if mode == 'input_space':\n            return get_tfd_input_space_model()\n        elif mode == 'code_space':\n            return get_tfd_code_space_model()\n\ndef generate_samples(dataset='mnist', mode='input_space'):\n    imsz = [28,28] if dataset=='mnist' else [48,48]\n    net = get_model(dataset=dataset, mode=mode)\n    plt.figure()\n    vt.bwpatchview(net.generate_samples(n_samples=30).asarray(), imsz, 5, gridintensity=1)\n    if not os.path.exists('figs'):\n        os.makedirs('figs')\n    plt.savefig('figs/samples_%s_%s.pdf' % (dataset, mode), bbox_inches='tight')\n\ndef generate_all_samples():\n    generate_samples(dataset='mnist', mode='input_space')\n    generate_samples(dataset='mnist', mode='code_space')\n    #generate_samples(dataset='tfd', mode='input_space')\n    #generate_samples(dataset='tfd', mode='code_space')\n\ndef load_train_data(dataset='mnist'):\n    if dataset == 'mnist':\n        train_data, _, _ = mnistio.load_data()\n    elif dataset == 'tfd':\n        train_data, _, _ = tfd.load_proper_fold(0, 'unlabeled', scale=True)\n        train_data = train_data.reshape(train_data.shape[0], np.prod(train_data.shape[1:]))\n\n    return train_data\n\ndef get_nearest_neighbor(dataset='mnist', mode='input_space'):\n    imsz = [28,28] if dataset=='mnist' else [48,48]\n    net = get_model(dataset=dataset, mode=mode)\n    train_data = load_train_data(dataset=dataset)\n\n    if not os.path.exists('figs'):\n        os.makedirs('figs')\n    vis.nn_search(net.generate_samples(n_samples=12), train_data, top_k=1, imsz=imsz,\n            orientation='horizontal', output_file='figs/nn_%s_%s.pdf' % (dataset, mode), pad=0.1)\n\ndef get_all_nearest_neighbors():\n    get_nearest_neighbor(dataset='mnist', mode='input_space')\n    get_nearest_neighbor(dataset='mnist', mode='code_space')\n    #get_nearest_neighbor(dataset='tfd', mode='input_space')\n    #get_nearest_neighbor(dataset='tfd', mode='code_space')\n\ndef get_morphing_figure(dataset='mnist', mode='input_space'):\n    imsz = [28,28] if dataset=='mnist' else [48,48]\n    net = get_model(dataset=dataset, mode=mode)\n    plt.figure()\n    gnp.seed_rand(8)\n    vis.generation_on_a_line(net, n_points=24, imsz=imsz, nrows=10, h_seeds=net.sample_hiddens(5))\n\n    if not os.path.exists('figs'):\n        os.makedirs('figs')\n    plt.savefig('figs/morphing_%s_%s.pdf' % (dataset, mode), bbox_inches='tight')\n\ndef get_all_morphing_figures():\n    get_morphing_figure(dataset='mnist', mode='code_space')\n    #get_morphing_figure(dataset='tfd', mode='code_space')\n\nif __name__ == '__main__':\n    generate_all_samples()\n    get_all_nearest_neighbors()\n    get_all_morphing_figures()\n"
  },
  {
    "path": "test.py",
    "content": "\"\"\"\nDebug tests for the datasetbias project.\n\nYujia Li, 09/2014\n\"\"\"\nimport os\nos.environ['GNUMPY_CPU_PRECISION'] = '64'\n\nimport pynn.nn as nn\nimport pynn.layer as ly\nimport pynn.loss as ls\nimport gnumpy as gnp\nimport numpy as np\nimport time\nimport math\n\nimport core.generative as gen\n\n_GRAD_CHECK_EPS = 1e-6\n_FDIFF_EPS = 1e-8\n\n_TEMP_FILE_NAME = '_temp_.pdata'\n\n_GOOD_COLOR_BEGINS = '\\033[42m'\n_BAD_COLOR_BEGINS = '\\033[41m'\n_COLOR_RESET = '\\033[0m'\n\ndef good_colored_str(txt):\n    return _GOOD_COLOR_BEGINS + txt + _COLOR_RESET\n\ndef bad_colored_str(txt):\n    return _BAD_COLOR_BEGINS + txt + _COLOR_RESET\n\ndef vec_str(v):\n    s = '[ '\n    for i in range(len(v)):\n        s += '%11.8f ' % v[i]\n    s += ']'\n    return s\n\ndef test_vec_pair(v1, msg1, v2, msg2, error_thres=_GRAD_CHECK_EPS):\n    print msg1 + ' : ' + vec_str(v1)\n    print msg2 + ' : ' + vec_str(v2)\n    n_space = len(msg2) - len('diff')\n    print ' ' * n_space + 'diff' + ' : ' + vec_str(v1 - v2)\n    err = np.sqrt(((v1 - v2)**2).sum())\n    print 'err : %.8f' % err\n\n    success = err < error_thres\n    print good_colored_str('** SUCCESS **') if success else \\\n            bad_colored_str('** FAIL **')\n\n    return success\n\ndef finite_difference_gradient(f, x):\n    grad = x * 0\n    for i in range(len(x)):\n        x_0 = x[i]\n        x[i] = x_0 + _FDIFF_EPS\n        f_plus = f(x)\n        x[i] = x_0 - _FDIFF_EPS\n        f_minus = f(x)\n        grad[i] = (f_plus - f_minus) / (2 * _FDIFF_EPS)\n        x[i] = x_0\n\n    return grad\n\ndef fdiff_grad_generator(net, x, t, add_noise=False, seed=None):\n    if t is not None:\n        net.load_target(t)\n\n    def f(w):\n        if add_noise and seed is not None:\n            gnp.seed_rand(seed)\n        w_0 = net.get_param_vec()\n        net.set_param_from_vec(w)\n        net.forward_prop(x, add_noise=add_noise, compute_loss=True)\n        loss = net.get_loss()\n        net.set_param_from_vec(w_0)\n\n        return loss\n\n    return f\n\ndef test_net_io(f_create, f_create_void):\n    net1 = f_create()\n    print 'Testing %s I/O' % net1.__class__.__name__\n\n    net1.save_model_to_file(_TEMP_FILE_NAME)\n\n    net2 = f_create_void()\n    net2.load_model_from_file(_TEMP_FILE_NAME)\n\n    os.remove(_TEMP_FILE_NAME)\n\n    print 'Net #1: \\n' + str(net1)\n    print 'Net #2: \\n' + str(net2)\n    test_passed = (str(net1) == str(net2))\n\n    test_passed = test_passed and test_vec_pair(net1.get_param_vec(), 'Net #1',\n            net2.get_param_vec(), 'Net #2')\n    return test_passed\n\ndef test_databias_loss(loss_type, **kwargs):\n    print 'Testing Loss <' + loss_type + '> ' \\\n            + ', '.join([str(k) + '=' + str(v) for k, v in kwargs.iteritems()])\n\n    n_cases = 5\n    n_datasets = 3\n    in_dim = 2\n    \n    x = gnp.randn(n_cases, in_dim)\n    s = np.arange(n_cases) % n_datasets\n\n    loss = ls.get_loss_from_type_name(loss_type)\n    loss.load_target(s, K=n_datasets, **kwargs)\n\n    def f(w):\n        return loss.compute_loss_and_grad(w.reshape(x.shape), compute_grad=True)[0]\n\n    backprop_grad = loss.compute_loss_and_grad(x, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, x.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef create_databias_net(dropout_rate):\n    net = nn.NeuralNet(3, 2)\n    net.add_layer(2, nonlin_type=ly.NONLIN_NAME_TANH, dropout=dropout_rate)\n    net.add_layer(0, nonlin_type=ly.NONLIN_NAME_LINEAR, dropout=0)\n    return net\n\ndef test_databias_loss_with_net(add_noise, loss_type, **kwargs):\n    print 'Testing Loss <' + loss_type + '> with network, '\\\n            + ('with noise' if add_noise else 'without noise') + ', ' \\\n            + ', '.join([str(k) + '=' + str(v) for k, v in kwargs.iteritems()])\n    n_cases = 5\n    n_datasets = 3\n    seed = 8\n    dropout_rate = 0.5 if add_noise else 0\n\n    net = create_databias_net(dropout_rate)\n    net.set_loss(loss_type)\n    print net\n    x = gnp.randn(n_cases, net.in_dim)\n    s = np.arange(n_cases) % n_datasets\n\n    net.load_target(s, K=n_datasets, **kwargs)\n\n    if add_noise:\n        gnp.seed_rand(seed)\n    net.clear_gradient()\n    net.forward_prop(x, add_noise=add_noise, compute_loss=True)\n    net.backward_prop()\n\n    backprop_grad = net.get_grad_vec()\n\n    f = fdiff_grad_generator(net, x, None, add_noise=add_noise, seed=seed)\n    fdiff_grad = finite_difference_gradient(f, net.get_param_vec())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n\n    gnp.seed_rand(int(time.time()))\n    return test_passed\n\ndef test_generative_mmd_loss(sigma=1):\n    print 'Testing generative MMD loss, sigma=%g' % sigma\n    n_dims = 3\n    n_target = 5\n    n_pred = 4\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_MMDGEN, sigma=sigma)\n    mmd.load_target(target)\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_generative_multi_scale_mmd_loss(sigma=[1, 10], scale_weight=None):\n    print 'Testing generative multi-scale MMD loss, sigma=%s' % str(sigma)\n    n_dims = 3\n    n_target = 5\n    n_pred = 4\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_MMDGEN_MULTISCALE, sigma=sigma, scale_weight=scale_weight)\n    mmd.load_target(target)\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_linear_time_mmd_loss(sigma=1.0, use_modified_loss=False, use_absolute_value=False):\n    print 'Testing linear time MMD loss, sigma=%s' % str(sigma)\n    n_dims = 3\n    n_target = 4\n    n_pred = 4\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_LINEAR_TIME_MMDGEN, sigma=sigma,\n            use_modified_loss=use_modified_loss, use_absolute_value=use_absolute_value)\n    mmd.load_target(target)\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_linear_time_minibatch_mmd_loss(sigma=1.0, minibatch_size=100):\n    print 'Testing linear time minibatch MMD loss'\n    n_dims = 3\n    n_target = 10\n    n_pred = 10\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_LINEAR_TIME_MINIBATCH_MMDGEN,\n            sigma=sigma, minibatch_size=minibatch_size)\n    mmd.load_target(target)\n    print mmd\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_random_feature_mmd_loss(sigma=[1,10], scale_weight=[0.5, 1], n_features=3):\n    print 'Testing random feature MMD loss'\n    n_dims = 2\n    n_target = 5\n    n_pred = 5 \n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_RANDOM_FEATURE_MMDGEN,\n            sigma=sigma, scale_weight=scale_weight, n_features=n_features)\n    mmd.load_target(target)\n    print mmd\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_random_feature_mmd_loss_approximation(sigma=[1,10], scale_weight=[0.5,1],\n        n_features=3):\n    print 'Testing random feature MMD loss approximation error'\n\n    n_dims = 2\n    n_target = 5\n    n_pred = 5 \n\n    target = gnp.rand(n_target, n_dims)\n    pred = gnp.rand(n_pred, n_dims)\n\n    rand_mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_RANDOM_FEATURE_MMDGEN,\n            sigma=sigma, scale_weight=scale_weight, n_features=n_features)\n    rand_mmd.load_target(target)\n    print rand_mmd\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_MMDGEN_MULTISCALE_PAIR,\n            sigma=sigma, scale_weight=scale_weight)\n    mmd.load_target(target)\n\n    rand_loss, rand_grad = rand_mmd.compute_loss_and_grad(pred, compute_grad=True)\n    true_loss, true_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)\n\n    test_passed = test_vec_pair(rand_grad.asarray().ravel(), 'Approximate Gradient',\n            true_grad.asarray().ravel(), '       True Gradient', error_thres=1e-2)\n    test_passed = test_vec_pair(np.array([rand_loss]), 'Approximate Loss',\n            np.array([true_loss]), '       True Loss', error_thres=1e-2) \\\n            and test_passed\n    print ''\n    return test_passed\n\ndef test_pair_mmd_loss_multiscale(sigma=[1, 10], scale_weight=None):\n    print 'Testing generative pair multi-scale MMD loss'\n    n_dims = 3\n    n_target = 5\n    n_pred = 4\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_MMDGEN_MULTISCALE_PAIR, sigma=sigma, scale_weight=scale_weight)\n    mmd.load_target(target)\n    print mmd\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_diff_kernel_mmd_loss(sigma=[1], scale_weight=[1], loss_name=None):\n    assert loss_name is not None\n\n    print 'Testing differentiable kernel MMD loss <%s>' % loss_name\n\n    n_dims = 3\n    n_target = 5\n    n_pred = 4\n\n    target = gnp.randn(n_target, n_dims)\n    pred = gnp.randn(n_pred, n_dims)\n\n    mmd = ls.get_loss_from_type_name(loss_name, sigma=sigma, scale_weight=scale_weight)\n    mmd.load_target(target)\n    print mmd\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_diff_kernel_per_example_mmd_loss(sigma=[1], scale_weight=[1], pred_per_example=1, target_per_example=[1], loss_name=None):\n    assert loss_name is not None\n\n    print 'Testing differentiable kernel per example MMD loss <%s>' % loss_name\n\n    if len(target_per_example) == 1:\n        target_per_example = target_per_example * 3\n\n    n_dims = 3\n    n_target = sum(target_per_example)\n    n_pred = len(target_per_example) * pred_per_example\n\n    pred = gnp.randn(n_pred, n_dims)\n    target = []\n    for i_target in target_per_example:\n        target.append(gnp.randn(i_target, n_dims))\n\n    mmd = ls.get_loss_from_type_name(loss_name, sigma=sigma, scale_weight=scale_weight, pred_per_example=pred_per_example)\n    mmd.load_target(target)\n    print mmd\n\n    def f(w):\n        return mmd.compute_loss_and_grad(w.reshape(pred.shape), compute_grad=False)[0]\n\n    backprop_grad = mmd.compute_loss_and_grad(pred, compute_grad=True)[1].asarray().ravel()\n    fdiff_grad = finite_difference_gradient(f, pred.asarray().ravel())\n\n    test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient',\n            backprop_grad, '  Backpropagation Gradient')\n    print ''\n    return test_passed\n\ndef test_all_diff_kernel_per_example_mmd_loss():\n    print ''\n    print '==============================================================='\n    print 'Testing differentiable kernel per example MMD loss (new design)'\n    print '==============================================================='\n    print ''\n\n    sigma_list = [1, 10]\n    scale_weight_list = [1.0, 3.0]\n    target_per_example_list = [[1], [2], [1,2,3]]\n    pred_per_example_list = [1,2,3]\n    loss_list = [ls.LOSS_NAME_CPU_PER_EXAMPLE_MMDGEN_SQRT_GAUSSIAN]\n\n    n_success = 0\n    n_tests = 0\n    for loss_name in loss_list:\n        for sigma, scale_weight, target_per_example, pred_per_example in zip(sigma_list, scale_weight_list,\n                target_per_example_list[:len(sigma_list)], pred_per_example_list[:len(sigma_list)]):\n            if test_diff_kernel_per_example_mmd_loss([sigma], [scale_weight], pred_per_example, target_per_example, loss_name):\n                n_success += 1\n            n_tests += 1\n\n        if test_diff_kernel_per_example_mmd_loss(sigma_list, scale_weight_list, pred_per_example_list[-1], target_per_example_list[-1], loss_name):\n            n_success += 1\n\n        n_tests += 1\n\n    print '=============='\n    print 'Test finished: %d/%d success, %d failed' % (n_success, n_tests, n_tests - n_success)\n    print ''\n\n    return n_success, n_tests\n\n\n\ndef test_all_diff_kernel_mmd_loss():\n    print ''\n    print '==================================================='\n    print 'Testing differentiable kernel MMD loss (new design)'\n    print '==================================================='\n    print ''\n\n    sigma_list = [1, 2.5, 10]\n    scale_weight_list = [1.0, 2, 3.0]\n    loss_list = [ls.LOSS_NAME_MMDGEN_GAUSSIAN, ls.LOSS_NAME_MMDGEN_LAPLACIAN,\n            ls.LOSS_NAME_MMDGEN_LAPLACIAN_L1, ls.LOSS_NAME_MMDGEN_SQRT_GAUSSIAN,\n            ls.LOSS_NAME_CPU_MMDGEN_GAUSSIAN, ls.LOSS_NAME_CPU_MMDGEN_SQRT_GAUSSIAN]\n\n    n_success = 0\n    n_tests = 0\n    for loss_name in loss_list:\n        for sigma, scale_weight in zip(sigma_list, scale_weight_list):\n            if test_diff_kernel_mmd_loss([sigma], [scale_weight], loss_name):\n                n_success += 1\n            n_tests += 1\n\n        if test_diff_kernel_mmd_loss(sigma_list, scale_weight_list, loss_name):\n            n_success += 1\n        n_tests += 1\n\n    print '=============='\n    print 'Test finished: %d/%d success, %d failed' % (n_success, n_tests, n_tests - n_success)\n    print ''\n\n    return n_success, n_tests\n\ndef test_all_generative_mmd_loss():\n    print ''\n    print '========================'\n    print 'Testing data bias losses'\n    print '========================'\n    print ''\n\n    n_success = 0\n    if test_generative_mmd_loss(sigma=1):\n        n_success += 1\n    if test_generative_mmd_loss(sigma=1e-1):\n        n_success += 1\n    if test_generative_multi_scale_mmd_loss(sigma=[1], scale_weight=[1.0]):\n        n_success += 1\n    if test_generative_multi_scale_mmd_loss(sigma=[10], scale_weight=[2.0]):\n        n_success += 1\n    if test_generative_multi_scale_mmd_loss(sigma=[100], scale_weight=[2.0]):\n        n_success += 1\n    if test_generative_multi_scale_mmd_loss(sigma=[1, 10, 100], scale_weight=[1.0, 2.0, 3.0]):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=1):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=0.1):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=1, use_modified_loss=True):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=0.1, use_modified_loss=True):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=1, use_modified_loss=True, use_absolute_value=True):\n        n_success += 1\n    if test_linear_time_mmd_loss(sigma=0.1, use_modified_loss=True, use_absolute_value=True):\n        n_success += 1\n    if test_linear_time_minibatch_mmd_loss(sigma=1.0, minibatch_size=2):\n        n_success += 1\n    if test_linear_time_minibatch_mmd_loss(sigma=0.1, minibatch_size=3):\n        n_success += 1\n    if test_pair_mmd_loss_multiscale(sigma=[1], scale_weight=[1.0]):\n        n_success += 1\n    if test_pair_mmd_loss_multiscale(sigma=[10], scale_weight=[2.0]):\n        n_success += 1\n    if test_pair_mmd_loss_multiscale(sigma=[100], scale_weight=[2.0]):\n        n_success += 1\n    if test_pair_mmd_loss_multiscale(sigma=[1, 10, 100], scale_weight=[1.0, 2.0, 3.0]):\n        n_success += 1\n    if test_random_feature_mmd_loss(sigma=[1], scale_weight=[1.0], n_features=3):\n        n_success += 1\n    if test_random_feature_mmd_loss(sigma=[1], scale_weight=[1.0], n_features=10):\n        n_success += 1\n    if test_random_feature_mmd_loss(sigma=[1, 10, 100], scale_weight=[1.0, 2.0, 3.0], n_features=3):\n        n_success += 1\n    if test_random_feature_mmd_loss(sigma=[1, 10, 100], scale_weight=[1.0, 2.0, 3.0], n_features=10):\n        n_success += 1\n    if test_random_feature_mmd_loss_approximation(sigma=[5], scale_weight=[1.0], n_features=1024):\n        n_success += 1\n    if test_random_feature_mmd_loss_approximation(sigma=[5, 10, 80], scale_weight=[1.0, 2.0, 3.0], n_features=1024):\n        n_success += 1\n\n    n_tests = 24 \n\n    print '=============='\n    print 'Test finished: %d/%d success, %d failed' % (n_success, n_tests, n_tests - n_success)\n    print ''\n\n    return n_success, n_tests\n\ndef run_all_tests():\n    gnp.seed_rand(int(time.time()))\n\n    n_success = 0\n    n_tests = 0\n\n    test_list = [test_all_generative_mmd_loss,\n            test_all_diff_kernel_mmd_loss, \n            test_all_diff_kernel_per_example_mmd_loss]\n    for batch_test in test_list:\n        success_in_batch, tests_in_batch = batch_test()\n        n_success += success_in_batch\n        n_tests += tests_in_batch\n\n    print ''\n    print '==================='\n    print 'All tests finished: %d/%d success, %d failed' % (n_success, n_tests, n_tests - n_success)\n    print ''\n\nif __name__ == '__main__':\n    run_all_tests()\n\n"
  },
  {
    "path": "train.py",
    "content": "\"\"\"\nTraining script for MNIST/TFD.\n\nYujia Li, 01/2015\n\"\"\"\n\nimport argparse\n\nimport cPickle as pickle\nimport pynn.nn as nn\nimport pynn.layer as ly\nimport pynn.loss as ls\nimport pynn.learner as learner\nimport core.generative as gen\nimport gnumpy as gnp\nimport numpy as np\nimport time\nimport dataio.mnist as mnistio\nimport dataio.tfd as tfd\n\nimport eval_mmd_generative_model as ev\n\n# You may want to change this\nOUTPUT_BASE_DIR = 'output'\n\ndef write_config(file_name, config):\n    \"\"\"\n    file_name: output config file name\n    config: dict containing all the configs.\n    \"\"\"\n    with open(file_name, 'w') as f:\n        for k, v in sorted(config.items(), key=lambda t: t[0]):\n            f.write(str(k) + '=' + str(v) + '\\n')\n\ndef cat_list(lst):\n    return '_'.join([str(v) for v in lst])\n\n\ndef load_tfd_fold(fold=0):\n    \"\"\"\n    Return train, val, test data for the particular fold.\n    \"\"\"\n    # note that the training set used here is the 'unlabeled' set in TFD\n    x_train, _, _ = tfd.load_proper_fold(fold, 'unlabeled', scale=True)\n    x_val,   _, _ = tfd.load_proper_fold(fold, 'val', scale=True)\n    x_test,  _, _ = tfd.load_proper_fold(fold, 'test', scale=True)\n\n    imsz = np.prod(x_train.shape[1:])\n\n    return x_train.reshape(x_train.shape[0], imsz), \\\n            x_val.reshape(x_val.shape[0], imsz), \\\n            x_test.reshape(x_test.shape[0], imsz)\n\ndef load_tfd_all_folds(set_name='val', n_folds=5):\n    x = []\n    for i_fold in range(n_folds):\n        #xx, _, _ = tfd.load_fold(i_fold, set_name, scale=True)\n        xx, _, _ = tfd.load_proper_fold(i_fold, set_name, scale=True)\n        x.append(xx.reshape(xx.shape[0], np.prod(xx.shape[1:])))\n    return x\n\ndef mnist_mmd_input_space(n_hids=[10,64,256,256,1024], sigma=[2,5,10,20,40,80], learn_rate=2, momentum=0.9):\n    \"\"\"\n    n_hids: number of hidden units on all layers (top-down) in the generative network.\n    sigma: a list of scales used for the kernel\n    learn_rate, momentum: parameters for the learning process\n\n    return: KDE log_likelihood on validation set.\n    \"\"\"\n    gnp.seed_rand(8)\n\n    x_train, x_val, x_test = mnistio.load_data()\n\n    print ''\n    print 'Training data: %d x %d' % x_train.shape\n\n    in_dim = n_hids[0]\n    out_dim = x_train.shape[1]\n\n    net = gen.StochasticGenerativeNet(in_dim, out_dim)\n    for i in range(1, len(n_hids)):\n        net.add_layer(n_hids[i], nonlin_type=ly.NONLIN_NAME_RELU, dropout=0)\n    net.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n\n    # place holder loss\n    net.set_loss(ls.LOSS_NAME_MMDGEN, loss_after_nonlin=True, sigma=80, loss_weight=1000)\n\n    print ''\n    print '========'\n    print 'Training'\n    print '========'\n    print ''\n    print net\n    print ''\n\n    mmd_learner = gen.StochasticGenerativeNetLearner(net)\n    mmd_learner.load_data(x_train)\n\n    output_base = OUTPUT_BASE_DIR + '/mnist/input_space'\n\n    #sigma = [2,5,10,20,40,80]\n    sigma_weights = [1,1,1,1,1,1]\n    #learn_rate = 1\n    #momentum = 0.9\n\n    minibatch_size = 1000\n    n_sample_update_iters = 1\n    max_iters = 40000\n    i_checkpoint = 2000\n\n    output_dir = output_base + '/nhids_%s_sigma_%s_lr_%s_m_%s' % (\n            '_'.join([str(nh) for nh in n_hids]), '_'.join([str(s) for s in sigma]), str(learn_rate), str(momentum))\n\n    print ''\n    print '>>>> output_dir = %s' % output_dir\n    print ''\n\n    mmd_learner.set_output_dir(output_dir)\n    #net.set_loss(ls.LOSS_NAME_MMDGEN_MULTISCALE, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n    net.set_loss(ls.LOSS_NAME_MMDGEN_SQRT_GAUSSIAN, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n\n    print '**********************************'\n    print net.loss\n    print '**********************************'\n    print ''\n\n    def f_checkpoint(i_iter, w):\n        mmd_learner.save_checkpoint('%d' % i_iter)\n\n    mmd_learner.train_sgd(minibatch_size=minibatch_size, n_samples_per_update=minibatch_size, \n            n_sample_update_iters=n_sample_update_iters, learn_rate=learn_rate, momentum=momentum, \n            weight_decay=0, learn_rate_schedule={10000:learn_rate/10.0},\n            momentum_schedule={10000:1-(1-momentum)/10.0},\n            learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=i_checkpoint, f_exe=f_checkpoint)\n\n    mmd_learner.save_model()\n\n    print ''\n    print '===================='\n    print 'Evaluating the model'\n    print '===================='\n    print ''\n\n    log_prob, std, sigma = ev.kde_eval_mnist(net, x_val, verbose=False)\n    test_log_prob, test_std, _ = ev.kde_eval_mnist(net, x_test, sigma_range=[sigma], verbose=False)\n\n    print 'Validation: %.2f (%.2f)' % (log_prob, std)\n    print 'Test      : %.2f (%.2f)' % (test_log_prob, test_std)\n    print ''\n\n    write_config(output_dir + '/params_and_results.cfg', { 'n_hids': n_hids,\n        'sigma': sigma, 'sigma_weights': sigma_weights, 'learn_rate': learn_rate,\n        'momentum': momentum, 'minibatch_size': minibatch_size, \n        'n_sample_update_iters': n_sample_update_iters, 'max_iters': max_iters,\n        'i_checkpoint': i_checkpoint, 'val_log_prob': log_prob, 'val_std': std, \n        'test_log_prob': test_log_prob, 'test_std': test_std })\n\n    print '>>>> output_dir = %s' % output_dir\n    print ''\n\n    return log_prob\n\n\ndef mnist_mmd_code_space(\n        ae_n_hids=[1024, 32], \n        ae_dropout=[0.2, 0.5],\n        ae_learn_rate=1e-1, \n        ae_momentum=0.9,\n        mmd_n_hids=[10, 64, 256, 256, 1024], \n        mmd_sigma=1,\n        mmd_learn_rate=2,\n        mmd_momentum=0.9):\n    \"\"\"\n    ae_n_hids: #hid for the encoder, bottom-up\n    ae_dropout: the amount of dropout for each layer in the encoder, same order\n    ae_learn_rate, ae_momentum: .\n    mmd_n_hids: #hid for the generative net, top-down\n    mmd_sigma: scale of the kernel\n    mmd_learn_rate, mmd_momentum: .\n\n    Return KDE log_likelihood on the validation set.\n    \"\"\"\n    gnp.seed_rand(8)\n    x_train, x_val, x_test = mnistio.load_data()\n\n    common_output_base = OUTPUT_BASE_DIR + '/mnist/code_space'\n    output_base = common_output_base + '/aeh_%s_dr_%s_aelr_%s_aem_%s_nh_%s_s_%s_lr_%s_m_%s' % (\n            cat_list(ae_n_hids), cat_list(ae_dropout), str(ae_learn_rate), str(ae_momentum),\n            cat_list(mmd_n_hids), str(mmd_sigma), str(mmd_learn_rate), str(mmd_momentum))\n\n    #######################\n    # Auto-encoder training\n    #######################\n\n    n_dims = x_train.shape[1]\n    h_dim = ae_n_hids[-1]\n\n    encoder = nn.NeuralNet(n_dims, h_dim)\n    for i in range(len(ae_n_hids) - 1):\n        encoder.add_layer(ae_n_hids[i], nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=ae_dropout[i])\n    encoder.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=ae_dropout[-1])\n\n    decoder = nn.NeuralNet(h_dim, n_dims)\n    for i in range(len(ae_n_hids) - 1)[::-1]:\n        decoder.add_layer(ae_n_hids[i], nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n    decoder.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n    decoder.set_loss(ls.LOSS_NAME_BINARY_CROSSENTROPY, loss_weight=1)\n\n    autoenc = nn.AutoEncoder(encoder=encoder, decoder=decoder)\n\n    print ''\n    print autoenc\n    print ''\n\n    learn_rate = ae_learn_rate\n    final_momentum = ae_momentum\n    max_iters = 3000\n\n    nn_pretrainer = learner.AutoEncoderPretrainer(autoenc)\n    nn_pretrainer.load_data(x_train)\n    nn_pretrainer.pretrain_network(learn_rate=1e-1, momentum=0.5, weight_decay=0, minibatch_size=100,\n            max_grad_norm=10, max_iters=max_iters, iprint=100)\n\n    nn_learner = learner.Learner(autoenc)\n    nn_learner.set_output_dir(output_base + '/ae')\n    nn_learner.load_data(x_train, x_train)\n\n    def f_checkpoint(i_iter, w):\n        nn_learner.save_checkpoint('%d' % i_iter)\n\n    nn_learner.train_sgd(learn_rate=learn_rate, momentum=0, weight_decay=0, minibatch_size=100,\n            learn_rate_schedule=None, momentum_schedule={50:0.5, 200:final_momentum}, \n            max_grad_norm=10, learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=2000, f_exe=f_checkpoint)\n    nn_learner.save_checkpoint('best')\n\n    ##################\n    # Training MMD net\n    ##################\n\n    n_hids = mmd_n_hids\n\n    in_dim = n_hids[0]\n    out_dim = autoenc.encoder.out_dim\n\n    net = gen.StochasticGenerativeNetWithAutoencoder(in_dim, out_dim, autoenc)\n    for i in range(1, len(n_hids)):\n        net.add_layer(n_hids[i], nonlin_type=ly.NONLIN_NAME_RELU, dropout=0)\n    net.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n\n    print ''\n    print '========'\n    print 'Training'\n    print '========'\n    print ''\n    print net\n    print ''\n\n    mmd_learner = gen.StochasticGenerativeNetLearner(net)\n    mmd_learner.load_data(x_train)\n\n    sigma = [mmd_sigma]\n    sigma_weights = [1]\n    learn_rate = mmd_learn_rate\n    momentum = mmd_momentum\n\n    minibatch_size = 1000\n    n_sample_update_iters = 1\n    max_iters = 40000\n    i_checkpoint = 2000\n\n    mmd_learner.set_output_dir(output_base + '/mmd')\n    #net.set_loss(ls.LOSS_NAME_MMDGEN_MULTISCALE, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n    net.set_loss(ls.LOSS_NAME_MMDGEN_SQRT_GAUSSIAN, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n\n    print '**********************************'\n    print net.loss\n    print '**********************************'\n    print ''\n\n    def f_checkpoint(i_iter, w):\n        mmd_learner.save_checkpoint('%d' % i_iter)\n\n    mmd_learner.train_sgd(minibatch_size=minibatch_size, n_samples_per_update=minibatch_size, \n            n_sample_update_iters=n_sample_update_iters, learn_rate=learn_rate, momentum=momentum, \n            weight_decay=0, learn_rate_schedule={10000:learn_rate/10.0},\n            momentum_schedule={10000:1-(1-momentum)/10.0},\n            learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=i_checkpoint, f_exe=f_checkpoint)\n    mmd_learner.save_model()\n\n    # Evaluation\n\n    print ''\n    print '===================='\n    print 'Evaluating the model'\n    print '===================='\n    print ''\n\n    log_prob, std, sigma = ev.kde_eval_mnist(net, x_val, verbose=False)\n    test_log_prob, test_std, _ = ev.kde_eval_mnist(net, x_test, sigma_range=[sigma], verbose=False)\n\n    print 'Validation: %.2f (%.2f)' % (log_prob, std)\n    print 'Test      : %.2f (%.2f)' % (test_log_prob, test_std)\n    print ''\n\n    write_config(output_base + '/params_and_results.cfg', { \n        'ae_n_hids' : ae_n_hids, 'ae_dropout' : ae_dropout, 'ae_learn_rate' : ae_learn_rate,\n        'ae_momentum' : ae_momentum, 'mmd_n_hids': mmd_n_hids,\n        'mmd_sigma': mmd_sigma, 'mmd_sigma_weights': sigma_weights, 'mmd_learn_rate': mmd_learn_rate,\n        'mmd_momentum': mmd_momentum, 'mmd_minibatch_size': minibatch_size, \n        'mmd_n_sample_update_iters': n_sample_update_iters, 'mmd_max_iters': max_iters,\n        'mmd_i_checkpoint': i_checkpoint, 'val_log_prob': log_prob, 'val_std': std, \n        'test_log_prob': test_log_prob, 'test_std': test_std })\n\n    print '>>>> output_dir = %s' % output_base\n    print ''\n\n    return log_prob\n\ndef tfd_mmd_input_space(n_hids=[10,64,256,256,1024], sigma=[5,10,20,40,80,160], learn_rate=2, momentum=0.9):\n    \"\"\"\n    return validation log prob.\n    \"\"\"\n    gnp.seed_rand(8)\n\n    # train on only one fold - that's enough as the training set is the same across folds\n    x_train, x_val, x_test = load_tfd_fold(0)\n\n    print ''\n    print 'Training data: %d x %d' % x_train.shape\n\n    in_dim = n_hids[0]\n    out_dim = x_train.shape[1]\n\n    net = gen.StochasticGenerativeNet(in_dim, out_dim)\n    for i in range(1, len(n_hids)):\n        net.add_layer(n_hids[i], nonlin_type=ly.NONLIN_NAME_RELU, dropout=0)\n    net.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n\n    # place holder loss\n    net.set_loss(ls.LOSS_NAME_MMDGEN, loss_after_nonlin=True, sigma=80, loss_weight=1000)\n\n    print ''\n    print '========'\n    print 'Training'\n    print '========'\n    print ''\n    print net\n    print ''\n\n    mmd_learner = gen.StochasticGenerativeNetLearner(net)\n    mmd_learner.load_data(x_train)\n\n    output_base = OUTPUT_BASE_DIR + '/tfd/input_space'\n\n    #sigma = [2,5,10,20,40,80]\n    sigma_weights = [1,1,1,1,1,1]\n    #learn_rate = 1\n    #momentum = 0.9\n\n    minibatch_size = 1000\n    n_sample_update_iters = 1\n    max_iters = 48000\n    i_checkpoint = 2000\n\n    output_dir = output_base + '/nhids_%s_sigma_%s_lr_%s_m_%s' % (\n            '_'.join([str(nh) for nh in n_hids]), '_'.join([str(s) for s in sigma]), str(learn_rate), str(momentum))\n\n    print ''\n    print '>>>> output_dir = %s' % output_dir\n    print ''\n\n    mmd_learner.set_output_dir(output_dir)\n    #net.set_loss(ls.LOSS_NAME_MMDGEN_MULTISCALE, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n    net.set_loss(ls.LOSS_NAME_MMDGEN_SQRT_GAUSSIAN, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n\n    print '**********************************'\n    print net.loss\n    print '**********************************'\n    print ''\n\n    def f_checkpoint(i_iter, w):\n        mmd_learner.save_checkpoint('%d' % i_iter)\n\n    mmd_learner.train_sgd(minibatch_size=minibatch_size, n_samples_per_update=minibatch_size, \n            n_sample_update_iters=n_sample_update_iters, learn_rate=learn_rate, momentum=momentum, \n            weight_decay=0, learn_rate_schedule={10000:learn_rate/10.0},\n            momentum_schedule={10000:1-(1-momentum)/10.0},\n            learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=i_checkpoint, f_exe=f_checkpoint)\n\n    mmd_learner.save_model()\n\n    print ''\n    print '===================='\n    print 'Evaluating the model'\n    print '===================='\n    print ''\n\n    x_val = load_tfd_all_folds('val')\n    x_test = load_tfd_all_folds('test')\n\n    log_prob, std, sigma = ev.kde_eval_tfd(net, x_val, verbose=False)\n    test_log_prob, test_std, _ = ev.kde_eval_tfd(net, x_test, sigma_range=[sigma], verbose=False)\n\n    print 'Validation: %.2f (%.2f)' % (log_prob, std)\n    print 'Test      : %.2f (%.2f)' % (test_log_prob, test_std)\n    print ''\n\n    write_config(output_dir + '/params_and_results.cfg', { 'n_hids': n_hids,\n        'sigma': sigma, 'sigma_weights': sigma_weights, 'learn_rate': learn_rate,\n        'momentum': momentum, 'minibatch_size': minibatch_size, \n        'n_sample_update_iters': n_sample_update_iters, 'max_iters': max_iters,\n        'i_checkpoint': i_checkpoint, 'val_log_prob': log_prob, 'val_std': std, \n        'test_log_prob': test_log_prob, 'test_std': test_std })\n\n    print '>>>> output_dir = %s' % output_dir\n    print ''\n\n    return log_prob\n\n\ndef tfd_mmd_code_space(\n        ae_n_hids=[512, 512, 128], \n        ae_dropout=[0.1, 0.1, 0.1],\n        ae_learn_rate=1e-1, \n        ae_momentum=0,\n        mmd_n_hids=[10, 64, 256, 256, 1024], \n        mmd_sigma=[1,2,5,10,20,40],\n        mmd_learn_rate=1e-1,\n        mmd_momentum=0.9):\n    \"\"\"\n    ae_n_hids: #hid for the encoder, bottom-up\n    ae_dropout: the amount of dropout for each layer in the encoder, same order\n    ae_learn_rate, ae_momentum: .\n    mmd_n_hids: #hid for the generative net, top-down\n    mmd_sigma: scale of the kernel\n    mmd_learn_rate, mmd_momentum: .\n\n    Return KDE log_likelihood on the validation set.\n    \"\"\"\n    gnp.seed_rand(8)\n    x_train, x_val, x_test = load_tfd_fold(0)\n\n    common_output_base = OUTPUT_BASE_DIR + '/tfd/code_space'\n    output_base = common_output_base + '/aeh_%s_dr_%s_aelr_%s_aem_%s_nh_%s_s_%s_lr_%s_m_%s' % (\n            cat_list(ae_n_hids), cat_list(ae_dropout), str(ae_learn_rate), str(ae_momentum),\n            cat_list(mmd_n_hids), cat_list(mmd_sigma), str(mmd_learn_rate), str(mmd_momentum))\n\n    #######################\n    # Auto-encoder training\n    #######################\n\n    n_dims = x_train.shape[1]\n    h_dim = ae_n_hids[-1]\n\n    encoder = nn.NeuralNet(n_dims, h_dim)\n    for i in range(len(ae_n_hids) - 1):\n        encoder.add_layer(ae_n_hids[i], nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=ae_dropout[i])\n    encoder.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=ae_dropout[-1])\n\n    decoder = nn.NeuralNet(h_dim, n_dims)\n    for i in range(len(ae_n_hids) - 1)[::-1]:\n        decoder.add_layer(ae_n_hids[i], nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n    decoder.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n    decoder.set_loss(ls.LOSS_NAME_BINARY_CROSSENTROPY, loss_weight=1)\n\n    autoenc = nn.AutoEncoder(encoder=encoder, decoder=decoder)\n\n    print ''\n    print autoenc\n    print ''\n\n    learn_rate = ae_learn_rate\n    final_momentum = ae_momentum\n    max_iters = 15000\n    #max_iters = 200\n\n    nn_pretrainer = learner.AutoEncoderPretrainer(autoenc)\n    nn_pretrainer.load_data(x_train)\n    nn_pretrainer.pretrain_network(learn_rate=1e-1, momentum=0.5, weight_decay=0, minibatch_size=100,\n            max_grad_norm=10, max_iters=max_iters, iprint=100)\n\n    nn_learner = learner.Learner(autoenc)\n    nn_learner.set_output_dir(output_base + '/ae')\n    nn_learner.load_data(x_train, x_train)\n\n    def f_checkpoint(i_iter, w):\n        nn_learner.save_checkpoint('%d' % i_iter)\n\n    nn_learner.train_sgd(learn_rate=learn_rate, momentum=0, weight_decay=0, minibatch_size=100,\n            learn_rate_schedule=None, momentum_schedule={50:0.5, 200:final_momentum}, \n            max_grad_norm=10, learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=2000, f_exe=f_checkpoint)\n    nn_learner.save_checkpoint('best')\n\n    ##################\n    # Training MMD net\n    ##################\n\n    n_hids = mmd_n_hids\n\n    in_dim = n_hids[0]\n    out_dim = autoenc.encoder.out_dim\n\n    net = gen.StochasticGenerativeNetWithAutoencoder(in_dim, out_dim, autoenc)\n    for i in range(1, len(n_hids)):\n        net.add_layer(n_hids[i], nonlin_type=ly.NONLIN_NAME_RELU, dropout=0)\n    net.add_layer(0, nonlin_type=ly.NONLIN_NAME_SIGMOID, dropout=0)\n\n    print ''\n    print '========'\n    print 'Training'\n    print '========'\n    print ''\n    print net\n    print ''\n\n    mmd_learner = gen.StochasticGenerativeNetLearner(net)\n    mmd_learner.load_data(x_train)\n\n    sigma = mmd_sigma\n    sigma_weights = [1] * len(sigma)\n    learn_rate = mmd_learn_rate\n    momentum = mmd_momentum\n\n    minibatch_size = 1000\n    n_sample_update_iters = 1\n    max_iters = 48000\n    #max_iters = 200\n    i_checkpoint = 2000\n\n    mmd_learner.set_output_dir(output_base + '/mmd')\n    #net.set_loss(ls.LOSS_NAME_MMDGEN_MULTISCALE, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n    net.set_loss(ls.LOSS_NAME_MMDGEN_SQRT_GAUSSIAN, loss_after_nonlin=True, sigma=sigma, scale_weight=sigma_weights, loss_weight=1000)\n\n    print '**********************************'\n    print net.loss\n    print '**********************************'\n    print ''\n\n    def f_checkpoint(i_iter, w):\n        mmd_learner.save_checkpoint('%d' % i_iter)\n\n    mmd_learner.train_sgd(minibatch_size=minibatch_size, n_samples_per_update=minibatch_size, \n            n_sample_update_iters=n_sample_update_iters, learn_rate=learn_rate, momentum=momentum, \n            weight_decay=0, learn_rate_schedule={10000:learn_rate/10.0},\n            momentum_schedule={10000:1-(1-momentum)/10.0},\n            learn_rate_drop_iters=0, decrease_type='linear', adagrad_start_iter=0,\n            max_iters=max_iters, iprint=100, i_exe=i_checkpoint, f_exe=f_checkpoint)\n    mmd_learner.save_model()\n\n    # Evaluation\n\n    print ''\n    print '===================='\n    print 'Evaluating the model'\n    print '===================='\n    print ''\n\n    x_val = load_tfd_all_folds('val')\n    x_test = load_tfd_all_folds('test')\n\n    log_prob, std, sigma = ev.kde_eval_tfd(net, x_val, verbose=False)\n    test_log_prob, test_std, _ = ev.kde_eval_tfd(net, x_test, sigma_range=[sigma], verbose=False)\n\n    print 'Validation: %.2f (%.2f)' % (log_prob, std)\n    print 'Test      : %.2f (%.2f)' % (test_log_prob, test_std)\n    print ''\n\n    write_config(output_base + '/params_and_results.cfg', { \n        'ae_n_hids' : ae_n_hids, 'ae_dropout' : ae_dropout, 'ae_learn_rate' : ae_learn_rate,\n        'ae_momentum' : ae_momentum, 'mmd_n_hids': mmd_n_hids,\n        'mmd_sigma': mmd_sigma, 'mmd_sigma_weights': sigma_weights, 'mmd_learn_rate': mmd_learn_rate,\n        'mmd_momentum': mmd_momentum, 'mmd_minibatch_size': minibatch_size, \n        'mmd_n_sample_update_iters': n_sample_update_iters, 'mmd_max_iters': max_iters,\n        'mmd_i_checkpoint': i_checkpoint, 'val_log_prob': log_prob, 'val_std': std, \n        'test_log_prob': test_log_prob, 'test_std': test_std })\n\n    print '>>>> output_dir = %s' % output_base\n    print ''\n\n    return log_prob\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Model parameter tuning')\n    parser.add_argument('-m', '--mode', choices=['mnistinput', 'mnistcode', 'tfdinput', 'tfdcode'])\n    args = parser.parse_args()\n\n    print ''\n    print '************************'\n    print 'Testing %s' % args.mode\n    print '************************'\n    print ''\n\n    if args.mode == 'mnistinput':\n        mnist_mmd_input_space()\n    elif args.mode == 'mnistcode':\n        mnist_mmd_code_space()\n    elif args.mode == 'tfdinput':\n        tfd_mmd_input_space()\n    elif args.mode == 'tfdcode':\n        tfd_mmd_code_space()\n\n\n"
  },
  {
    "path": "vistools.py",
    "content": "\"\"\"This module contains useful tools that makes data visualization easier.\n\nYujia Li, 03/2013\n\"\"\"\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef bwpatchview(data, imsz, nrows, gridwidth=1, gridintensity=0, rowmajor=True, ax=None):\n    \"\"\"Display a list of images in grid view.\n\n    data: N*D matrix, each row is an image\n    imsz: 2-D tuple, size of the images\n    nrows: number of rows to arrange the images in a plot\n    gridwidth: number of pixels to use for the grid\n    gridintensity: the intensity value for the grid\n    rowmajor: are the images stored in a row-major order or coloumn-major order\n    ax: if provided, the image will be shown on the given axis.\n\n    The images are orgainzed in rows from left to right.\n    \"\"\"\n\n    N, D = data.shape\n    sx, sy = imsz\n\n    ncols = N // nrows\n    if N % nrows:\n        ncols += 1\n\n    img = np.ones((sx * nrows + gridwidth * (nrows + 1), \n        sy * ncols + gridwidth * (ncols + 1))) * gridintensity\n\n    for ix in range(0, nrows):\n        for iy in range(0, ncols):\n            idx = ix * ncols + iy\n            if idx >= N:\n                break\n            xstart = gridwidth + ix * (sx + gridwidth)\n            xend = xstart + sx\n            ystart = gridwidth + iy * (sy + gridwidth)\n            yend = ystart + sy\n\n            if rowmajor:\n                img[xstart:xend, ystart:yend] = data[idx].reshape(imsz)\n            else:\n                img[xstart:xend, ystart:yend] = data[idx].reshape((imsz[1], imsz[0])).T\n\n    if ax != None:\n        ax.imshow(img, cmap='gray', interpolation='nearest')\n        ax.axis('off')\n    else:\n        plt.imshow(img, cmap='gray', interpolation='nearest')\n        plt.axis('off')\n    plt.show()\n\ndef cpatchview(data, imsz, nrows, gridwidth=1, gridintensity=0, rowmajor=True, ax=None, normalize=False):\n    \"\"\"Display a list of color images in grid view.\n\n    data: N*(3*D) matrix, each row is a color image\n    imsz: 2-D tuple, size of the images, should have prod(imsz)=D\n    nrows: number of rows to arrange the images in a plot\n    gridwidth: number of pixels to use for the grid\n    gridintensity: the intensity value for the grid\n    rowmajor: specify whether the images are stored in row-major order or \n        column-major order\n    ax: if provided, the image will be shown on the given axis.\n    normalize: if set and data is real valued, data is normalized to within [0,1]\n    \n    The images are organized in rows from left to right.\n    \"\"\"\n    N, D = data.shape\n    D = D / 3\n    sx, sy = imsz\n\n    ncols = N / nrows\n    if N % nrows:\n        ncols += 1\n\n    img = np.ones((sx * nrows + gridwidth * (nrows + 1), \n        sy * ncols + gridwidth * (ncols + 1), 3), dtype=data.dtype) * gridintensity\n\n    for ix in range(0, nrows):\n        for iy in range(0, ncols):\n            idx = ix * ncols + iy\n            if idx >= N:\n                break\n            xstart = gridwidth + ix * (sx + gridwidth)\n            xend = xstart + sx\n            ystart = gridwidth + iy * (sy + gridwidth)\n            yend = ystart + sy\n\n            if rowmajor:\n                img[xstart:xend, ystart:yend, :] = data[idx].reshape((3,sx,sy)).transpose((1,2,0))\n            else:\n                img[xstart:xend, ystart:yend] = data[idx].reshape((3,sy,sx)).transpose((2,1,0))\n\n    if ax != None:\n        if normalize:\n            ax.imshow((img - img.min()) / (img.max() - img.min() + 1e-20), interpolation='nearest')\n        else:\n            ax.imshow(img, interpolation='nearest')\n        ax.axis('off')\n    else:\n        if normalize:\n            plt.imshow((img - img.min()) / (img.max() - img.min() + 1e-20), interpolation='nearest')\n        else:\n            plt.imshow(img, interpolation='nearest')\n        plt.axis('off')\n    plt.show()\n\ndef listpatchview(data, nrows, gridwidth=1, gridintensity=0, ax=None):\n    \"\"\"Display a list of images in grid view.\n\n    data: a list of images of the same size, can be either color or gray\n        images, but should be consistent.\n    nrows: number of rows to arrange the images in a plot\n    gridwidth: number of pixels to use for the grid\n    gridintensity: the intensity value for the grid\n    ax: if provided, the image will be shown on the given axis\n    \n    The images are organized in rows from left to right.\n    \"\"\"\n    N = len(data)\n    sx, sy = data[0].shape[:2]\n    D = sx * sy\n\n    ncols = N / nrows\n    if N % nrows:\n        ncols += 1\n\n    if len(data[0].shape) < 3 or data[0].shape[2] == 1:\n        n_color = 1\n        img = np.ones((sx * nrows + gridwidth * (nrows + 1), \n            sy * ncols + gridwidth * (ncols + 1)),dtype=data[0].dtype) * gridintensity\n    else:\n        n_color = 3\n        assert(data[0].shape[2] == n_color)\n        img = np.ones((sx * nrows + gridwidth * (nrows + 1), \n            sy * ncols + gridwidth * (ncols + 1), n_color),dtype=data[0].dtype) * gridintensity\n\n    for ix in range(0, nrows):\n        for iy in range(0, ncols):\n            idx = ix * ncols + iy\n            if idx >= N:\n                break\n            xstart = gridwidth + ix * (sx + gridwidth)\n            xend = xstart + sx\n            ystart = gridwidth + iy * (sy + gridwidth)\n            yend = ystart + sy\n\n            if n_color == 3:\n                img[xstart:xend, ystart:yend, :] = data[idx]\n            else:\n                img[xstart:xend, ystart:yend] = data[idx]\n\n    if ax == None:\n        ax = plt\n    if n_color == 3:\n        ax.imshow(img, interpolation='nearest')\n    else:\n        ax.imshow(img, cmap='gray', interpolation='nearest')\n    ax.axis('off')\n    plt.show()\n\ndef plot2dgaussian(mu, sigma, npoints=100, linespec=None, linewidth=1, ax=None, *args, **kwargs):\n    \"\"\"Plot a 2D Gaussian distribution. Showing on the plot are the mean of \n    the Gaussian and an ellipse corresponding to 1 standard deviation (not\n    strictly speaking standard deviation, but similar).\n    \"\"\"\n    eig, Q = np.linalg.eig(sigma)\n    scale = np.sqrt(eig).reshape(1,2)\n\n    x = np.zeros((npoints + 1, 2))\n\n    for n in range(npoints):\n        angle = 2 * np.pi * n / npoints\n        x[n,:] = mu + (scale * np.array([[np.cos(angle), np.sin(angle)]])).dot(Q.T)\n\n    x[npoints,:] = x[0,:]\n\n    if ax == None:\n        ax = plt\n\n    if linespec:\n        ax.plot(x[:,0], x[:,1], linespec, linewidth=linewidth, *args, **kwargs)\n    else:\n        ax.plot(x[:,0], x[:,1], linewidth=linewidth, *args, **kwargs)\n    plt.show()\n\ndef intarray_to_rgb(x, cmap):\n    \"\"\"\n    x: MxN is an array of int indices into the cmap\n    cmap: int->(r,g,b) mapping\n\n    Return converted y of shape MxNx3\n    \"\"\"\n    y = np.zeros((x.shape[0], x.shape[1], 3), dtype=np.uint8)\n\n    if isinstance(cmap, dict):\n        for c in cmap:\n            y[x == c] = cmap[c]\n    elif isinstance(cmap, np.ndarray):\n        for i in range(cmap.shape[0]):\n            y[x == i] = cmap[i]\n\n    return y\n\ndef pil_png_cmap_to_dict(pil_palette):\n    \"\"\"\n    // cmap is a color map from PIL after loading a color png file. Format: list\n    // of (rgb, idx) tuples. rgb is an integer representation of the RGB value.\n\n    pil_palette is a list of palette values. Should be 3xC where C is the \n    number of colors.\n\n    Return a dict of (idx -> (r,g,b)).\n    \"\"\"\n    cm = {}\n    p = np.array(pil_palette, dtype=np.uint8).reshape(len(pil_palette)/3, 3)\n    for i in range(p.shape[0]):\n        cm[i] = p[i]\n    return cm\n\n"
  },
  {
    "path": "visualize.py",
    "content": "import matplotlib.pyplot as plt\nimport numpy as np\nimport gnumpy as gnp\nimport vistools as vt\nimport core.generative as gen\nimport os\nimport time\nimport core.util as util\nimport scipy.misc as misc\n\nfrom mpl_toolkits.axes_grid1 import AxesGrid\n\nplt.ion()\n\ndef nn_search(samples, database, top_k=1, imsz=[28,28], orientation='horizontal', output_file=None, pad=0.1):\n    if orientation not in ['horizontal', 'vertical']:\n        print '[Error] orientation must be either horizontal or vertical'\n        return\n\n    g_samples = util.to_garray(samples)\n    g_database = util.to_garray(database)\n\n    if isinstance(database, gnp.garray):\n        database = database.asarray()\n    if isinstance(samples, gnp.garray):\n        samples = samples.asarray()\n\n    n_samples, n_dims = samples.shape\n    nn = np.empty((n_samples * top_k, n_dims), dtype=np.float)\n\n    for i in range(n_samples):\n        v = g_samples[i]\n        d = ((g_database - v)**2).sum(axis=1)\n        idx = d.asarray().argsort()\n        top_candidates = database[idx[:top_k]]\n        if orientation == 'horizontal':\n            nn[np.arange(i, i+n_samples*top_k, n_samples)] = top_candidates\n        elif orientation == 'vertical':\n            nn[i*top_k:(i+1)*top_k] = top_candidates\n\n    f = plt.figure()\n    grid = AxesGrid(f, 111, nrows_ncols=(2,1), axes_pad=pad)\n\n    vt.bwpatchview(samples, imsz, 1, gridintensity=1, ax=grid[0])\n    if orientation == 'horizontal':\n        vt.bwpatchview(nn, imsz, top_k, gridintensity=1, ax=grid[1])\n    elif orientation == 'vertical':\n        vt.bwpatchview(nn, imsz, n_samples, gridintensity=1, ax=grid[1])\n\n    if output_file is not None:\n        plt.savefig(output_file, bbox_inches='tight')\n\ndef view_checkpoints(model_dir, sigma, imsz=[28,28], figid=101):\n    \"\"\"\n    checkpoint files should have a name matching the following:\n    <model_dir>/checkpoint_<sigma>_<iter>.pdata\n    \"\"\"\n    prefix = '%s/checkpoint_%s' % (model_dir, str(sigma))\n    checkpoint_numbers = sorted([int(fpath.split('.')[0].split('_')[-1]) for fpath in os.listdir(model_dir) if fpath.startswith('checkpoint_%s' % str(sigma))])\n\n    net = gen.StochasticGenerativeNet()\n\n    plt.figure(figid, figsize=(10,8))\n    ax = plt.subplot(111)\n\n    for i in checkpoint_numbers:\n        net.load_model_from_file(prefix + '_%d.pdata' % i)\n        w = net.layers[-1].params.W.asarray()\n        ax.cla()\n        vt.bwpatchview(w[:400], imsz, int(np.sqrt(w[:400].shape[0])), rowmajor=True, gridintensity=1, ax=ax)\n        plt.draw()\n        plt.show()\n        print 'Checkpoint %d' % i\n        time.sleep(0.04)\n\ndef generation_on_a_line(net, n_points=100, imsz=[28,28], nrows=10, h_seeds=None):\n    if h_seeds is None:\n        h = net.sample_hiddens(2)\n        z = gnp.zeros((n_points, h.shape[1]))\n        diff = h[1] - h[0]\n        step = diff / (n_points - 1)\n        for i in range(n_points):\n            z[i] = h[0] + step * i\n    else:\n        n_seeds = h_seeds.shape[0]\n        z = gnp.zeros((n_points * n_seeds, h_seeds.shape[1]))\n        for i in range(n_seeds):\n            h0 = h_seeds[i]\n            h1 = h_seeds[(i+1) % n_seeds]\n            diff = h1 - h0\n            step = diff / (n_points - 1)\n            for j in range(n_points):\n                z[i*n_points+j] = h0 + step * j\n\n    x = net.generate_samples(z=z)\n    vt.bwpatchview(x.asarray(), imsz, nrows, rowmajor=True, gridintensity=1)\n\ndef generate_morphing_video(net, h_seeds, n_points=100, imsz=[28,28], output_dir='video'):\n    if not os.path.exists(output_dir):\n        os.makedirs(output_dir)\n\n    n_seeds = h_seeds.shape[0]\n    z = gnp.zeros((n_points * n_seeds, h_seeds.shape[1]))\n\n    for i in range(n_seeds):\n        h0 = h_seeds[i]\n        h1 = h_seeds[(i+1) % n_seeds]\n        diff = h1 - h0\n        step = diff / (n_points - 1)\n        for j in range(n_points):\n            z[i*n_points+j] = h0 + step * j\n\n    x = net.generate_samples(z=z).asarray()\n    for i in range(x.shape[0]):\n        misc.imsave(output_dir + '/%d.png' % i, x[i].reshape(imsz))\n\n###################################\n# For old experiments\n###################################\n\ndef plot_dataset(x, t, ax=None):\n    if ax is None:\n        plt.figure()\n        ax = plt.subplot(111)\n    ax.plot(x[t==0,0], x[t==0,1], 'o')\n    ax.plot(x[t==1,0], x[t==1,1], 'o')\n\n    x_min = x[:,0].min()\n    x_max = x[:,0].max()\n    y_min = x[:,1].min()\n    y_max = x[:,1].max()\n    ax.set_xlim([x_min - (x_max - x_min) / 10.0, x_max + (x_max - x_min) / 10.0])\n    ax.set_ylim([y_min - (y_max - y_min) / 10.0, y_max + (y_max - y_min) / 10.0])\n\n    plt.show()\n\n    return ax\n\ndef plot_decision_boundary(f, x_range, y_range, density, ax=None, **kwargs):\n    if ax is None:\n        plt.figure()\n        ax = plt.subplot(111)\n\n    x, y = np.meshgrid(np.arange(x_range[0], x_range[1], density),\n            np.arange(y_range[0], y_range[1], density))\n\n    data = np.c_[x.reshape(x.size,1), y.reshape(y.size,1)]\n    z = f(data).reshape(x.shape)\n\n    ax.contour(x, y, z, levels=[0], **kwargs)\n\n    plt.show()\n\n    return ax\n"
  }
]