Repository: LiDan456/MAD-GANs
Branch: master
Commit: 3139a73a4112
Files: 45
Total size: 249.0 KB

Directory structure:
gitextract_fejrc7rh/

├── .gitattributes
├── AD.py
├── AD_Invert.py
├── DR_discriminator.py
├── README.md
├── RGAN.py
├── data_utils.py
├── differential_privacy/
│   ├── dp_sgd/
│   │   └── dp_optimizer/
│   │       ├── dp_optimizer.py
│   │       ├── sanitizer.py
│   │       └── utils.py
│   └── privacy_accountant/
│       └── tf/
│           └── accountant.py
├── eugenium_mmd.py
├── eval.py
├── experiments/
│   ├── parameters/
│   │   ├── kdd99_30_0.npy
│   │   ├── kdd99_30_1.npy
│   │   ├── kdd99_30_10.npy
│   │   ├── kdd99_30_11.npy
│   │   ├── kdd99_30_12.npy
│   │   ├── kdd99_30_13.npy
│   │   ├── kdd99_30_14.npy
│   │   ├── kdd99_30_15.npy
│   │   ├── kdd99_30_16.npy
│   │   ├── kdd99_30_17.npy
│   │   ├── kdd99_30_18.npy
│   │   ├── kdd99_30_19.npy
│   │   ├── kdd99_30_2.npy
│   │   ├── kdd99_30_20.npy
│   │   ├── kdd99_30_21.npy
│   │   ├── kdd99_30_22.npy
│   │   ├── kdd99_30_3.npy
│   │   ├── kdd99_30_4.npy
│   │   ├── kdd99_30_5.npy
│   │   ├── kdd99_30_6.npy
│   │   ├── kdd99_30_7.npy
│   │   ├── kdd99_30_8.npy
│   │   └── kdd99_30_9.npy
│   ├── plots/
│   │   └── gs/
│   │       └── kdd99_gs_real.npy
│   └── settings/
│       ├── kdd99.txt
│       └── kdd99_test.txt
├── mmd.py
├── mod_core_rnn_cell_impl.py
├── model.py
├── plotting.py
├── tf_ops.py
└── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto


================================================
FILE: AD.py
================================================
import tensorflow as tf
import numpy as np
import pdb
import json
import model
from mod_core_rnn_cell_impl import LSTMCell  # modified to allow initializing bias in lstm

import utils
import eval
import DR_discriminator
import data_utils

# from pyod.utils.utility import *
from sklearn.utils.validation import *
from sklearn.metrics.classification import *
from sklearn.metrics.ranking import *
from time import time

begin = time()

"""
Here, only the discriminator was used to do the anomaly detection
"""

# --- get settings --- #
# parse command line arguments, or use defaults
parser = utils.rgan_options_parser()
settings = vars(parser.parse_args())
# if a settings file is specified, it overrides command line arguments/defaults
if settings['settings_file']: settings = utils.load_settings_from_file(settings)

# --- get data, split --- #
data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
print('Loading data from', data_path)
settings["eval_single"] = False
settings["eval_an"] = False
samples, labels, index = data_utils.get_data(settings["data"], settings["seq_length"], settings["seq_step"],
                                             settings["num_signals"], settings["sub_id"], settings["eval_single"],
                                             settings["eval_an"], data_path)
# --- save settings, data --- #
# no need
print('Ready to run with settings:')
for (k, v) in settings.items(): print(v, '\t', k)
# add the settings to local environment
# WARNING: at this point a lot of variables appear
locals().update(settings)
json.dump(settings, open('./experiments/settings/' + identifier + '.txt', 'w'), indent=0)

class myADclass():
    def __init__(self, epoch, settings=settings, samples=samples, labels=labels, index=index):
        self.epoch = epoch
        self.settings = settings
        self.samples = samples
        self.labels = labels
        self.index = index
    def ADfunc(self):
        num_samples_t = self.samples.shape[0]
        print('sample_shape:', self.samples.shape[0])
        print('num_samples_t', num_samples_t)

        # -- only discriminate one batch for one time -- #
        D_test = np.empty([num_samples_t, self.settings['seq_length'], 1])
        DL_test = np.empty([num_samples_t, self.settings['seq_length'], 1])
        L_mb = np.empty([num_samples_t, self.settings['seq_length'], 1])
        I_mb = np.empty([num_samples_t, self.settings['seq_length'], 1])
        batch_times = num_samples_t // self.settings['batch_size']
        for batch_idx in range(0, num_samples_t // self.settings['batch_size']):
            # print('batch_idx:{}
            # display batch progress
            model.display_batch_progression(batch_idx, batch_times)
            start_pos = batch_idx * self.settings['batch_size']
            end_pos = start_pos + self.settings['batch_size']
            T_mb = self.samples[start_pos:end_pos, :, :]
            L_mmb = self.labels[start_pos:end_pos, :, :]
            I_mmb = self.index[start_pos:end_pos, :, :]
            para_path = './experiments/parameters/' + self.settings['sub_id'] + '_' + str(
                self.settings['seq_length']) + '_' + str(self.epoch) + '.npy'
            D_t, L_t = DR_discriminator.dis_trained_model(self.settings, T_mb, para_path)
            D_test[start_pos:end_pos, :, :] = D_t
            DL_test[start_pos:end_pos, :, :] = L_t
            L_mb[start_pos:end_pos, :, :] = L_mmb
            I_mb[start_pos:end_pos, :, :] = I_mmb

        start_pos = (num_samples_t // self.settings['batch_size']) * self.settings['batch_size']
        end_pos = start_pos + self.settings['batch_size']
        size = samples[start_pos:end_pos, :, :].shape[0]
        fill = np.ones([self.settings['batch_size'] - size, samples.shape[1], samples.shape[2]])
        batch = np.concatenate([samples[start_pos:end_pos, :, :], fill], axis=0)
        para_path = './experiments/parameters/' + self.settings['sub_id'] + '_' + str(
            self.settings['seq_length']) + '_' + str(self.epoch) + '.npy'
        D_t, L_t = DR_discriminator.dis_trained_model(self.settings, batch, para_path)
        L_mmb = self.labels[start_pos:end_pos, :, :]
        I_mmb = self.index[start_pos:end_pos, :, :]
        D_test[start_pos:end_pos, :, :] = D_t[:size, :, :]
        DL_test[start_pos:end_pos, :, :] = L_t[:size, :, :]
        L_mb[start_pos:end_pos, :, :] = L_mmb
        I_mb[start_pos:end_pos, :, :] = I_mmb

        results = np.zeros([18, 4])
        for i in range(2, 8):
            tao = 0.1 * i
            Accu2, Pre2, Rec2, F12 = DR_discriminator.detection_Comb(
                DL_test, L_mb, I_mb, self.settings['seq_step'], tao)
            print('seq_length:', self.settings['seq_length'])
            print('Comb-logits-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}'
                  .format(self.epoch, tao, Accu2, Pre2, Rec2, F12))
            results[i - 2, :] = [Accu2, Pre2, Rec2, F12]

            Accu3, Pre3, Rec3, F13 = DR_discriminator.detection_Comb(
                D_test, L_mb, I_mb, self.settings['seq_step'], tao)
            print('seq_length:', self.settings['seq_length'])
            print('Comb-statistic-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}'
                  .format(self.epoch, tao, Accu3, Pre3, Rec3, F13))
            results[i - 2+6, :] = [Accu3, Pre3, Rec3, F13]

            Accu5, Pre5, Rec5, F15 = DR_discriminator.sample_detection(D_test, L_mb, tao)
            print('seq_length:', self.settings['seq_length'])
            print('sample-wise-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}'
                  .format(self.epoch, tao, Accu5, Pre5, Rec5, F15))
            results[i - 2+12, :] = [Accu5, Pre5, Rec5, F15]

        return results

if __name__ == "__main__":
    print('Main Starting...')

    Results = np.empty([settings['num_epochs'], 18, 4])

    for epoch in range(settings['num_epochs']):
    # for epoch in range(50, 60):
        ob = myADclass(epoch)
        Results[epoch, :, :] = ob.ADfunc()

    # res_path = './experiments/plots/Results' + '_' + settings['sub_id'] + '_' + str(
    #     settings['seq_length']) + '.npy'
    # np.save(res_path, Results)

    print('Main Terminating...')
    end = time() - begin
    print('Testing terminated | Training time=%d s' % (end))

================================================
FILE: AD_Invert.py
================================================
import tensorflow as tf
import numpy as np
import pdb
import json
from mod_core_rnn_cell_impl import LSTMCell  # modified to allow initializing bias in lstm

import data_utils
import plotting
import model
import mmd
import utils
import eval
import DR_discriminator

from differential_privacy.dp_sgd.dp_optimizer import dp_optimizer
from differential_privacy.dp_sgd.dp_optimizer import sanitizer
from differential_privacy.privacy_accountant.tf import accountant

"""
Here, both the discriminator and generator were used to do the anomaly detection
"""

# --- get settings --- #
# parse command line arguments, or use defaults
parser = utils.rgan_options_parser()
settings = vars(parser.parse_args())
# if a settings file is specified, it overrides command line arguments/defaults
if settings['settings_file']: settings = utils.load_settings_from_file(settings)

# --- get data, split --- #
data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
print('Loading data from', data_path)
samples, labels, index = data_utils.get_data(settings["data"], settings["seq_length"], settings["seq_step"],
                                             settings["num_signals"], settings["sub_id"], settings["eval_single"],
                                             settings["eval_an"], data_path)
# --- save settings, data --- #
# no need
print('Ready to run with settings:')
for (k, v) in settings.items(): print(v, '\t', k)
# add the settings to local environment
# WARNING: at this point a lot of variables appear
locals().update(settings)
json.dump(settings, open('./experiments/settings/' + identifier + '.txt', 'w'), indent=0)

class myADclass():
    def __init__(self, epoch, settings=settings, samples=samples, labels=labels, index=index):
        self.epoch = epoch
        self.settings = settings
        self.samples = samples
        self.labels = labels
        self.index = index
    def ADfunc(self):
        num_samples_t = self.samples.shape[0]
        t_size = 500
        T_index = np.random.choice(num_samples_t, size=t_size, replace=False)
        print('sample_shape:', self.samples.shape[0])
        print('num_samples_t', num_samples_t)

        # -- only discriminate one batch for one time -- #
        D_test = np.empty([t_size, self.settings['seq_length'], 1])
        DL_test = np.empty([t_size, self.settings['seq_length'], 1])
        GG = np.empty([t_size, self.settings['seq_length'], self.settings['num_signals']])
        T_samples = np.empty([t_size, self.settings['seq_length'], self.settings['num_signals']])
        L_mb = np.empty([t_size, self.settings['seq_length'], 1])
        I_mb = np.empty([t_size, self.settings['seq_length'], 1])
        for batch_idx in range(0, t_size):
            # print('epoch:{}'.format(self.epoch))
            # print('batch_idx:{}'.format(batch_idx))
            # display batch progress
            model.display_batch_progression(batch_idx, t_size)
            T_mb = self.samples[T_index[batch_idx], :, :]
            L_mmb = self.labels[T_index[batch_idx], :, :]
            I_mmb = self.index[T_index[batch_idx], :, :]
            para_path = './experiments/parameters/' + self.settings['sub_id'] + '_' + str(
                self.settings['seq_length']) + '_' + str(self.epoch) + '.npy'
            D_t, L_t = DR_discriminator.dis_D_model(self.settings, T_mb, para_path)
            Gs, Zs, error_per_sample, heuristic_sigma = DR_discriminator.invert(self.settings, T_mb, para_path,
                                                                                g_tolerance=None,
                                                                                e_tolerance=0.1, n_iter=None,
                                                                                max_iter=1000,
                                                                                heuristic_sigma=None)
            GG[batch_idx, :, :] = Gs
            T_samples[batch_idx, :, :] = T_mb
            D_test[batch_idx, :, :] = D_t
            DL_test[batch_idx, :, :] = L_t
            L_mb[batch_idx, :, :] = L_mmb
            I_mb[batch_idx, :, :] = I_mmb

        # -- use self-defined evaluation functions -- #
        # -- test different tao values for the detection function -- #
        results = np.empty([5, 5])
        # for i in range(2, 8):
        #     tao = 0.1 * i
        tao = 0.5
        lam = 0.8
        Accu1, Pre1, Rec1, F11, FPR1, D_L1 = DR_discriminator.detection_D_I(DL_test, L_mb, I_mb, self.settings['seq_step'], tao)
        print('seq_length:', self.settings['seq_length'])
        print('D:Comb-logits-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
              .format(self.epoch, tao, Accu1, Pre1, Rec1, F11, FPR1))
        results[0, :] = [Accu1, Pre1, Rec1, F11, FPR1]

        Accu2, Pre2, Rec2, F12, FPR2, D_L2 = DR_discriminator.detection_D_I(D_test, L_mb, I_mb, self.settings['seq_step'], tao)
        print('seq_length:', self.settings['seq_length'])
        print('D:Comb-statistic-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
              .format(self.epoch, tao, Accu2, Pre2, Rec2, F12, FPR2))
        results[1, :] = [Accu2, Pre2, Rec2, F12, FPR2]

        Accu3, Pre3, Rec3, F13, FPR3, D_L3 = DR_discriminator.detection_R_D_I(DL_test, GG, T_samples, L_mb, self.settings['seq_step'], tao, lam)
        print('seq_length:', self.settings['seq_length'])
        print('RD:Comb-logits_based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
            .format(self.epoch, tao, Accu3, Pre3, Rec3, F13, FPR3))
        results[2, :] = [Accu3, Pre3, Rec3, F13, FPR3]

        Accu4, Pre4, Rec4, F14, FPR4, D_L4 = DR_discriminator.detection_R_D_I(D_test, GG, T_samples, L_mb, self.settings['seq_step'], tao, lam)
        print('seq_length:', self.settings['seq_length'])
        print('RD:Comb-statistic-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
              .format(self.epoch, tao, Accu4, Pre4, Rec4, F14, FPR4))
        results[3, :] = [Accu4, Pre4, Rec4, F14, FPR4]

        Accu5, Pre5, Rec5, F15, FPR5, D_L5 = DR_discriminator.detection_R_I(GG, T_samples, L_mb, self.settings['seq_step'],tao)
        print('seq_length:', self.settings['seq_length'])
        print('G:Comb-sample-based-Epoch: {}; tao={:.1}; Accu: {:.4}; Pre: {:.4}; Rec: {:.4}; F1: {:.4}; FPR: {:.4}'
              .format(self.epoch, tao, Accu5, Pre5, Rec5, F15, FPR5))
        results[4, :] = [Accu5, Pre5, Rec5, F15, FPR5]

        return results, GG, D_test, DL_test


if __name__ == "__main__":
    print('Main Starting...')

    Results = np.empty([settings['num_epochs'], 5, 5])

    t_size = 500
    D_test = np.empty([settings['num_epochs'], t_size, settings['seq_length'], 1])
    DL_test = np.empty([settings['num_epochs'], t_size, settings['seq_length'], 1])
    GG = np.empty([settings['num_epochs'], t_size, settings['seq_length'], settings['num_signals']])

    for epoch in range(settings['num_epochs']):
    # for epoch in range(1):
        ob = myADclass(epoch)
        Results[epoch, :, :], GG[epoch, :, :, :], D_test[epoch, :, :, :], DL_test[epoch, :, :, :] = ob.ADfunc()

    res_path = './experiments/plots/Results_Invert' + '_' + settings['sub_id'] + '_' + str(
        settings['seq_length']) + '.npy'
    np.save(res_path, Results)

    dg_path = './experiments/plots/DG_Invert' + '_' + settings['sub_id'] + '_' + str(
        settings['seq_length']) + '_'
    np.save(dg_path + 'D_test.npy', D_test)
    np.save(dg_path + 'DL_test.npy', DL_test)
    np.save(dg_path + 'GG.npy', DL_test)

    print('Main Terminating...')

================================================
FILE: DR_discriminator.py
================================================
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb
import model
import mmd
from mod_core_rnn_cell_impl import LSTMCell
from sklearn.metrics import precision_recall_fscore_support

def anomaly_detection_plot(D_test, T_mb, L_mb, D_L, epoch, identifier):

    aa = D_test.shape[0]
    bb = D_test.shape[1]
    D_L = D_L.reshape([aa, bb, -1])

    x_points = np.arange(bb)

    fig, ax = plt.subplots(4, 4, sharex=True)
    for m in range(4):
        for n in range(4):
            D = D_test[n * 4 + m, :, :]
            T = T_mb[n * 4 + m, :, :]
            L = L_mb[n * 4 + m, :, :]
            DL = D_L[n * 4 + m, :, :]
            ax[m, n].plot(x_points, D, '--g', label='Pro')
            ax[m, n].plot(x_points, T, 'b', label='Data')
            ax[m, n].plot(x_points, L, 'k', label='Label')
            ax[m, n].plot(x_points, DL, 'r', label='Label')
            ax[m, n].set_ylim(-1, 1)
    for n in range(4):
        ax[-1, n].xaxis.set_ticks(range(0, bb, int(bb/6)))
    fig.suptitle(epoch)
    fig.subplots_adjust(hspace=0.15)
    fig.savefig("./experiments/plots/DR_dis/" + identifier + "_epoch" + str(epoch).zfill(4) + ".png")
    plt.clf()
    plt.close()

    return True

def detection_Comb(Label_test, L_mb, I_mb, seq_step, tao):
    aa = Label_test.shape[0]
    bb = Label_test.shape[1]

    LL = (aa-1)*seq_step+bb

    Label_test = abs(Label_test.reshape([aa, bb]))
    L_mb = L_mb .reshape([aa, bb])
    I_mb = I_mb .reshape([aa, bb])
    D_L = np.zeros([LL, 1])
    L_L = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i*10+j)
            D_L[i*seq_step+j] += Label_test[i, j]
            L_L[i * seq_step + j] += L_mb[i, j]
            Count[i * seq_step + j] += 1

    D_L /= Count
    L_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if D_L[i] > tao:
            # true/negative
            D_L[i] = 0
        else:
            # false/positive
            D_L[i] = 1

    cc = (D_L == L_L)
    # print('D_L:', D_L)
    # print('L_L:', L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)

    print('N:', N)

    Accu = float((N / LL) * 100)

    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')

    return Accu, precision, recall, f1,


def detection_logits_I(DL_test, L_mb, I_mb, seq_step, tao):
    aa = DL_test.shape[0]
    bb = DL_test.shape[1]

    LL = (aa-1)*seq_step+bb

    DL_test = abs(DL_test.reshape([aa, bb]))
    L_mb = L_mb .reshape([aa, bb])
    I_mb = I_mb .reshape([aa, bb])
    D_L = np.zeros([LL, 1])
    L_L = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i*10+j)
            D_L[i*seq_step+j] += DL_test[i, j]
            L_L[i * seq_step + j] += L_mb[i, j]
            Count[i * seq_step + j] += 1

    D_L /= Count
    L_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if D_L[i] > tao:
            # true/negative
            D_L[i] = 0
        else:
            # false/positive
            D_L[i] = 1

        A = D_L[i]
        B = L_L[i]
        if A == 1 and B == 1:
            TP += 1
        elif A == 1 and B == 0:
            FP += 1
        elif A == 0 and B == 0:
            TN += 1
        elif A == 0 and B == 1:
            FN += 1


    cc = (D_L == L_L)
    # print('D_L:', D_L)
    # print('L_L:', L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)

    print('N:', N)

    Accu = float((N / LL) * 100)

    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')

    # true positive among all the detected positive
    # Pre = (100 * TP) / (TP + FP + 1)
    # # true positive among all the real positive
    # Rec = (100 * TP) / (TP + FN + 1)
    # # The F1 score is the harmonic average of the precision and recall,
    # # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    # F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate--false alarm rate
    FPR = (100 * FP) / (FP + TN+1)

    return Accu, precision, recall, f1, FPR, D_L

def detection_statistic_I(D_test, L_mb, I_mb, seq_step, tao):
    # point-wise detection for one dimension

    aa = D_test.shape[0]
    bb = D_test.shape[1]

    LL = (aa-1) * seq_step + bb
    # print('aa:', aa)
    # print('bb:', bb)
    # print('LL:', LL)

    D_test = D_test.reshape([aa, bb])
    L_mb = L_mb.reshape([aa, bb])
    I_mb = I_mb.reshape([aa, bb])
    D_L = np.zeros([LL, 1])
    L_L = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i * 10 + j)
            D_L[i * seq_step + j] += D_test[i, j]
            L_L[i * seq_step + j] += L_mb[i, j]
            Count[i * seq_step + j] += 1

    D_L /= Count
    L_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if D_L[i] > tao:
            # true/negative
            D_L[i] = 0
        else:
            # false/positive
            D_L[i] = 1

        A = D_L[i]
        B = L_L[i]
        if A == 1 and B == 1:
            TP += 1
        elif A == 1 and B == 0:
            FP += 1
        elif A == 0 and B == 0:
            TN += 1
        elif A == 0 and B == 1:
            FN += 1

    cc = (D_L == L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)
    Accu = float((N / LL) * 100)

    precision, recall, f1, _ = precision_recall_fscore_support(L_L, D_L, average='binary')

    # true positive among all the detected positive
    # Pre = (100 * TP) / (TP + FP + 1)
    # # true positive among all the real positive
    # Rec = (100 * TP) / (TP + FN + 1)
    # # The F1 score is the harmonic average of the precision and recall,
    # # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    # F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate--false alarm rate
    FPR = (100 * FP) / (FP + TN)

    return Accu, precision, recall, f1, FPR, D_L

def detection_D_I(DD, L_mb, I_mb, seq_step, tao):
    # point-wise detection for one dimension

    aa = DD.shape[0]
    bb = DD.shape[1]

    LL = (aa-1)*seq_step+bb

    DD = abs(DD.reshape([aa, bb]))
    L_mb = L_mb .reshape([aa, bb])
    I_mb = I_mb .reshape([aa, bb])
    D_L = np.zeros([LL, 1])
    L_L = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i*10+j)
            D_L[i*10+j] += DD[i, j]
            L_L[i * 10 + j] += L_mb[i, j]
            Count[i * 10 + j] += 1

    D_L /= Count
    L_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if D_L[i] > tao:
            # true/negative
            D_L[i] = 0
        else:
            # false/positive
            D_L[i] = 1

        A = D_L[i]
        B = L_L[i]
        if A == 1 and B == 1:
            TP += 1
        elif A == 1 and B == 0:
            FP += 1
        elif A == 0 and B == 0:
            TN += 1
        elif A == 0 and B == 1:
            FN += 1


    cc = (D_L == L_L)
    # print('D_L:', D_L)
    # print('L_L:', L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)

    print('N:', N)

    Accu = float((N / LL) * 100)

    # true positive among all the detected positive
    Pre = (100 * TP) / (TP + FP + 1)
    # true positive among all the real positive
    Rec = (100 * TP) / (TP + FN + 1)
    # The F1 score is the harmonic average of the precision and recall,
    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate--false alarm rate
    FPR = (100 * FP) / (FP + TN+1)

    return Accu, Pre, Rec, F1, FPR, D_L

def detection_R_D_I(DD, Gs, T_mb, L_mb, seq_step, tao, lam):
    # point-wise detection for one dimension
    # (1-lambda)*R(x)+lambda*D(x)
    # lambda=0.5?
    # D_test, Gs, T_mb, L_mb  are of same size

    R = np.absolute(Gs - T_mb)
    R = np.mean(R, axis=2)
    aa = DD.shape[0]
    bb = DD.shape[1]

    LL = (aa - 1) * seq_step + bb

    DD = abs(DD.reshape([aa, bb]))
    DD = 1-DD
    L_mb = L_mb.reshape([aa, bb])
    R = R.reshape([aa, bb])

    D_L = np.zeros([LL, 1])
    R_L = np.zeros([LL, 1])
    L_L = np.zeros([LL, 1])
    L_pre = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i*10+j)
            D_L[i * 10 + j] += DD[i, j]
            L_L[i * 10 + j] += L_mb[i, j]
            R_L[i * 10 + j] += R[i, j]
            Count[i * 10 + j] += 1
    D_L /= Count
    L_L /= Count
    R_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if (1-lam)*R_L[i] + lam*D_L[i] > tao:
            # false
            L_pre[i] = 1
        else:
            # true
            L_pre[i] = 0

        A = L_pre[i]
        # print('A:', A)
        B = L_L[i]
        # print('B:', B)
        if A == 1 and B == 1:
            TP += 1
        elif A == 1 and B == 0:
            FP += 1
        elif A == 0 and B == 0:
            TN += 1
        elif A == 0 and B == 1:
            FN += 1

    cc = (L_pre == L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)
    Accu = float((N / (aa*bb)) * 100)

    # true positive among all the detected positive
    Pre = (100 * TP) / (TP + FP + 1)
    # true positive among all the real positive
    Rec = (100 * TP) / (TP + FN + 1)
    # The F1 score is the harmonic average of the precision and recall,
    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate
    FPR = (100 * FP) / (FP + TN+1)

    return Accu, Pre, Rec, F1, FPR, L_pre

def detection_R_I(Gs, T_mb, L_mb, seq_step, tao):
    # point-wise detection for one dimension
    # (1-lambda)*R(x)+lambda*D(x)
    # lambda=0.5?
    # D_test, Gs, T_mb, L_mb  are of same size

    R = np.absolute(Gs - T_mb)
    R = np.mean(R, axis=2)
    aa = R.shape[0]
    bb = R.shape[1]

    LL = (aa - 1) * seq_step + bb

    L_mb = L_mb.reshape([aa, bb])
    R = R.reshape([aa, bb])

    L_L = np.zeros([LL, 1])
    R_L = np.zeros([LL, 1])
    L_pre = np.zeros([LL, 1])
    Count = np.zeros([LL, 1])
    for i in range(0, aa):
        for j in range(0, bb):
            # print('index:', i*10+j)
            L_L[i * 10 + j] += L_mb[i, j]
            R_L[i * 10 + j] += R[i, j]
            Count[i * 10 + j] += 1
    L_L /= Count
    R_L /= Count

    TP, TN, FP, FN = 0, 0, 0, 0

    for i in range(LL):
        if R_L[i] > tao:
            # false
            L_pre[i] = 1
        else:
            # true
            L_pre[i] = 0

        A = L_pre[i]
        B = L_L[i]
        if A == 1 and B == 1:
            TP += 1
        elif A == 1 and B == 0:
            FP += 1
        elif A == 0 and B == 0:
            TN += 1
        elif A == 0 and B == 1:
            FN += 1

    cc = (L_pre == L_L)
    cc = list(cc.reshape([-1]))
    N = cc.count(True)
    Accu = float((N / (aa*bb)) * 100)

    # true positive among all the detected positive
    Pre = (100 * TP) / (TP + FP + 1)
    # true positive among all the real positive
    Rec = (100 * TP) / (TP + FN + 1)
    # The F1 score is the harmonic average of the precision and recall,
    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate
    FPR = (100 * FP) / (FP + TN+1)

    return Accu, Pre, Rec, F1, FPR, L_pre


def sample_detection(D_test, L_mb, tao):
    # sample-wise detection for one dimension

    aa = D_test.shape[0]
    bb = D_test.shape[1]

    D_test = D_test.reshape([aa, bb])
    L_mb = L_mb.reshape([aa, bb])
    L = np.sum(L_mb, 1)
    # NN = 0-10
    L[L > 0] = 1

    D_L = np.empty([aa, ])

    for i in range(aa):
        if np.mean(D_test[i, :]) > tao:
            # true/negative
            D_L[i] = 0
        else:
            # false/positive
            D_L[i] = 1

    cc = (D_L == L)
    # cc = list(cc)
    N = list(cc).count(True)
    Accu = float((N / (aa)) * 100)

    precision, recall, f1, _ = precision_recall_fscore_support(L, D_L, average='binary')

    return Accu, precision, recall, f1


def CUSUM_det(spe_n, spe_a, labels):

    mu = np.mean(spe_n)
    sigma = np.std(spe_n)

    kk = 3*sigma
    H = 15*sigma
    print('H:', H)

    tar = np.mean(spe_a)

    mm = spe_a.shape[0]

    SH = np.empty([mm, ])
    SL = np.empty([mm, ])

    for i in range(mm):
        SH[-1] = 0
        SL[-1] = 0
        SH[i] = max(0, SH[i-1]+spe_a[i]-(tar+kk))
        SL[i] = min(0, SL[i-1]+spe_a[i]-(tar-kk))


    count = np.empty([mm, ])
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(mm):
        A = SH[i]
        B = SL[i]
        AA = H
        BB = -H
        if A <= AA and B >= BB:
            count[i] = 0
        else:
            count[i] = 1

        C = count[i]
        D = labels[i]
        if C == 1 and D == 1:
            TP += 1
        elif C == 1 and D == 0:
            FP += 1
        elif C == 0 and D == 0:
            TN += 1
        elif C == 0 and D == 1:
            FN += 1

    cc = (count == labels)
    # cc = list(cc)
    N = list(cc).count(True)
    Accu = float((N / (mm)) * 100)

    # true positive among all the detected positive
    Pre = (100 * TP) / (TP + FP + 1)
    # true positive among all the real positive
    Rec = (100 * TP) / (TP + FN)
    # The F1 score is the harmonic average of the precision and recall,
    # where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    F1 = (2 * Pre * Rec) / (100 * (Pre + Rec + 1))
    # False positive rate
    FPR = (100 * FP) / (FP + TN)

    return Accu, Pre, Rec, F1, FPR


def SPE(X, pc):
    a = X.shape[0]
    b = X.shape[1]

    spe = np.empty([a])
    # Square Prediction Error (square of residual distance)
    #  spe = X'(I-PP')X
    I = np.identity(b, float) - np.matmul(pc.transpose(1, 0), pc)
    # I = np.matmul(I, I)
    for i in range(a):
        x = X[i, :].reshape([b, 1])
        y = np.matmul(x.transpose(1, 0), I)
        spe[i] = np.matmul(y, x)

    return spe


def generator_o(z, hidden_units_g, seq_length, batch_size, num_generated_features, reuse=False, parameters=None, learn_scale=True):
    """
    If parameters are supplied, initialise as such
    """
    # It is important to specify different variable scopes for the LSTM cells.
    with tf.variable_scope("generator_o") as scope:

        W_out_G_initializer = tf.constant_initializer(value=parameters['generator/W_out_G:0'])
        b_out_G_initializer = tf.constant_initializer(value=parameters['generator/b_out_G:0'])
        try:
            scale_out_G_initializer = tf.constant_initializer(value=parameters['generator/scale_out_G:0'])
        except KeyError:
            scale_out_G_initializer = tf.constant_initializer(value=1)
            assert learn_scale
        lstm_initializer = tf.constant_initializer(value=parameters['generator/rnn/lstm_cell/weights:0'])
        bias_start = parameters['generator/rnn/lstm_cell/biases:0']

        W_out_G = tf.get_variable(name='W_out_G', shape=[hidden_units_g, num_generated_features], initializer=W_out_G_initializer)
        b_out_G = tf.get_variable(name='b_out_G', shape=num_generated_features, initializer=b_out_G_initializer)
        scale_out_G = tf.get_variable(name='scale_out_G', shape=1, initializer=scale_out_G_initializer, trainable=False)

        inputs = z

        cell = LSTMCell(num_units=hidden_units_g,
                        state_is_tuple=True,
                        initializer=lstm_initializer,
                        bias_start=bias_start,
                        reuse=reuse)
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
            cell=cell,
            dtype=tf.float32,
            sequence_length=[seq_length] * batch_size,
            inputs=inputs)
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hidden_units_g])
        logits_2d = tf.matmul(rnn_outputs_2d, W_out_G) + b_out_G #out put weighted sum
        output_2d = tf.nn.tanh(logits_2d) # logits operation [-1, 1]
        output_3d = tf.reshape(output_2d, [-1, seq_length, num_generated_features])
    return output_3d


def discriminator_o(x, hidden_units_d, reuse=False, parameters=None):

    with tf.variable_scope("discriminator_0") as scope:

        W_out_D_initializer = tf.constant_initializer(value=parameters['discriminator/W_out_D:0'])
        b_out_D_initializer = tf.constant_initializer(value=parameters['discriminator/b_out_D:0'])

        W_out_D = tf.get_variable(name='W_out_D', shape=[hidden_units_d, 1],  initializer=W_out_D_initializer)
        b_out_D = tf.get_variable(name='b_out_D', shape=1, initializer=b_out_D_initializer)


        inputs = x

        cell = tf.contrib.rnn.LSTMCell(num_units=hidden_units_d, state_is_tuple=True, reuse=reuse)

        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32, inputs=inputs)


        logits = tf.einsum('ijk,km', rnn_outputs, W_out_D) + b_out_D # output weighted sum

        output = tf.nn.sigmoid(logits) # y = 1 / (1 + exp(-x)). output activation [0, 1]. Probability??
        # sigmoid output ([0,1]), Probability?

    return output, logits


def invert(settings, samples, para_path, g_tolerance=None, e_tolerance=0.1,
           n_iter=None, max_iter=10000, heuristic_sigma=None):
    """
    Return the latent space points corresponding to a set of a samples
    ( from gradient descent )
    Note: this function is designed for ONE sample generation
    """
    # num_samples = samples.shape[0]
    # cast samples to float32

    samples = np.float32(samples)

    # get the model
    # if settings is a string, assume it's an identifier and load
    if type(settings) == str:
        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))


    # print('Inverting', 1, 'samples using model', settings['identifier'], 'at epoch', epoch,)
    # if not g_tolerance is None:
    #     print('until gradient norm is below', g_tolerance)
    # else:
    #     print('until error is below', e_tolerance)


    # get parameters
    parameters = model.load_parameters(para_path)
    # # assertions
    # assert samples.shape[2] == settings['num_generated_features']
    # create VARIABLE Z
    Z = tf.get_variable(name='Z', shape=[1, settings['seq_length'],
                                         settings['latent_dim']],
                        initializer=tf.random_normal_initializer())
    # create outputs

    G_samples = generator_o(Z, settings['hidden_units_g'], settings['seq_length'],
                          1, settings['num_generated_features'],
                          reuse=False, parameters=parameters)
    # generator_vars = ['hidden_units_g', 'seq_length', 'batch_size', 'num_generated_features', 'cond_dim', 'learn_scale']
    # generator_settings = dict((k, settings[k]) for k in generator_vars)
    # G_samples = model.generator(Z, **generator_settings, reuse=True)

    fd = None

    # define loss mmd-based loss
    if heuristic_sigma is None:
        heuristic_sigma = mmd.median_pairwise_distance_o(samples)  # this is noisy
        print('heuristic_sigma:', heuristic_sigma)
    samples = tf.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])
    Kxx, Kxy, Kyy, wts = mmd._mix_rbf_kernel(G_samples, samples, sigmas=tf.constant(value=heuristic_sigma, shape=(1, 1)))
    similarity_per_sample = tf.diag_part(Kxy)
    reconstruction_error_per_sample = 1 - similarity_per_sample
    # reconstruction_error_per_sample = tf.reduce_sum((tf.nn.l2_normalize(G_samples, dim=1) - tf.nn.l2_normalize(samples, dim=1))**2, axis=[1,2])
    similarity = tf.reduce_mean(similarity_per_sample)
    reconstruction_error = 1 - similarity
    # updater
    #    solver = tf.train.AdamOptimizer().minimize(reconstruction_error_per_sample, var_list=[Z])
    # solver = tf.train.RMSPropOptimizer(learning_rate=500).minimize(reconstruction_error, var_list=[Z])
    solver = tf.train.RMSPropOptimizer(learning_rate=0.1).minimize(reconstruction_error_per_sample, var_list=[Z])
    # solver = tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9).minimize(reconstruction_error_per_sample, var_list=[Z])

    grad_Z = tf.gradients(reconstruction_error_per_sample, Z)[0]
    grad_per_Z = tf.norm(grad_Z, axis=(1, 2))
    grad_norm = tf.reduce_mean(grad_per_Z)
    # solver = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(reconstruction_error, var_list=[Z])
    print('Finding latent state corresponding to samples...')

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        error = sess.run(reconstruction_error, feed_dict=fd)
        g_n = sess.run(grad_norm, feed_dict=fd)
        # print(g_n)
        i = 0
        if not n_iter is None:
            while i < n_iter:
                _ = sess.run(solver, feed_dict=fd)
                error = sess.run(reconstruction_error, feed_dict=fd)
                i += 1
        else:
            if not g_tolerance is None:
                while g_n > g_tolerance:
                    _ = sess.run(solver, feed_dict=fd)
                    error, g_n = sess.run([reconstruction_error, grad_norm], feed_dict=fd)
                    i += 1
                    print(error, g_n)
                    if i > max_iter:
                        break
            else:
                while np.abs(error) > e_tolerance:
                    _ = sess.run(solver, feed_dict=fd)
                    error = sess.run(reconstruction_error, feed_dict=fd)
                    i += 1
                    # print(error)
                    if i > max_iter:
                        break
        Zs = sess.run(Z, feed_dict=fd)
        Gs = sess.run(G_samples, feed_dict={Z: Zs})
        error_per_sample = sess.run(reconstruction_error_per_sample, feed_dict=fd)
        print('Z found in', i, 'iterations with final reconstruction error of', error)
    tf.reset_default_graph()

    return Gs, Zs, error_per_sample, heuristic_sigma


def dis_trained_model(settings, samples, para_path):
    """
    Return the discrimination results of  num_samples testing samples from a trained model described by settings dict
    Note: this function is designed for ONE sample discrimination
    """

    # if settings is a string, assume it's an identifier and load
    if type(settings) == str:
        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))

    num_samples = samples.shape[0]
    samples = np.float32(samples)
    num_variables = samples.shape[2]
    # samples = np.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])

    # get the parameters, get other variables
    # parameters = model.load_parameters(settings['sub_id'] + '_' + str(settings['seq_length']) + '_' + str(epoch))
    parameters = model.load_parameters(para_path)
    # settings['sub_id'] + '_' + str(settings['seq_length']) + '_' + str(epoch)

    # create placeholder, T samples
    # T = tf.placeholder(tf.float32, [settings['batch_size'], settings['seq_length'], settings['num_generated_features']])

    T = tf.placeholder(tf.float32, [num_samples, settings['seq_length'], num_variables])

    # create the discriminator (GAN)
    # normal GAN
    D_t, L_t = discriminator_o(T, settings['hidden_units_d'], reuse=False, parameters=parameters)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # with tf.device('/gpu:1'):
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        D_t, L_t = sess.run([D_t, L_t], feed_dict={T: samples})

    tf.reset_default_graph()
    return D_t, L_t

def dis_D_model(settings, samples, para_path):
    """
    Return the discrimination results of  num_samples testing samples from a trained model described by settings dict
    Note: this function is designed for ONE sample discrimination
    """

    # if settings is a string, assume it's an identifier and load
    if type(settings) == str:
        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))

    # num_samples = samples.shape[0]
    samples = np.float32(samples)
    samples = np.reshape(samples, [1, settings['seq_length'], settings['num_generated_features']])

    # get the parameters, get other variables
    parameters = model.load_parameters(para_path)
    # create placeholder, T samples

    T = tf.placeholder(tf.float32, [1, settings['seq_length'], settings['num_generated_features']])

    # create the discriminator (GAN or CGAN)
    # normal GAN
    D_t, L_t = discriminator_o(T, settings['hidden_units_d'], reuse=False, parameters=parameters)
    # D_t, L_t = model.discriminator(T, settings['hidden_units_d'], settings['seq_length'], num_samples, reuse=False,
    #               parameters=parameters, cond_dim=0, c=None, batch_mean=False)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        D_t, L_t = sess.run([D_t, L_t], feed_dict={T: samples})

    tf.reset_default_graph()
    return D_t, L_t

================================================
FILE: README.md
================================================
# -- Multivariate Anomaly Detection for Time Series Data with GANs -- #

# MAD-GAN

This repository contains code for the paper, _[MAD-GAN: Multivariate Anomaly Detection for Time Series Data with Generative Adversarial Networks](https://arxiv.org/pdf/1901.04997.pdf)_, by Dan Li, Dacheng Chen, Jonathan Goh, and See-Kiong Ng.

MAD-GAN is a refined version of GAN-AD at _[Anomaly Detection with Generative Adversarial Networks for Multivariate Time Series](https://arxiv.org/pdf/1809.04758.pdf)_ The code can be found at https://github.com/LiDan456/GAN-AD

(We are still working on this topic, will upload the completed version later...)

## Overview

We used generative adversarial networks (GANs) to do anomaly detection for time series data.
The GAN framework was **R**GAN, whihc was taken from the paper, _[Real-valued (Medical) Time Series Generation with Recurrent Conditional GANs](https://arxiv.org/abs/1706.02633).
Please refer to https://github.com/ratschlab/RGAN for the original code.

## Quickstart

- Python3

- Please unpack the data.7z file in the data folder before run RGAN.py and AD.py

- To train the model:
  
  """ python RGAN.py --settings_file kdd99 """

- To do anomaly detection:

  """ python AD.py --settings_file kdd99_test"""
  
  """ python AD_Invert.py --settings_file kdd99_test"""

## Data

We apply our method on the SWaT and WADI datasets in the paper, however, we didn't upload the data in this repository. Please refer to https://itrust.sutd.edu.sg/ and send request to iTrust is you want to try the data.

In this repository we used kdd cup 1999 dataset as an example (please unpack the data.7z file in the data folder before run RGAN.py and AD.py). You can also down load the original data at http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html


================================================
FILE: RGAN.py
================================================
import numpy as np
import tensorflow as tf
import pdb
import random
import json
from scipy.stats import mode

import data_utils
import plotting
import model
import utils
import eval
import DR_discriminator

from time import time
from math import floor
from mmd import rbf_mmd2, median_pairwise_distance, mix_rbf_mmd2_and_ratio

begin = time()

tf.logging.set_verbosity(tf.logging.ERROR)

# --- get settings --- #
# parse command line arguments, or use defaults
parser = utils.rgan_options_parser()
settings = vars(parser.parse_args())
# if a settings file is specified, it overrides command line arguments/defaults
if settings['settings_file']: settings = utils.load_settings_from_file(settings)

# --- get data, split --- #
# samples, pdf, labels = data_utils.get_data(settings)
data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
print('Loading data from', data_path)
settings["eval_an"] = False
settings["eval_single"] = False
samples, labels, index = data_utils.get_data(settings["data"], settings["seq_length"], settings["seq_step"],
                                             settings["num_signals"], settings['sub_id'], settings["eval_single"],
                                             settings["eval_an"], data_path)
print('samples_size:',samples.shape)
# -- number of variables -- #
num_variables = samples.shape[2]
print('num_variables:', num_variables)
# --- save settings, data --- #
print('Ready to run with settings:')
for (k, v) in settings.items(): print(v, '\t', k)
# add the settings to local environment
# WARNING: at this point a lot of variables appear
locals().update(settings)
json.dump(settings, open('./experiments/settings/' + identifier + '.txt', 'w'), indent=0)

# --- build model --- #
# preparation: data placeholders and model parameters
Z, X, T = model.create_placeholders(batch_size, seq_length, latent_dim, num_variables)
discriminator_vars = ['hidden_units_d', 'seq_length', 'batch_size', 'batch_mean']
discriminator_settings = dict((k, settings[k]) for k in discriminator_vars)
generator_vars = ['hidden_units_g', 'seq_length', 'batch_size', 'learn_scale']
generator_settings = dict((k, settings[k]) for k in generator_vars)
generator_settings['num_signals'] = num_variables

# model: GAN losses
D_loss, G_loss = model.GAN_loss(Z, X, generator_settings, discriminator_settings)
D_solver, G_solver, priv_accountant = model.GAN_solvers(D_loss, G_loss, learning_rate, batch_size,
                                                        total_examples=samples.shape[0],
                                                        l2norm_bound=l2norm_bound,
                                                        batches_per_lot=batches_per_lot, sigma=dp_sigma, dp=dp)
# model: generate samples for visualization
G_sample = model.generator(Z, **generator_settings, reuse=True)


# # --- evaluation settings--- #
#
# # frequency to do visualisations
# num_samples = samples.shape[0]
# vis_freq = max(6600 // num_samples, 1)
# eval_freq = max(6600// num_samples, 1)
#
# # get heuristic bandwidth for mmd kernel from evaluation samples
# heuristic_sigma_training = median_pairwise_distance(samples)
# best_mmd2_so_far = 1000
#
# # optimise sigma using that (that's t-hat)
# batch_multiplier = 5000 // batch_size
# eval_size = batch_multiplier * batch_size
# eval_eval_size = int(0.2 * eval_size)
# eval_real_PH = tf.placeholder(tf.float32, [eval_eval_size, seq_length, num_generated_features])
# eval_sample_PH = tf.placeholder(tf.float32, [eval_eval_size, seq_length, num_generated_features])
# n_sigmas = 2
# sigma = tf.get_variable(name='sigma', shape=n_sigmas, initializer=tf.constant_initializer(
#     value=np.power(heuristic_sigma_training, np.linspace(-1, 3, num=n_sigmas))))
# mmd2, that = mix_rbf_mmd2_and_ratio(eval_real_PH, eval_sample_PH, sigma)
# with tf.variable_scope("SIGMA_optimizer"):
#     sigma_solver = tf.train.RMSPropOptimizer(learning_rate=0.05).minimize(-that, var_list=[sigma])
#     # sigma_solver = tf.train.AdamOptimizer().minimize(-that, var_list=[sigma])
#     # sigma_solver = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(-that, var_list=[sigma])
# sigma_opt_iter = 2000
# sigma_opt_thresh = 0.001
# sigma_opt_vars = [var for var in tf.global_variables() if 'SIGMA_optimizer' in var.name]


# --- run the program --- #
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
# sess = tf.Session()
sess.run(tf.global_variables_initializer())

# # -- plot the real samples -- #
vis_real_indices = np.random.choice(len(samples), size=16)
vis_real = np.float32(samples[vis_real_indices, :, :])
plotting.save_plot_sample(vis_real, 0, identifier + '_real', n_samples=16, num_epochs=num_epochs)
plotting.save_samples_real(vis_real, identifier)

# --- train --- #
train_vars = ['batch_size', 'D_rounds', 'G_rounds', 'use_time', 'seq_length', 'latent_dim']
train_settings = dict((k, settings[k]) for k in train_vars)
train_settings['num_signals'] = num_variables

t0 = time()
MMD = np.zeros([num_epochs, ])

for epoch in range(num_epochs):
# for epoch in range(1):
    # -- train epoch -- #
    D_loss_curr, G_loss_curr = model.train_epoch(epoch, samples, labels, sess, Z, X, D_loss, G_loss,
                                                 D_solver, G_solver, **train_settings)

    # # -- eval -- #
    # # visualise plots of generated samples, with/without labels
    # # choose which epoch to visualize
    #
    # # random input vectors for the latent space, as the inputs of generator
    # vis_ZZ = model.sample_Z(batch_size, seq_length, latent_dim, use_time)
    #
    # # # -- generate samples-- #
    # vis_sample = sess.run(G_sample, feed_dict={Z: vis_ZZ})
    # # # -- visualize the generated samples -- #
    # plotting.save_plot_sample(vis_sample, epoch, identifier, n_samples=16, num_epochs=None, ncol=4)
    # # plotting.save_plot_sample(vis_sample, 0, identifier + '_real', n_samples=16, num_epochs=num_epochs)
    # # # save the generated samples in cased they might be useful for comparison
    # plotting.save_samples(vis_sample, identifier, epoch)

    # -- print -- #
    print('epoch, D_loss_curr, G_loss_curr, seq_length')
    print('%d\t%.4f\t%.4f\t%d' % (epoch, D_loss_curr, G_loss_curr, seq_length))

    # # -- compute mmd2 and if available, prob density -- #
    # if epoch % eval_freq == 0:
    #     # how many samples to evaluate with?
    #     eval_Z = model.sample_Z(eval_size, seq_length, latent_dim, use_time)
    #     eval_sample = np.empty(shape=(eval_size, seq_length, num_signals))
    #     for i in range(batch_multiplier):
    #         eval_sample[i * batch_size:(i + 1) * batch_size, :, :] = sess.run(G_sample, feed_dict={ Z: eval_Z[i * batch_size:(i + 1) * batch_size]})
    #     eval_sample = np.float32(eval_sample)
    #     eval_real = np.float32(samples['vali'][np.random.choice(len(samples['vali']), size=batch_multiplier * batch_size), :, :])
    #
    #     eval_eval_real = eval_real[:eval_eval_size]
    #     eval_test_real = eval_real[eval_eval_size:]
    #     eval_eval_sample = eval_sample[:eval_eval_size]
    #     eval_test_sample = eval_sample[eval_eval_size:]
    #
    #     # MMD
    #     # reset ADAM variables
    #     sess.run(tf.initialize_variables(sigma_opt_vars))
    #     sigma_iter = 0
    #     that_change = sigma_opt_thresh * 2
    #     old_that = 0
    #     while that_change > sigma_opt_thresh and sigma_iter < sigma_opt_iter:
    #         new_sigma, that_np, _ = sess.run([sigma, that, sigma_solver],
    #                                          feed_dict={eval_real_PH: eval_eval_real, eval_sample_PH: eval_eval_sample})
    #         that_change = np.abs(that_np - old_that)
    #         old_that = that_np
    #         sigma_iter += 1
    #     opt_sigma = sess.run(sigma)
    #     try:
    #         mmd2, that_np = sess.run(mix_rbf_mmd2_and_ratio(eval_test_real, eval_test_sample, biased=False, sigmas=sigma))
    #     except ValueError:
    #         mmd2 = 'NA'
    #         that = 'NA'
    #
    #     MMD[epoch, ] = mmd2

    # -- save model parameters -- #
    model.dump_parameters(sub_id + '_' + str(seq_length) + '_' + str(epoch), sess)

np.save('./experiments/plots/gs/' + identifier + '_' + 'MMD.npy', MMD)

end = time() - begin
print('Training terminated | Training time=%d s' %(end) )

print("Training terminated | training time = %ds  " % (time() - begin))

================================================
FILE: data_utils.py
================================================
import numpy as np
import pandas as pd
import pdb
import re
from time import time
import json
import random

import model

from scipy.spatial.distance import pdist, squareform
from scipy.stats import multivariate_normal, invgamma, mode
from scipy.special import gamma
from scipy.misc.pilutil import imresize
from functools import partial
from math import ceil

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing


# --- deal with the SWaT data --- #
def swat(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # train = np.load('./data/swat.npy')
    # print('Loaded swat from .npy')
    train = np.loadtxt(open('./data/swat.csv'), delimiter=',')
    print('Loaded swat from .csv')
    m, n = train.shape # m=496800, n=52
    for i in range(n - 1):
        A = max(train[:, i])
        if A != 0:
            train[:, i] /= max(train[:, i])
            # scale from -1 to 1
            train[:, i] = 2 * train[:, i] - 1
        else:
            train[:, i] = train[:, i]

    samples = train[21600:, 0:n-1]
    labels = train[21600:, n-1]    # the last colummn is label
    #############################
    # -- choose variable for uni-variate GAN-AD -- #
    # samples = samples[:, [1, 8, 18, 28]]
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    # ALL SENSORS IDX
    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
    # X_n = samples[:, XS]
    # X_a = samples_a[:, XS]
    # All VARIABLES
    X_n = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=5 -- #
    n_components = num_signals
    pca = PCA(n_components, svd_solver='full')
    pca.fit(X_n)
    ex_var = pca.explained_variance_ratio_
    pc = pca.components_

    # projected values on the principal component
    T_n = np.matmul(X_n, pc.transpose(1, 0))
    samples = T_n

    # # only for one-dimensional
    # samples = T_n.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    # seq_length = 7200
    num_samples = (samples.shape[0]-seq_length)//seq_step
    print("num_samples:", num_samples)
    print("num_signals:", num_signals)
    aa = np.empty([num_samples, seq_length, num_signals])
    bb = np.empty([num_samples, seq_length, 1])

    for j in range(num_samples):
       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
       for i in range(num_signals):
           aa[j, :, i] = samples[(j * seq_step):(j*seq_step + seq_length), i]

    # samples = aa[:, 0:7200:200, :]
    # labels = bb[:, 0:7200:200, :]
    samples = aa
    labels = bb

    return samples, labels

def swat_birgan(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # train = np.load('./data/swat.npy')
    # print('Loaded swat from .npy')
    train = np.loadtxt(open('./data/swat.csv'), delimiter=',')
    print('Loaded swat from .csv')
    m, n = train.shape # m=496800, n=52
    for i in range(n - 1):
        A = max(train[:, i])
        if A != 0:
            train[:, i] /= max(train[:, i])
            # scale from -1 to 1
            train[:, i] = 2 * train[:, i] - 1
        else:
            train[:, i] = train[:, i]

    samples = train[21600:, 0:n-1]
    labels = train[21600:, n-1]    # the last colummn is label
    #############################
    # # -- choose variable for uni-variate GAN-AD -- #
    # # samples = samples[:, [1, 8, 18, 28]]
    ###########################################
    ###########################################
    nn = samples.shape[1]
    num_samples = (samples.shape[0]-seq_length)//seq_step
    aa = np.empty([num_samples, nn, nn])
    AA = np.empty([seq_length, nn])
    bb = np.empty([num_samples, seq_length, 1])

    print('Pre-process training data...')
    for j in range(num_samples):
       # display batch progress
       model_bigan.display_batch_progression(j, num_samples)
       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
       for i in range(nn):
           AA[:, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
       aa[j, :, :] = np.cov(AA.T)

    samples = aa
    labels = bb

    return samples, labels

def swat_test(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # test = np.load('./data/swat_a.npy')
    # print('Loaded swat_a from .npy')
    test = np.loadtxt(open('./data/swat_a.csv'), delimiter=',')
    print('Loaded swat_a from .csv')
    m, n = test.shape  # m1=449919, n1=52
    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    # -- choose variable for uni-variate GAN-AD -- #
    # samples = samples[:, [1,2,3,4]]
    # samples_a = samples_a[:, [1,2,3,4]]
    ############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    import DR_discriminator as dr
    # ALL SENSORS IDX
    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
    # X_n = samples[:, XS]
    # X_a = samples_a[:, XS]
    # All VARIABLES
    X_a = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=5 -- #
    n_components = num_signals
    pca_a = PCA(n_components, svd_solver='full')
    pca_a.fit(X_a)
    pc_a = pca_a.components_
    # projected values on the principal component
    T_a = np.matmul(X_a, pc_a.transpose(1, 0))

    samples = T_a
    # # only for one-dimensional
    # samples = T_a.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples_t = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples_t, seq_length, num_signals])
    bb = np.empty([num_samples_t, seq_length, 1])
    bbb = np.empty([num_samples_t, seq_length, 1])

    for j in range(num_samples_t):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb
    index = bbb

    return samples, labels, index


def swat_birgan_test(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # train = np.load('./data/swat.npy')
    # print('Loaded swat from .npy')
    test = np.loadtxt(open('./data/swat_a.csv'), delimiter=',')
    print('Loaded swat_a from .csv')
    m, n = test.shape  # m1=449919, n1=52
    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    # idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    # # -- choose variable for uni-variate GAN-AD -- #
    # # samples = samples[:, [1, 8, 18, 28]]
    ###########################################
    ###########################################
    nn = samples.shape[1]
    num_samples = (samples.shape[0]-seq_length)//seq_step
    aa = np.empty([num_samples, nn, nn])
    AA = np.empty([seq_length, nn])
    bb = np.empty([num_samples, seq_length, 1])

    print('Pre-process testing data...')
    for j in range(num_samples):
       # display batch progress
       model_bigan.display_batch_progression(j, num_samples)
       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
       for i in range(nn):
           AA[:, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]
       aa[j, :, :] = np.cov(AA.T)

    samples = aa
    labels = bb

    return samples, labels


def wadi(seq_length, seq_step, num_signals, randomize=False):
    train = np.load('./data/wadi.npy')
    print('Loaded wadi from .npy')
    m, n = train.shape  # m=1048571, n=119
    for i in range(n-1):
        A = max(train[:, i])
        if A != 0:
            train[:, i] /= max(train[:, i])
            # scale from -1 to 1
            train[:, i] = 2 * train[:, i] - 1
        else:
            train[:, i] = train[:, i]

    samples = train[259200:, 0:n-1]  # normal
    labels = train[259200:, n-1]
    #############################
    samples = samples[:, [0, 3, 6, 17]]
    # samples = samples[:, 0]
    ############################
    # # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    # from sklearn.decomposition import PCA
    # import DR_discriminator as dr
    # X_n = samples
    # ####################################
    # ###################################
    # # -- the best PC dimension is chosen pc=8 -- #
    # n_components = num_signals
    # pca = PCA(n_components, svd_solver='full')
    # pca.fit(X_n)
    # pc = pca.components_
    # # projected values on the principal component
    # T_n = np.matmul(X_n, pc.transpose(1, 0))
    #
    # samples = T_n
    # # # only for one-dimensional
    # # samples = T_n.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    seq_length = 10800
    num_samples = (samples.shape[0] - seq_length) // seq_step
    print("num_samples:", num_samples)
    print("num_signals:", num_signals)
    aa = np.empty([num_samples, seq_length, num_signals])
    bb = np.empty([num_samples, seq_length, 1])

    for j in range(num_samples):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        # aa[j, :, :] = np.reshape(samples[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa[:, 0:10800:300, :]
    labels = bb[:, 0:10800:300, :]

    return samples, labels


def wadi_test(seq_length, seq_step, num_signals, randomize=False):
    test = np.load('./data/wadi_a.npy')
    print('Loaded wadi_a from .npy')
    m, n = test.shape  # m1=172801, n1=119

    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    import DR_discriminator as dr
    X_a = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=8 -- #
    n_components = num_signals
    pca_a = PCA(n_components, svd_solver='full')
    pca_a.fit(X_a)
    pc_a = pca_a.components_
    # projected values on the principal component
    T_a = np.matmul(X_a, pc_a.transpose(1, 0))

    samples = T_a
    # # only for one-dimensional
    # samples = T_a.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples_t = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples_t, seq_length, num_signals])
    bb = np.empty([num_samples_t, seq_length, 1])
    bbb = np.empty([num_samples_t, seq_length, 1])

    for j in range(num_samples_t):
        bb[j, :, :] = np.reshape(labels[(j * 10):(j * seq_step + seq_length)], [-1, 1])
        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb
    index = bbb

    return samples, labels, index

def kdd99(seq_length, seq_step, num_signals):
    train = np.load('./data/kdd99_train.npy')
    print('load kdd99_train from .npy')
    m, n = train.shape  # m=562387, n=35
    # normalization
    for i in range(n - 1):
        # print('i=', i)
        A = max(train[:, i])
        # print('A=', A)
        if A != 0:
            train[:, i] /= max(train[:, i])
            # scale from -1 to 1
            train[:, i] = 2 * train[:, i] - 1
        else:
            train[:, i] = train[:, i]

    samples = train[:, 0:n - 1]
    labels = train[:, n - 1]  # the last colummn is label
    #############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    X_n = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=6 -- #
    n_components = num_signals
    pca = PCA(n_components, svd_solver='full')
    pca.fit(X_n)
    ex_var = pca.explained_variance_ratio_
    pc = pca.components_
    # projected values on the principal component
    T_n = np.matmul(X_n, pc.transpose(1, 0))
    samples = T_n
    # # only for one-dimensional
    # samples = T_n.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples, seq_length, num_signals])
    bb = np.empty([num_samples, seq_length, 1])

    for j in range(num_samples):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb

    return samples, labels

def kdd99_test(seq_length, seq_step, num_signals):
    test = np.load('./data/kdd99_test.npy')
    print('load kdd99_test from .npy')

    m, n = test.shape  # m1=494021, n1=35

    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    import DR_discriminator as dr
    X_a = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=6 -- #
    n_components = num_signals
    pca_a = PCA(n_components, svd_solver='full')
    pca_a.fit(X_a)
    pc_a = pca_a.components_
    # projected values on the principal component
    T_a = np.matmul(X_a, pc_a.transpose(1, 0))
    samples = T_a
    # # only for one-dimensional
    # samples = T_a.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples_t = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples_t, seq_length, num_signals])
    bb = np.empty([num_samples_t, seq_length, 1])
    bbb = np.empty([num_samples_t, seq_length, 1])

    for j in range(num_samples_t):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb
    index = bbb

    return samples, labels, index


# ############################ data pre-processing #################################
# --- to do with loading --- #
# --- to do with loading --- #
def get_samples_and_labels(settings):
    """
    Parse settings options to load or generate correct type of data,
    perform test/train split as necessary, and reform into 'samples' and 'labels'
    dictionaries.
    """
    if settings['data_load_from']:
        data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
        print('Loading data from', data_path)
        samples, pdf, labels = get_data('load', data_path)
        train, vali, test = samples['train'], samples['vali'], samples['test']
        train_labels, vali_labels, test_labels = labels['train'], labels['vali'], labels['test']
        del samples, labels
    else:
        # generate the data
        data_vars = ['num_samples', 'num_samples_t','seq_length', 'seq_step', 'num_signals', 'freq_low',
                'freq_high', 'amplitude_low', 'amplitude_high', 'scale', 'full_mnist']
        data_settings = dict((k, settings[k]) for k in data_vars if k in settings.keys())
        samples, pdf, labels = get_data(settings['data'], settings['seq_length'], settings['seq_step'], settings['num_signals'], settings['sub_id'])
        if 'multivariate_mnist' in settings and settings['multivariate_mnist']:
            seq_length = samples.shape[1]
            samples = samples.reshape(-1, int(np.sqrt(seq_length)), int(np.sqrt(seq_length)))
        if 'normalise' in settings and settings['normalise']: # TODO this is a mess, fix
            print(settings['normalise'])
            norm = True
        else:
            norm = False
        if labels is None:
            train, vali, test = split(samples, [0.6, 0.2, 0.2], normalise=norm)
            train_labels, vali_labels, test_labels = None, None, None
        else:
            train, vali, test, labels_list = split(samples, [0.6, 0.2, 0.2], normalise=norm, labels=labels)
            train_labels, vali_labels, test_labels = labels_list

    labels = dict()
    labels['train'], labels['vali'], labels['test'] = train_labels, vali_labels, test_labels

    samples = dict()
    samples['train'], samples['vali'], samples['test'] = train, vali, test

    # futz around with labels
    # TODO refactor cause this is messy
    if 'one_hot' in settings and settings['one_hot'] and not settings['data_load_from']:
        if len(labels['train'].shape) == 1:
            # ASSUME labels go from 0 to max_val inclusive, find max-val
            max_val = int(np.max([labels['train'].max(), labels['test'].max(), labels['vali'].max()]))
            # now we have max_val + 1 dimensions
            print('Setting cond_dim to', max_val + 1, 'from', settings['cond_dim'])
            settings['cond_dim'] = max_val + 1
            print('Setting max_val to 1 from', settings['max_val'])
            settings['max_val'] = 1

            labels_oh = dict()
            for (k, v) in labels.items():
                A = np.zeros(shape=(len(v), settings['cond_dim']))
                A[np.arange(len(v)), (v).astype(int)] = 1
                labels_oh[k] = A
            labels = labels_oh
        else:
            assert settings['max_val'] == 1
            # this is already one-hot!

    if 'predict_labels' in settings and settings['predict_labels']:
        samples, labels = data_utils.make_predict_labels(samples, labels)
        print('Setting cond_dim to 0 from', settings['cond_dim'])
        settings['cond_dim'] = 0

    # update the settings dictionary to update erroneous settings
    # (mostly about the sequence length etc. - it gets set by the data!)
    settings['seq_length'] = samples['train'].shape[1]
    settings['num_samples'] = samples['train'].shape[0] + samples['vali'].shape[0] + samples['test'].shape[0]
    settings['num_signals'] = samples['train'].shape[2]

    return samples, pdf, labels


def get_data(data_type, seq_length, seq_step, num_signals, sub_id, eval_single, eval_an, data_options=None):
    """
    Helper/wrapper function to get the requested data.
    """
    print('data_type')
    labels = None
    index = None
    if data_type == 'load':
        data_dict = np.load(data_options).item()
        samples = data_dict['samples']
        pdf = data_dict['pdf']
        labels = data_dict['labels']
    elif data_type == 'swat':
        samples, labels = swat(seq_length, seq_step, num_signals)
    elif data_type == 'swat_test':
        samples, labels, index = swat_test(seq_length, seq_step, num_signals)
    elif data_type == 'kdd99':
        samples, labels = kdd99(seq_length, seq_step, num_signals)
    elif data_type == 'kdd99_test':
        samples, labels, index = kdd99_test(seq_length, seq_step, num_signals)
    elif data_type == 'wadi':
        samples, labels = wadi(seq_length, seq_step, num_signals)
    elif data_type == 'wadi_test':
        samples, labels, index = wadi_test(seq_length, seq_step, num_signals)
    else:
        raise ValueError(data_type)
    print('Generated/loaded', len(samples), 'samples from data-type', data_type)
    return samples, labels, index


def get_batch(samples, batch_size, batch_idx, labels=None):
    start_pos = batch_idx * batch_size
    end_pos = start_pos + batch_size
    if labels is None:
        return samples[start_pos:end_pos], None
    else:
        if type(labels) == tuple: # two sets of labels
            assert len(labels) == 2
            return samples[start_pos:end_pos], labels[0][start_pos:end_pos], labels[1][start_pos:end_pos]
        else:
            assert type(labels) == np.ndarray
            return samples[start_pos:end_pos], labels[start_pos:end_pos]


def split(samples, proportions, normalise=False, scale=False, labels=None, random_seed=None):
    """
    Return train/validation/test split.
    """
    if random_seed != None:
        random.seed(random_seed)
        np.random.seed(random_seed)
    assert np.sum(proportions) == 1
    n_total = samples.shape[0]
    n_train = ceil(n_total * proportions[0])
    n_test = ceil(n_total * proportions[2])
    n_vali = n_total - (n_train + n_test)
    # permutation to shuffle the samples
    shuff = np.random.permutation(n_total)
    train_indices = shuff[:n_train]
    vali_indices = shuff[n_train:(n_train + n_vali)]
    test_indices = shuff[(n_train + n_vali):]
    # TODO when we want to scale we can just return the indices
    assert len(set(train_indices).intersection(vali_indices)) == 0
    assert len(set(train_indices).intersection(test_indices)) == 0
    assert len(set(vali_indices).intersection(test_indices)) == 0
    # split up the samples
    train = samples[train_indices]
    vali = samples[vali_indices]
    test = samples[test_indices]
    # apply the same normalisation scheme to all parts of the split
    if normalise:
        if scale: raise ValueError(normalise, scale)  # mutually exclusive
        train, vali, test = normalise_data(train, vali, test)
    elif scale:
        train, vali, test = scale_data(train, vali, test)
    if labels is None:
        return train, vali, test
    else:
        print('Splitting labels...')
        if type(labels) == np.ndarray:
            train_labels = labels[train_indices]
            vali_labels = labels[vali_indices]
            test_labels = labels[test_indices]
            labels_split = [train_labels, vali_labels, test_labels]
        elif type(labels) == dict:
            # more than one set of labels!  (weird case)
            labels_split = dict()
            for (label_name, label_set) in labels.items():
                train_labels = label_set[train_indices]
                vali_labels = label_set[vali_indices]
                test_labels = label_set[test_indices]
                labels_split[label_name] = [train_labels, vali_labels, test_labels]
        else:
            raise ValueError(type(labels))
        return train, vali, test, labels_split


================================================
FILE: differential_privacy/dp_sgd/dp_optimizer/dp_optimizer.py
================================================
# (originally from https://github.com/tensorflow/models/tree/master/research/differential_privacy,
# possibly with some small edits by @corcra)

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Differentially private optimizers.
"""
from __future__ import division

import tensorflow as tf

from differential_privacy.dp_sgd.dp_optimizer import utils
#from differential_privacy.dp_sgd.per_example_gradients import per_example_gradients

import pdb

class DPGradientDescentOptimizer(tf.train.GradientDescentOptimizer):
  """Differentially private gradient descent optimizer.
  """

  def __init__(self, learning_rate, eps_delta, sanitizer,
               sigma=None, use_locking=False, name="DPGradientDescent",
               batches_per_lot=1):
    """Construct a differentially private gradient descent optimizer.

    The optimizer uses fixed privacy budget for each batch of training.

    Args:
      learning_rate: for GradientDescentOptimizer.
      eps_delta: EpsDelta pair for each epoch.
      sanitizer: for sanitizing the graident.
      sigma: noise sigma. If None, use eps_delta pair to compute sigma;
        otherwise use supplied sigma directly.
      use_locking: use locking.
      name: name for the object.
      batches_per_lot: Number of batches in a lot.
    """

    super(DPGradientDescentOptimizer, self).__init__(learning_rate,
                                                     use_locking, name)
    # Also, if needed, define the gradient accumulators
    self._batches_per_lot = batches_per_lot
    self._grad_accum_dict = {}
    if batches_per_lot > 1:
      self._batch_count = tf.Variable(1, dtype=tf.int32, trainable=False,
                                      name="batch_count")
      var_list = tf.trainable_variables()
      with tf.variable_scope("grad_acc_for"):
        for var in var_list:
          v_grad_accum = tf.Variable(tf.zeros_like(var),
                                     trainable=False,
                                     name=utils.GetTensorOpName(var))
          self._grad_accum_dict[var.name] = v_grad_accum

    self._eps_delta = eps_delta
    self._sanitizer = sanitizer
    self._sigma = sigma

  def compute_sanitized_gradients(self, loss, var_list=None,
                                  add_noise=True):
    """Compute the sanitized gradients.

    Args:
      loss: the loss tensor.
      var_list: the optional variables.
      add_noise: if true, then add noise. Always clip.
    Returns:
      a pair of (list of sanitized gradients) and privacy spending accumulation
      operations.
    Raises:
      TypeError: if var_list contains non-variable.
    """

    self._assert_valid_dtypes([loss])

    xs = [tf.convert_to_tensor(x) for x in var_list]
    # TODO check this change
    loss_list = tf.unstack(loss, axis=0)
    px_grads_byexample = [tf.gradients(l, xs) for l in loss_list]
    px_grads = [[x[v] for x in px_grads_byexample] for v in range(len(xs))]
    #px_grads = tf.gradients(loss, xs)
    # add a dummy 0th dimension to reflect the fact that we have a batch size of 1...
  #  px_grads = [tf.expand_dims(x, 0) for x in px_grads]
#    px_grads = per_example_gradients.PerExampleGradients(loss, xs)
    sanitized_grads = []
    for px_grad, v in zip(px_grads, var_list):
      tensor_name = utils.GetTensorOpName(v)
      sanitized_grad = self._sanitizer.sanitize(
          px_grad, self._eps_delta, sigma=self._sigma,
          tensor_name=tensor_name, add_noise=add_noise,
          num_examples=self._batches_per_lot * tf.slice(
              tf.shape(px_grad), [0], [1]))
      sanitized_grads.append(sanitized_grad)

    return sanitized_grads

  def minimize(self, loss, global_step=None, var_list=None,
               name=None):
    """Minimize using sanitized gradients.

    This gets a var_list which is the list of trainable variables.
    For each var in var_list, we defined a grad_accumulator variable
    during init. When batches_per_lot > 1, we accumulate the gradient
    update in those. At the end of each lot, we apply the update back to
    the variable. This has the effect that for each lot we compute
    gradients at the point at the beginning of the lot, and then apply one
    update at the end of the lot. In other words, semantically, we are doing
    SGD with one lot being the equivalent of one usual batch of size
    batch_size * batches_per_lot.
    This allows us to simulate larger batches than our memory size would permit.

    The lr and the num_steps are in the lot world.

    Args:
      loss: the loss tensor.
      global_step: the optional global step.
      var_list: the optional variables.
      name: the optional name.
    Returns:
      the operation that runs one step of DP gradient descent.
    """

    # First validate the var_list

    if var_list is None:
      var_list = tf.trainable_variables()
    for var in var_list:
      if not isinstance(var, tf.Variable):
        raise TypeError("Argument is not a variable.Variable: %s" % var)

    # Modification: apply gradient once every batches_per_lot many steps.
    # This may lead to smaller error

    if self._batches_per_lot == 1:
      sanitized_grads = self.compute_sanitized_gradients(
          loss, var_list=var_list)

      grads_and_vars = list(zip(sanitized_grads, var_list))
      self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None])

      apply_grads = self.apply_gradients(grads_and_vars,
                                         global_step=global_step, name=name)
      return apply_grads

    # Condition for deciding whether to accumulate the gradient
    # or actually apply it.
    # we use a private self_batch_count to keep track of number of batches.
    # global step will count number of lots processed.

    update_cond = tf.equal(tf.constant(0),
                           tf.mod(self._batch_count,
                                  tf.constant(self._batches_per_lot)))

    # Things to do for batches other than last of the lot.
    # Add non-noisy clipped grads to shadow variables.

    def non_last_in_lot_op(loss, var_list):
      """Ops to do for a typical batch.

      For a batch that is not the last one in the lot, we simply compute the
      sanitized gradients and apply them to the grad_acc variables.

      Args:
        loss: loss function tensor
        var_list: list of variables
      Returns:
        A tensorflow op to do the updates to the gradient accumulators
      """
      sanitized_grads = self.compute_sanitized_gradients(
          loss, var_list=var_list, add_noise=False)

      update_ops_list = []
      for var, grad in zip(var_list, sanitized_grads):
        grad_acc_v = self._grad_accum_dict[var.name]
        update_ops_list.append(grad_acc_v.assign_add(grad))
      update_ops_list.append(self._batch_count.assign_add(1))
      return tf.group(*update_ops_list)

    # Things to do for last batch of a lot.
    # Add noisy clipped grads to accumulator.
    # Apply accumulated grads to vars.

    def last_in_lot_op(loss, var_list, global_step):
      """Ops to do for last batch in a lot.

      For the last batch in the lot, we first add the sanitized gradients to
      the gradient acc variables, and then apply these
      values over to the original variables (via an apply gradient)

      Args:
        loss: loss function tensor
        var_list: list of variables
        global_step: optional global step to be passed to apply_gradients
      Returns:
        A tensorflow op to push updates from shadow vars to real vars.
      """

      # We add noise in the last lot. This is why we need this code snippet
      # that looks almost identical to the non_last_op case here.
      sanitized_grads = self.compute_sanitized_gradients(
          loss, var_list=var_list, add_noise=True)

      normalized_grads = []
      for var, grad in zip(var_list, sanitized_grads):
        grad_acc_v = self._grad_accum_dict[var.name]
        # To handle the lr difference per lot vs per batch, we divide the
        # update by number of batches per lot.
        normalized_grad = tf.div(grad_acc_v.assign_add(grad),
                                 tf.to_float(self._batches_per_lot))

        normalized_grads.append(normalized_grad)

      with tf.control_dependencies(normalized_grads):
        grads_and_vars = list(zip(normalized_grads, var_list))
        self._assert_valid_dtypes(
            [v for g, v in grads_and_vars if g is not None])
        apply_san_grads = self.apply_gradients(grads_and_vars,
                                               global_step=global_step,
                                               name="apply_grads")

      # Now reset the accumulators to zero
      resets_list = []
      with tf.control_dependencies([apply_san_grads]):
        for _, acc in self._grad_accum_dict.items():
          reset = tf.assign(acc, tf.zeros_like(acc))
          resets_list.append(reset)
      resets_list.append(self._batch_count.assign_add(1))

      last_step_update = tf.group(*([apply_san_grads] + resets_list))
      return last_step_update
    # pylint: disable=g-long-lambda
    update_op = tf.cond(update_cond,
                        lambda: last_in_lot_op(
                            loss, var_list,
                            global_step),
                        lambda: non_last_in_lot_op(
                            loss, var_list))
    return tf.group(update_op)


================================================
FILE: differential_privacy/dp_sgd/dp_optimizer/sanitizer.py
================================================
# (originally from https://github.com/tensorflow/models/tree/master/research/differential_privacy,
# possibly with some small edits by @corcra)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Defines Sanitizer class for sanitizing tensors.

A sanitizer first limits the sensitivity of a tensor and then adds noise
to the tensor. The parameters are determined by the privacy_spending and the
other parameters. It also uses an accountant to keep track of the privacy
spending.
"""
from __future__ import division

import collections

import tensorflow as tf

from differential_privacy.dp_sgd.dp_optimizer import utils

import pdb

ClipOption = collections.namedtuple("ClipOption",
                                    ["l2norm_bound", "clip"])


class AmortizedGaussianSanitizer(object):
  """Sanitizer with Gaussian noise and amoritzed privacy spending accounting.

  This sanitizes a tensor by first clipping the tensor, summing the tensor
  and then adding appropriate amount of noise. It also uses an amortized
  accountant to keep track of privacy spending.
  """

  def __init__(self, accountant, default_option):
    """Construct an AmortizedGaussianSanitizer.

    Args:
      accountant: the privacy accountant. Expect an amortized one.
      default_option: the default ClipOptoin.
    """

    self._accountant = accountant
    self._default_option = default_option
    self._options = {}

  def set_option(self, tensor_name, option):
    """Set options for an individual tensor.

    Args:
      tensor_name: the name of the tensor.
      option: clip option.
    """

    self._options[tensor_name] = option

  def sanitize(self, x, eps_delta, sigma=None,
               option=ClipOption(None, None), tensor_name=None,
               num_examples=None, add_noise=True):
    """Sanitize the given tensor.

    This santize a given tensor by first applying l2 norm clipping and then
    adding Gaussian noise. It calls the privacy accountant for updating the
    privacy spending.

    Args:
      x: the tensor to sanitize.
      eps_delta: a pair of eps, delta for (eps,delta)-DP. Use it to
        compute sigma if sigma is None.
      sigma: if sigma is not None, use sigma.
      option: a ClipOption which, if supplied, used for
        clipping and adding noise.
      tensor_name: the name of the tensor.
      num_examples: if None, use the number of "rows" of x.
      add_noise: if True, then add noise, else just clip.
    Returns:
      a pair of sanitized tensor and the operation to accumulate privacy
      spending.
    """
    if sigma is None:
      # pylint: disable=unpacking-non-sequence
      eps, delta = eps_delta
      with tf.control_dependencies(
          [tf.Assert(tf.greater(eps, 0),
                     ["eps needs to be greater than 0"]),
           tf.Assert(tf.greater(delta, 0),
                     ["delta needs to be greater than 0"])]):
        # The following formula is taken from
        #   Dwork and Roth, The Algorithmic Foundations of Differential
        #   Privacy, Appendix A.
        #   http://www.cis.upenn.edu/~aaroth/Papers/privacybook.pdf
        sigma = tf.sqrt(2.0 * tf.log(1.25 / delta)) / eps

    l2norm_bound, clip = option
    if l2norm_bound is None:
      l2norm_bound, clip = self._default_option
      if ((tensor_name is not None) and
          (tensor_name in self._options)):
        l2norm_bound, clip = self._options[tensor_name]
    if clip:
      x = utils.BatchClipByL2norm(x, l2norm_bound)

    if add_noise:
      if num_examples is None:
        num_examples = tf.slice(tf.shape(x), [0], [1])
      privacy_accum_op = self._accountant.accumulate_privacy_spending(
          eps_delta, sigma, num_examples)
      with tf.control_dependencies([privacy_accum_op]):
        saned_x = utils.AddGaussianNoise(tf.reduce_sum(x, 0),
                                         sigma * l2norm_bound)
    else:
      saned_x = tf.reduce_sum(x, 0)
    return saned_x


================================================
FILE: differential_privacy/dp_sgd/dp_optimizer/utils.py
================================================
# (originally from https://github.com/tensorflow/models/tree/master/research/differential_privacy,
# possibly with some small edits by @corcra)

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Utils for building and training NN models.
"""
from __future__ import division

import math

import numpy
import tensorflow as tf


class LayerParameters(object):
  """class that defines a non-conv layer."""
  def __init__(self):
    self.name = ""
    self.num_units = 0
    self._with_bias = False
    self.relu = False
    self.gradient_l2norm_bound = 0.0
    self.bias_gradient_l2norm_bound = 0.0
    self.trainable = True
    self.weight_decay = 0.0


class ConvParameters(object):
  """class that defines a conv layer."""
  def __init__(self):
    self.patch_size = 5
    self.stride = 1
    self.in_channels = 1
    self.out_channels = 0
    self.with_bias = True
    self.relu = True
    self.max_pool = True
    self.max_pool_size = 2
    self.max_pool_stride = 2
    self.trainable = False
    self.in_size = 28
    self.name = ""
    self.num_outputs = 0
    self.bias_stddev = 0.1


# Parameters for a layered neural network.
class NetworkParameters(object):
  """class that define the overall model structure."""
  def __init__(self):
    self.input_size = 0
    self.projection_type = 'NONE'  # NONE, RANDOM, PCA
    self.projection_dimensions = 0
    self.default_gradient_l2norm_bound = 0.0
    self.layer_parameters = []  # List of LayerParameters
    self.conv_parameters = []  # List of ConvParameters


def GetTensorOpName(x):
  """Get the name of the op that created a tensor.

  Useful for naming related tensors, as ':' in name field of op is not permitted

  Args:
    x: the input tensor.
  Returns:
    the name of the op.
  """

  t = x.name.rsplit(":", 1)
  if len(t) == 1:
    return x.name
  else:
    return t[0]


def BuildNetwork(inputs, network_parameters):
  """Build a network using the given parameters.

  Args:
    inputs: a Tensor of floats containing the input data.
    network_parameters: NetworkParameters object
      that describes the parameters for the network.
  Returns:
    output, training_parameters: where the outputs (a tensor) is the output
      of the network, and training_parameters (a dictionary that maps the
      name of each variable to a dictionary of parameters) is the parameters
      used during training.
  """

  training_parameters = {}
  num_inputs = network_parameters.input_size
  outputs = inputs
  projection = None

  # First apply convolutions, if needed
  for conv_param in network_parameters.conv_parameters:
    outputs = tf.reshape(
        outputs,
        [-1, conv_param.in_size, conv_param.in_size,
         conv_param.in_channels])
    conv_weights_name = "%s_conv_weight" % (conv_param.name)
    conv_bias_name = "%s_conv_bias" % (conv_param.name)
    conv_std_dev = 1.0 / (conv_param.patch_size
                          * math.sqrt(conv_param.in_channels))
    conv_weights = tf.Variable(
        tf.truncated_normal([conv_param.patch_size,
                             conv_param.patch_size,
                             conv_param.in_channels,
                             conv_param.out_channels],
                            stddev=conv_std_dev),
        trainable=conv_param.trainable,
        name=conv_weights_name)
    conv_bias = tf.Variable(
        tf.truncated_normal([conv_param.out_channels],
                            stddev=conv_param.bias_stddev),
        trainable=conv_param.trainable,
        name=conv_bias_name)
    training_parameters[conv_weights_name] = {}
    training_parameters[conv_bias_name] = {}
    conv = tf.nn.conv2d(outputs, conv_weights,
                        strides=[1, conv_param.stride,
                                 conv_param.stride, 1],
                        padding="SAME")
    relud = tf.nn.relu(conv + conv_bias)
    mpd = tf.nn.max_pool(relud, ksize=[1,
                                       conv_param.max_pool_size,
                                       conv_param.max_pool_size, 1],
                         strides=[1, conv_param.max_pool_stride,
                                  conv_param.max_pool_stride, 1],
                         padding="SAME")
    outputs = mpd
    num_inputs = conv_param.num_outputs
    # this should equal
    # in_size * in_size * out_channels / (stride * max_pool_stride)

  # once all the convs are done, reshape to make it flat
  outputs = tf.reshape(outputs, [-1, num_inputs])

  # Now project, if needed
  if network_parameters.projection_type is not "NONE":
    projection = tf.Variable(tf.truncated_normal(
        [num_inputs, network_parameters.projection_dimensions],
        stddev=1.0 / math.sqrt(num_inputs)), trainable=False, name="projection")
    num_inputs = network_parameters.projection_dimensions
    outputs = tf.matmul(outputs, projection)

  # Now apply any other layers

  for layer_parameters in network_parameters.layer_parameters:
    num_units = layer_parameters.num_units
    hidden_weights_name = "%s_weight" % (layer_parameters.name)
    hidden_weights = tf.Variable(
        tf.truncated_normal([num_inputs, num_units],
                            stddev=1.0 / math.sqrt(num_inputs)),
        name=hidden_weights_name, trainable=layer_parameters.trainable)
    training_parameters[hidden_weights_name] = {}
    if layer_parameters.gradient_l2norm_bound:
      training_parameters[hidden_weights_name]["gradient_l2norm_bound"] = (
          layer_parameters.gradient_l2norm_bound)
    if layer_parameters.weight_decay:
      training_parameters[hidden_weights_name]["weight_decay"] = (
          layer_parameters.weight_decay)

    outputs = tf.matmul(outputs, hidden_weights)
    if layer_parameters.with_bias:
      hidden_biases_name = "%s_bias" % (layer_parameters.name)
      hidden_biases = tf.Variable(tf.zeros([num_units]),
                                  name=hidden_biases_name)
      training_parameters[hidden_biases_name] = {}
      if layer_parameters.bias_gradient_l2norm_bound:
        training_parameters[hidden_biases_name][
            "bias_gradient_l2norm_bound"] = (
                layer_parameters.bias_gradient_l2norm_bound)

      outputs += hidden_biases
    if layer_parameters.relu:
      outputs = tf.nn.relu(outputs)
    # num_inputs for the next layer is num_units in the current layer.
    num_inputs = num_units

  return outputs, projection, training_parameters


def VaryRate(start, end, saturate_epochs, epoch):
  """Compute a linearly varying number.

  Decrease linearly from start to end until epoch saturate_epochs.

  Args:
    start: the initial number.
    end: the end number.
    saturate_epochs: after this we do not reduce the number; if less than
      or equal to zero, just return start.
    epoch: the current learning epoch.
  Returns:
    the caculated number.
  """
  if saturate_epochs <= 0:
    return start

  step = (start - end) / (saturate_epochs - 1)
  if epoch < saturate_epochs:
    return start - step * epoch
  else:
    return end


def BatchClipByL2norm(t, upper_bound, name=None):
  """Clip an array of tensors by L2 norm.

  Shrink each dimension-0 slice of tensor (for matrix it is each row) such
  that the l2 norm is at most upper_bound. Here we clip each row as it
  corresponds to each example in the batch.

  Args:
    t: the input tensor.
    upper_bound: the upperbound of the L2 norm.
    name: optional name.
  Returns:
    the clipped tensor.
  """

  assert upper_bound > 0
  with tf.name_scope(values=[t, upper_bound], name=name,
                     default_name="batch_clip_by_l2norm") as name:
    saved_shape = tf.shape(t)
    batch_size = tf.slice(saved_shape, [0], [1])
    t2 = tf.reshape(t, tf.concat(axis=0, values=[batch_size, [-1]]))
    upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]),
                              tf.constant(1.0/upper_bound))
    # Add a small number to avoid divide by 0
    l2norm_inv = tf.rsqrt(tf.reduce_sum(t2 * t2, [1]) + 0.000001)
    scale = tf.minimum(l2norm_inv, upper_bound_inv) * upper_bound
    clipped_t = tf.matmul(tf.diag(scale), t2)
    clipped_t = tf.reshape(clipped_t, saved_shape, name=name)
  return clipped_t


def SoftThreshold(t, threshold_ratio, name=None):
  """Soft-threshold a tensor by the mean value.

  Softthreshold each dimension-0 vector (for matrix it is each column) by
  the mean of absolute value multiplied by the threshold_ratio factor. Here
  we soft threshold each column as it corresponds to each unit in a layer.

  Args:
    t: the input tensor.
    threshold_ratio: the threshold ratio.
    name: the optional name for the returned tensor.
  Returns:
    the thresholded tensor, where each entry is soft-thresholded by
    threshold_ratio times the mean of the aboslute value of each column.
  """

  assert threshold_ratio >= 0
  with tf.name_scope(values=[t, threshold_ratio], name=name,
                     default_name="soft_thresholding") as name:
    saved_shape = tf.shape(t)
    t2 = tf.reshape(t, tf.concat(axis=0, values=[tf.slice(saved_shape, [0], [1]), -1]))
    t_abs = tf.abs(t2)
    t_x = tf.sign(t2) * tf.nn.relu(t_abs -
                                   (tf.reduce_mean(t_abs, [0],
                                                   keep_dims=True) *
                                    threshold_ratio))
    return tf.reshape(t_x, saved_shape, name=name)


def AddGaussianNoise(t, sigma, name=None):
  """Add i.i.d. Gaussian noise (0, sigma^2) to every entry of t.

  Args:
    t: the input tensor.
    sigma: the stddev of the Gaussian noise.
    name: optional name.
  Returns:
    the noisy tensor.
  """

  with tf.name_scope(values=[t, sigma], name=name,
                     default_name="add_gaussian_noise") as name:
    noisy_t = t + tf.random_normal(tf.shape(t), stddev=sigma)
  return noisy_t


def GenerateBinomialTable(m):
  """Generate binomial table.

  Args:
    m: the size of the table.
  Returns:
    A two dimensional array T where T[i][j] = (i choose j),
    for 0<= i, j <=m.
  """

  table = numpy.zeros((m + 1, m + 1), dtype=numpy.float64)
  for i in range(m + 1):
    table[i, 0] = 1
  for i in range(1, m + 1):
    for j in range(1, m + 1):
      v = table[i - 1, j] + table[i - 1, j -1]
      assert not math.isnan(v) and not math.isinf(v)
      table[i, j] = v
  return tf.convert_to_tensor(table)


================================================
FILE: differential_privacy/privacy_accountant/tf/accountant.py
================================================
# (originally from https://github.com/tensorflow/models/tree/master/research/differential_privacy,
# possibly with some small edits by @corcra)

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Defines Accountant class for keeping track of privacy spending.

A privacy accountant keeps track of privacy spendings. It has methods
accumulate_privacy_spending and get_privacy_spent. Here we only define
AmortizedAccountant which tracks the privacy spending in the amortized
way. It uses privacy amplication via sampling to compute the privacy
spending for each batch and strong composition (specialized for Gaussian
noise) for accumulate the privacy spending.
"""
from __future__ import division

import abc
import collections
import math
import sys

import numpy
import tensorflow as tf

from differential_privacy.dp_sgd.dp_optimizer import utils

EpsDelta = collections.namedtuple("EpsDelta", ["spent_eps", "spent_delta"])

import pdb

# TODO(liqzhang) To ensure the same API for AmortizedAccountant and
# MomentsAccountant, we pass the union of arguments to both, so we
# have unused_sigma for AmortizedAccountant and unused_eps_delta for
# MomentsAccountant. Consider to revise the API to avoid the unused
# arguments.  It would be good to use @abc.abstractmethod, etc, to
# define the common interface as a base class.
class AmortizedAccountant(object):
  """Keep track of privacy spending in an amortized way.

  AmortizedAccountant accumulates the privacy spending by assuming
  all the examples are processed uniformly at random so the spending is
  amortized among all the examples. And we assume that we use Gaussian noise
  so the accumulation is on eps^2 and delta, using advanced composition.
  """

  def __init__(self, total_examples):
    """Initialization. Currently only support amortized tracking.

    Args:
      total_examples: total number of examples.
    """

    assert total_examples > 0
    self._total_examples = total_examples
    self._eps_squared_sum = tf.Variable(tf.zeros([1]), trainable=False,
                                        name="eps_squared_sum")
    self._delta_sum = tf.Variable(tf.zeros([1]), trainable=False,
                                  name="delta_sum")

  def accumulate_privacy_spending(self, eps_delta, unused_sigma,
                                  num_examples):
    """Accumulate the privacy spending.

    Currently only support approximate privacy. Here we assume we use Gaussian
    noise on randomly sampled batch so we get better composition: 1. the per
    batch privacy is computed using privacy amplication via sampling bound;
    2. the composition is done using the composition with Gaussian noise.
    TODO(liqzhang) Add a link to a document that describes the bounds used.

    Args:
      eps_delta: EpsDelta pair which can be tensors.
      unused_sigma: the noise sigma. Unused for this accountant.
      num_examples: the number of examples involved.
    Returns:
      a TensorFlow operation for updating the privacy spending.
    """

    eps, delta = eps_delta
    with tf.control_dependencies(
        [tf.Assert(tf.greater(delta, 0),
                   ["delta needs to be greater than 0"])]):
      amortize_ratio = (tf.cast(num_examples, tf.float32) * 1.0 /
                        self._total_examples)
      # Use privacy amplification via sampling bound.
      # See Lemma 2.2 in http://arxiv.org/pdf/1405.7085v2.pdf
      # TODO(liqzhang) Add a link to a document with formal statement
      # and proof.
      amortize_eps = tf.reshape(tf.log(1.0 + amortize_ratio * (
          tf.exp(eps) - 1.0)), [1])
      amortize_delta = tf.reshape(amortize_ratio * delta, [1])
      return tf.group(*[tf.assign_add(self._eps_squared_sum,
                                      tf.square(amortize_eps)),
                        tf.assign_add(self._delta_sum, amortize_delta)])

  def get_privacy_spent(self, sess, target_eps=None):
    """Report the spending so far.

    Args:
      sess: the session to run the tensor.
      target_eps: the target epsilon. Unused.
    Returns:
      the list containing a single EpsDelta, with values as Python floats (as
      opposed to numpy.float64). This is to be consistent with
      MomentAccountant which can return a list of (eps, delta) pair.
    """

    # pylint: disable=unused-argument
    unused_target_eps = target_eps
    eps_squared_sum, delta_sum = sess.run([self._eps_squared_sum,
                                           self._delta_sum])
    return [EpsDelta(math.sqrt(eps_squared_sum), float(delta_sum))]


class MomentsAccountant(object):
  """Privacy accountant which keeps track of moments of privacy loss.

  Note: The constructor of this class creates tf.Variables that must
  be initialized with tf.global_variables_initializer() or similar calls.

  MomentsAccountant accumulates the high moments of the privacy loss. It
  requires a method for computing differenital moments of the noise (See
  below for the definition). So every specific accountant should subclass
  this class by implementing _differential_moments method.

  Denote by X_i the random variable of privacy loss at the i-th step.
  Consider two databases D, D' which differ by one item. X_i takes value
  log Pr[M(D')==x]/Pr[M(D)==x] with probability Pr[M(D)==x].
  In MomentsAccountant, we keep track of y_i(L) = log E[exp(L X_i)] for some
  large enough L. To compute the final privacy spending,  we apply Chernoff
  bound (assuming the random noise added at each step is independent) to
  bound the total privacy loss Z = sum X_i as follows:
    Pr[Z > e] = Pr[exp(L Z) > exp(L e)]
              < E[exp(L Z)] / exp(L e)
              = Prod_i E[exp(L X_i)] / exp(L e)
              = exp(sum_i log E[exp(L X_i)]) / exp(L e)
              = exp(sum_i y_i(L) - L e)
  Hence the mechanism is (e, d)-differentially private for
    d =  exp(sum_i y_i(L) - L e).
  We require d < 1, i.e. e > sum_i y_i(L) / L. We maintain y_i(L) for several
  L to compute the best d for any give e (normally should be the lowest L
  such that 2 * sum_i y_i(L) / L < e.

  We further assume that at each step, the mechanism operates on a random
  sample with sampling probability q = batch_size / total_examples. Then
    E[exp(L X)] = E[(Pr[M(D)==x / Pr[M(D')==x])^L]
  By distinguishing two cases of whether D < D' or D' < D, we have
  that
    E[exp(L X)] <= max (I1, I2)
  where
    I1 = (1-q) E ((1-q) + q P(X+1) / P(X))^L + q E ((1-q) + q P(X) / P(X-1))^L
    I2 = E (P(X) / ((1-q) + q P(X+1)))^L

  In order to compute I1 and I2, one can consider to
    1. use an asymptotic bound, which recovers the advance composition theorem;
    2. use the closed formula (like GaussianMomentsAccountant);
    3. use numerical integration or random sample estimation.

  Dependent on the distribution, we can often obtain a tigher estimation on
  the moments and hence a more accurate estimation of the privacy loss than
  obtained using generic composition theorems.

  """

  __metaclass__ = abc.ABCMeta

  def __init__(self, total_examples, moment_orders=32):
    """Initialize a MomentsAccountant.

    Args:
      total_examples: total number of examples.
      moment_orders: the order of moments to keep.
    """

    assert total_examples > 0
    self._total_examples = total_examples
    self._moment_orders = (moment_orders
                           if isinstance(moment_orders, (list, tuple))
                           else range(1, moment_orders + 1))
    self._max_moment_order = max(self._moment_orders)
    assert self._max_moment_order < 100, "The moment order is too large."
    self._log_moments = [tf.Variable(numpy.float64(0.0),
                                     trainable=False,
                                     name=("log_moments-%d" % moment_order))
                         for moment_order in self._moment_orders]

  @abc.abstractmethod
  def _compute_log_moment(self, sigma, q, moment_order):
    """Compute high moment of privacy loss.

    Args:
      sigma: the noise sigma, in the multiples of the sensitivity.
      q: the sampling ratio.
      moment_order: the order of moment.
    Returns:
      log E[exp(moment_order * X)]
    """
    pass

  def accumulate_privacy_spending(self, unused_eps_delta,
                                  sigma, num_examples):
    """Accumulate privacy spending.

    In particular, accounts for privacy spending when we assume there
    are num_examples, and we are releasing the vector
    (sum_{i=1}^{num_examples} x_i) + Normal(0, stddev=l2norm_bound*sigma)
    where l2norm_bound is the maximum l2_norm of each example x_i, and
    the num_examples have been randomly selected out of a pool of
    self.total_examples.

    Args:
      unused_eps_delta: EpsDelta pair which can be tensors. Unused
        in this accountant.
      sigma: the noise sigma, in the multiples of the sensitivity (that is,
        if the l2norm sensitivity is k, then the caller must have added
        Gaussian noise with stddev=k*sigma to the result of the query).
      num_examples: the number of examples involved.
    Returns:
      a TensorFlow operation for updating the privacy spending.
    """
    q = tf.cast(num_examples, tf.float64) * 1.0 / self._total_examples

    moments_accum_ops = []
    for i in range(len(self._log_moments)):
      moment = self._compute_log_moment(sigma, q, self._moment_orders[i])
      moments_accum_ops.append(tf.assign_add(self._log_moments[i], moment))
    return tf.group(*moments_accum_ops)

  def _compute_delta(self, log_moments, eps):
    """Compute delta for given log_moments and eps.

    Args:
      log_moments: the log moments of privacy loss, in the form of pairs
        of (moment_order, log_moment)
      eps: the target epsilon.
    Returns:
      delta
    """
    min_delta = 1.0
    for moment_order, log_moment in log_moments:
      if math.isinf(log_moment) or math.isnan(log_moment):
        sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
        continue
      if log_moment < moment_order * eps:
        min_delta = min(min_delta,
                        math.exp(log_moment - moment_order * eps))
    return min_delta

  def _compute_eps(self, log_moments, delta):
    min_eps = float("inf")
    for moment_order, log_moment in log_moments:
      if math.isinf(log_moment) or math.isnan(log_moment):
        sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
        continue
      min_eps = min(min_eps, (log_moment - math.log(delta)) / moment_order)
    return min_eps

  def get_privacy_spent(self, sess, target_eps=None, target_deltas=None):
    """Compute privacy spending in (e, d)-DP form for a single or list of eps.

    Args:
      sess: the session to run the tensor.
      target_eps: a list of target epsilon's for which we would like to
        compute corresponding delta value.
      target_deltas: a list of target deltas for which we would like to
        compute the corresponding eps value. Caller must specify
        either target_eps or target_delta.
    Returns:
      A list of EpsDelta pairs.
    """
    assert (target_eps is None) ^ (target_deltas is None)
    eps_deltas = []
    log_moments = sess.run(self._log_moments)
    log_moments_with_order = numpy.array(list(zip(self._moment_orders, log_moments)))
    if target_eps is not None:
      for eps in target_eps:
        delta = self._compute_delta(log_moments_with_order, eps)
        eps_deltas.append(EpsDelta(eps, delta))
    else:
      assert target_deltas
      for delta in target_deltas:
        eps_deltas.append(
            EpsDelta(self._compute_eps(log_moments_with_order, delta), delta))
    return eps_deltas


class GaussianMomentsAccountant(MomentsAccountant):
  """MomentsAccountant which assumes Gaussian noise.

  GaussianMomentsAccountant assumes the noise added is centered Gaussian
  noise N(0, sigma^2 I). In this case, we can compute the differential moments
  accurately using a formula.

  For asymptotic bound, for Gaussian noise with variance sigma^2, we can show
  for L < sigma^2,  q L < sigma,
    log E[exp(L X)] = O(q^2 L^2 / sigma^2).
  Using this we derive that for training T epoches, with batch ratio q,
  the Gaussian mechanism with variance sigma^2 (with q < 1/sigma) is (e, d)
  private for d = exp(T/q q^2 L^2 / sigma^2 - L e). Setting L = sigma^2,
  Tq = e/2, the mechanism is (e, exp(-e sigma^2/2))-DP. Equivalently, the
  mechanism is (e, d)-DP if sigma = sqrt{2 log(1/d)}/e, q < 1/sigma,
  and T < e/(2q). This bound is better than the bound obtained using general
  composition theorems, by an Omega(sqrt{log k}) factor on epsilon, if we run
  k steps. Since we use direct estimate, the obtained privacy bound has tight
  constant.

  For GaussianMomentAccountant, it suffices to compute I1, as I1 >= I2,
  which reduce to computing E(P(x+s)/P(x+s-1) - 1)^i for s = 0 and 1. In the
  companion gaussian_moments.py file, we supply procedure for computing both
  I1 and I2 (the computation of I2 is through multi-precision integration
  package). It can be verified that indeed I1 >= I2 for wide range of parameters
  we have tried, though at the moment we are unable to prove this claim.

  We recommend that when using this accountant, users independently verify
  using gaussian_moments.py that for their parameters, I1 is indeed larger
  than I2. This can be done by following the instructions in
  gaussian_moments.py.
  """

  def __init__(self, total_examples, moment_orders=32):
    """Initialization.

    Args:
      total_examples: total number of examples.
      moment_orders: the order of moments to keep.
    """
    super(self.__class__, self).__init__(total_examples, moment_orders)
    self._binomial_table = utils.GenerateBinomialTable(self._max_moment_order)

  def _differential_moments(self, sigma, s, t):
    """Compute 0 to t-th differential moments for Gaussian variable.

        E[(P(x+s)/P(x+s-1)-1)^t]
      = sum_{i=0}^t (t choose i) (-1)^{t-i} E[(P(x+s)/P(x+s-1))^i]
      = sum_{i=0}^t (t choose i) (-1)^{t-i} E[exp(-i*(2*x+2*s-1)/(2*sigma^2))]
      = sum_{i=0}^t (t choose i) (-1)^{t-i} exp(i(i+1-2*s)/(2 sigma^2))
    Args:
      sigma: the noise sigma, in the multiples of the sensitivity.
      s: the shift.
      t: 0 to t-th moment.
    Returns:
      0 to t-th moment as a tensor of shape [t+1].
    """
    assert t <= self._max_moment_order, ("The order of %d is out "
                                         "of the upper bound %d."
                                         % (t, self._max_moment_order))
    binomial = tf.slice(self._binomial_table, [0, 0],
                        [t + 1, t + 1])
    signs = numpy.zeros((t + 1, t + 1), dtype=numpy.float64)
    for i in range(t + 1):
      for j in range(t + 1):
        signs[i, j] = 1.0 - 2 * ((i - j) % 2)
    exponents = tf.constant([j * (j + 1.0 - 2.0 * s) / (2.0 * sigma * sigma)
                             for j in range(t + 1)], dtype=tf.float64)
    # x[i, j] = binomial[i, j] * signs[i, j] = (i choose j) * (-1)^{i-j}
    x = tf.multiply(binomial, signs)
    # y[i, j] = x[i, j] * exp(exponents[j])
    #         = (i choose j) * (-1)^{i-j} * exp(j(j-1)/(2 sigma^2))
    # Note: this computation is done by broadcasting pointwise multiplication
    # between [t+1, t+1] tensor and [t+1] tensor.
    y = tf.multiply(x, tf.exp(exponents))
    # z[i] = sum_j y[i, j]
    #      = sum_j (i choose j) * (-1)^{i-j} * exp(j(j-1)/(2 sigma^2))
    z = tf.reduce_sum(y, 1)
    return z

  def _compute_log_moment(self, sigma, q, moment_order):
    """Compute high moment of privacy loss.

    Args:
      sigma: the noise sigma, in the multiples of the sensitivity.
      q: the sampling ratio.
      moment_order: the order of moment.
    Returns:
      log E[exp(moment_order * X)]
    """
    assert moment_order <= self._max_moment_order, ("The order of %d is out "
                                                    "of the upper bound %d."
                                                    % (moment_order,
                                                       self._max_moment_order))
    binomial_table = tf.slice(self._binomial_table, [moment_order, 0],
                              [1, moment_order + 1])
    # qs = [1 q q^2 ... q^L] = exp([0 1 2 ... L] * log(q))
    qs = tf.exp(tf.constant([i * 1.0 for i in range(moment_order + 1)],
                            dtype=tf.float64) * tf.cast(
                                tf.log(q), dtype=tf.float64))
    moments0 = self._differential_moments(sigma, 0.0, moment_order)
    term0 = tf.reduce_sum(binomial_table * qs * moments0)
    moments1 = self._differential_moments(sigma, 1.0, moment_order)
    term1 = tf.reduce_sum(binomial_table * qs * moments1)
    return tf.squeeze(tf.log(tf.cast(q * term0 + (1.0 - q) * term1,
                                     tf.float64)))


class DummyAccountant(object):
  """An accountant that does no accounting."""

  def accumulate_privacy_spending(self, *unused_args):
    return tf.no_op()

  def get_privacy_spent(self, unused_sess, **unused_kwargs):
    return [EpsDelta(numpy.inf, 1.0)]


================================================
FILE: eugenium_mmd.py
================================================
'''
Code taken from: https://github.com/eugenium/mmd
(modified slightly for efficiency/PEP by Stephanie Hyland)

Python implementation of MMD and Covariance estimates for Relative MMD

Some code is based on code from Vincent Van Asch 
which is based on matlab code from Arthur Gretton


Eugene Belilovsky
eugene.belilovsky@inria.fr
'''
import numpy as np
import scipy as sp
from numpy import sqrt
from sklearn.metrics.pairwise import rbf_kernel
from functools import partial
import pdb

def my_kernel(X, Y, sigma):
    gamma = 1 / (2 * sigma**2)
    if len(X.shape) == 2:
        X_sqnorms = np.einsum('...i,...i', X, X)
        Y_sqnorms = np.einsum('...i,...i', Y, Y)
        XY = np.einsum('ia,ja', X, Y)
    elif len(X.shape) == 3:
        X_sqnorms = np.einsum('...ij,...ij', X, X)
        Y_sqnorms = np.einsum('...ij,...ij', Y, Y)
        XY = np.einsum('iab,jab', X, Y)
    else:
        pdb.set_trace()
    Kxy = np.exp(-gamma*(X_sqnorms.reshape(-1, 1) - 2*XY + Y_sqnorms.reshape(1, -1)))
    return Kxy

def MMD_3_Sample_Test(X, Y, Z, sigma=-1, SelectSigma=True, computeMMDs=False):
    '''Performs the relative MMD test which returns a test statistic for whether Y is closer to X or than Z.
    See http://arxiv.org/pdf/1511.04581.pdf
    The bandwith heuristic is based on the median heuristic (see Smola,Gretton).
    '''
    if(sigma<0):
        #Similar heuristics
        if SelectSigma:
            siz=np.min((1000, X.shape[0]))
            sigma1=kernelwidthPair(X[0:siz], Y[0:siz]);
            sigma2=kernelwidthPair(X[0:siz], Z[0:siz]);
            sigma=(sigma1+sigma2)/2.
        else:
            siz=np.min((1000, X.shape[0]*3))
            Zem=np.r_[X[0:siz/3], Y[0:siz/3], Z[0:siz/3]]
            sigma=kernelwidth(Zem);

    #kernel = partial(rbf_kernel, gamma=1.0/(sigma**2))
    kernel = partial(my_kernel, sigma=sigma)
    #kernel = partial(grbf, sigma=sigma)

    Kyy = kernel(Y, Y)
    Kzz = kernel(Z, Z)
    Kxy = kernel(X, Y)
    Kxz = kernel(X, Z)

    Kyynd = Kyy-np.diag(np.diagonal(Kyy))
    Kzznd = Kzz-np.diag(np.diagonal(Kzz))
    m = Kxy.shape[0];
    n = Kyy.shape[0];
    r = Kzz.shape[0];    

    
    u_yy=np.sum(Kyynd)*( 1./(n*(n-1)) )
    u_zz=np.sum(Kzznd)*( 1./(r*(r-1)) )
    u_xy=np.sum(Kxy)/(m*n)
    u_xz=np.sum(Kxz)/(m*r)
    #Compute the test statistic
    t=u_yy - 2.*u_xy - (u_zz-2.*u_xz)
    Diff_Var, Diff_Var_z2, data=MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz)

    pvalue=sp.stats.norm.cdf(-t/np.sqrt((Diff_Var)))
  #  pvalue_z2=sp.stats.norm.cdf(-t/np.sqrt((Diff_Var_z2)))
    tstat=t/sqrt(Diff_Var)
    
    if(computeMMDs):
         Kxx = kernel(X, X)
         Kxxnd = Kxx-np.diag(np.diagonal(Kxx))
         u_xx=np.sum(Kxxnd)*( 1./(m*(m-1)) )
         MMDXY=u_xx+u_yy-2.*u_xy
         MMDXZ=u_xx+u_zz-2.*u_xz
    else:
         MMDXY=None
         MMDXZ=None
    return pvalue, tstat, sigma, MMDXY, MMDXZ
    
def MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz):
    '''
    Compute the variance of the difference statistic MMDXY-MMDXZ
    See http://arxiv.org/pdf/1511.04581.pdf Appendix for derivations
    '''
    m = Kxy.shape[0];
    n = Kyy.shape[0];
    r = Kzz.shape[0];
    
    
    Kyynd = Kyy-np.diag(np.diagonal(Kyy));
    Kzznd = Kzz-np.diag(np.diagonal(Kzz));
    
    u_yy=np.sum(Kyynd)*( 1./(n*(n-1)) );
    u_zz=np.sum(Kzznd)*( 1./(r*(r-1)) );
    u_xy=np.sum(Kxy)/(m*n);
    u_xz=np.sum(Kxz)/(m*r);
    
    #compute zeta1
    t1=(1./n**3)*np.sum(Kyynd.T.dot(Kyynd))-u_yy**2;
    t2=(1./(n**2*m))*np.sum(Kxy.T.dot(Kxy))-u_xy**2;
    t3=(1./(n*m**2))*np.sum(Kxy.dot(Kxy.T))-u_xy**2;
    t4=(1./r**3)*np.sum(Kzznd.T.dot(Kzznd))-u_zz**2;
    t5=(1./(r*m**2))*np.sum(Kxz.dot(Kxz.T))-u_xz**2;
    t6=(1./(r**2*m))*np.sum(Kxz.T.dot(Kxz))-u_xz**2;
    t7=(1./(n**2*m))*np.sum(Kyynd.dot(Kxy.T))-u_yy*u_xy;
    t8=(1./(n*m*r))*np.sum(Kxy.T.dot(Kxz))-u_xz*u_xy;
    t9=(1./(r**2*m))*np.sum(Kzznd.dot(Kxz.T))-u_zz*u_xz;
    
    zeta1=(t1+t2+t3+t4+t5+t6-2.*(t7+t8+t9)); 
    
    zeta2=(1/m/(m-1))*np.sum((Kyynd-Kzznd-Kxy.T-Kxy+Kxz+Kxz.T)**2)-(u_yy - 2.*u_xy - (u_zz-2.*u_xz))**2;
    
    
    data=dict({'t1':t1,
               't2':t2,
               't3':t3,
               't4':t4,
               't5':t5,
               't6':t6,
               't7':t7,
               't8':t8,
               't9':t9,
               'zeta1':zeta1,
               'zeta2':zeta2,
                })
    #TODO more precise version for zeta2 
    #    xx=(1/m^2)*sum(sum(Kxxnd.*Kxxnd))-u_xx^2;
    # yy=(1/n^2)*sum(sum(Kyynd.*Kyynd))-u_yy^2;
    #xy=(1/(n*m))*sum(sum(Kxy.*Kxy))-u_xy^2;
    #xxy=(1/(n*m^2))*sum(sum(Kxxnd*Kxy))-u_xx*u_xy;
    #yyx=(1/(n^2*m))*sum(sum(Kyynd*Kxy'))-u_yy*u_xy;
    #zeta2=(xx+yy+xy+xy-2*(xxy+xxy +yyx+yyx))
    
    
    Var=(4.*(m-2)/(m*(m-1)))*zeta1;
    Var_z2=Var+(2./(m*(m-1)))*zeta2;

    return Var, Var_z2, data
def grbf(x1, x2, sigma):
    '''Calculates the Gaussian radial base function kernel'''
    n, nfeatures = x1.shape
    m, mfeatures = x2.shape
    
    k1 = np.sum((x1*x1), 1)
    q = np.tile(k1, (m, 1)).transpose()
    del k1
    
    k2 = np.sum((x2*x2), 1)
    r = np.tile(k2.T, (n, 1))
    del k2
    
    h = q + r
    del q, r
    
    # The norm
    h = h - 2*np.dot(x1, x2.transpose())
    h = np.array(h, dtype=float)
    
    return np.exp(-1.*h/(2.*pow(sigma, 2)))
    
    
def kernelwidthPair(x1, x2):
    '''Implementation of the median heuristic. See Gretton 2012
       Pick sigma such that the exponent of exp(- ||x-y|| / (2*sigma2)),
       in other words ||x-y|| / (2*sigma2),  equals 1 for the median distance x
       and y of all distances between points from both data sets X and Y.
    '''
    n, nfeatures = x1.shape
    m, mfeatures = x2.shape
    
    k1 = np.sum((x1*x1), 1)
    q = np.tile(k1, (m, 1)).transpose()
    del k1
    
    k2 = np.sum((x2*x2), 1)
    r = np.tile(k2, (n, 1))
    del k2
    
    h= q + r
    del q, r
    
    # The norm
    h = h - 2*np.dot(x1, x2.transpose())
    h = np.array(h, dtype=float)
    
    mdist = np.median([i for i in h.flat if i])
    
    sigma = sqrt(mdist/2.0)
    if not sigma: sigma = 1
    
    return sigma
def kernelwidth(Zmed):
    '''Alternative median heuristic when we cant partition the points
    '''
    m= Zmed.shape[0]
    k1 = np.expand_dims(np.sum((Zmed*Zmed), axis=1), 1)
    q = np.kron(np.ones((1, m)), k1)
    r = np.kron(np.ones((m, 1)), k1.T)
    del k1
    
    h= q + r
    del q, r
    
    # The norm
    h = h - 2.*Zmed.dot(Zmed.T)
    h = np.array(h, dtype=float)
    
    mdist = np.median([i for i in h.flat if i])
    
    sigma = sqrt(mdist/2.0)
    if not sigma: sigma = 1
    
    return sigma
    
    
def MMD_unbiased(Kxx, Kyy, Kxy):
#The estimate when distribution of x is not equal to y
    m = Kxx.shape[0]
    n = Kyy.shape[0]
    
    t1 = (1./(m*(m-1)))*np.sum(Kxx - np.diag(np.diagonal(Kxx)))
    t2 = (2./(m*n)) * np.sum(Kxy)
    t3 = (1./(n*(n-1)))* np.sum(Kyy - np.diag(np.diagonal(Kyy)))
    
    MMDsquared = (t1-t2+t3)
    
    return MMDsquared


================================================
FILE: eval.py
================================================
#!/usr/bin/env ipython
# Evaluation of models
#

import json
import pdb
import numpy as np
import pandas as pd
from eugenium_mmd import MMD_3_Sample_Test
from scipy.stats import ks_2samp
import mmd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
import sklearn
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# for keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.backend import clear_session

import model
import data_utils
import plotting

import pickle

def assert_same_data(A, B):
    # case 0, both loaded
    if A['data'] == 'load' and B['data'] == 'load':
        assert A['data_load_from'] == B['data_load_from']
        data_path = './experiments/data/' + A['data_load_from']
    elif A['data'] == 'load' and (not B['data'] == 'load'):
        assert A['data_load_from'] == B['identifier']
        data_path = './experiments/data/' + A['data_load_from']
    elif (not A['data'] == 'load') and B['data'] == 'load':
        assert B['data_load_from'] == A['identifier']
        data_path = './experiments/data/' + A['identifier']
    else:
        raise ValueError(A['data'], B['data'])
    return data_path

def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
    """
    Compare samples from a model against training set and validation set in mmd
    """
    if tstr:
        print('Loading data from TSTR experiment (not sampling from model)')
        # load pre-generated samples
        synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
        model_samples = synth_data['samples']
        synth_labels = synth_data['labels']
        # load real data used in that experiment
        real_data = np.load('./experiments/data/' + identifier + '.data.npy').item()
        real_samples = real_data['samples']
        train = real_samples['train']
        test = real_samples['test']
        n_samples = test.shape[0]
        if model_samples.shape[0] > n_samples:
            model_samples = np.random.permutation(model_samples)[:n_samples]
        print('Data loaded successfully!')
    else:
        if identifier == 'cristobal_eICU':
            model_samples = pickle.load(open('REDACTED', 'rb'))
            samples, labels = data_utils.eICU_task()
            train = samples['train'].reshape(-1,16,4)
            vali = samples['vali'].reshape(-1,16,4)
            test = samples['test'].reshape(-1,16,4)
            #train_targets = labels['train']
            #vali_targets = labels['vali']
            #test_targets = labels['test']
            train, vali, test = data_utils.scale_data(train, vali, test)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(model_samples)[:n_samples]
        elif identifier == 'cristobal_MNIST':
            the_dir = 'REDACTED'
            # pick a random one
            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
            model_samples, model_labels = pickle.load(open(the_dir + 'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_' + which + '_190.pk', 'rb'))
            # get test and train...
            # (generated with fixed seed...)
            mnist_resized_dim = 14
            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
            proportions = [0.6, 0.2, 0.2]
            train, vali, test, labels_split = data_utils.split(samples, labels=labels, random_seed=1, proportions=proportions)
            np.random.seed()
            train = train.reshape(-1, 14, 14)
            test = test.reshape(-1, 14, 14)
            vali = vali.reshape(-1, 14, 14)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(model_samples)[:n_samples]
        else:
            settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
            # get the test, train sets
            data = np.load('./experiments/data/' + identifier + '.data.npy').item()
            train = data['samples']['train']
            test = data['samples']['test']
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            model_samples = model.sample_trained_model(settings, epoch, n_samples)
    all_samples = np.vstack([train, test, model_samples])
    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
    print('heuristic sigma:', heuristic_sigma)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, test, np.random.permutation(train)[:n_samples], sigma=heuristic_sigma, computeMMDs=False)
    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
#    if pvalue < 0.05:
#        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
        # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
#    else:
#        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
    return pvalue, tstat, sigma

def model_comparison(identifier_A, identifier_B, epoch_A=99, epoch_B=99):
    """
    Compare two models using relative MMD test
    """
    # make sure they used the same data
    settings_A = json.load(open('./experiments/settings/' + identifier_A + '.txt', 'r'))
    settings_B = json.load(open('./experiments/settings/' + identifier_B + '.txt', 'r'))
    data_path = assert_same_data(settings_A, settings_B)
    # now load the data
    data = np.load(data_path + '.data.npy').item()['samples']['vali']
    n_samples = data.shape[0]
    A_samples = model.sample_trained_model(settings_A, epoch_A, n_samples)
    B_samples = model.sample_trained_model(settings_B, epoch_B, n_samples)
    # do the comparison
    # TODO: support multiple signals
    ## some notes about this test:
    ## MMD_3_Sample_Test(X, Y, Z) tests the hypothesis that Px is closer to Pz than Py
    ## that is, test the null hypothesis H0:
    ##   MMD(F, Px, Py) <= MMD(F, Px, Pz)
    ## versus the alternate hypothesis:
    ##   MMD(F, Px, Py) > MMD(F, Px, Pz)
    ## at significance level that we select later (just the threshold on the p-value)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(data[:, :, 0], A_samples[:, :, 0], B_samples[:, :, 0], computeMMDs=True)
    print(pvalue, tstat, sigma)
    if pvalue < 0.05:
        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that', identifier_B, 'has a smaller MMD with the true data than', identifier_A)
    else:
        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclude that', identifier_B, 'has a smaller MMD with the true data than', identifier_A)
    return pvalue, tstat, sigma, MMDXY, MMDXZ

# --- to do with reconstruction --- #

def get_reconstruction_errors(identifier, epoch, g_tolerance=0.05, max_samples=1000, rerun=False, tstr=False):
    """
    Get the reconstruction error of every point in the training set of a given
    experiment.
    """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    if settings['data_load_from']:
        data_dict = np.load('./experiments/data/' + settings['data_load_from'] + '.data.npy').item()
    else:
        data_dict = np.load('./experiments/data/' + identifier + '.data.npy').item()
    samples = data_dict['samples']
    train = samples['train']
    vali = samples['vali']
    test = samples['test']
    labels = data_dict['labels']
    train_labels, test_labels, synth_labels, vali_labels = None, None, None, None
    try:
        if rerun:
            raise FileNotFoundError
        errors = np.load('./experiments/eval/' + identifier + '_' + str(epoch) + '_' + str(g_tolerance) + '.reconstruction_errors.npy').item()
        train_errors = errors['train']
        test_errors = errors['test']
        generated_errors = errors['generated']
        noisy_errors = errors['noisy']
        print('Loaded precomputed errors')
    except FileNotFoundError:
        if tstr:
            synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
            generated = synth_data['samples']
            synth_labels = synth_data['labels']
            train_labels = labels['train']
            test_labels = labels['test']
            vali_labels = labels['vali']
        else:
            # generate new data
            n_eval = 500
            # generate "easy" samples from the distribution
            generated = model.sample_trained_model(settings, epoch, n_eval)
            # generate "hard' random samples, not from train/test distribution
            # TODO: use original validation examples, add noise etc.
        ##    random_samples = np.random.normal(size=generated.shape)
        #    random_samples -= np.mean(random_samples, axis=0) 
        #    random_samples += np.mean(vali, axis=0)
        #    random_samples /= np.std(random_samples, axis=0)
        #    random_samples *= np.std(vali, axis=0)

        # get all the errors
        print('Getting reconstruction errors on train set')
        if train.shape[0] > max_samples:
            index_subset = np.random.permutation(train.shape[0])[:max_samples]
            train = train[index_subset]
            if train_labels is not None:
                train_labels = train_labels[index_subset]
        train_errors = error_per_sample(identifier, epoch, train, n_rep=5, g_tolerance=g_tolerance, C_samples=train_labels)
        print('Getting reconstruction errors on test set')
        if test.shape[0] > max_samples:
            index_subset = np.random.permutation(test.shape[0])[:max_samples]
            test = test[index_subset]
            if test_labels is not None:
                test_labels = test_labels[index_subset]
        test_errors = error_per_sample(identifier, epoch, test, n_rep=5, g_tolerance=g_tolerance, C_samples=test_labels)
        D_test, p_test = ks_2samp(train_errors, test_errors)
        print('KS statistic and p-value for train v. test erors:', D_test, p_test)
        pdb.set_trace()
        print('Getting reconstruction errors on generated set')
        generated_errors = error_per_sample(identifier, epoch, generated, n_rep=5, g_tolerance=g_tolerance, C_samples=synth_labels)
        D_gen, p_gen = ks_2samp(generated_errors, train_errors)
        print('KS statistic and p-value for train v. gen erors:', D_gen, p_gen)
        D_gentest, p_gentest = ks_2samp(generated_errors, test_errors)
        print('KS statistic and p-value for gen v. test erors:', D_gentest, p_gentest)
#        print('Getting reconstruction errors on noisy set')
#        alpha = 0.5
#        noisy_samples = alpha*vali + (1-alpha)*np.random.permutation(vali)
#        noisy_errors = error_per_sample(identifier, epoch, noisy_samples, n_rep=5, g_tolerance=g_tolerance, C_samples=vali_labels)
        noisy_errors = None
        # save!
        errors = {'train': train_errors, 'test': test_errors, 'generated': generated_errors, 'noisy': noisy_errors}
        np.save('./experiments/eval/' + identifier + '_' + str(epoch) + '_' + str(g_tolerance) + '.reconstruction_errors.npy', errors)
    # do two-sample Kolomogorov-Smirnov test for equality
    D_test, p_test = ks_2samp(train_errors, test_errors)
    print('KS statistic and p-value for train v. test erors:', D_test, p_test)
    D_gen, p_gen = ks_2samp(generated_errors, train_errors)
    print('KS statistic and p-value for train v. gen erors:', D_gen, p_gen)
    D_gentest, p_gentest = ks_2samp(generated_errors, test_errors)
    print('KS statistic and p-value for gen v. test erors:', D_gentest, p_gentest)
    # visualise distribution of errors for train and test
    plotting.reconstruction_errors(identifier + '_' + str(epoch) + '_' + str(g_tolerance), train_errors, test_errors, generated_errors, noisy_errors)
    # visualise the "hardest" and "easiest" samples from train
    ranking_train = np.argsort(train_errors)
    easiest_train = ranking_train[:6]
    hardest_train = ranking_train[-6:]
    plotting.save_plot_sample(train[easiest_train], epoch, identifier + '_easytrain', n_samples=6, num_epochs=None, ncol=2)
    plotting.save_plot_sample(train[hardest_train], epoch, identifier + '_hardtrain', n_samples=6, num_epochs=None, ncol=2)
    # visualise the "hardest" and "easiest" samples from random
#    ranking_random = np.argsort(noisy_errors)
#    easiest_random = ranking_random[:6]
#    hardest_random = ranking_random[-6:]
#    plotting.save_plot_sample(random_samples[easiest_random], epoch, identifier + '_easyrandom', n_samples=6, num_epochs=None, ncol=2)
#    plotting.save_plot_sample(random_samples[hardest_random], epoch, identifier + '_hardrandom', n_samples=6, num_epochs=None, ncol=2)
    return True

def error_per_sample(identifier, epoch, samples, n_rep=3, n_iter=None, g_tolerance=0.025, use_min=True, C_samples=None):
    """
    Get (average over a few runs) of the reconstruction error per sample
    """
    n_samples = samples.shape[0]
    heuristic_sigma = np.float32(mmd.median_pairwise_distance(samples))
    errors = np.zeros(shape=(n_samples, n_rep))
    for rep in range(n_rep):
        Z, rep_errors, sigma = model.invert(identifier, epoch, samples, n_iter=n_iter, heuristic_sigma=heuristic_sigma, g_tolerance=g_tolerance, C_samples=C_samples)
        errors[:, rep] = rep_errors
    # return min, or average?
    if use_min:
        errors = np.min(errors, axis=1)
    else:
        # use mean
        errors = np.mean(errors, axis=1)
    return errors

# --- visualisation evaluation --- #

def view_digit(identifier, epoch, digit, n_samples=6):
    """
    Generate a bunch of MNIST digits from a CGAN, view them
    """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    if settings['one_hot']:
        assert settings['max_val'] == 1
        assert digit <= settings['cond_dim']
        C_samples = np.zeros(shape=(n_samples, settings['cond_dim']))
        C_samples[:, digit] = 1
    else:
        assert settings['cond_dim'] == 1
        assert digit <= settings['max_val']
        C_samples = np.array([digit]*n_samples).reshape(-1, 1)
    digit_samples = model.sample_trained_model(settings, epoch, n_samples, Z_samples=None, cond_dim=settings['cond_dim'], C_samples=C_samples)
    digit_samples = digit_samples.reshape(n_samples, -1, 1)
    # visualise
    plotting.save_mnist_plot_sample(digit_samples, digit, identifier + '_' + str(epoch) + '_digit_', n_samples)
    return True

def view_interpolation(identifier, epoch, n_steps=6, input_samples=None, e_tolerance=0.01, sigma=3.29286853021):
    """
    If samples: generate interpolation between real points
    Else:
        Sample two points in the latent space, view a linear interpolation between them.
    """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    if input_samples is None:
        # grab two trainng examples
        data = np.load('./experiments/data/' + identifier + '.data.npy').item()
        train = data['samples']['train']
        input_samples = np.random.permutation(train)[:2]
#        Z_sampleA, Z_sampleB = model.sample_Z(2, settings['seq_length'], settings['latent_dim'], 
#                                          settings['use_time'])
        if sigma is None:
            ## gotta get a sigma somehow
            sigma = mmd.median_pairwise_distance(train)
            print('Calcualted heuristic sigma from training data:', sigma)
    Zs, error, _ = model.invert(settings, epoch, input_samples, e_tolerance=e_tolerance)
    Z_sampleA, Z_sampleB = Zs
    Z_samples = plotting.interpolate(Z_sampleA, Z_sampleB, n_steps=n_steps)
    samples = model.sample_trained_model(settings, epoch, Z_samples.shape[0], Z_samples)
    # get distances from generated samples to target samples
    d_A, d_B = [], []
    for sample in samples:
        d_A.append(sample_distance(sample, samples[0], sigma))
        d_B.append(sample_distance(sample, samples[-1], sigma))
    distances = pd.DataFrame({'dA': d_A, 'dB': d_B})
    plotting.save_plot_interpolate(input_samples, samples, epoch, settings['identifier'] + '_epoch' + str(epoch), distances=distances, sigma=sigma)
    return True

def view_latent_vary(identifier, epoch, n_steps=6):
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    Z_sample = model.sample_Z(1, settings['seq_length'], settings['latent_dim'], 
                                      settings['use_time'])[0]
    samples_dim = []
    for dim in range(settings['latent_dim']):
        Z_samples_dim = plotting.vary_latent_dimension(Z_sample, dim, n_steps)
        samples_dim.append(model.sample_trained_model(settings, epoch, Z_samples_dim.shape[0], Z_samples_dim))
    plotting.save_plot_vary_dimension(samples_dim, epoch, settings['identifier'] + '_varydim', n_dim=settings['latent_dim'])
    return True

def view_reconstruction(identifier, epoch, real_samples, tolerance=1):
    """
    Given a set of real samples, find the "closest" latent space points 
    corresponding to them, generate samples from these, visualise!
    """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    Zs, error, sigma = model.invert(settings, epoch, real_samples, tolerance=tolerance)
    plotting.visualise_latent(Zs[0], identifier+'_' + str(epoch) + '_0')
    plotting.visualise_latent(Zs[1], identifier+'_' + str(epoch) + '_1')
    model_samples = model.sample_trained_model(settings, epoch, Zs.shape[0], Zs)
    plotting.save_plot_reconstruct(real_samples, model_samples, settings['identifier'])
    return True

def view_fixed(identifier, epoch, n_samples=6, dim=None):
    """ What happens when we give the same point at each time step? """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    Z_samples = model.sample_Z(n_samples, settings['seq_length'], settings['latent_dim'], 
                                      settings['use_time'])
    # now, propagate forward the value at time 0 (which time doesn't matter)
    for i in range(1, settings['seq_length']):
        if dim is None:
            Z_samples[:, i, :] = Z_samples[:, 0, :]
        else:
            Z_samples[:, i, dim] = Z_samples[:, 0, dim]
    # now generate
    samples = model.sample_trained_model(settings, epoch, n_samples, Z_samples)
    # now visualise
    plotting.save_plot_sample(samples, epoch, identifier + '_fixed', n_samples)
    return True

def view_params(identifier, epoch):
    """ Visualise weight matrices in the GAN """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    parameters = model.load_parameters(identifier + '_' + str(epoch))
    plotting.plot_parameters(parameters, identifier + '_' + str(epoch))
    return True

# --- to do with samples --- #

def sample_distance(sampleA, sampleB, sigma):
    """
    I know this isn't the best distance measure, alright.
    """
    # RBF!
    gamma = 1 / (2 * sigma**2)
    similarity = np.exp(-gamma*(np.linalg.norm(sampleA - sampleB)**2))
    distance = 1 - similarity
    return distance

### --- TSTR ---- ###

def train_CNN(train_X, train_Y, vali_X, vali_Y, test_X):
    """
    Train a CNN (code copied/adapted from Cristobal's mnist_keras_trts_0_2)
	(ONLY MNIST, ONLY 14x14)
    (ONLY DIGITS UP TO 3)
    """
    print('Training CNN!')
    input_shape = (14,14,1)
    batch_size = 128
    num_classes = 3
    epochs = 1000

    m = Sequential()
    m.add(Conv2D(16, kernel_size=(3, 3),
                        activation='relu',
                        input_shape=input_shape))
    m.add(Conv2D(32, (3, 3), activation='relu'))
    m.add(MaxPooling2D(pool_size=(2, 2)))
    m.add(Dropout(0.25))
    m.add(Flatten())
    m.add(Dense(128, activation='relu'))
    m.add(Dropout(0.5))
    m.add(Dense(num_classes, activation='softmax'))

    m.compile(loss=keras.losses.categorical_crossentropy,
            optimizer=keras.optimizers.Adadelta(),
            metrics=['accuracy'])

    earlyStopping=keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=1, mode='auto')
    m.fit(np.expand_dims(train_X, axis=-1), train_Y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(np.expand_dims(vali_X, axis=-1), vali_Y),
            callbacks=[earlyStopping])
    test_predictions = m.predict(np.expand_dims(test_X, axis=-1))
    return test_predictions

def TSTR_mnist(identifier, epoch, generate=True, duplicate_synth=1, vali=True, CNN=False, reverse=False):
    """
    Either load or generate synthetic training, real test data...
    Load synthetic training, real test data, do multi-class SVM
    (basically just this: http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html)

    If reverse = True: do TRTS
    """
    print('Running TSTR on', identifier, 'at epoch', epoch)
    if vali:
        test_set = 'vali'
    else:
        test_set = 'test'
    if generate:
        data = np.load('./experiments/data/' + identifier + '.data.npy').item()
        samples = data['samples']
        train_X = samples['train']
        test_X = samples[test_set]
        labels = data['labels']
        train_Y = labels['train']
        test_Y = labels[test_set]
        # now sample from the model
        synth_Y = np.tile(train_Y, [duplicate_synth, 1])
        synth_X = model.sample_trained_model(identifier, epoch, num_samples=synth_Y.shape[0], C_samples=synth_Y)
        # for use in TRTS
        synth_testX = model.sample_trained_model(identifier, epoch, num_samples=test_Y.shape[0], C_samples=test_Y)
        synth_data = {'samples': synth_X, 'labels': synth_Y, 'test_samples': synth_testX, 'test_labels': test_Y}
        np.save('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy', synth_data)
    else:
        print('Loading synthetic data from pre-sampled model')
        exp_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
        test_X, test_Y = exp_data['test_data'], exp_data['test_labels']
        train_X, train_Y = exp_data['train_data'], exp_data['train_labels']
        synth_X, synth_Y = exp_data['synth_data'], exp_data['synth_labels']
    if reverse:
        which_setting = 'trts'
        print('Swapping synthetic test set in for real, to do TRTS!')
        test_X = synth_testX
    else:
        print('Doing normal TSTR')
        which_setting = 'tstr'
    # make classifier
    if not CNN:
        model_choice = 'RF'
         # if multivariate, reshape
        if len(test_X.shape) == 3:
            test_X = test_X.reshape(test_X.shape[0], -1)
        if len(train_X.shape) == 3:
            train_X = train_X.reshape(train_X.shape[0], -1)
        if len(synth_X.shape) == 3:
            synth_X = synth_X.reshape(synth_X.shape[0], -1)
        # if one hot, fix
        if len(synth_Y.shape) > 1 and not synth_Y.shape[1] == 1:
            synth_Y = np.argmax(synth_Y, axis=1)
            train_Y = np.argmax(train_Y, axis=1)
            test_Y = np.argmax(test_Y, axis=1)
       # random forest
        #synth_classifier = SVC(gamma=0.001)
        #real_classifier = SVC(gamma=0.001)
        synth_classifier = RandomForestClassifier(n_estimators=500)
        real_classifier = RandomForestClassifier(n_estimators=500)
        # fit
        real_classifier.fit(train_X, train_Y)
        synth_classifier.fit(synth_X, synth_Y)
        # test on real
        synth_predY = synth_classifier.predict(test_X)
        real_predY = real_classifier.predict(test_X)
    else:
        model_choice = 'CNN'
        synth_predY = train_CNN(synth_X, synth_Y, samples['vali'], labels['vali'], test_X)
        clear_session()
        real_predY = train_CNN(train_X, train_Y, samples['vali'], labels['vali'], test_X)
        clear_session()
        # CNN setting is all 'one-hot'
        test_Y = np.argmax(test_Y, axis=1)
        synth_predY = np.argmax(synth_predY, axis=1)
        real_predY = np.argmax(real_predY, axis=1)
    
    # report on results
    synth_prec, synth_recall, synth_f1, synth_support = precision_recall_fscore_support(test_Y, synth_predY, average='weighted')
    synth_accuracy = accuracy_score(test_Y, synth_predY)
    synth_auprc = 'NaN'
    synth_auroc = 'NaN'
    synth_scores = [synth_prec, synth_recall, synth_f1, synth_accuracy, synth_auprc, synth_auroc]
    real_prec, real_recall, real_f1, real_support = precision_recall_fscore_support(test_Y, real_predY, average='weighted')
    real_accuracy = accuracy_score(test_Y, real_predY)
    real_auprc = 'NaN'
    real_auroc = 'NaN'
    real_scores = [real_prec, real_recall, real_f1, real_accuracy, real_auprc, real_auroc]
    
    all_scores = synth_scores + real_scores

    if vali:
        report_file = open('./experiments/tstr/vali.' + which_setting + '_report.v3.csv', 'a')
        report_file.write('mnist,' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
        report_file.close()
    else:
        report_file = open('./experiments/tstr/' + which_setting + '_report.v3.csv', 'a')
        report_file.write('mnist,' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
        report_file.close()
        # visualise results
        try:
            plotting.view_mnist_eval(identifier + '_' + str(epoch), train_X, train_Y, synth_X, synth_Y, test_X, test_Y, synth_predY, real_predY)
        except ValueError:
            print('PLOTTING ERROR')
            pdb.set_trace()
    print(classification_report(test_Y, synth_predY))
    print(classification_report(test_Y, real_predY))
    return synth_f1, real_f1

def TSTR_eICU(identifier, epoch, generate=True, vali=True, CNN=False, do_OR=False, duplicate_synth=1, reverse=False):
    """
    """
    if vali:
        test_set = 'vali'
    else:
        test_set = 'test'
    data = np.load('./experiments/data/' + identifier + '.data.npy').item()
    samples = data['samples']
    train_X = samples['train']
    test_X = samples[test_set]
    labels = data['labels']
    train_Y = labels['train']
    test_Y = labels[test_set]
    if generate:
        # now sample from the model
        synth_Y = np.tile(train_Y, [duplicate_synth, 1])
        synth_X = model.sample_trained_model(identifier, epoch, num_samples=synth_Y.shape[0], C_samples=synth_Y)
        # for use in TRTS
        synth_testX = model.sample_trained_model(identifier, epoch, num_samples=test_Y.shape[0], C_samples=test_Y)
        synth_data = {'samples': synth_X, 'labels': synth_Y, 'test_samples': synth_testX, 'test_labels': test_Y}
        np.save('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy', synth_data)
    else:
        print('Loading pre-generated data')
        print('WARNING: not implemented for TRTS')
        # get "train" data
        exp_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item()
        synth_X = exp_data['samples']
        synth_Y = exp_data['labels']
        n_synth = synth_X.shape[0]
        synth_X = synth_X.reshape(n_synth, -1)
    #    pdb.set_trace()
    #    # ALERT ALERT MODIFYING
    #    synth_X = 2*(synth_X > 0) - 1
    orig_data = np.load('/cluster/home/hyland/eICU_task_data.npy').item()
    if reverse:
        which_setting = 'trts'
    # visualise distribution of errors for train and test
        print('Swapping synthetic test set in for real, to do TRTS!')
        test_X = synth_testX
    else:
        print('Doing normal TSTR')
        which_setting = 'tstr'
#    # get test data
#    test_X = data['test_X']
#    test_Y = data['test_Y']
    if not CNN:
        model_choice = 'RF'
         # if multivariate, reshape
        if len(test_X.shape) == 3:
            test_X = test_X.reshape(test_X.shape[0], -1)
        if len(train_X.shape) == 3:
            train_X = train_X.reshape(train_X.shape[0], -1)
        if len(synth_X.shape) == 3:
            synth_X = synth_X.reshape(synth_X.shape[0], -1)
    else:
        raise ValueError(CNN)
        model_choice = 'CNN'
    # we will select the best validation set epoch based on F1 score, take average across all the tasks
    score_list = []
    for label in range(synth_Y.shape[1]):
        task = orig_data['Y_columns'][label]
        if vali:
            if not task in ['low_sao2', 'high_heartrate', 'low_respiration']:
                print('Skipping task', task, 'because validation evaluation.')
                continue
        print('Evaluating on task:', task)
        #print('(', np.mean(synth_Y[:, label]), 'positive in train, ', np.mean(test_Y[:, label]), 'in test)')
        #m = RandomForestClassifier(n_estimators=50).fit(synth_X, synth_Y[:, label])
        #m = SVC(gamma=0.001).fit(synth_X, synth_Y[:, label])
        synth_classifier = RandomForestClassifier(n_estimators=100).fit(synth_X, synth_Y[:, label])
        synth_predY = synth_classifier.predict(test_X)
        synth_predY_prob = synth_classifier.predict_proba(test_X)[:, 1]
        real_classifier = RandomForestClassifier(n_estimators=100).fit(train_X, train_Y[:, label])
        real_predY = real_classifier.predict(test_X)
        real_predY_prob = real_classifier.predict_proba(test_X)[:, 1]
        #print('(predicted', np.mean(predict), 'positive labels)')
        
        synth_prec, synth_recall, synth_f1, synth_support = precision_recall_fscore_support(test_Y[:, label], synth_predY, average='weighted')
        synth_accuracy = accuracy_score(test_Y[:, label], synth_predY)
        synth_auprc = average_precision_score(test_Y[:, label], synth_predY_prob)
        synth_auroc = roc_auc_score(test_Y[:, label], synth_predY_prob)
        synth_scores = [synth_prec, synth_recall, synth_f1, synth_accuracy, synth_auprc, synth_auroc]

        real_prec, real_recall, real_f1, real_support = precision_recall_fscore_support(test_Y[:, label], real_predY, average='weighted')
        real_accuracy = accuracy_score(test_Y[:, label], real_predY)
        real_auprc = average_precision_score(test_Y[:, label], real_predY_prob)
        real_auroc = roc_auc_score(test_Y[:, label], real_predY_prob)
        real_scores = [real_prec, real_recall, real_f1, real_accuracy, real_auprc, real_auroc]
        
        all_scores = synth_scores + real_scores

        if vali:
            report_file = open('./experiments/tstr/vali.' + which_setting + '_report.v3.csv', 'a')
            report_file.write('eICU_' + task + ',' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
            report_file.close()
        else:
            report_file = open('./experiments/tstr/' + which_setting + '_report.v3.csv', 'a')
            report_file.write('eICU_' + task + ',' + identifier + ',' + model_choice + ',' + str(epoch) + ',' + ','.join(map(str, all_scores)) + '\n')
            report_file.close()
        
        print(classification_report(test_Y[:, label], synth_predY))
        print(classification_report(test_Y[:, label], real_predY))
        if task in ['low_sao2', 'high_heartrate', 'low_respiration']:
            score_list.append(synth_auprc + synth_auroc)

    if do_OR:
        raise NotImplementedError
        # do the OR task
        extreme_heartrate_test = test_Y[:, 1] + test_Y[:, 4]
        extreme_respiration_test = test_Y[:, 2] + test_Y[:, 5]
        extreme_systemicmean_test = test_Y[:, 3] + test_Y[:, 6]
        Y_OR_test = np.vstack([extreme_heartrate_test, extreme_respiration_test, extreme_systemicmean_test]).T
        Y_OR_test = (Y_OR_test > 0)*1

        extreme_heartrate_synth = synth_Y[:, 1] + synth_Y[:, 4]
        extreme_respiration_synth = synth_Y[:, 2] + synth_Y[:, 5]
        extreme_systemicmean_synth = synth_Y[:, 3] + synth_Y[:, 6]
        Y_OR_synth = np.vstack([extreme_heartrate_synth, extreme_respiration_synth, extreme_systemicmean_synth]).T
        Y_OR_synth = (Y_OR_synth > 0)*1

        OR_names = ['extreme heartrate', 'extreme respiration', 'extreme MAP']
        OR_results = []
        for label in range(Y_OR_synth.shape[1]):
            print('task:', OR_names[label])
            print('(', np.mean(Y_OR_synth[:, label]), 'positive in train, ', np.mean(Y_OR_test[:, label]), 'in test)')
            m = RandomForestClassifier(n_estimators=500).fit(synth_X, Y_OR_synth[:, label])
            predict = m.predict(X_test)
            print('(predicted', np.mean(predict), 'positive labels)')
            accuracy = accuracy_score(Y_OR_test[:, label], predict)
            precision = sklearn.metrics.precision_score(Y_OR_test[:, label], predict)
            recall = sklearn.metrics.recall_score(Y_OR_test[:, label], predict)
            print(accuracy, precision, recall)
            OR_results.append([accuracy, precision, recall])
    else:
        OR_results = []

    score_across_tasks = np.mean(np.array(score_list))
    return score_across_tasks

def NIPS_toy_plot(identifier_rbf, epoch_rbf, identifier_sine, epoch_sine, identifier_mnist, epoch_mnist):
    """
    for each experiment:
    - plot a bunch of train examples
    - sample a bunch of generated examples
    - plot all in separate PDFs so i can merge in illustrator

    for sine and rbf, grey background
    MNIST is just MNIST (square though)
    """
    n_samples = 15
    # settings
    settings_rbf = json.load(open('./experiments/settings/' + identifier_rbf + '.txt', 'r'))
    settings_sine = json.load(open('./experiments/settings/' + identifier_sine + '.txt', 'r'))
    settings_mnist = json.load(open('./experiments/settings/' + identifier_mnist + '.txt', 'r'))
    # data
    data_rbf = np.load('./experiments/data/' + identifier_rbf + '.data.npy').item()
    data_sine = np.load('./experiments/data/' + identifier_sine + '.data.npy').item()
    data_mnist = np.load('./experiments/data/' + identifier_mnist + '.data.npy').item()
    train_rbf = data_rbf['samples']['train']
    train_sine = data_sine['samples']['train']
    train_mnist = data_mnist['samples']['train']
    # sample
    samples_rbf = model.sample_trained_model(settings_rbf, epoch_rbf, n_samples)
    samples_sine = model.sample_trained_model(settings_sine, epoch_sine, n_samples)
    samples_mnist = model.sample_trained_model(settings_mnist, epoch_mnist, n_samples)
    # plot them all
    index = 0
    #for sample in np.random.permutation(train_rbf)[:n_samples]:
    #    plotting.nips_plot_rbf(sample, index, 'train')
    #    index += 1
    #for sample in samples_rbf:
    #    plotting.nips_plot_rbf(sample, index, 'GAN')
    #    index += 1
    #for sample in np.random.permutation(train_sine)[:n_samples]:
    #    plotting.nips_plot_sine(sample, index, 'train')
    #    index += 1
    #for sample in samples_sine:
    #    plotting.nips_plot_sine(sample, index, 'GAN')
    #    index += 1
    for sample in np.random.permutation(train_mnist)[:n_samples]:
        plotting.nips_plot_mnist(sample, index, 'train')
        index += 1
    for sample in samples_mnist:
        plotting.nips_plot_mnist(sample, index, 'GAN')
        index += 1
    return True


================================================
FILE: experiments/settings/kdd99.txt
================================================
{
"settings_file": "",
"data": "kdd99",
"seq_length": 30,
"num_signals": 6,
"normalise": false,
"scale": 0.1,
"freq_low": 1.0,
"freq_high": 5.0,
"amplitude_low": 0.1,
"amplitude_high": 0.9,
"multivariate_mnist": false,
"full_mnist": false,
"data_load_from": "",
"resample_rate_in_min": 15,
"hidden_units_g": 100,
"hidden_units_d": 100,
"hidden_units_e": 100,
"kappa": 1,
"latent_dim": 15,
"weight": 0.5,
"degree": 1,
"batch_mean": false,
"learn_scale": false,
"learning_rate": 0.1,
"batch_size": 500,
"num_epochs": 100,
"D_rounds": 1,
"G_rounds": 3,
"E_rounds": 1,
"shuffle": true,
"eval_mul": false,
"eval_an": false,
"eval_single": false,
"wrong_labels": false,
"identifier": "kdd99",
"sub_id": "kdd99",
"dp": false,
"l2norm_bound": 1e-05,
"batches_per_lot": 1,
"dp_sigma": 1e-05,
"use_time": false,
"seq_step": 10,
"num_generated_features": 6
}

================================================
FILE: experiments/settings/kdd99_test.txt
================================================
{
"settings_file": "",
"data": "kdd99_test",
"seq_length": 30,
"num_signals": 6,
"normalise": false,
"scale": 0.1,
"freq_low": 1.0,
"freq_high": 5.0,
"amplitude_low": 0.1,
"amplitude_high": 0.9,
"multivariate_mnist": false,
"full_mnist": false,
"data_load_from": "",
"resample_rate_in_min": 15,
"hidden_units_g": 100,
"hidden_units_d": 100,
"hidden_units_e": 100,
"kappa": 1,
"latent_dim": 15,
"weight": 0.5,
"degree": 1,
"batch_mean": false,
"learn_scale": false,
"learning_rate": 0.1,
"batch_size": 500,
"num_epochs": 100,
"D_rounds": 1,
"G_rounds": 3,
"E_rounds": 1,
"shuffle": true,
"eval_mul": false,
"eval_an": false,
"eval_single": false,
"wrong_labels": false,
"identifier": "kdd99_test",
"sub_id": "kdd99",
"dp": false,
"l2norm_bound": 1e-05,
"batches_per_lot": 1,
"dp_sigma": 1e-05,
"use_time": false,
"seq_step": 10,
"num_generated_features": 6
}

================================================
FILE: mmd.py
================================================
'''
MMD functions implemented in tensorflow.
(from https://github.com/dougalsutherland/opt-mmd/blob/master/gan/mmd.py)
'''
from __future__ import division

import tensorflow as tf

from tf_ops import dot, sq_sum

from scipy.spatial.distance import pdist
from numpy import median, vstack, einsum
import pdb
import numpy as np

_eps=1e-8

################################################################################
### Quadratic-time MMD with Gaussian RBF kernel

def _mix_rbf_kernel(X, Y, sigmas, wts=None):
    """
    """
    if wts is None:
        wts = [1.0] * sigmas.get_shape()[0]

    # debug!
    if len(X.shape) == 2:
        # matrix
        XX = tf.matmul(X, X, transpose_b=True)
        XY = tf.matmul(X, Y, transpose_b=True)
        YY = tf.matmul(Y, Y, transpose_b=True)
    elif len(X.shape) == 3:
        # tensor -- this is computing the Frobenius norm
        XX = tf.tensordot(X, X, axes=[[1, 2], [1, 2]])
        XY = tf.tensordot(X, Y, axes=[[1, 2], [1, 2]])
        YY = tf.tensordot(Y, Y, axes=[[1, 2], [1, 2]])
    else:
        raise ValueError(X)

    X_sqnorms = tf.diag_part(XX)
    Y_sqnorms = tf.diag_part(YY)

    r = lambda x: tf.expand_dims(x, 0)
    c = lambda x: tf.expand_dims(x, 1)

    K_XX, K_XY, K_YY = 0, 0, 0
    for sigma, wt in zip(tf.unstack(sigmas, axis=0), wts):
        gamma = 1 / (2 * sigma**2)
        K_XX += wt * tf.exp(-gamma * (-2 * XX + c(X_sqnorms) + r(X_sqnorms)))
        K_XY += wt * tf.exp(-gamma * (-2 * XY + c(X_sqnorms) + r(Y_sqnorms)))
        K_YY += wt * tf.exp(-gamma * (-2 * YY + c(Y_sqnorms) + r(Y_sqnorms)))

    return K_XX, K_XY, K_YY, tf.reduce_sum(wts)


def rbf_mmd2(X, Y, sigma=1, biased=True):
    return mix_rbf_mmd2(X, Y, sigmas=[sigma], biased=biased)


def mix_rbf_mmd2(X, Y, sigmas=(1,), wts=None, biased=True):
    K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigmas, wts)
    return _mmd2(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased)


def rbf_mmd2_and_ratio(X, Y, sigma=1, biased=True):
    return mix_rbf_mmd2_and_ratio(X, Y, sigmas=[sigma], biased=biased)


def mix_rbf_mmd2_and_ratio(X, Y, sigmas=(1,), wts=None, biased=True):
    K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigmas, wts)
    return _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased)


################################################################################
### Helper functions to compute variances based on kernel matrices


def _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
    m = tf.cast(K_XX.get_shape()[0], tf.float32)
    n = tf.cast(K_YY.get_shape()[0], tf.float32)

    if biased:
        mmd2 = (tf.reduce_sum(K_XX) / (m * m)
              + tf.reduce_sum(K_YY) / (n * n)
              - 2 * tf.reduce_sum(K_XY) / (m * n))
    else:
        if const_diagonal is not False:
            trace_X = m * const_diagonal
            trace_Y = n * const_diagonal
        else:
            trace_X = tf.trace(K_XX)
            trace_Y = tf.trace(K_YY)

        mmd2 = ((tf.reduce_sum(K_XX) - trace_X) / (m * (m - 1))
              + (tf.reduce_sum(K_YY) - trace_Y) / (n * (n - 1))
              - 2 * tf.reduce_sum(K_XY) / (m * n))

    return mmd2


def _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=False, biased=False,
                    min_var_est=_eps):
    mmd2, var_est = _mmd2_and_variance(
        K_XX, K_XY, K_YY, const_diagonal=const_diagonal, biased=biased)
    ratio = mmd2 / tf.sqrt(tf.maximum(var_est, min_var_est))
    return mmd2, ratio


def _mmd2_and_variance(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
    m = tf.cast(K_XX.get_shape()[0], tf.float32)  # Assumes X, Y are same shape

    ### Get the various sums of kernels that we'll use
    # Kts drop the diagonal, but we don't need to compute them explicitly
    if const_diagonal is not False:
        const_diagonal = tf.cast(const_diagonal, tf.float32)
        diag_X = diag_Y = const_diagonal
        sum_diag_X = sum_diag_Y = m * const_diagonal
        sum_diag2_X = sum_diag2_Y = m * const_diagonal**2
    else:
        diag_X = tf.diag_part(K_XX)
        diag_Y = tf.diag_part(K_YY)

        sum_diag_X = tf.reduce_sum(diag_X)
        sum_diag_Y = tf.reduce_sum(diag_Y)

        sum_diag2_X = sq_sum(diag_X)
        sum_diag2_Y = sq_sum(diag_Y)

    Kt_XX_sums = tf.reduce_sum(K_XX, 1) - diag_X
    Kt_YY_sums = tf.reduce_sum(K_YY, 1) - diag_Y
    K_XY_sums_0 = tf.reduce_sum(K_XY, 0)
    K_XY_sums_1 = tf.reduce_sum(K_XY, 1)

    Kt_XX_sum = tf.reduce_sum(Kt_XX_sums)
    Kt_YY_sum = tf.reduce_sum(Kt_YY_sums)
    K_XY_sum = tf.reduce_sum(K_XY_sums_0)

    Kt_XX_2_sum = sq_sum(K_XX) - sum_diag2_X
    Kt_YY_2_sum = sq_sum(K_YY) - sum_diag2_Y
    K_XY_2_sum  = sq_sum(K_XY)

    if biased:
        mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m)
              + (Kt_YY_sum + sum_diag_Y) / (m * m)
              - 2 * K_XY_sum / (m * m))
    else:
        mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * (m-1))
              + (Kt_YY_sum + sum_diag_Y) / (m * (m-1))
              - 2 * K_XY_sum / (m * m))

    var_est = (
          2 / (m**2 * (m-1)**2) * (
              2 * sq_sum(Kt_XX_sums) - Kt_XX_2_sum
            + 2 * sq_sum(Kt_YY_sums) - Kt_YY_2_sum)
        - (4*m-6) / (m**3 * (m-1)**3) * (Kt_XX_sum**2 + Kt_YY_sum**2)
        + 4*(m-2) / (m**3 * (m-1)**2) * (
              sq_sum(K_XY_sums_1) + sq_sum(K_XY_sums_0))
        - 4 * (m-3) / (m**3 * (m-1)**2) * K_XY_2_sum
        - (8*m - 12) / (m**5 * (m-1)) * K_XY_sum**2
        + 8 / (m**3 * (m-1)) * (
              1/m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum
            - dot(Kt_XX_sums, K_XY_sums_1)
            - dot(Kt_YY_sums, K_XY_sums_0))
    )

    return mmd2, var_est


### additions from stephanie, for convenience

def median_pairwise_distance(X, Y=None):
    """
    Heuristic for bandwidth of the RBF. Median pairwise distance of joint data.
    If Y is missing, just calculate it from X:
        this is so that, during training, as Y changes, we can use a fixed
        bandwidth (and save recalculating this each time we evaluated the mmd)
    At the end of training, we do the heuristic "correctly" by including
    both X and Y.

    Note: most of this code is assuming tensorflow, but X and Y are just ndarrays
    """
    if Y is None:
        Y = X       # this is horrendously inefficient, sorry
   
    if len(X.shape) == 2:
        # matrix
        X_sqnorms = einsum('...i,...i', X, X)
        Y_sqnorms = einsum('...i,...i', Y, Y)
        XY = einsum('ia,ja', X, Y)
    elif len(X.shape) == 3:
        # tensor -- this is computing the Frobenius norm
        X_sqnorms = einsum('...ij,...ij', X, X)
        Y_sqnorms = einsum('...ij,...ij', Y, Y)
        XY = einsum('iab,jab', X, Y)
    else:
        raise ValueError(X)

    distances = np.sqrt(X_sqnorms.reshape(-1, 1) - 2*XY + Y_sqnorms.reshape(1, -1))
    return median(distances)


def median_pairwise_distance_o(X, Y=None):
    """
    Heuristic for bandwidth of the RBF. Median pairwise distance of joint data.
    If Y is missing, just calculate it from X:
        this is so that, during training, as Y changes, we can use a fixed
        bandwidth (and save recalculating this each time we evaluated the mmd)
    At the end of training, we do the heuristic "correctly" by including
    both X and Y.

    Note: most of this code is assuming tensorflow, but X and Y are just ndarrays
    """
    if Y is None:
        Y = X  # this is horrendously inefficient, sorry

    if len(X.shape) == 2:
        # matrix
        X_sqnorms = np.einsum('...i,...i', X, X)
        Y_sqnorms = np.einsum('...i,...i', Y, Y)
        XY = np.einsum('ia,ja', X, Y)
    elif len(X.shape) == 3:
        # tensor -- this is computing the Frobenius norm
        X_sqnorms = np.einsum('...ij,...ij', X, X)  # reduce the tensor shape
        Y_sqnorms = np.einsum('...ij,...ij', Y, Y)
        XY = np.einsum('iab,jab', X, Y)  # X*Y^T??
    else:
        raise ValueError(X)

    distances = np.sqrt(X_sqnorms.reshape(-1, 1) - 2 * XY + Y_sqnorms.reshape(1, -1))
    distances = distances.reshape(-1, 1)
    distances = distances[~np.isnan(distances)]
    return np.median(distances)

================================================
FILE: mod_core_rnn_cell_impl.py
================================================
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#modified by Stephanie (@corcra) to enable initializing the bias term in lstm """
# ==============================================================================

"""Module implementing RNN Cells."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import contextlib
import hashlib
import math
import numbers

from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import partitioned_variables
from tensorflow.python.ops import random_ops
from tensorflow.python.ops import variable_scope as vs

from tensorflow.python.ops.math_ops import sigmoid
from tensorflow.python.ops.math_ops import tanh
#from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
from tensorflow.python.ops.rnn_cell_impl import RNNCell

from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util import nest


_BIAS_VARIABLE_NAME = "biases"
_WEIGHTS_VARIABLE_NAME = "weights"


@contextlib.contextmanager
def _checked_scope(cell, scope, reuse=None, **kwargs):
  if reuse is not None:
    kwargs["reuse"] = reuse
  with vs.variable_scope(scope, **kwargs) as checking_scope:
    scope_name = checking_scope.name
    if hasattr(cell, "_scope"):
      cell_scope = cell._scope  # pylint: disable=protected-access
      if cell_scope.name != checking_scope.name:
        raise ValueError(
            "Attempt to reuse RNNCell %s with a different variable scope than "
            "its first use.  First use of cell was with scope '%s', this "
            "attempt is with scope '%s'.  Please create a new instance of the "
            "cell if you would like it to use a different set of weights.  "
            "If before you were using: MultiRNNCell([%s(...)] * num_layers), "
            "change to: MultiRNNCell([%s(...) for _ in range(num_layers)]).  "
            "If before you were using the same cell instance as both the "
            "forward and reverse cell of a bidirectional RNN, simply create "
            "two instances (one for forward, one for reverse).  "
            "In May 2017, we will start transitioning this cell's behavior "
            "to use existing stored weights, if any, when it is called "
            "with scope=None (which can lead to silent model degradation, so "
            "this error will remain until then.)"
            % (cell, cell_scope.name, scope_name, type(cell).__name__,
               type(cell).__name__))
    else:
      weights_found = False
      try:
        with vs.variable_scope(checking_scope, reuse=True):
          vs.get_variable(_WEIGHTS_VARIABLE_NAME)
        weights_found = True
      except ValueError:
        pass
      if weights_found and reuse is None:
        raise ValueError(
            "Attempt to have a second RNNCell use the weights of a variable "
            "scope that already has weights: '%s'; and the cell was not "
            "constructed as %s(..., reuse=True).  "
            "To share the weights of an RNNCell, simply "
            "reuse it in your second calculation, or create a new one with "
            "the argument reuse=True." % (scope_name, type(cell).__name__))

    # Everything is OK.  Update the cell's scope and yield it.
    cell._scope = checking_scope  # pylint: disable=protected-access
    yield checking_scope


class BasicRNNCell(RNNCell):
  """The most basic RNN cell."""

  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
    if input_size is not None:
      logging.warn("%s: The input_size parameter is deprecated.", self)
    self._num_units = num_units
    self._activation = activation
    self._reuse = reuse

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  def __call__(self, inputs, state, scope=None):
    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
    with _checked_scope(self, scope or "basic_rnn_cell", reuse=self._reuse):
      output = self._activation(
          _linear([inputs, state], self._num_units, True))
    return output, output


class GRUCell(RNNCell):
  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""

  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
    if input_size is not None:
      logging.warn("%s: The input_size parameter is deprecated.", self)
    self._num_units = num_units
    self._activation = activation
    self._reuse = reuse

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
      with vs.variable_scope("gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        value = sigmoid(_linear(
          [inputs, state], 2 * self._num_units, True, 1.0))
        r, u = array_ops.split(
            value=value,
            num_or_size_splits=2,
            axis=1)
      with vs.variable_scope("candidate"):
        c = self._activation(_linear([inputs, r * state],
                                     self._num_units, True))
      new_h = u * state + (1 - u) * c
    return new_h, new_h


_LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))


class LSTMStateTuple(_LSTMStateTuple):
  """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.

  Stores two elements: `(c, h)`, in that order.

  Only used when `state_is_tuple=True`.
  """
  __slots__ = ()

  @property
  def dtype(self):
    (c, h) = self
    if not c.dtype == h.dtype:
      raise TypeError("Inconsistent internal state: %s vs %s" %
                      (str(c.dtype), str(h.dtype)))
    return c.dtype


class BasicLSTMCell(RNNCell):
  """Basic LSTM recurrent network cell.

  The implementation is based on: http://arxiv.org/abs/1409.2329.

  We add forget_bias (default: 1) to the biases of the forget gate in order to
  reduce the scale of forgetting in the beginning of the training.

  It does not allow cell clipping, a projection layer, and does not
  use peep-hole connections: it is the basic baseline.

  For advanced models, please use the full LSTMCell that follows.
  """

  def __init__(self, num_units, forget_bias=1.0, input_size=None,
               state_is_tuple=True, activation=tanh, reuse=None):
    """Initialize the basic LSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell.
      forget_bias: float, The bias added to forget gates (see above).
      input_size: Deprecated and unused.
      state_is_tuple: If True, accepted and returned states are 2-tuples of
        the `c_state` and `m_state`.  If False, they are concatenated
        along the column axis.  The latter behavior will soon be deprecated.
      activation: Activation function of the inner states.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.
    """
    if not state_is_tuple:
      logging.warn("%s: Using a concatenated state is slower and will soon be "
                   "deprecated.  Use state_is_tuple=True.", self)
    if input_size is not None:
      logging.warn("%s: The input_size parameter is deprecated.", self)
    self._num_units = num_units
    self._forget_bias = forget_bias
    self._state_is_tuple = state_is_tuple
    self._activation = activation
    self._reuse = reuse

  @property
  def state_size(self):
    return (LSTMStateTuple(self._num_units, self._num_units)
            if self._state_is_tuple else 2 * self._num_units)

  @property
  def output_size(self):
    return self._num_units

  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with _checked_scope(self, scope or "basic_lstm_cell", reuse=self._reuse):
      # Parameters of gates are concatenated into one multiply for efficiency.
      if self._state_is_tuple:
        c, h = state
      else:
        c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
      concat = _linear([inputs, h], 4 * self._num_units, True)

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)

      new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
               self._activation(j))
      new_h = self._activation(new_c) * sigmoid(o)

      if self._state_is_tuple:
        new_state = LSTMStateTuple(new_c, new_h)
      else:
        new_state = array_ops.concat([new_c, new_h], 1)
      return new_h, new_state


class LSTMCell(RNNCell):
  """Long short-term memory unit (LSTM) recurrent network cell.

  The default non-peephole implementation is based on:

    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf

  S. Hochreiter and J. Schmidhuber.
  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.

  The peephole implementation is based on:

    https://research.google.com/pubs/archive/43905.pdf

  Hasim Sak, Andrew Senior, and Francoise Beaufays.
  "Long short-term memory recurrent neural network architectures for
   large scale acoustic modeling." INTERSPEECH, 2014.

  The class uses optional peep-hole connections, optional cell clipping, and
  an optional projection layer.
  """

  def __init__(self, num_units, input_size=None,
               use_peepholes=False, cell_clip=None,
               initializer=None, bias_start=0.0, num_proj=None, proj_clip=None,
               num_unit_shards=None, num_proj_shards=None,
               forget_bias=1.0, state_is_tuple=True,
               activation=tanh, reuse=None):
    """Initialize the parameters for an LSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell
      input_size: Deprecated and unused.
      use_peepholes: bool, set True to enable diagonal/peephole connections.
      cell_clip: (optional) A float value, if provided the cell state is clipped
        by this value prior to the cell output activation.
      initializer: (optional) The initializer to use for the weight and
        projection matrices.
      bias_start: (optional) The VALUE to initialize the bias to, in
        the linear call
      num_proj: (optional) int, The output dimensionality for the projection
        matrices.  If None, no projection is performed.
      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
        provided, then the projected values are clipped elementwise to within
        `[-proj_clip, proj_clip]`.
      num_unit_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      num_proj_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      forget_bias: Biases of the forget gate are initialized by default to 1
        in order to reduce the scale of forgetting at the beginning of
        the training.
      state_is_tuple: If True, accepted and returned states are 2-tuples of
        the `c_state` and `m_state`.  If False, they are concatenated
        along the column axis.  This latter behavior will soon be deprecated.
      activation: Activation function of the inner states.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.
    """
    if not state_is_tuple:
      logging.warn("%s: Using a concatenated state is slower and will soon be "
                   "deprecated.  Use state_is_tuple=True.", self)
    if input_size is not None:
      logging.warn("%s: The input_size parameter is deprecated.", self)
    if num_unit_shards is not None or num_proj_shards is not None:
      logging.warn(
          "%s: The num_unit_shards and proj_unit_shards parameters are "
          "deprecated and will be removed in Jan 2017.  "
          "Use a variable scope with a partitioner instead.", self)

    self._num_units = num_units
    self._use_peepholes = use_peepholes
    self._cell_clip = cell_clip
    self._initializer = initializer
    self._bias_start = bias_start
    self._num_proj = num_proj
    self._proj_clip = proj_clip
    self._num_unit_shards = num_unit_shards
    self._num_proj_shards = num_proj_shards
    self._forget_bias = forget_bias
    self._state_is_tuple = state_is_tuple
    self._activation = activation
    self._reuse = reuse

    if num_proj:
      self._state_size = (
          LSTMStateTuple(num_units, num_proj)
          if state_is_tuple else num_units + num_proj)
      self._output_size = num_proj
    else:
      self._state_size = (
          LSTMStateTuple(num_units, num_units)
          if state_is_tuple else 2 * num_units)
      self._output_size = num_units

  @property
  def state_size(self):
    return self._state_size

  @property
  def output_size(self):
    return self._output_size

  def __call__(self, inputs, state, scope=None):
    """Run one step of LSTM.

    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: if `state_is_tuple` is False, this must be a state Tensor,
        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
        `m_state`.
      scope: VariableScope for the created subgraph; defaults to "lstm_cell".

    Returns:
      A tuple containing:

      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.

    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
    num_proj = self._num_units if self._num_proj is None else self._num_proj

    if self._state_is_tuple:
      (c_prev, m_prev) = state
    else:
      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    dtype = inputs.dtype
    input_size = inputs.get_shape().with_rank(2)[1]
    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
    with _checked_scope(self, scope or "lstm_cell",
                        initializer=self._initializer,
                        reuse=self._reuse) as unit_scope:
      if self._num_unit_shards is not None:
        unit_scope.set_partitioner(
            partitioned_variables.fixed_size_partitioner(
                self._num_unit_shards))
      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True, bias_start=self._bias_start)
      i, j, f, o = array_ops.split(
          value=lstm_matrix, num_or_size_splits=4, axis=1)
      # Diagonal connections
      if self._use_peepholes:
        with vs.variable_scope(unit_scope) as projection_scope:
          if self._num_unit_shards is not None:
            projection_scope.set_partitioner(None)
          w_f_diag = vs.get_variable(
              "w_f_diag", shape=[self._num_units], dtype=dtype)
          w_i_diag = vs.get_variable(
              "w_i_diag", shape=[self._num_units], dtype=dtype)
          w_o_diag = vs.get_variable(
              "w_o_diag", shape=[self._num_units], dtype=dtype)

      if self._use_peepholes:
        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
      else:
        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
             self._activation(j))

      if self._cell_clip is not None:
        # pylint: disable=invalid-unary-operand-type
        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
        # pylint: enable=invalid-unary-operand-type
      if self._use_peepholes:
        m = sigmoid(o + w_o_diag * c) * self._activation(c)
      else:
        m = sigmoid(o) * self._activation(c)

      if self._num_proj is not None:
        with vs.variable_scope("projection") as proj_scope:
          if self._num_proj_shards is not None:
            proj_scope.set_partitioner(
                partitioned_variables.fixed_size_partitioner(
                    self._num_proj_shards))
          m = _linear(m, self._num_proj, bias=False)

        if self._proj_clip is not None:
          # pylint: disable=invalid-unary-operand-type
          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
          # pylint: enable=invalid-unary-operand-type

    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
                 array_ops.concat([c, m], 1))
    return m, new_state


class OutputProjectionWrapper(RNNCell):
  """Operator adding an output projection to the given cell.

  Note: in many cases it may be more efficient to not use this wrapper,
  but instead concatenate the whole sequence of your outputs in time,
  do the projection on this batch-concatenated sequence, then split it
  if needed or directly feed into a softmax.
  """

  def __init__(self, cell, output_size, reuse=None):
    """Create a cell with output projection.

    Args:
      cell: an RNNCell, a projection to output_size is added to it.
      output_size: integer, the size of the output after projection.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.

    Raises:
      TypeError: if cell is not an RNNCell.
      ValueError: if output_size is not positive.
    """
    if not isinstance(cell, RNNCell):
      raise TypeError("The parameter cell is not RNNCell.")
    if output_size < 1:
      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
    self._cell = cell
    self._output_size = output_size
    self._reuse = reuse

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run the cell and output projection on inputs, starting from state."""
    output, res_state = self._cell(inputs, state)
    # Default scope: "OutputProjectionWrapper"
    with _checked_scope(self, scope or "output_projection_wrapper",
                        reuse=self._reuse):
      projected = _linear(output, self._output_size, True)
    return projected, res_state


class InputProjectionWrapper(RNNCell):
  """Operator adding an input projection to the given cell.

  Note: in many cases it may be more efficient to not use this wrapper,
  but instead concatenate the whole sequence of your inputs in time,
  do the projection on this batch-concatenated sequence, then split it.
  """

  def __init__(self, cell, num_proj, input_size=None):
    """Create a cell with input projection.

    Args:
      cell: an RNNCell, a projection of inputs is added before it.
      num_proj: Python integer.  The dimension to project to.
      input_size: Deprecated and unused.

    Raises:
      TypeError: if cell is not an RNNCell.
    """
    if input_size is not None:
      logging.warn("%s: The input_size parameter is deprecated.", self)
    if not isinstance(cell, RNNCell):
      raise TypeError("The parameter cell is not RNNCell.")
    self._cell = cell
    self._num_proj = num_proj

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run the input projection and then the cell."""
    # Default scope: "InputProjectionWrapper"
    with vs.variable_scope(scope or "input_projection_wrapper"):
      projected = _linear(inputs, self._num_proj, True)
    return self._cell(projected, state)


def _enumerated_map_structure(map_fn, *args, **kwargs):
  ix = [0]
  def enumerated_fn(*inner_args, **inner_kwargs):
    r = map_fn(ix[0], *inner_args, **inner_kwargs)
    ix[0] += 1
    return r
  return nest.map_structure(enumerated_fn, *args, **kwargs)


class DropoutWrapper(RNNCell):
  """Operator adding dropout to inputs and outputs of the given cell."""

  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
               state_keep_prob=1.0, variational_recurrent=False,
               input_size=None, dtype=None, seed=None):
    """Create a cell with added input, state, and/or output dropout.

    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
    then the the same dropout mask is applied at every step, as described in:

    Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
    Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287

    Otherwise a different dropout mask is applied at every time step.

    Args:
      cell: an RNNCell, a projection to output_size is added to it.
      input_keep_prob: unit Tensor or float between 0 and 1, input keep
        probability; if it is constant and 1, no input dropout will be added.
      output_keep_prob: unit Tensor or float between 0 and 1, output keep
        probability; if it is constant and 1, no output dropout will be added.
      state_keep_prob: unit Tensor or float between 0 and 1, output keep
        probability; if it is constant and 1, no output dropout will be added.
        State dropout is performed on the *output* states of the cell.
      variational_recurrent: Python bool.  If `True`, then the same
        dropout pattern is applied across all time steps per run call.
        If this parameter is set, `input_size` **must** be provided.
      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
        containing the depth(s) of the input tensors expected to be passed in to
        the `DropoutWrapper`.  Required and used **iff**
         `variational_recurrent = True` and `input_keep_prob < 1`.
      dtype: (optional) The `dtype` of the input, state, and output tensors.
        Required and used **iff** `variational_recurrent = True`.
      seed: (optional) integer, the randomness seed.

    Raises:
      TypeError: if cell is not an RNNCell.
      ValueError: if any of the keep_probs are not between 0 and 1.
    """
    if not isinstance(cell, RNNCell):
      raise TypeError("The parameter cell is not a RNNCell.")
    with ops.name_scope("DropoutWrapperInit"):
      def tensor_and_const_value(v):
        tensor_value = ops.convert_to_tensor(v)
        const_value = tensor_util.constant_value(tensor_value)
        return (tensor_value, const_value)
      for prob, attr in [(input_keep_prob, "input_keep_prob"),
                         (state_keep_prob, "state_keep_prob"),
                         (output_keep_prob, "output_keep_prob")]:
        tensor_prob, const_prob = tensor_and_const_value(prob)
        if const_prob is not None:
          if const_prob < 0 or const_prob > 1:
            raise ValueError("Parameter %s must be between 0 and 1: %d"
                             % (attr, const_prob))
          setattr(self, "_%s" % attr, float(const_prob))
        else:
          setattr(self, "_%s" % attr, tensor_prob)

    # Set cell, variational_recurrent, seed before running the code below
    self._cell = cell
    self._variational_recurrent = variational_recurrent
    self._seed = seed

    self._recurrent_input_noise = None
    self._recurrent_state_noise = None
    self._recurrent_output_noise = None

    if variational_recurrent:
      if dtype is None:
        raise ValueError(
            "When variational_recurrent=True, dtype must be provided")

      def convert_to_batch_shape(s):
        # Prepend a 1 for the batch dimension; for recurrent
        # variational dropout we use the same dropout mask for all
        # batch elements.
        return array_ops.concat(
            ([1], tensor_shape.TensorShape(s).as_list()), 0)

      def batch_noise(s, inner_seed):
        shape = convert_to_batch_shape(s)
        return random_ops.random_uniform(shape, seed=inner_seed, dtype=dtype)

      if (not isinstance(self._input_keep_prob, numbers.Real) or
          self._input_keep_prob < 1.0):
        if input_size is None:
          raise ValueError(
              "When variational_recurrent=True and input_keep_prob < 1.0 or "
              "is unknown, input_size must be provided")
        self._recurrent_input_noise = _enumerated_map_structure(
            lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)),
            input_size)
      self._recurrent_state_noise = _enumerated_map_structure(
          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)),
          cell.state_size)
      self._recurrent_output_noise = _enumerated_map_structure(
          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)),
          cell.output_size)

  def _gen_seed(self, salt_prefix, index):
    if self._seed is None:
      return None
    salt = "%s_%d" % (salt_prefix, index)
    string = (str(self._seed) + salt).encode("utf-8")
    return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def _variational_recurrent_dropout_value(
      self, index, value, noise, keep_prob):
    """Performs dropout given the pre-calculated noise tensor."""
    # uniform [keep_prob, 1.0 + keep_prob)
    random_tensor = keep_prob + noise

    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
    binary_tensor = math_ops.floor(random_tensor)
    ret = math_ops.div(value, keep_prob) * binary_tensor
    ret.set_shape(value.get_shape())
    return ret

  def _dropout(self, values, salt_prefix, recurrent_noise, keep_prob):
    """Decides whether to perform standard dropout or recurrent dropout."""
    if not self._variational_recurrent:
      def dropout(i, v):
        return nn_ops.dropout(
            v, keep_prob=keep_prob, seed=self._gen_seed(salt_prefix, i))
      return _enumerated_map_structure(dropout, values)
    else:
      def dropout(i, v, n):
        return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
      return _enumerated_map_structure(dropout, values, recurrent_noise)

  def __call__(self, inputs, state, scope=None):
    """Run the cell with the declared dropouts."""
    def _should_dropout(p):
      return (not isinstance(p, float)) or p < 1

    if _should_dropout(self._input_keep_prob):
      inputs = self._dropout(inputs, "input",
                             self._recurrent_input_noise,
                             self._input_keep_prob)
    output, new_state = self._cell(inputs, state, scope)
    if _should_dropout(self._state_keep_prob):
      new_state = self._dropout(new_state, "state",
                                self._recurrent_state_noise,
                                self._state_keep_prob)
    if _should_dropout(self._output_keep_prob):
      output = self._dropout(output, "output",
                             self._recurrent_output_noise,
                             self._output_keep_prob)
    return output, new_state


class ResidualWrapper(RNNCell):
  """RNNCell wrapper that ensures cell inputs are added to the outputs."""

  def __init__(self, cell):
    """Constructs a `ResidualWrapper` for `cell`.

    Args:
      cell: An instance of `RNNCell`.
    """
    self._cell = cell

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run the cell and add its inputs to its outputs.

    Args:
      inputs: cell inputs.
      state: cell state.
      scope: optional cell scope.

    Returns:
      Tuple of cell outputs and new state.

    Raises:
      TypeError: If cell inputs and outputs have different structure (type).
      ValueError: If cell inputs and outputs have different structure (value).
    """
    outputs, new_state = self._cell(inputs, state, scope=scope)
    nest.assert_same_structure(inputs, outputs)
    # Ensure shapes match
    def assert_shape_match(inp, out):
      inp.get_shape().assert_is_compatible_with(out.get_shape())
    nest.map_structure(assert_shape_match, inputs, outputs)
    res_outputs = nest.map_structure(
        lambda inp, out: inp + out, inputs, outputs)
    return (res_outputs, new_state)


class DeviceWrapper(RNNCell):
  """Operator that ensures an RNNCell runs on a particular device."""

  def __init__(self, cell, device):
    """Construct a `DeviceWrapper` for `cell` with device `device`.

    Ensures the wrapped `cell` is called with `tf.device(device)`.

    Args:
      cell: An instance of `RNNCell`.
      device: A device string or function, for passing to `tf.device`.
    """
    self._cell = cell
    self._device = device

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run the cell on specified device."""
    with ops.device(self._device):
      return self._cell(inputs, state, scope=scope)


class EmbeddingWrapper(RNNCell):
  """Operator adding input embedding to the given cell.

  Note: in many cases it may be more efficient to not use this wrapper,
  but instead concatenate the whole sequence of your inputs in time,
  do the embedding on this batch-concatenated sequence, then split it and
  feed into your RNN.
  """

  def __init__(self, cell, embedding_classes, embedding_size, initializer=None,
               reuse=None):
    """Create a cell with an added input embedding.

    Args:
      cell: an RNNCell, an embedding will be put before its inputs.
      embedding_classes: integer, how many symbols will be embedded.
      embedding_size: integer, the size of the vectors we embed into.
      initializer: an initializer to use when creating the embedding;
        if None, the initializer from variable scope or a default one is used.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.

    Raises:
      TypeError: if cell is not an RNNCell.
      ValueError: if embedding_classes is not positive.
    """
    if not isinstance(cell, RNNCell):
      raise TypeError("The parameter cell is not RNNCell.")
    if embedding_classes <= 0 or embedding_size <= 0:
      raise ValueError("Both embedding_classes and embedding_size must be > 0: "
                       "%d, %d." % (embedding_classes, embedding_size))
    self._cell = cell
    self._embedding_classes = embedding_classes
    self._embedding_size = embedding_size
    self._initializer = initializer
    self._reuse = reuse

  @property
  def state_size(self):
    return self._cell.state_size

  @property
  def output_size(self):
    return self._cell.output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      return self._cell.zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse):
      with ops.device("/cpu:0"):
        if self._initializer:
          initializer = self._initializer
        elif vs.get_variable_scope().initializer:
          initializer = vs.get_variable_scope().initializer
        else:
          # Default initializer for embeddings should have variance=1.
          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)

        if type(state) is tuple:
          data_type = state[0].dtype
        else:
          data_type = state.dtype

        embedding = vs.get_variable(
            "embedding", [self._embedding_classes, self._embedding_size],
            initializer=initializer,
            dtype=data_type)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))
    return self._cell(embedded, state)


class MultiRNNCell(RNNCell):
  """RNN cell composed sequentially of multiple simple cells."""

  def __init__(self, cells, state_is_tuple=True):
    """Create a RNN cell composed sequentially of a number of RNNCells.

    Args:
      cells: list of RNNCells that will be composed in this order.
      state_is_tuple: If True, accepted and returned states are n-tuples, where
        `n = len(cells)`.  If False, the states are all
        concatenated along the column axis.  This latter behavior will soon be
        deprecated.

    Raises:
      ValueError: if cells is empty (not allowed), or at least one of the cells
        returns a state tuple but the flag `state_is_tuple` is `False`.
    """
    if not cells:
      raise ValueError("Must specify at least one cell for MultiRNNCell.")
    if not nest.is_sequence(cells):
      raise TypeError(
          "cells must be a list or tuple, but saw: %s." % cells)

    self._cells = cells
    self._state_is_tuple = state_is_tuple
    if not state_is_tuple:
      if any(nest.is_sequence(c.state_size) for c in self._cells):
        raise ValueError("Some cells return tuples of states, but the flag "
                         "state_is_tuple is not set.  State sizes are: %s"
                         % str([c.state_size for c in self._cells]))

  @property
  def state_size(self):
    if self._state_is_tuple:
      return tuple(cell.state_size for cell in self._cells)
    else:
      return sum([cell.state_size for cell in self._cells])

  @property
  def output_size(self):
    return self._cells[-1].output_size

  def zero_state(self, batch_size, dtype):
    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
      if self._state_is_tuple:
        return tuple(cell.zero_state(batch_size, dtype) for cell in self._cells)
      else:
        # We know here that state_size of each cell is not a tuple and
        # presumably does not contain TensorArrays or anything else fancy
        return super(MultiRNNCell, self).zero_state(batch_size, dtype)

  def __call__(self, inputs, state, scope=None):
    """Run this multi-layer cell on inputs, starting from state."""
    with vs.variable_scope(scope or "multi_rnn_cell"):
      cur_state_pos = 0
      cur_inp = inputs
      new_states = []
      for i, cell in enumerate(self._cells):
        with vs.variable_scope("cell_%d" % i):
          if self._state_is_tuple:
            if not nest.is_sequence(state):
              raise ValueError(
                  "Expected state to be a tuple of length %d, but received: %s"
                  % (len(self.state_size), state))
            cur_state = state[i]
          else:
            cur_state = array_ops.slice(
                state, [0, cur_state_pos], [-1, cell.state_size])
            cur_state_pos += cell.state_size
          cur_inp, new_state = cell(cur_inp, cur_state)
          new_states.append(new_state)
    new_states = (tuple(new_states) if self._state_is_tuple else
                  array_ops.concat(new_states, 1))
    return cur_inp, new_states


class _SlimRNNCell(RNNCell):
  """A simple wrapper for slim.rnn_cells."""

  def __init__(self, cell_fn):
    """Create a SlimRNNCell from a cell_fn.

    Args:
      cell_fn: a function which takes (inputs, state, scope) and produces the
        outputs and the new_state. Additionally when called with inputs=None and
        state=None it should return (initial_outputs, initial_state).

    Raises:
      TypeError: if cell_fn is not callable
      ValueError: if cell_fn cannot produce a valid initial state.
    """
    if not callable(cell_fn):
      raise TypeError("cell_fn %s needs to be callable", cell_fn)
    self._cell_fn = cell_fn
    self._cell_name = cell_fn.func.__name__
    init_output, init_state = self._cell_fn(None, None)
    output_shape = init_output.get_shape()
    state_shape = init_state.get_shape()
    self._output_size = output_shape.with_rank(2)[1].value
    self._state_size = state_shape.with_rank(2)[1].value
    if self._output_size is None:
      raise ValueError("Initial output created by %s has invalid shape %s" %
                       (self._cell_name, output_shape))
    if self._state_size is None:
      raise ValueError("Initial state created by %s has invalid shape %s" %
                       (self._cell_name, state_shape))

  @property
  def state_size(self):
    return self._state_size

  @property
  def output_size(self):
    return self._output_size

  def __call__(self, inputs, state, scope=None):
    scope = scope or self._cell_name
    output, state = self._cell_fn(inputs, state, scope=scope)
    return output, state


def _linear(args, output_size, bias, bias_start=0.0, scope=None):
  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

  Args:
    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
    output_size: int, second dimension of W[i].
    bias: boolean, whether to add a bias term or not.
    bias_start: starting value to initialize the bias; 0 by default.

  Returns:
    A 2D Tensor with shape [batch x output_size] equal to
    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

  Raises:
    ValueError: if some of the arguments has unspecified or wrong shape.
  """
  if args is None or (nest.is_sequence(args) and not args):
    raise ValueError("`args` must be specified")
  if not nest.is_sequence(args):
    args = [args]

  # Calculate the total size of arguments on dimension 1.
  total_arg_size = 0
  shapes = [a.get_shape() for a in args]
  for shape in shapes:
    if shape.ndims != 2:
      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
    if shape[1].value is None:
      raise ValueError("linear expects shape[1] to be provided for shape %s, "
                       "but saw %s" % (shape, shape[1]))
    else:
      total_arg_size += shape[1].value

  dtype = [a.dtype for a in args][0]

  # Now the computation.
  scope = vs.get_variable_scope()
  with vs.variable_scope(scope) as outer_scope:
    weights = vs.get_variable(
        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
    if len(args) == 1:
      res = math_ops.matmul(args[0], weights)
    else:
      res = math_ops.matmul(array_ops.concat(args, 1), weights)
    if not bias:
      return res
    with vs.variable_scope(outer_scope) as inner_scope:
      inner_scope.set_partitioner(None)
      biases = vs.get_variable(
          _BIAS_VARIABLE_NAME, [output_size],
          dtype=dtype,
          initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
    return nn_ops.bias_add(res, biases)


================================================
FILE: model.py
================================================
import tensorflow as tf
import numpy as np
# from data_utils import get_batch
import data_utils
import pdb
import json
import sys
from mod_core_rnn_cell_impl import LSTMCell  # modified to allow initializing bias in lstm

# from tensorflow.contrib.rnn import LSTMCell
tf.logging.set_verbosity(tf.logging.ERROR)
import mmd

from differential_privacy.dp_sgd.dp_optimizer import dp_optimizer
from differential_privacy.dp_sgd.dp_optimizer import sanitizer
from differential_privacy.privacy_accountant.tf import accountant

# ------------------------------- #
"""
Most of the models are copied from https://github.com/ratschlab/RGAN
"""

# --- to do with latent space --- #

def sample_Z(batch_size, seq_length, latent_dim, use_time=False, use_noisy_time=False):
    sample = np.float32(np.random.normal(size=[batch_size, seq_length, latent_dim]))
    if use_time:
        print('WARNING: use_time has different semantics')
        sample[:, :, 0] = np.linspace(0, 1.0 / seq_length, num=seq_length)
    return sample

# --- samples for testing ---#

def sample_T(batch_size, batch_idx):
    samples_aaa = np.load('./data/samples_aa.npy')
    num_samples_t = samples_aaa.shape[0]
    labels_aaa = np.load('./data/labels_aa.npy')
    idx_aaa = np.load('./data/idx_aa.npy')
    start_pos = batch_idx * batch_size
    end_pos = start_pos + batch_size
    T_mb = samples_aaa[start_pos:end_pos, :, :]
    L_mb = labels_aaa[start_pos:end_pos, :, :]
    I_mb = idx_aaa[start_pos:end_pos, :, :]
    return T_mb, L_mb, I_mb, num_samples_t

def sample_TT(batch_size):
    samples_aaa = np.load('./data/samples_aa.npy')
    labels_aaa = np.load('./data/labels_aa.npy')
    idx_aaa = np.load('./data/idx_aa.npy')
    T_indices = np.random.choice(len(samples_aaa), size=batch_size, replace=False)
    T_mb = samples_aaa[T_indices, :, :]
    L_mb = labels_aaa[T_indices, :, :]
    I_mb = idx_aaa[T_indices, :, :]
    return T_mb, L_mb, I_mb

# --- to do with training --- #
def train_epoch(epoch, samples, labels, sess, Z, X, D_loss, G_loss, D_solver, G_solver,
                batch_size, use_time, D_rounds, G_rounds, seq_length,
                latent_dim, num_signals):
    """
    Train generator and discriminator for one epoch.
    """
    # for batch_idx in range(0, int(len(samples) / batch_size) - (D_rounds + (cond_dim > 0) * G_rounds), D_rounds + (cond_dim > 0) * G_rounds):
    for batch_idx in range(0, int(len(samples) / batch_size) - (D_rounds + G_rounds), D_rounds + G_rounds):
        # update the discriminator
        X_mb, Y_mb = data_utils.get_batch(samples, batch_size, batch_idx, labels)
        Z_mb = sample_Z(batch_size, seq_length, latent_dim, use_time)
        for d in range(D_rounds):
            # run the discriminator solver
            _ = sess.run(D_solver, feed_dict={X: X_mb, Z: Z_mb})

        # update the generator
        for g in range(G_rounds):
            # run the generator solver
            _ = sess.run(G_solver, feed_dict={Z: sample_Z(batch_size, seq_length, latent_dim, use_time=use_time)})

    # at the end, get the loss
    D_loss_curr, G_loss_curr = sess.run([D_loss, G_loss], feed_dict={X: X_mb,
                                                                     Z: sample_Z(batch_size, seq_length, latent_dim,
                                                                                 use_time=use_time)})
    D_loss_curr = np.mean(D_loss_curr)
    G_loss_curr = np.mean(G_loss_curr)


    return D_loss_curr, G_loss_curr


def GAN_loss(Z, X, generator_settings, discriminator_settings):

    # normal GAN
    G_sample = generator(Z, **generator_settings)

    D_real, D_logit_real = discriminator(X, **discriminator_settings)

    D_fake, D_logit_fake = discriminator(G_sample, reuse=True, **discriminator_settings)

    # Measures the probability error in discrete classification tasks in which each class is independent
    # and not mutually exclusive.
    # logits: predicted labels??

    D_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_real, labels=tf.ones_like(D_logit_real)), 1)
    D_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.zeros_like(D_logit_fake)), 1)

    D_loss = D_loss_real + D_loss_fake


    # G_loss = tf.reduce_mean(tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.ones_like(D_logit_fake)), axis=1))
    G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.ones_like(D_logit_fake)), 1)


    return D_loss, G_loss


def GAN_solvers(D_loss, G_loss, learning_rate, batch_size, total_examples, l2norm_bound, batches_per_lot, sigma, dp=False):
    """
    Optimizers
    """
    discriminator_vars = [v for v in tf.trainable_variables() if v.name.startswith('discriminator')]
    generator_vars = [v for v in tf.trainable_variables() if v.name.startswith('generator')]
    if dp:
        print('Using differentially private SGD to train discriminator!')
        eps = tf.placeholder(tf.float32)
        delta = tf.placeholder(tf.float32)
        priv_accountant = accountant.GaussianMomentsAccountant(total_examples)
        clip = True
        l2norm_bound = l2norm_bound / batch_size
        batches_per_lot = 1
        gaussian_sanitizer = sanitizer.AmortizedGaussianSanitizer(
            priv_accountant,
            [l2norm_bound, clip])

        # the trick is that we need to calculate the gradient with respect to
        # each example in the batch, during the DP SGD step
        D_solver = dp_optimizer.DPGradientDescentOptimizer(learning_rate,
                                                           [eps, delta],
                                                           sanitizer=gaussian_sanitizer,
                                                           sigma=sigma,
                                                           batches_per_lot=batches_per_lot).minimize(D_loss, var_list=discriminator_vars)
    else:
        D_loss_mean_over_batch = tf.reduce_mean(D_loss)
        D_solver = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(D_loss_mean_over_batch, var_list=discriminator_vars)
        priv_accountant = None
    G_loss_mean_over_batch = tf.reduce_mean(G_loss)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss_mean_over_batch, var_list=generator_vars)
    return D_solver, G_solver, priv_accountant


# --- to do with the model --- #

def create_placeholders(batch_size, seq_length, latent_dim, num_signals):

    Z = tf.placeholder(tf.float32, [batch_size, seq_length, latent_dim])
    X = tf.placeholder(tf.float32, [batch_size, seq_length, num_signals])
    T = tf.placeholder(tf.float32, [batch_size, seq_length, num_signals])
    return Z, X, T

def generator(z, hidden_units_g, seq_length, batch_size, num_signals, reuse=False, parameters=None, learn_scale=True):

    """
    If parameters are supplied, initialise as such
    """
    with tf.variable_scope("generator") as scope:
        if reuse:
            scope.reuse_variables()
        if parameters is None:
            W_out_G_initializer = tf.truncated_normal_initializer()
            b_out_G_initializer = tf.truncated_normal_initializer()
            scale_out_G_initializer = tf.constant_initializer(value=1.0)
            lstm_initializer = None
            bias_start = 1.0
        else:
            W_out_G_initializer = tf.constant_initializer(value=parameters['generator/W_out_G:0'])
            b_out_G_initializer = tf.constant_initializer(value=parameters['generator/b_out_G:0'])
            try:
                scale_out_G_initializer = tf.constant_initializer(value=parameters['generator/scale_out_G:0'])
            except KeyError:
                scale_out_G_initializer = tf.constant_initializer(value=1)
                assert learn_scale
            lstm_initializer = tf.constant_initializer(value=parameters['generator/rnn/lstm_cell/weights:0'])
            bias_start = parameters['generator/rnn/lstm_cell/biases:0']

        W_out_G = tf.get_variable(name='W_out_G', shape=[hidden_units_g, num_signals],
                                  initializer=W_out_G_initializer)
        b_out_G = tf.get_variable(name='b_out_G', shape=num_signals, initializer=b_out_G_initializer)
        scale_out_G = tf.get_variable(name='scale_out_G', shape=1, initializer=scale_out_G_initializer,
                                      trainable=learn_scale)
        # inputs
        inputs = z

        cell = LSTMCell(num_units=hidden_units_g,
                        state_is_tuple=True,
                        initializer=lstm_initializer,
                        bias_start=bias_start,
                        reuse=reuse)
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
            cell=cell,
            dtype=tf.float32,
            sequence_length=[seq_length] * batch_size,
            inputs=inputs)
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hidden_units_g])
        logits_2d = tf.matmul(rnn_outputs_2d, W_out_G) + b_out_G #out put weighted sum
        #        output_2d = tf.multiply(tf.nn.tanh(logits_2d), scale_out_G)
        output_2d = tf.nn.tanh(logits_2d) # logits operation [-1, 1]
        output_3d = tf.reshape(output_2d, [-1, seq_length, num_signals])

    return output_3d


def discriminator(x, hidden_units_d, seq_length, batch_size, reuse=False, parameters=None, batch_mean=False):
    with tf.variable_scope("discriminator") as scope:
        if reuse:
            scope.reuse_variables()
        if parameters is None:
            W_out_D = tf.get_variable(name='W_out_D', shape=[hidden_units_d, 1],
                                      initializer=tf.truncated_normal_initializer())
            b_out_D = tf.get_variable(name='b_out_D', shape=1,
                                      initializer=tf.truncated_normal_initializer())

        else:
            W_out_D = tf.constant_initializer(value=parameters['discriminator/W_out_D:0'])
            b_out_D = tf.constant_initializer(value=parameters['discriminator/b_out_D:0'])

        # inputs
        inputs = x

        # add the average of the inputs to the inputs (mode collapse?
        if batch_mean:
            mean_over_batch = tf.stack([tf.reduce_mean(x, axis=0)] * batch_size, axis=0)
            inputs = tf.concat([x, mean_over_batch], axis=2)

        cell = tf.contrib.rnn.LSTMCell(num_units=hidden_units_d,
                                       state_is_tuple=True,
                                       reuse=reuse)
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
            cell=cell,
            dtype=tf.float32,
            inputs=inputs)
        # logit_final = tf.matmul(rnn_outputs[:, -1], W_final_D) + b_final_D
        logits = tf.einsum('ijk,km', rnn_outputs, W_out_D) + b_out_D # output weighted sum
        # real logits or actual output layer?
        # logit is a function that maps probabilities ([0,1]) to ([-inf,inf]) ?

        output = tf.nn.sigmoid(logits) # y = 1 / (1 + exp(-x)). output activation [0, 1]. Probability??
        # sigmoid output ([0,1]), Probability?

    return output, logits


# --- display ----#
def display_batch_progression(j, id_max):
    '''
    See epoch progression
    '''
    batch_progression = int((j / id_max) * 100)
    sys.stdout.write(str(batch_progression) + ' % epoch' + chr(13))
    _ = sys.stdout.flush


# --- to do with saving/loading --- #

def dump_parameters(identifier, sess):
    """
    Save model parmaters to a numpy file
    """
    # dump_path = './experiments/parameters/' + identifier + '.npy'
    dump_path = './experiments/parameters/' + identifier + '.npy'
    model_parameters = dict()
    for v in tf.trainable_variables():
        model_parameters[v.name] = sess.run(v)
    np.save(dump_path, model_parameters)
    print('Recorded', len(model_parameters), 'parameters to', dump_path)
    return True


def load_parameters(identifier):
    """
    Load parameters from a numpy file
    """
    # load_path = './experiments/plots/parameters/' + identifier + '.npy'
    # load_path = './experiments/plots/parameters/parameters_60/' + identifier + '.npy'
    # load_path = './experiments/parameters/' + identifier + '.npy'
    # load_path = './experiments/parameters/' + identifier + '.npy'
    model_parameters = np.load(identifier).item()
    return model_parameters


================================================
FILE: plotting.py
================================================
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import pdb
from time import time
from matplotlib.colors import hsv_to_rgb
from pandas import read_table, read_hdf

def plot_label(label, id):

    fig, ax = plt.subplots(1, 1)
    ax.plot(label)
    fig.savefig("./Figs/Label_" + str(id).zfill(2) + ".png")
    plt.clf()
    plt.close(fig)

    return

def visualise_at_epoch(vis_sample, data, predict_labels, epoch,
        identifier, num_epochs, resample_rate_in_min, multivariate_mnist,
        seq_length, labels):
    # TODO: what's with all these arguments
    if data == 'mnist':
        if predict_labels:
            n_labels = 1
            if one_hot:
                n_labels = 6
                lab_votes = np.argmax(vis_sample[:, :, -n_labels:], axis=2)
            else:
                lab_votes = vis_sample[:, :, -n_labels:]
            labs, _ = mode(lab_votes, axis=1)
            samps = vis_sample[:, :, :-n_labels]
        else:
            labs = labels
            samps = vis_sample
        if multivariate_mnist:
            save_mnist_plot_sample(samps.reshape(-1, seq_length**2, 1), epoch, identifier, n_samples=6, labels=labs)
        else:
            save_mnist_plot_sample(samps, epoch, identifier, n_samples=6, labels=labs)
    else:
        save_plot_sample(vis_sample, epoch, identifier, n_samples=6,
                num_epochs=num_epochs)

    return True


def save_plot_sample(samples, idx, identifier, n_samples=16, num_epochs=None, ncol=4):
    assert n_samples <= samples.shape[0]
    assert n_samples % ncol == 0
    sample_length = samples.shape[1]
  
    if not num_epochs is None:
        col = hsv_to_rgb((1, 1.0*(idx)/num_epochs, 0.8)) #convert hsv values in a numpy array to rgb values all values assumed to be in range [0, 1].
    else:
        col = 'grey'

    x_points = np.arange(sample_length) #Return evenly spaced values within a given interval.

    nrow = int(n_samples/ncol)
    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(6, 6))
    for m in range(nrow):
        for n in range(ncol):
            # first column
            sample = samples[n*nrow + m, :, :]
            axarr[m, n].plot(x_points, sample, color=col)
            # axarr[m, n].set_ylim(-1, 1)
    for n in range(ncol):
        axarr[-1, n].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
    fig.suptitle(idx)
    fig.subplots_adjust(hspace = 0.15)
    fig.savefig("./experiments/plots/gs/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
    plt.clf()
    plt.close()
    return

def save_plot_interpolate(input_samples, samples, idx, identifier,  num_epochs=None, distances=None, sigma=1):
    """ very boilerplate, unsure how to make nicer """
    n_samples = samples.shape[0]
    sample_length = samples.shape[1]
  
    if not num_epochs is None:
        col = hsv_to_rgb((1, 1.0*(idx)/num_epochs, 0.8))
    else:
        col = 'grey'

    x_points = np.arange(sample_length)
    if distances is None:
        nrow = n_samples
    else:
        nrow = n_samples + 1
    ncol = 1
    fig, axarr = plt.subplots(nrow, ncol, figsize=(3, 9))
    if distances is None:
        startat = 0
    else:
        startat = 1
        axarr[0].plot(distances.dA, color='green', label='distance from A', linestyle='--', marker='o', markersize=4)
        axarr[0].plot(distances.dB, color='orange', label='distance from B', linestyle='dotted', marker='o', markersize=4)
        axarr[0].get_xaxis().set_visible(False)
        axarr[0].set_title('distance from endpoints')
    for m in range(startat, nrow):
        sample = samples[m-startat, :, 0]
        axarr[m].plot(x_points, sample, color=col)
    for m in range(startat, nrow):
        axarr[m].set_ylim(-1.1, 1.1)
        axarr[m].set_xlim(0, sample_length)
        axarr[m].spines["top"].set_visible(False)
        axarr[m].spines["bottom"].set_visible(False)
        axarr[m].spines["right"].set_visible(False)
        axarr[m].spines["left"].set_visible(False)
        axarr[m].tick_params(bottom='off', left='off')
        axarr[m].get_xaxis().set_visible(False)
        axarr[m].get_yaxis().set_visible(False)
        axarr[m].set_facecolor((0.96, 0.96, 0.96))
    if not input_samples is None:
        # now do the real samples
        axarr[startat].plot(x_points, input_samples[0], color='green', linestyle='--')
        axarr[-1].plot(x_points, input_samples[1], color='green', linestyle='--')
    
    axarr[-1].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
    fig.suptitle(idx)
    fig.subplots_adjust(hspace = 0.2)
    fig.savefig("./experiments/plots/" + identifier + "_interpolate.png")
    fig.savefig("./experiments/plots/" + identifier + "_interpolate.pdf")
    plt.clf()
    plt.close()
    return

def reconstruction_errors(identifier, train_errors, vali_errors, 
                          generated_errors, random_errors):
    """
    Plot two histogram of the reconstruction errors.
    """
    print(identifier)
    fig, axarr = plt.subplots(4, 1, sharex=True, figsize=(4, 8))
    axarr[0].hist(train_errors, normed=1, color='green', bins=50)
    axarr[0].set_title("train reconstruction errors")
    axarr[1].hist(vali_errors, normed=1, color='blue', bins=50)
    axarr[1].set_title('vali reconstruction errors')
    axarr[2].hist(generated_errors, normed=1, color='pink', bins=50)
    axarr[2].set_title('generated reconstruction errors')
    axarr[3].hist(random_errors, normed=1, color='grey', bins=50)
    axarr[3].set_title('random reconstruction errors')
    for ax in axarr:
        ax.spines["top"].set_visible(False)
        ax.spines["bottom"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.tick_params(bottom='off', left='off')
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    axarr[3].set_xlim(0, 0.05)
    plt.tight_layout()
    plt.savefig('./experiments/plots/' + identifier + '_reconstruction_errors.png')
    return True

def save_plot_reconstruct(real_samples, model_samples, identifier):
    assert real_samples.shape == model_samples.shape
    sample_length = real_samples.shape[1]
    x_points = np.arange(sample_length)
    nrow = real_samples.shape[0]
    ncol = 2
    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(6, 6))
    for m in range(nrow):
        real_sample = real_samples[m, :, 0]
        model_sample = model_samples[m, :, 0]
        axarr[m, 0].plot(x_points, real_sample, color='green')
        axarr[m, 1].plot(x_points, model_sample, color='red')
    axarr[-1, 0].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
    axarr[-1, 1].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
    axarr[0, 0].set_title('real')
    axarr[0, 1].set_title('reconstructed')
    fig.subplots_adjust(hspace = 0.15)
    fig.savefig("./experiments/plots/" + identifier + "_reconstruct.png")
    plt.clf()
    plt.close()
    return

def save_plot_vary_dimension(samples_list, idx, identifier, n_dim):
    """
    """
    assert len(samples_list) == n_dim
    sample_length = samples_list[0].shape[1]
  
    x_points = np.arange(sample_length)

    nrow = samples_list[0].shape[0]
    sidelength = n_dim*1.5
    fig, axarr = plt.subplots(nrow, n_dim, sharex=True, sharey=True, figsize=(sidelength, sidelength))
    for dim in range(n_dim):
        sample_dim = samples_list[dim]
        axarr[0, dim].set_title(dim)
        h = dim*1.0/n_dim       # hue
        for n in range(nrow):
            sample = sample_dim[n, :, 0]
            axarr[n, dim].plot(x_points, sample, color='black')
            axarr[n, dim].spines["top"].set_visible(False)
            axarr[n, dim].spines["bottom"].set_visible(False)
            axarr[n, dim].spines["right"].set_visible(False)
            axarr[n, dim].spines["left"].set_visible(False)
            axarr[n, dim].tick_params(bottom='off', left='off')
            axarr[n, dim].get_xaxis().set_visible(False)
            axarr[n, dim].set_facecolor(hsv_to_rgb((h, 0 + 0.25*n/nrow, 0.96)))
        axarr[-1, dim].xaxis.set_ticks(range(0, sample_length, int(sample_length/4)))
    fig.suptitle(idx)
    fig.subplots_adjust(hspace = 0.11, wspace=0.11)
    fig.savefig("./experiments/plots/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
    plt.clf()
    plt.close()
    return True

def interpolate(sampleA, sampleB=None, n_steps=6):
    """
    Plot the linear interpolation between two latent space points.
    """
    weights = np.linspace(0, 1, n_steps)
    if sampleB is None:
        # do it "close by"
        sampleB = sampleA + np.random.normal(size=sampleA.shape, scale=0.05)
    samples = np.array([w*sampleB + (1-w)*sampleA for w in weights])
    return samples

def vary_latent_dimension(sample, dimension, n_steps=6):
    """
    """
    assert dimension <= sample.shape[1]
    scale = np.mean(np.abs(sample[:, dimension]))
    deviations = np.linspace(0, 2*scale, n_steps)
    samples = np.array([sample[:, :]]*n_steps)
    for n in range(n_steps):
        samples[n, :, dimension] += deviations[n]
    return samples

def plot_sine_evaluation(real_samples, fake_samples, idx, identifier):
    """ 
    Create histogram of fake (generated) samples frequency, amplitude distribution.
    Also for real samples.
    """
    ### frequency
    seq_length = len(real_samples[0])    # assumes samples are all the same length
    frate = seq_length
    freqs_hz = np.fft.rfftfreq(seq_length)*frate      # this is for labelling the plot
    # TODO, just taking axis 0 for now...
    w_real = np.mean(np.abs(np.fft.rfft(real_samples[:, :, 0])), axis=0)
    w_fake = np.mean(np.abs(np.fft.rfft(fake_samples[:, :, 0])), axis=0)
    ### amplitude
    A_real = np.max(np.abs(real_samples[:, :, 0]), axis=1) 
    A_fake = np.max(np.abs(fake_samples[:, :, 0]), axis=1) 
    ### now plot
    nrow = 2
    ncol = 2
    fig, axarr = plt.subplots(nrow, ncol, sharex='col', figsize=(6, 6))
    # freq
    axarr[0, 0].vlines(freqs_hz, ymin=np.minimum(np.zeros_like(w_real), w_real), ymax=np.maximum(np.zeros_like(w_real), w_real), color='#30ba50')
    axarr[0, 0].set_title("frequency", fontsize=16)
    axarr[0, 0].set_ylabel("real", fontsize=16)
    axarr[1, 0].vlines(freqs_hz, ymin=np.minimum(np.zeros_like(w_fake), w_fake), ymax=np.maximum(np.zeros_like(w_fake), w_fake), color='#ba4730')
    axarr[1, 0].set_ylabel("generated", fontsize=16)
    # amplitude
    axarr[0, 1].hist(A_real, normed=True, color='#30ba50', bins=30)
    axarr[0, 1].set_title("amplitude", fontsize=16)
    axarr[1, 1].hist(A_fake, normed=True, color='#ba4730', bins=30)
   
    fig.savefig('./experiments/plots/' + identifier + '_eval' + str(idx).zfill(4) +'.png')
    plt.clf()
    plt.close()
    return True

def plot_trace(identifier, xmax=250, final=False, dp=False):
    """
    """

    trace_path = './experiments/traces/' + identifier + '.trace.txt'
    da = read_table(trace_path, sep=' ')
    nrow = 3
    if dp:
        trace_dp_path = './experiments/traces/' + identifier + '.dptrace.txt'
        da_dp = read_table(trace_dp_path, sep=' ')
        nrow += 1
    
    ncol=1  
    fig, axarr = plt.subplots(nrow, ncol, sharex='col', figsize=(6, 6))

    # D_loss
    d_handle,  = axarr[0].plot(da.epoch, da.D_loss, color='red', label='discriminator')
    axarr[0].set_ylabel('D loss')
#    axarr[0].set_ylim(0.9, 1.6)
    if final:
        #D_ticks = [1.0, 1.2, 1.5]
        D_ticks = [0.5, 1.0, 1.5]
        axarr[0].get_yaxis().set_ticks(D_ticks)
        for tick in D_ticks:
            axarr[0].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)
    # G loss
    ax_G = axarr[0].twinx()
    g_handle,  = ax_G.plot(da.epoch, da.G_loss, color='green', ls='dashed', label='generator')
    ax_G.set_ylabel('G loss')
    if final:
        G_ticks = [2.5, 5]
        ax_G.get_yaxis().set_ticks(G_ticks)
#        for tick in G_ticks:
#            axarr[0].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='green', alpha=1.0, zorder=0)

    ax_G.spines["top"].set_visible(False)
    ax_G.spines["bottom"].set_visible(False)
    ax_G.spines["right"].set_visible(False)
    ax_G.spines["left"].set_visible(False)
    ax_G.tick_params(bottom='off', right='off')
    axarr[0].legend(handles=[d_handle, g_handle], labels=['discriminator', 'generator'])

    # mmd
    da_mmd = da.loc[:, ['epoch', 'mmd2']].dropna()
    axarr[1].plot(da_mmd.epoch, da_mmd.mmd2, color='purple')
    axarr[1].set_ylabel('MMD$^2$')
    #axarr[1].set_ylim(0.0, 0.04)

    #ax_that = axarr[1].twinx()
    #ax_that.plot(da.that)
    #ax_that.set_ylabel('$\hat{t}$')
    #ax_that.set_ylim(0, 50)
    if final:
        mmd_ticks = [0.01, 0.02, 0.03]
        axarr[1].get_yaxis().set_ticks(mmd_ticks)
        for tick in mmd_ticks:
            axarr[1].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)

    # log likelihood
    da_ll = da.loc[:, ['epoch', 'll', 'real_ll']].dropna()
    axarr[2].plot(da_ll.epoch, da_ll.ll, color='orange')
    axarr[2].plot(da_ll.epoch, da_ll.real_ll, color='orange', alpha=0.5)
    axarr[2].set_ylabel('likelihood')
    axarr[2].set_xlabel('epoch')
    axarr[2].set_ylim(-750, 100)
    #axarr[2].set_ylim(-10000000, 500)
    if final:
#        ll_ticks = [-1.0*1e7, -0.5*1e7, 0]
        ll_ticks = [-500 ,-250, 0]
        axarr[2].get_yaxis().set_ticks(ll_ticks)
        for tick in ll_ticks:
            axarr[2].plot((-10, xmax+10), (tick, tick), ls='dotted', lw=0.5, color='black', alpha=0.4, zorder=0)
   
    if dp:
        assert da_dp.columns[0] == 'epoch'
        epochs = da_dp['epoch']
        eps_values = da_dp.columns[1:]
        for eps_string in eps_values:
            if 'eps' in eps_string:
                eps = eps_string[3:]
            else:
                eps = eps_string
            deltas = da_dp[eps_string]
            axarr[3].plot(epochs, deltas, label=eps)
            axarr[3].set_ylabel('delta')
            axarr[3].set_xlabel('epoch')
        axarr[3].legend()

    # beautify
    for ax in axarr:
        #ax.spines["top"].set_visible(True)
        ax.spines["top"].set_color((0, 0, 0, 0.3))
        #ax.spines["bottom"].set_visible(False)
        ax.spines["bottom"].set_color((0, 0, 0, 0.3))
        #ax.spines["right"].set_visible(False)
        ax.spines["right"].set_color((0, 0, 0, 0.3))
        #ax.spines["left"].set_visible(False)
        ax.spines["left"].set_color((0, 0, 0, 0.3))
        ax.tick_params(bottom='off', left='off')
        # make background grey
    #    ax.set_facecolor((0.96, 0.96, 0.96))
        ymin, ymax = ax.get_ylim()
        for x in np.arange(0, xmax+10, 10):
            ax.plot((x, x), (ymin, ymax), ls='dotted', lw=0.5, color='black', alpha=0.40, zorder=0)
        ax.set_xlim(-5, xmax)
        ax.get_yaxis().set_label_coords(-0.11,0.5)

    # bottom one

    fig.savefig('./experiments/traces/' + identifier + '_trace.png')
    fig.savefig('./experiments/traces/' + identifier + '_trace.pdf')
    plt.clf()
    plt.close()
    return True


def save_samples(vis_sample, identifier, epoch):

    np.save('./experiments/plots/gs/' + identifier + '_gs_%s.npy' % epoch, vis_sample)

    return True

def save_samples_real(vis_real, identifier):

    np.save('./experiments/plots/gs/' + identifier + '_gs_real.npy',  vis_real)

    return True

def save_mnist_plot_sample(samples, idx, identifier, n_samples, labels=None):
    """
    Generates a grid showing mnist digits.

    """
    assert n_samples <= samples.shape[0]
    if not labels is None:
        assert n_samples <= len(labels)
        if len(labels.shape) > 1 and not labels.shape[1] == 1:
            # one-hot
            label_titles = np.argmax(labels, axis=1)
        else:
            label_titles = labels
    else:
        label_titles = ['NA']*n_samples
    assert n_samples % 2 == 0
    img_size = int(np.sqrt(samples.shape[1]))
    
    nrow = int(n_samples/2)
    ncol = 2
    fig, axarr = plt.subplots(nrow, ncol, sharex=True, figsize=(8, 8))
    for m in range(nrow):
        # first column
        sample = samples[m, :, 0]
        axarr[m, 0].imshow(sample.reshape([img_size,img_size]), cmap='gray')            
        axarr[m, 0].set_title(str(label_titles[m]))
        # second column
        sample = samples[nrow + m, :, 0]
        axarr[m, 1].imshow(sample.reshape([img_size,img_size]), cmap='gray')           
        axarr[m, 1].set_title(str(label_titles[m + nrow]))
    fig.suptitle(idx)
    fig.suptitle(idx)
    fig.subplots_adjust(hspace = 0.15)
    fig.savefig("./experiments/plots/" + identifier + "_epoch" + str(idx).zfill(4) + ".png")
    plt.clf()
    plt.close()
    return

def visualise_latent(Z, identifier):
    """
    visualise a SINGLE point in the latent space
    """
    seq_length = Z.shape[0]
    latent_dim = Z.shape[1]
    if latent_dim > 2: 
        print('WARNING: Only visualising first two dimensions of latent space.')
    h = np.random.random()
    colours = np.array([hsv_to_rgb((h, i/seq_length, 0.96)) for i in range(seq_length)])
#    plt.plot(Z[:, 0], Z[:, 1], c='grey', alpha=0.5)
    for i in range(seq_length):
        plt.scatter(Z[i, 0], Z[i, 1], marker='o', c=colours[i])
    plt.savefig('./experiments/plots/' + identifier + '_Z.png')
    plt.clf()
    plt.close()
    return True


# --- to do with the model --- #
def plot_parameters(parameters, identifier):
    """
    visualise the parameters of a GAN
    """
    generator_out = parameters['generator/W_out_G:0']
    generator_weights = parameters['generator/rnn/lstm_cell/weights:0'] # split this into four
    generator_matrices = np.split(generator_weights, 4, 1)
    fig, axarr = plt.subplots(5, 1, sharex=True, 
            gridspec_kw = {'height_ratios':[0.2, 1, 1, 1, 1]}, figsize=(3,13))

    axarr[0].matshow(generator_out.T, extent=[0,100,0,100])
    axarr[0].set_title('W_out_G')
    axarr[1].matshow(generator_matrices[0])
    axarr[1].set_title('LSTM weights (1)')
    axarr[2].matshow(generator_matrices[1])
    axarr[2].set_title('LSTM weights (2)')
    axarr[3].matshow(generator_matrices[2])
    axarr[3].set_title('LSTM weights (3)')
    axarr[4].matshow(generator_matrices[3])
    axarr[4].set_title('LSTM weights (4)')
    for a in axarr:
        a.set_xlim(0, 100)
        a.set_ylim(0, 100)
        a.spines["top"].set_visible(False)
        a.spines["bottom"].set_visible(False)
        a.spines["right"].set_visible(False)
        a.spines["left"].set_visible(False)
        a.get_xaxis().set_visible(False)
        a.get_yaxis().set_visible(False)
#        a.tick_params(bottom='off', left='off', top='off')
    plt.tight_layout()
    plt.savefig('./experiments/plots/' + identifier + '_weights.png')
    return True

### TSTR ###
def view_mnist_eval(identifier, train_X, train_Y, synth_X, synth_Y, test_X, test_Y, synth_predY, real_predY):
    """
    Basically just 
    http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
    """
    # resize everything
    side_length = int(np.sqrt(train_X.shape[1]))
    train_X = train_X.reshape(-1, side_length, side_length)
    synth_X = synth_X.reshape(-1, side_length, side_length)
    test_X = test_X.reshape(-1, side_length, side_length)
    # remember, they're wrecked in the outer function thanks to python
    synth_images_and_labels = list(zip(synth_X, synth_Y))
    for index, (image, label) in enumerate(synth_images_and_labels[:4]):
        plt.subplot(4, 4, index + 1)
        plt.axis('off')
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        if index == 0:
            plt.title('synth train: %i' % label)
        else:
            plt.title('%i' % label)
    train_images_and_labels = list(zip(train_X, train_Y))
    for index, (image, label) in enumerate(train_images_and_labels[:4]):
        plt.subplot(4, 4, index + 5)
        plt.axis('off')
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        if index == 0:
            plt.title('real train: %i' % label)
        else:
            plt.title('%i' % label)
    images_and_synthpreds = list(zip(test_X, synth_predY))
    for index, (image, prediction) in enumerate(images_and_synthpreds[:4]):
        plt.subplot(4, 4, index + 9)
        plt.axis('off')
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        if index == 0:
            plt.title('synth pred: %i' % prediction)
        else:
            plt.title('%i' % prediction)
    images_and_realpreds = list(zip(test_X, real_predY))
    for index, (image, prediction) in enumerate(images_and_realpreds[:4]):
        plt.subplot(4, 4, index + 13)
        plt.axis('off')
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        if index == 0:
            plt.title('real pred: %i' % prediction)
        else:
            plt.title('%i' % prediction)
    plt.tight_layout()
    plt.title(identifier)
    plt.savefig('./experiments/tstr/' + identifier + '_preds.png')
    return True

# --- nips !!! --- #
def nips_plot_rbf(sample, index, which='train'):
    if which == 'train':
#        col = '#167ea0'
        col = '#13af5f'
    else:
        col = 'black'
    sample_length = len(sample)
    sample = sample.reshape(sample_length)
    x_points = np.arange(sample_length)
    fig, axarr = plt.subplots(1, 1, figsize=(2, 2))
    axarr.set_facecolor((0.95, 0.96, 0.96))
    axarr.plot(x_points, sample, color=col)
    axarr.set_ylim(-1.5, 1.5)
    axarr.get_xaxis().set_visible(False)
    axarr.get_yaxis().set_visible(False)
    axarr.spines["top"].set_visible(False)
    axarr.spines["bottom"].set_visible(False)
    axarr.spines["right"].set_visible(False)
    axarr.spines["left"].set_visible(False)
    axarr.tick_params(bottom='off', left='off')
    plt.savefig('./plots/NIPS_rbf_' + which + '_' + str(index) + '.png')
    plt.savefig('./plots/NIPS_rbf_' + which + '_' + str(index) + '.pdf')
    plt.clf()
    plt.close()
    return True

def nips_plot_sine(sample, index, which='train'):
    if which == 'train':
        #col = '#167ea0'
        #col = '#13af5f'
        col = '#1188ad'
    else:
        col = 'black'
    sample_length = len(sample)
    sample = sample.reshape(sample_length)
    sample_length = len(sample)
    sample = sample.reshape(sample_length)
    x_points = np.arange(sample_length)
    fig, axarr = plt.subplots(1, 1, figsize=(2, 2))
    axarr.set_facecolor((0.95, 0.96, 0.96))
    axarr.plot(x_points, sample, color=col)
    axarr.set_ylim(-1.1, 1.1)
    axarr.get_xaxis().set_visible(False)
    axarr.get_yaxis().set_visible(False)
    axarr.spines["top"].set_visible(False)
    axarr.spines["bottom"].set_visible(False)
    axarr.spines["right"].set_visible(False)
    axarr.spines["left"].set_visible(False)
    axarr.tick_params(bottom='off', left='off')
    plt.savefig('./plots/NIPS_sine_' + which + '_' + str(index) + '.png')
    plt.savefig('./plots/NIPS_sine_' + which + '_' + str(index) + '.pdf')
    plt.clf()
    plt.close()
    return True

def nips_plot_mnist(sample, index, which='train'):
    plt.axis('off')
    plt.imshow(sample, cmap=plt.cm.gray, interpolation='nearest')
    plt.savefig('./plots/NIPS_mnist_' + which + '_' + str(index) + '.png')
    plt.savefig('./plots/NIPS_mnist_' + which + '_' + str(index) + '.pdf')
    plt.clf()
    plt.close()
    return True


================================================
FILE: tf_ops.py
================================================
### from https://github.com/eugenium/MMD/blob/master/tf_ops.py
import tensorflow as tf


def sq_sum(t, name=None):
    "The squared Frobenius-type norm of a tensor, sum(t ** 2)."
    with tf.name_scope(name, "SqSum", [t]):
        t = tf.convert_to_tensor(t, name='t')
        return 2 * tf.nn.l2_loss(t)


def dot(x, y, name=None):
    "The dot product of two vectors x and y."
    with tf.name_scope(name, "Dot", [x, y]):
        x = tf.convert_to_tensor(x, name='x')
        y = tf.convert_to_tensor(y, name='y')

        x.get_shape().assert_has_rank(1)
        y.get_shape().assert_has_rank(1)

        return tf.squeeze(tf.matmul(tf.expand_dims(x, 0), tf.expand_dims(y, 1)))


================================================
FILE: utils.py
================================================
#!/usr/bin/env ipython
# Utility functions that don't fit in other scripts
import argparse
import json

def rgan_options_parser():
    """
    Define parser to parse options from command line, with defaults.
    Refer to this function for definitions of various variables.
    """
    parser = argparse.ArgumentParser(description='Train a GAN to generate sequential, real-valued data.')
    # meta-option
    parser.add_argument('--settings_file', help='json file of settings, overrides everything else', type=str, default='')
    # options pertaining to data
    parser.add_argument('--data', help='what kind of data to train with?',
            default='gp_rbf',
            choices=['gp_rbf', 'sine', 'mnist', 'load'])
    # parser.add_argument('--num_samples', type=int, help='how many training examples \
    #                 to generate?', default=28*5*100)
    # parser.add_argument('--num_samples_t', type=int, help='how many testing examples \
    #                     for anomaly detection?', default=28 * 5 * 100)
    parser.add_argument('--seq_length', type=int, default=30)
    parser.add_argument('--num_signals', type=int, default=1)
    parser.add_argument('--normalise', type=bool, default=False, help='normalise the \
            training/vali/test data (during split)?')
    # parser.add_argument('--AD', type=bool, default=False, help='should we conduct anomaly detection?')

    ### for gp_rbf
    parser.add_argument('--scale', type=float, default=0.1)
            ### for sin (should be using subparsers for this...)
    parser.add_argument('--freq_low', type=float, default=1.0)
    parser.add_argument('--freq_high', type=float, default=5.0)
    parser.add_argument('--amplitude_low', type=float, default=0.1)
    parser.add_argument('--amplitude_high', type=float, default=0.9)
            ### for mnist
    parser.add_argument('--multivariate_mnist', type=bool, default=False)
    parser.add_argument('--full_mnist', type=bool, default=False)
            ### for loading
    parser.add_argument('--data_load_from', type=str, default='')
            ### for eICU
    parser.add_argument('--resample_rate_in_min', type=int, default=15)
    # hyperparameters of the model
    parser.add_argument('--hidden_units_g', type=int, default=100)
    parser.add_argument('--hidden_units_d', type=int, default=100)
    parser.add_argument('--hidden_units_e', type=int, default=100)
    parser.add_argument('--kappa', type=float, help='weight between final output \
            and intermediate steps in discriminator cost (1 = all \
            intermediate', default=1)
    parser.add_argument('--latent_dim', type=int, default=5, help='dimensionality \
            of the latent/noise space')
    parser.add_argument('--weight', type=int, default=0.5, help='weight of score')
    parser.add_argument('--degree', type=int, default=1, help='norm degree')
    parser.add_argument('--batch_mean', type=bool, default=False, help='append the mean \
            of the batch to all variables for calculating discriminator loss')
    parser.add_argument('--learn_scale', type=bool, default=False, help='make the \
            "scale" parameter at the output of the generator learnable (else fixed \
            to 1')
    # options pertaining to training
    parser.add_argument('--learning_rate', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=28)
    parser.add_argument('--num_epochs', type=int, default=100)
    parser.add_argument('--D_rounds', type=int, default=5, help='number of rounds \
            of discriminator training')
    parser.add_argument('--G_rounds', type=int, default=1, help='number of rounds \
            of generator training')
    parser.add_argument('--E_rounds', type=int, default=1, help='number of rounds \
               of encoder training')
    # parser.add_argument('--use_time', type=bool, default=False, help='enforce \
    #         latent dimension 0 to correspond to time')
    parser.add_argument('--shuffle', type=bool, default=True)
    parser.add_argument('--eval_mul', type=bool, default=False)
    parser.add_argument('--eval_an', type=bool, default=False)
    parser.add_argument('--eval_single', type=bool, default=False)
    parser.add_argument('--wrong_labels', type=bool, default=False, help='augment \
            discriminator loss with real examples with wrong (~shuffled, sort of) labels')
    # options pertaining to evaluation and exploration
    parser.add_argument('--identifier', type=str, default='test', help='identifier \
            string for output files')
    parser.add_argument('--sub_id', type=str, default='test', help='identifier \
               string for load parameters')
    # options pertaining to differential privacy
    parser.add_argument('--dp', type=bool, default=False, help='train discriminator \
            with differentially private SGD?')
    parser.add_argument('--l2norm_bound', type=float, default=1e-5,
            help='bound on norm of individual gradients for DP training')
    parser.add_argument('--batches_per_lot', type=int, default=1,
            help='number of batches per lot (for DP)')
    parser.add_argument('--dp_sigma', type=float, default=1e-5,
            help='sigma for noise added (for DP)')

    return parser

def load_settings_from_file(settings):
    """
    Handle loading settings from a JSON file, filling in missing settings from
    the command line defaults, but otherwise overwriting them.
    """
    settings_path = './experiments/settings/' + settings['settings_file'] + '.txt'
    print('Loading settings from', settings_path)
    settings_loaded = json.load(open(settings_path, 'r'))
    # check for settings missing in file
    for key in settings.keys():
        if not key in settings_loaded:
            print(key, 'not found in loaded settings - adopting value from command line defaults: ', settings[key])
            # overwrite parsed/default settings with those read from file, allowing for
    # (potentially new) default settings not present in file
    settings.update(settings_loaded)
    return settings