[
  {
    "path": "DeepCCA.py",
    "content": "try:\n    import cPickle as thepickle\nexcept ImportError:\n    import _pickle as thepickle\n\nimport gzip\nimport numpy as np\n\nfrom keras.callbacks import ModelCheckpoint\nfrom utils import load_data, svm_classify\nfrom linear_cca import linear_cca\nfrom models import create_model\n\n\ndef train_model(model, data1, data2, epoch_num, batch_size):\n    \"\"\"\n    trains the model\n    # Arguments\n        data1 and data2: the train, validation, and test data for view 1 and view 2 respectively. data should be packed\n        like ((X for train, Y for train), (X for validation, Y for validation), (X for test, Y for test))\n        epoch_num: number of epochs to train the model\n        batch_size: the size of batches\n    # Returns\n        the trained model\n    \"\"\"\n\n    # Unpacking the data\n    train_set_x1, train_set_y1 = data1[0]\n    valid_set_x1, valid_set_y1 = data1[1]\n    test_set_x1, test_set_y1 = data1[2]\n\n    train_set_x2, train_set_y2 = data2[0]\n    valid_set_x2, valid_set_y2 = data2[1]\n    test_set_x2, test_set_y2 = data2[2]\n\n    # best weights are saved in \"temp_weights.hdf5\" during training\n    # it is done to return the best model based on the validation loss\n    checkpointer = ModelCheckpoint(filepath=\"temp_weights.h5\", verbose=1, save_best_only=True, save_weights_only=True)\n\n    # used dummy Y because labels are not used in the loss function\n    model.fit([train_set_x1, train_set_x2], np.zeros(len(train_set_x1)),\n              batch_size=batch_size, epochs=epoch_num, shuffle=True,\n              validation_data=([valid_set_x1, valid_set_x2], np.zeros(len(valid_set_x1))),\n              callbacks=[checkpointer])\n\n    model.load_weights(\"temp_weights.h5\")\n\n    results = model.evaluate([test_set_x1, test_set_x2], np.zeros(len(test_set_x1)), batch_size=batch_size, verbose=1)\n\n    print('loss on test data: ', results)\n\n    results = model.evaluate([valid_set_x1, valid_set_x2], np.zeros(len(valid_set_x1)), batch_size=batch_size, verbose=1)\n    print('loss on validation data: ', results)\n    return model\n\n\ndef test_model(model, data1, data2, outdim_size, apply_linear_cca):\n    \"\"\"produce the new features by using the trained model\n    # Arguments\n        model: the trained model\n        data1 and data2: the train, validation, and test data for view 1 and view 2 respectively.\n            Data should be packed like\n            ((X for train, Y for train), (X for validation, Y for validation), (X for test, Y for test))\n        outdim_size: dimension of new features\n        apply_linear_cca: if to apply linear CCA on the new features\n    # Returns\n        new features packed like\n            ((new X for train - view 1, new X for train - view 2, Y for train),\n            (new X for validation - view 1, new X for validation - view 2, Y for validation),\n            (new X for test - view 1, new X for test - view 2, Y for test))\n    \"\"\"\n\n    # producing the new features\n    new_data = []\n    for k in range(3):\n        pred_out = model.predict([data1[k][0], data2[k][0]])\n        r = int(pred_out.shape[1] / 2)\n        new_data.append([pred_out[:, :r], pred_out[:, r:], data1[k][1]])\n\n    # based on the DCCA paper, a linear CCA should be applied on the output of the networks because\n    # the loss function actually estimates the correlation when a linear CCA is applied to the output of the networks\n    # however it does not improve the performance significantly\n    if apply_linear_cca:\n        w = [None, None]\n        m = [None, None]\n        print(\"Linear CCA started!\")\n        w[0], w[1], m[0], m[1] = linear_cca(new_data[0][0], new_data[0][1], outdim_size)\n        print(\"Linear CCA ended!\")\n\n        # Something done in the original MATLAB implementation of DCCA, do not know exactly why;)\n        # it did not affect the performance significantly on the noisy MNIST dataset\n        #s = np.sign(w[0][0,:])\n        #s = s.reshape([1, -1]).repeat(w[0].shape[0], axis=0)\n        #w[0] = w[0] * s\n        #w[1] = w[1] * s\n        ###\n\n        for k in range(3):\n            data_num = len(new_data[k][0])\n            for v in range(2):\n                new_data[k][v] -= m[v].reshape([1, -1]).repeat(data_num, axis=0)\n                new_data[k][v] = np.dot(new_data[k][v], w[v])\n\n    return new_data\n\n\nif __name__ == '__main__':\n    ############\n    # Parameters Section\n\n    # the path to save the final learned features\n    save_to = './new_features.gz'\n\n    # the size of the new space learned by the model (number of the new features)\n    outdim_size = 10\n\n    # size of the input for view 1 and view 2\n    input_shape1 = 784\n    input_shape2 = 784\n\n    # number of layers with nodes in each one\n    layer_sizes1 = [1024, 1024, 1024, outdim_size]\n    layer_sizes2 = [1024, 1024, 1024, outdim_size]\n\n    # the parameters for training the network\n    learning_rate = 1e-3\n    epoch_num = 100\n    batch_size = 800\n\n    # the regularization parameter of the network\n    # seems necessary to avoid the gradient exploding especially when non-saturating activations are used\n    reg_par = 1e-5\n\n    # specifies if all the singular values should get used to calculate the correlation or just the top outdim_size ones\n    # if one option does not work for a network or dataset, try the other one\n    use_all_singular_values = False\n\n    # if a linear CCA should get applied on the learned features extracted from the networks\n    # it does not affect the performance on noisy MNIST significantly\n    apply_linear_cca = True\n\n    # end of parameters section\n    ############\n\n    # Each view is stored in a gzip file separately. They will get downloaded the first time the code gets executed.\n    # Datasets get stored under the datasets folder of user's Keras folder\n    # normally under [Home Folder]/.keras/datasets/\n    data1 = load_data('noisymnist_view1.gz', 'https://www2.cs.uic.edu/~vnoroozi/noisy-mnist/noisymnist_view1.gz')\n    data2 = load_data('noisymnist_view2.gz', 'https://www2.cs.uic.edu/~vnoroozi/noisy-mnist/noisymnist_view2.gz')\n\n    # Building, training, and producing the new features by DCCA\n    model = create_model(layer_sizes1, layer_sizes2, input_shape1, input_shape2,\n                            learning_rate, reg_par, outdim_size, use_all_singular_values)\n    model.summary()\n    model = train_model(model, data1, data2, epoch_num, batch_size)\n    new_data = test_model(model, data1, data2, outdim_size, apply_linear_cca)\n\n    # Training and testing of SVM with linear kernel on the view 1 with new features\n    [test_acc, valid_acc] = svm_classify(new_data, C=0.01)\n    print(\"Accuracy on view 1 (validation data) is:\", valid_acc * 100.0)\n    print(\"Accuracy on view 1 (test data) is:\", test_acc*100.0)\n\n    # Saving new features in a gzip pickled file specified by save_to\n    print('saving new features ...')\n    f1 = gzip.open(save_to, 'wb')\n    thepickle.dump(new_data, f1)\n    f1.close()\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2016 Vahid Noroozi\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE."
  },
  {
    "path": "README.md",
    "content": "# DCCA: Deep Canonical Correlation Analysis\n\nThis is an implementation of Deep Canonical Correlation Analysis (DCCA or Deep CCA) in Python. It needs Theano and Keras libraries to be installed.\n\nDCCA is a non-linear version of CCA which uses neural networks as the mapping functions instead of linear transformers. DCCA is originally proposed in the following paper:\n\nGalen Andrew, Raman Arora, Jeff Bilmes, Karen Livescu, \"[Deep Canonical Correlation Analysis.](http://www.jmlr.org/proceedings/papers/v28/andrew13.pdf)\", ICML, 2013.\n\nIt uses the Keras library with the Theano backend, and does not work on the Tensorflow backend. Because the loss function of the network is written with Theano. The base modeling network can easily get substituted with a more efficient and powerful network like CNN.\n\nMost of the configuration and parameters are set based on the following paper:\n\nWeiran Wang, Raman Arora, Karen Livescu, and Jeff Bilmes. \"[On Deep Multi-View Representation Learning.](http://proceedings.mlr.press/v37/wangb15.pdf)\", ICML, 2015.\n\n### Dataset\nThe model is evaluated on a noisy version of MNIST dataset. I built the dataset exactly like the way it is introduced in the paper. The train/validation/test split is the original split of MNIST.\n\nThe dataset was large and could not get uploaded on GitHub. So it is uploaded on another server. The first time that the code gets executed, the dataset gets downloaded automatically by the code. It will get saved under the datasets folder of user's Keras folder (normally under [Home Folder]/.keras/datasets/).\n\n### Differences with the original paper\nThe following are the differences between my implementation and the original paper (they are small):\n\n * I used RMSProp (an adaptive version of gradient descent) instead of GD with momentum. It was so much faster in converging.\n * Instead of a non-saturating version of sigmoid, I just used the standard sigmoid as the activation functions. Standard sigmoid is used in the MATLAB implementation too. It should not affect the performance significantly. However, if it is needed, it can get substituted by another non-saturating activation function like ReLU.\n * Pre-training is not done in this implementation. However, it is not clear how much it can be useful.\n\n### Other Implementations\nThe following are the other implementations of DCCA in MATLAB and C++ from which I got help for the implementation. These codes are written by the authors of the original paper:\n\n* [C++ implementation](https://homes.cs.washington.edu/~galen/files/dcca.tgz) from Galen Andrew's website (https://homes.cs.washington.edu/~galen/)\n\n* [MATLAB implementation](http://ttic.uchicago.edu/~wwang5/papers/dccae.tgz) from Weiran Wang's website (http://ttic.uchicago.edu/~wwang5/dccae.html)"
  },
  {
    "path": "linear_cca.py",
    "content": "import numpy\n\n\ndef linear_cca(H1, H2, outdim_size):\n    \"\"\"\n    An implementation of linear CCA\n    # Arguments:\n        H1 and H2: the matrices containing the data for view 1 and view 2. Each row is a sample.\n        outdim_size: specifies the number of new features\n    # Returns\n        A and B: the linear transformation matrices \n        mean1 and mean2: the means of data for both views\n    \"\"\"\n    r1 = 1e-4\n    r2 = 1e-4\n\n    m = H1.shape[0]\n    o = H1.shape[1]\n\n    mean1 = numpy.mean(H1, axis=0)\n    mean2 = numpy.mean(H2, axis=0)\n    H1bar = H1 - numpy.tile(mean1, (m, 1))\n    H2bar = H2 - numpy.tile(mean2, (m, 1))\n\n    SigmaHat12 = (1.0 / (m - 1)) * numpy.dot(H1bar.T, H2bar)\n    SigmaHat11 = (1.0 / (m - 1)) * numpy.dot(H1bar.T, H1bar) + r1 * numpy.identity(o)\n    SigmaHat22 = (1.0 / (m - 1)) * numpy.dot(H2bar.T, H2bar) + r2 * numpy.identity(o)\n\n    [D1, V1] = numpy.linalg.eigh(SigmaHat11)\n    [D2, V2] = numpy.linalg.eigh(SigmaHat22)\n    SigmaHat11RootInv = numpy.dot(numpy.dot(V1, numpy.diag(D1 ** -0.5)), V1.T)\n    SigmaHat22RootInv = numpy.dot(numpy.dot(V2, numpy.diag(D2 ** -0.5)), V2.T)\n\n    Tval = numpy.dot(numpy.dot(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv)\n\n    [U, D, V] = numpy.linalg.svd(Tval)\n    V = V.T\n    A = numpy.dot(SigmaHat11RootInv, U[:, 0:outdim_size])\n    B = numpy.dot(SigmaHat22RootInv, V[:, 0:outdim_size])\n    D = D[0:outdim_size]\n\n    return A, B, mean1, mean2\n"
  },
  {
    "path": "models.py",
    "content": "from keras.layers import Dense, Merge\nfrom keras.models import Sequential\nfrom keras.optimizers import RMSprop\nfrom keras.regularizers import l2\nfrom objectives import cca_loss\n\n\ndef create_model(layer_sizes1, layer_sizes2, input_size1, input_size2,\n                    learning_rate, reg_par, outdim_size, use_all_singular_values):\n    \"\"\"\n    builds the whole model\n    the structure of each sub-network is defined in build_mlp_net,\n    and it can easily get substituted with a more efficient and powerful network like CNN\n    \"\"\"\n    view1_model = build_mlp_net(layer_sizes1, input_size1, reg_par)\n    view2_model = build_mlp_net(layer_sizes2, input_size2, reg_par)\n\n    model = Sequential()\n    model.add(Merge([view1_model, view2_model], mode='concat'))\n\n    model_optimizer = RMSprop(lr=learning_rate)\n    model.compile(loss=cca_loss(outdim_size, use_all_singular_values), optimizer=model_optimizer)\n\n    return model\n\n\ndef build_mlp_net(layer_sizes, input_size, reg_par):\n    model = Sequential()\n    for l_id, ls in enumerate(layer_sizes):\n        if l_id == 0:\n            input_dim = input_size\n        else:\n            input_dim = []\n        if l_id == len(layer_sizes)-1:\n            activation = 'linear'\n        else:\n            activation = 'sigmoid'\n\n        model.add(Dense(ls, input_dim=input_dim,\n                                activation=activation,\n                                kernel_regularizer=l2(reg_par)))\n    return model\n"
  },
  {
    "path": "objectives.py",
    "content": "import theano.tensor as T\n\n\ndef cca_loss(outdim_size, use_all_singular_values):\n    \"\"\"\n    The main loss function (inner_cca_objective) is wrapped in this function due to\n    the constraints imposed by Keras on objective functions\n    \"\"\"\n    def inner_cca_objective(y_true, y_pred):\n        \"\"\"\n        It is the loss function of CCA as introduced in the original paper. There can be other formulations.\n        It is implemented by Theano tensor operations, and does not work on Tensorflow backend\n        y_true is just ignored\n        \"\"\"\n\n        r1 = 1e-4\n        r2 = 1e-4\n        eps = 1e-12\n        o1 = o2 = y_pred.shape[1]//2\n\n        # unpack (separate) the output of networks for view 1 and view 2\n        H1 = y_pred[:, 0:o1].T\n        H2 = y_pred[:, o1:o1+o2].T\n\n        m = H1.shape[1]\n\n        H1bar = H1 - (1.0 / m) * T.dot(H1, T.ones([m, m]))\n        H2bar = H2 - (1.0 / m) * T.dot(H2, T.ones([m, m]))\n\n        SigmaHat12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T)\n        SigmaHat11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) + r1 * T.eye(o1)\n        SigmaHat22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) + r2 * T.eye(o2)\n\n        # Calculating the root inverse of covariance matrices by using eigen decomposition\n        [D1, V1] = T.nlinalg.eigh(SigmaHat11)\n        [D2, V2] = T.nlinalg.eigh(SigmaHat22)\n\n        # Added to increase stability\n        posInd1 = T.gt(D1, eps).nonzero()[0]\n        D1 = D1[posInd1]\n        V1 = V1[:, posInd1]\n        posInd2 = T.gt(D2, eps).nonzero()[0]\n        D2 = D2[posInd2]\n        V2 = V2[:, posInd2]\n\n        SigmaHat11RootInv = T.dot(T.dot(V1, T.nlinalg.diag(D1 ** -0.5)), V1.T)\n        SigmaHat22RootInv = T.dot(T.dot(V2, T.nlinalg.diag(D2 ** -0.5)), V2.T)\n\n        Tval = T.dot(T.dot(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv)\n\n        if use_all_singular_values:\n            # all singular values are used to calculate the correlation\n            corr = T.sqrt(T.nlinalg.trace(T.dot(Tval.T, Tval)))\n        else:\n            # just the top outdim_size singular values are used\n            [U, V] = T.nlinalg.eigh(T.dot(Tval.T, Tval))\n            U = U[T.gt(U, eps).nonzero()[0]]\n            U = U.sort()\n            corr = T.sum(T.sqrt(U[0:outdim_size]))\n\n        return -corr\n\n    return inner_cca_objective\n\n"
  },
  {
    "path": "utils.py",
    "content": "import gzip\nfrom sklearn import svm\nfrom sklearn.metrics import accuracy_score\nimport numpy as np\nimport theano\nfrom keras.utils.data_utils import get_file\n\n\ndef load_data(data_file, url):\n    \"\"\"loads the data from the gzip pickled files, and converts to numpy arrays\"\"\"\n    print('loading data ...')\n    path = get_file(data_file, origin=url)\n    f = gzip.open(path, 'rb')\n    train_set, valid_set, test_set = load_pickle(f)\n    f.close()\n\n    train_set_x, train_set_y = make_numpy_array(train_set)\n    valid_set_x, valid_set_y = make_numpy_array(valid_set)\n    test_set_x, test_set_y = make_numpy_array(test_set)\n\n    return [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)]\n\n\ndef make_numpy_array(data_xy):\n    \"\"\"converts the input to numpy arrays\"\"\"\n    data_x, data_y = data_xy\n    data_x = np.asarray(data_x, dtype=theano.config.floatX)\n    data_y = np.asarray(data_y, dtype='int32')\n    return data_x, data_y\n\n\ndef svm_classify(data, C):\n    \"\"\"\n    trains a linear SVM on the data\n    input C specifies the penalty factor of SVM\n    \"\"\"\n    train_data, _, train_label = data[0]\n    valid_data, _, valid_label = data[1]\n    test_data, _, test_label = data[2]\n\n    print('training SVM...')\n    clf = svm.LinearSVC(C=C, dual=False)\n    clf.fit(train_data, train_label.ravel())\n\n    p = clf.predict(test_data)\n    test_acc = accuracy_score(test_label, p)\n    p = clf.predict(valid_data)\n    valid_acc = accuracy_score(valid_label, p)\n\n    return [test_acc, valid_acc]\n\n\ndef load_pickle(f):\n    \"\"\"\n    loads and returns the content of a pickled file\n    it handles the inconsistencies between the pickle packages available in Python 2 and 3\n    \"\"\"\n    try:\n        import cPickle as thepickle\n    except ImportError:\n        import _pickle as thepickle\n\n    try:\n        ret = thepickle.load(f, encoding='latin1')\n    except TypeError:\n        ret = thepickle.load(f)\n\n    return ret\n\n"
  }
]