Repository: VahidooX/DeepCCA Branch: master Commit: b0ef3837a673 Files: 7 Total size: 17.4 KB Directory structure: gitextract_qp_aofnj/ ├── DeepCCA.py ├── LICENSE ├── README.md ├── linear_cca.py ├── models.py ├── objectives.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: DeepCCA.py ================================================ try: import cPickle as thepickle except ImportError: import _pickle as thepickle import gzip import numpy as np from keras.callbacks import ModelCheckpoint from utils import load_data, svm_classify from linear_cca import linear_cca from models import create_model def train_model(model, data1, data2, epoch_num, batch_size): """ trains the model # Arguments data1 and data2: the train, validation, and test data for view 1 and view 2 respectively. data should be packed like ((X for train, Y for train), (X for validation, Y for validation), (X for test, Y for test)) epoch_num: number of epochs to train the model batch_size: the size of batches # Returns the trained model """ # Unpacking the data train_set_x1, train_set_y1 = data1[0] valid_set_x1, valid_set_y1 = data1[1] test_set_x1, test_set_y1 = data1[2] train_set_x2, train_set_y2 = data2[0] valid_set_x2, valid_set_y2 = data2[1] test_set_x2, test_set_y2 = data2[2] # best weights are saved in "temp_weights.hdf5" during training # it is done to return the best model based on the validation loss checkpointer = ModelCheckpoint(filepath="temp_weights.h5", verbose=1, save_best_only=True, save_weights_only=True) # used dummy Y because labels are not used in the loss function model.fit([train_set_x1, train_set_x2], np.zeros(len(train_set_x1)), batch_size=batch_size, epochs=epoch_num, shuffle=True, validation_data=([valid_set_x1, valid_set_x2], np.zeros(len(valid_set_x1))), callbacks=[checkpointer]) model.load_weights("temp_weights.h5") results = model.evaluate([test_set_x1, test_set_x2], np.zeros(len(test_set_x1)), batch_size=batch_size, verbose=1) print('loss on test data: ', results) results = model.evaluate([valid_set_x1, valid_set_x2], np.zeros(len(valid_set_x1)), batch_size=batch_size, verbose=1) print('loss on validation data: ', results) return model def test_model(model, data1, data2, outdim_size, apply_linear_cca): """produce the new features by using the trained model # Arguments model: the trained model data1 and data2: the train, validation, and test data for view 1 and view 2 respectively. Data should be packed like ((X for train, Y for train), (X for validation, Y for validation), (X for test, Y for test)) outdim_size: dimension of new features apply_linear_cca: if to apply linear CCA on the new features # Returns new features packed like ((new X for train - view 1, new X for train - view 2, Y for train), (new X for validation - view 1, new X for validation - view 2, Y for validation), (new X for test - view 1, new X for test - view 2, Y for test)) """ # producing the new features new_data = [] for k in range(3): pred_out = model.predict([data1[k][0], data2[k][0]]) r = int(pred_out.shape[1] / 2) new_data.append([pred_out[:, :r], pred_out[:, r:], data1[k][1]]) # based on the DCCA paper, a linear CCA should be applied on the output of the networks because # the loss function actually estimates the correlation when a linear CCA is applied to the output of the networks # however it does not improve the performance significantly if apply_linear_cca: w = [None, None] m = [None, None] print("Linear CCA started!") w[0], w[1], m[0], m[1] = linear_cca(new_data[0][0], new_data[0][1], outdim_size) print("Linear CCA ended!") # Something done in the original MATLAB implementation of DCCA, do not know exactly why;) # it did not affect the performance significantly on the noisy MNIST dataset #s = np.sign(w[0][0,:]) #s = s.reshape([1, -1]).repeat(w[0].shape[0], axis=0) #w[0] = w[0] * s #w[1] = w[1] * s ### for k in range(3): data_num = len(new_data[k][0]) for v in range(2): new_data[k][v] -= m[v].reshape([1, -1]).repeat(data_num, axis=0) new_data[k][v] = np.dot(new_data[k][v], w[v]) return new_data if __name__ == '__main__': ############ # Parameters Section # the path to save the final learned features save_to = './new_features.gz' # the size of the new space learned by the model (number of the new features) outdim_size = 10 # size of the input for view 1 and view 2 input_shape1 = 784 input_shape2 = 784 # number of layers with nodes in each one layer_sizes1 = [1024, 1024, 1024, outdim_size] layer_sizes2 = [1024, 1024, 1024, outdim_size] # the parameters for training the network learning_rate = 1e-3 epoch_num = 100 batch_size = 800 # the regularization parameter of the network # seems necessary to avoid the gradient exploding especially when non-saturating activations are used reg_par = 1e-5 # specifies if all the singular values should get used to calculate the correlation or just the top outdim_size ones # if one option does not work for a network or dataset, try the other one use_all_singular_values = False # if a linear CCA should get applied on the learned features extracted from the networks # it does not affect the performance on noisy MNIST significantly apply_linear_cca = True # end of parameters section ############ # Each view is stored in a gzip file separately. They will get downloaded the first time the code gets executed. # Datasets get stored under the datasets folder of user's Keras folder # normally under [Home Folder]/.keras/datasets/ data1 = load_data('noisymnist_view1.gz', 'https://www2.cs.uic.edu/~vnoroozi/noisy-mnist/noisymnist_view1.gz') data2 = load_data('noisymnist_view2.gz', 'https://www2.cs.uic.edu/~vnoroozi/noisy-mnist/noisymnist_view2.gz') # Building, training, and producing the new features by DCCA model = create_model(layer_sizes1, layer_sizes2, input_shape1, input_shape2, learning_rate, reg_par, outdim_size, use_all_singular_values) model.summary() model = train_model(model, data1, data2, epoch_num, batch_size) new_data = test_model(model, data1, data2, outdim_size, apply_linear_cca) # Training and testing of SVM with linear kernel on the view 1 with new features [test_acc, valid_acc] = svm_classify(new_data, C=0.01) print("Accuracy on view 1 (validation data) is:", valid_acc * 100.0) print("Accuracy on view 1 (test data) is:", test_acc*100.0) # Saving new features in a gzip pickled file specified by save_to print('saving new features ...') f1 = gzip.open(save_to, 'wb') thepickle.dump(new_data, f1) f1.close() ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2016 Vahid Noroozi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # DCCA: Deep Canonical Correlation Analysis This is an implementation of Deep Canonical Correlation Analysis (DCCA or Deep CCA) in Python. It needs Theano and Keras libraries to be installed. DCCA is a non-linear version of CCA which uses neural networks as the mapping functions instead of linear transformers. DCCA is originally proposed in the following paper: Galen Andrew, Raman Arora, Jeff Bilmes, Karen Livescu, "[Deep Canonical Correlation Analysis.](http://www.jmlr.org/proceedings/papers/v28/andrew13.pdf)", ICML, 2013. It uses the Keras library with the Theano backend, and does not work on the Tensorflow backend. Because the loss function of the network is written with Theano. The base modeling network can easily get substituted with a more efficient and powerful network like CNN. Most of the configuration and parameters are set based on the following paper: Weiran Wang, Raman Arora, Karen Livescu, and Jeff Bilmes. "[On Deep Multi-View Representation Learning.](http://proceedings.mlr.press/v37/wangb15.pdf)", ICML, 2015. ### Dataset The model is evaluated on a noisy version of MNIST dataset. I built the dataset exactly like the way it is introduced in the paper. The train/validation/test split is the original split of MNIST. The dataset was large and could not get uploaded on GitHub. So it is uploaded on another server. The first time that the code gets executed, the dataset gets downloaded automatically by the code. It will get saved under the datasets folder of user's Keras folder (normally under [Home Folder]/.keras/datasets/). ### Differences with the original paper The following are the differences between my implementation and the original paper (they are small): * I used RMSProp (an adaptive version of gradient descent) instead of GD with momentum. It was so much faster in converging. * Instead of a non-saturating version of sigmoid, I just used the standard sigmoid as the activation functions. Standard sigmoid is used in the MATLAB implementation too. It should not affect the performance significantly. However, if it is needed, it can get substituted by another non-saturating activation function like ReLU. * Pre-training is not done in this implementation. However, it is not clear how much it can be useful. ### Other Implementations The following are the other implementations of DCCA in MATLAB and C++ from which I got help for the implementation. These codes are written by the authors of the original paper: * [C++ implementation](https://homes.cs.washington.edu/~galen/files/dcca.tgz) from Galen Andrew's website (https://homes.cs.washington.edu/~galen/) * [MATLAB implementation](http://ttic.uchicago.edu/~wwang5/papers/dccae.tgz) from Weiran Wang's website (http://ttic.uchicago.edu/~wwang5/dccae.html) ================================================ FILE: linear_cca.py ================================================ import numpy def linear_cca(H1, H2, outdim_size): """ An implementation of linear CCA # Arguments: H1 and H2: the matrices containing the data for view 1 and view 2. Each row is a sample. outdim_size: specifies the number of new features # Returns A and B: the linear transformation matrices mean1 and mean2: the means of data for both views """ r1 = 1e-4 r2 = 1e-4 m = H1.shape[0] o = H1.shape[1] mean1 = numpy.mean(H1, axis=0) mean2 = numpy.mean(H2, axis=0) H1bar = H1 - numpy.tile(mean1, (m, 1)) H2bar = H2 - numpy.tile(mean2, (m, 1)) SigmaHat12 = (1.0 / (m - 1)) * numpy.dot(H1bar.T, H2bar) SigmaHat11 = (1.0 / (m - 1)) * numpy.dot(H1bar.T, H1bar) + r1 * numpy.identity(o) SigmaHat22 = (1.0 / (m - 1)) * numpy.dot(H2bar.T, H2bar) + r2 * numpy.identity(o) [D1, V1] = numpy.linalg.eigh(SigmaHat11) [D2, V2] = numpy.linalg.eigh(SigmaHat22) SigmaHat11RootInv = numpy.dot(numpy.dot(V1, numpy.diag(D1 ** -0.5)), V1.T) SigmaHat22RootInv = numpy.dot(numpy.dot(V2, numpy.diag(D2 ** -0.5)), V2.T) Tval = numpy.dot(numpy.dot(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv) [U, D, V] = numpy.linalg.svd(Tval) V = V.T A = numpy.dot(SigmaHat11RootInv, U[:, 0:outdim_size]) B = numpy.dot(SigmaHat22RootInv, V[:, 0:outdim_size]) D = D[0:outdim_size] return A, B, mean1, mean2 ================================================ FILE: models.py ================================================ from keras.layers import Dense, Merge from keras.models import Sequential from keras.optimizers import RMSprop from keras.regularizers import l2 from objectives import cca_loss def create_model(layer_sizes1, layer_sizes2, input_size1, input_size2, learning_rate, reg_par, outdim_size, use_all_singular_values): """ builds the whole model the structure of each sub-network is defined in build_mlp_net, and it can easily get substituted with a more efficient and powerful network like CNN """ view1_model = build_mlp_net(layer_sizes1, input_size1, reg_par) view2_model = build_mlp_net(layer_sizes2, input_size2, reg_par) model = Sequential() model.add(Merge([view1_model, view2_model], mode='concat')) model_optimizer = RMSprop(lr=learning_rate) model.compile(loss=cca_loss(outdim_size, use_all_singular_values), optimizer=model_optimizer) return model def build_mlp_net(layer_sizes, input_size, reg_par): model = Sequential() for l_id, ls in enumerate(layer_sizes): if l_id == 0: input_dim = input_size else: input_dim = [] if l_id == len(layer_sizes)-1: activation = 'linear' else: activation = 'sigmoid' model.add(Dense(ls, input_dim=input_dim, activation=activation, kernel_regularizer=l2(reg_par))) return model ================================================ FILE: objectives.py ================================================ import theano.tensor as T def cca_loss(outdim_size, use_all_singular_values): """ The main loss function (inner_cca_objective) is wrapped in this function due to the constraints imposed by Keras on objective functions """ def inner_cca_objective(y_true, y_pred): """ It is the loss function of CCA as introduced in the original paper. There can be other formulations. It is implemented by Theano tensor operations, and does not work on Tensorflow backend y_true is just ignored """ r1 = 1e-4 r2 = 1e-4 eps = 1e-12 o1 = o2 = y_pred.shape[1]//2 # unpack (separate) the output of networks for view 1 and view 2 H1 = y_pred[:, 0:o1].T H2 = y_pred[:, o1:o1+o2].T m = H1.shape[1] H1bar = H1 - (1.0 / m) * T.dot(H1, T.ones([m, m])) H2bar = H2 - (1.0 / m) * T.dot(H2, T.ones([m, m])) SigmaHat12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T) SigmaHat11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) + r1 * T.eye(o1) SigmaHat22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) + r2 * T.eye(o2) # Calculating the root inverse of covariance matrices by using eigen decomposition [D1, V1] = T.nlinalg.eigh(SigmaHat11) [D2, V2] = T.nlinalg.eigh(SigmaHat22) # Added to increase stability posInd1 = T.gt(D1, eps).nonzero()[0] D1 = D1[posInd1] V1 = V1[:, posInd1] posInd2 = T.gt(D2, eps).nonzero()[0] D2 = D2[posInd2] V2 = V2[:, posInd2] SigmaHat11RootInv = T.dot(T.dot(V1, T.nlinalg.diag(D1 ** -0.5)), V1.T) SigmaHat22RootInv = T.dot(T.dot(V2, T.nlinalg.diag(D2 ** -0.5)), V2.T) Tval = T.dot(T.dot(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv) if use_all_singular_values: # all singular values are used to calculate the correlation corr = T.sqrt(T.nlinalg.trace(T.dot(Tval.T, Tval))) else: # just the top outdim_size singular values are used [U, V] = T.nlinalg.eigh(T.dot(Tval.T, Tval)) U = U[T.gt(U, eps).nonzero()[0]] U = U.sort() corr = T.sum(T.sqrt(U[0:outdim_size])) return -corr return inner_cca_objective ================================================ FILE: utils.py ================================================ import gzip from sklearn import svm from sklearn.metrics import accuracy_score import numpy as np import theano from keras.utils.data_utils import get_file def load_data(data_file, url): """loads the data from the gzip pickled files, and converts to numpy arrays""" print('loading data ...') path = get_file(data_file, origin=url) f = gzip.open(path, 'rb') train_set, valid_set, test_set = load_pickle(f) f.close() train_set_x, train_set_y = make_numpy_array(train_set) valid_set_x, valid_set_y = make_numpy_array(valid_set) test_set_x, test_set_y = make_numpy_array(test_set) return [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] def make_numpy_array(data_xy): """converts the input to numpy arrays""" data_x, data_y = data_xy data_x = np.asarray(data_x, dtype=theano.config.floatX) data_y = np.asarray(data_y, dtype='int32') return data_x, data_y def svm_classify(data, C): """ trains a linear SVM on the data input C specifies the penalty factor of SVM """ train_data, _, train_label = data[0] valid_data, _, valid_label = data[1] test_data, _, test_label = data[2] print('training SVM...') clf = svm.LinearSVC(C=C, dual=False) clf.fit(train_data, train_label.ravel()) p = clf.predict(test_data) test_acc = accuracy_score(test_label, p) p = clf.predict(valid_data) valid_acc = accuracy_score(valid_label, p) return [test_acc, valid_acc] def load_pickle(f): """ loads and returns the content of a pickled file it handles the inconsistencies between the pickle packages available in Python 2 and 3 """ try: import cPickle as thepickle except ImportError: import _pickle as thepickle try: ret = thepickle.load(f, encoding='latin1') except TypeError: ret = thepickle.load(f) return ret