Repository: fephsun/neuralnetmusic Branch: master Commit: 1b559a25bcfb Files: 101 Total size: 56.2 MB Directory structure: gitextract_7ps85ir2/ ├── .gitignore ├── DBN.py ├── DeepLearningTutorials/ │ ├── .gitignore │ ├── .hgignore │ ├── .travis.yml │ ├── README.rst │ ├── __init__.py │ ├── code/ │ │ ├── DBN.py │ │ ├── SdA.py │ │ ├── __init__.py │ │ ├── cA.py │ │ ├── convolutional_mlp.py │ │ ├── dA.py │ │ ├── hmc/ │ │ │ ├── __init__.py │ │ │ ├── hmc.py │ │ │ └── test_hmc.py │ │ ├── imdb.py │ │ ├── imdb_preprocess.py │ │ ├── logistic_cg.py │ │ ├── logistic_sgd.py │ │ ├── lstm.py │ │ ├── mlp.py │ │ ├── rbm.py │ │ ├── rnnrbm.py │ │ ├── rnnslu.py │ │ ├── test.py │ │ └── utils.py │ ├── data/ │ │ ├── download.sh │ │ └── training_colorpatches_16x16_demo.mat │ ├── doc/ │ │ ├── .templates/ │ │ │ └── layout.html │ │ ├── DBN.txt │ │ ├── LICENSE.txt │ │ ├── Makefile │ │ ├── SdA.txt │ │ ├── conf.py │ │ ├── contents.txt │ │ ├── dA.txt │ │ ├── deep.txt │ │ ├── gettingstarted.txt │ │ ├── hmc.txt │ │ ├── index.txt │ │ ├── lenet.txt │ │ ├── logreg.txt │ │ ├── lstm.txt │ │ ├── mlp.txt │ │ ├── rbm.txt │ │ ├── references.txt │ │ ├── rnnrbm.txt │ │ ├── rnnslu.txt │ │ ├── scripts/ │ │ │ └── docgen.py │ │ └── utilities.txt │ ├── issues_closed/ │ │ └── 2_RBM_cost_fn.txt │ ├── issues_open/ │ │ ├── 1_SdA_performance.txt │ │ ├── 3_RBM_scan_GPU.txt │ │ ├── 4_RBM_scan.txt │ │ ├── 5_results.txt │ │ └── 6_benchmarking_pybrain.txt │ └── misc/ │ └── do_nightly_build ├── README.md ├── joplin/ │ ├── alabama.xml │ ├── cleopha.xml │ ├── entertainer.xml │ ├── maple_leaf.xml │ ├── searchlight.xml │ ├── strenous.xml │ ├── syncopations.xml │ ├── winners.xml │ └── winners_2.xml ├── joplin-model.pickle ├── joplin_data.pickle ├── midi/ │ ├── DataTypeConverters.py │ ├── EventDispatcher.py │ ├── Icon_ │ ├── MidiFileParser.py │ ├── MidiInFile.py │ ├── MidiInStream.py │ ├── MidiOutFile.py │ ├── MidiOutStream.py │ ├── MidiToText.py │ ├── RawInstreamFile.py │ ├── RawOutstreamFile.py │ ├── __init__.py │ ├── changes.txt │ ├── constants.py │ ├── example_mimimal_type0.py │ ├── example_print_channel_0.py │ ├── example_print_events.py │ ├── example_print_file.py │ ├── example_transpose_octave.py │ ├── files.txt │ ├── hallelujah.mid │ ├── license.txt │ ├── readme │ ├── readme.txt │ ├── utils.py │ └── version.txt ├── myparser.py └── neural-plugin/ ├── DoubleTime.js ├── neural-plugin.js ├── neural-plugin.ui └── output-window.ui ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc ================================================ FILE: DBN.py ================================================ """ """ import cPickle import os import sys import time import os.path as path import copy import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from DeepLearningTutorials.code.mlp import HiddenLayer from DeepLearningTutorials.code.rbm import RBM from PIL import Image import myparser from midi.utils import midiwrite # compute_test_value is 'off' by default, meaning this feature is inactive theano.config.compute_test_value = 'off' # Use 'warn' to activate this feature # For switching between 32 and 64 bit systems, because Theano is a little silly # like that. NUMPY_DTYPE = numpy.float64 # start-snippet-1 class AutoencodingDBN(object): """ An autoencoding Deep Belief Network, based on the classifying DBN in the Theano tutorial. (Most of the code is copied over.) """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[1000, 1000, 1000]): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.layer_sizes = hidden_layers_sizes assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.x_mask = T.matrix('x_mask') # For partial information. # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.params.extend(rbm_layer.params) # And build the upside-down network. This shares parameters with the # forward network. Except the weights are transposed and stuff. # The "isolated" layers let you run only the upside-down part of the # network, for generation. The non-isolated layers are connected to # the forward, compressing part of the network, and are used for # training. reverse_input = self.sigmoid_layers[-1].output self.isolated_reverse_input = theano.shared( numpy.zeros([10, hidden_layers_sizes[-1]])) isolated_input = self.isolated_reverse_input self.reverse_layers = [None] * self.n_layers self.isolated_reverse = [None] * self.n_layers for i in reversed(xrange(self.n_layers)): if i == 0: out_size = n_ins else: out_size = hidden_layers_sizes[i-1] reverse_sigmoid = HiddenLayer(rng=numpy_rng, input=reverse_input, n_in=hidden_layers_sizes[i], n_out=out_size, W=self.sigmoid_layers[i].W.T, b=self.rbm_layers[i].vbias, activation=T.nnet.sigmoid ) isolated_sigmoid = HiddenLayer(rng=numpy_rng, input=isolated_input, n_in=hidden_layers_sizes[i], n_out=out_size, W=self.sigmoid_layers[i].W.T, b=self.rbm_layers[i].vbias, activation=T.nnet.sigmoid ) reverse_input = reverse_sigmoid.output isolated_input = isolated_sigmoid.output self.reverse_layers[i] = reverse_sigmoid self.isolated_reverse[i] = isolated_sigmoid # The fine-tune cost is the reconstruction error of the entire net. self.finetune_cost = ((self.x - self.reverse_layers[0].output)**2).sum() # The cost for training the generative net - in this case, self.x is # completely disconnected, and we feed a pattern into the reverse net. self.generative_cost = ((self.x - self.isolated_reverse[0].output)**2).sum() # The l1 cost is for generating constrained samples of the input. (Aka # harmonizing a melody.) Given a melody in self.x and a mask # self.x_mask of which parts of self.x actually matter, it computes the # error between the generated sample and the melody. self.l1_cost = (((self.x - self.isolated_reverse[0].output) * self.x_mask)**2).sum() def dump_params(self, outLoc): """ Takes all of the weights, and stores them as numpy arrays. This is so the params are portable between GPU machines and CPU machines. To load the params, you need to call load_from_dump, which re-makes your DBN. """ dump = {} for layer in range(self.n_layers): dump[(layer, 0)] = numpy.array(self.sigmoid_layers[layer].W.get_value()) dump[(layer, 1)] = numpy.array(self.sigmoid_layers[layer].b.get_value()) dump[(layer, 2)] = numpy.array(self.reverse_layers[layer].b.get_value()) cPickle.dump(dump, open(outLoc, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, train_set_x, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' index = T.lscalar('index') # index to a [mini]batch n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], } ) test_score_i = theano.function( [index], self.finetune_cost, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], } ) # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_batches)] return train_fn, test_score def build_generative_finetune_fns(self, train_set_outputs, train_set_labels, batch_size, learning_rate): index = T.lscalar('index') # index to a [mini]batch n_batches = train_set_outputs.get_value(borrow=True).shape[0] / batch_size # compute the gradients with respect to the model parameters # First, only one of the RBM biases is actually a parameter of the # generative model, so we have to fix that. gen_params = [] for i in range(self.n_layers): gen_params.append(self.rbm_layers[i].vbias) gen_params.append(self.rbm_layers[i].W) gparams = T.grad(self.generative_cost, gen_params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(gen_params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.generative_cost, updates=updates, givens={ self.x: train_set_outputs[ index * batch_size: (index + 1) * batch_size ], self.isolated_reverse[-1].input: train_set_labels[ index * batch_size: (index + 1) * batch_size ], } ) test_score_i = theano.function( [index], self.generative_cost, givens={ self.x: train_set_outputs[ index * batch_size: (index + 1) * batch_size ], self.isolated_reverse[-1].input: train_set_labels[ index * batch_size: (index + 1) * batch_size ], } ) # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_batches)] return train_fn, test_score def generate(self, top_level): """ Make a new piano roll, given top level values. (Uses the backwards section of the network to make a sample.) """ generator = theano.function( [], self.reverse_layers[0].output, givens={ self.reverse_layers[-1].input: top_level } ) return generator() def label(self, to_label, x_mask, learning_rate): """ Estimate top layer, given an incomplete layer 1. x_mask represents which values of to_label are unknown. """ grad = T.grad(self.l1_cost, self.isolated_reverse_input) # compute list of fine-tuning updates updates = (self.isolated_reverse_input, self.isolated_reverse_input - grad * learning_rate) train_fn = theano.function( inputs=[], outputs=self.l1_cost, updates=[updates], givens={ self.x: to_label, self.x_mask: x_mask, } ) return train_fn def train_dbn(self, data_file, finetune_lr=0.01, pretraining_epochs=100, pretrain_lr=0.05, k=1, training_epochs=1000, batch_size=10): raw_x = cPickle.load(open(data_file, 'rb')).astype(dtype=NUMPY_DTYPE) train_set_x = theano.shared(raw_x) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size print n_train_batches # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = self.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise for i in xrange(self.n_layers - 1): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() # end-snippet-2 print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # If you'd like to try out different parameters for the fine-tuner only, # you can cache the initial model state, so you don't have to pre-train # every time. cPickle.dump(self, open('initial-model.pickle', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' use_autoencoder = False if use_autoencoder: train_fn, test_model = self.build_finetune_functions( train_set_x=train_set_x, batch_size=batch_size, learning_rate=finetune_lr ) else: raw_labels = numpy.random.randint(2, size=[raw_x.shape[0], self.layer_sizes[-1]])\ .astype(dtype=numpy.float64) labels = theano.shared(raw_labels) train_fn, test_model = self.build_generative_finetune_fns( train_set_outputs=train_set_x, train_set_labels=labels, batch_size=batch_size, learning_rate=finetune_lr ) print '... finetuning the model' # early-stopping parameters patience = 4 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = test_model() this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f, ' 'obtained at iteration %i, ' ) % (best_validation_loss, best_iter + 1) ) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) self.dump_params('./my-model.pickle') def sample(self, top_level=None, rootLoc='./', save=True, threshold=0.5, filename='test.midi'): """ Generates a sample from the trained neural net. top_level is a 10 x [size of top layer] matrix whose rows contain values for the top layer. Most of the time, I only use the first row, but you can only process data in increments of batch_size. """ if top_level is None: top_level_size = self.layer_sizes[-1] top_level = numpy.random.randint(2, size=[10, top_level_size])\ .astype(dtype=NUMPY_DTYPE) output = self.generate(top_level) output = output.reshape([10, 88*64]) firstIm = output[0, :].reshape([88, 64]) # Makes a little picture of the piano roll. outIm = Image.fromarray((firstIm*255).astype('uint8')) outIm.save(path.join(rootLoc, 'test.png')) if threshold is not None: firstIm[firstIm > threshold] = 1 firstIm[firstIm <= threshold] = 0 if save: midiwrite(path.join(rootLoc, filename), firstIm.T, r=(12, 109), dt=64) return firstIm def label_from_file(self, rootLoc, fileLoc, learn_rate, n_iters, threshold): """ Given a xml file at fileLoc, harmonizes the melody in the xml file, by doing gradient descent on the top hidden layer of the network. This gives us an estimate of the top layer activations that might generate the melody. We then run the network forwards to get the entire harmony from the top level activations that we estimate. """ noteReader = myparser.LegatoNoteAdder(64) myparser.read(fileLoc, noteReader.handle) snippet = noteReader.mtx mask = melody_blocker(snippet) linear_snippet = snippet.reshape([88*64]) linear_mask = mask.reshape([88*64]) in_data = numpy.zeros([10, 88*64]) x_mask = numpy.zeros([10, 88*64]) for i in range(10): in_data[i, :] = linear_snippet x_mask[i, :] = linear_mask # Do gradient descent to estimate the activations on layer 1. new_vals = theano.shared( value=numpy.random.sample([10, self.layer_sizes[-1]]), ) f = theano.function( inputs=[], updates=[(self.isolated_reverse_input, new_vals)], ) f() trainer = self.label(in_data, x_mask, learn_rate) for i in range(n_iters): print trainer() # Then, generate using it. result = dbn.sample(self.isolated_reverse_input, rootLoc=rootLoc, save=False, threshold=threshold) # Add the melody back onto the snippet. final = result * (1.0 - mask) final = final + snippet final[final > 0.5] = 1 midiwrite(path.join(rootLoc, 'test.midi'), final.T, r=(12, 109), dt=64) return final def melody_blocker(snippet): """ Makes a mask where anything above the top line of the snippet is 1. Also enforces empty space a major 2nd above and below the melody. (This means the optimizer will consider any note above the top line of the melody, or too close to the melody, wrong.) """ envelope = numpy.copy(snippet) _, length = snippet.shape for i in range(length): occupied = [x for x in range(88) if snippet[x, i] != 0] if len(occupied) == 0: continue top = max(occupied) envelope[top:, i] = 1 for pitch in occupied: envelope[pitch-2:pitch+3, i] = 1 return envelope def load_from_dump(inLoc): """ Loads data from dumped state (generated by dumped_params), and creates a new DBN. """ dump = cPickle.load(open(inLoc, 'rb')) # Get the number of layers. max_layer = 0 for layer, _ in dump: if layer > max_layer: max_layer = layer max_layer += 1 # Get the size of each layer. layer_sizes = [] for layer in range(max_layer): layer_sizes.append(len(dump[(layer, 1)])) # For now, the size of the input is fixed at 88x64, but you can read that # out of the dump, as well. dbn = AutoencodingDBN(numpy_rng=numpy.random.RandomState(), n_ins=88*64, hidden_layers_sizes=layer_sizes) for layer in range(max_layer): dbn.sigmoid_layers[layer].W.set_value(dump[(layer, 0)]) dbn.sigmoid_layers[layer].b.set_value(dump[(layer, 1)]) dbn.reverse_layers[layer].b.set_value(dump[(layer, 2)]) return dbn if __name__ == '__main__': if sys.argv[1] == 'train': dbn = AutoencodingDBN(numpy_rng=numpy.random.RandomState(), n_ins=88*64, hidden_layers_sizes=[1024, 256, 64]) dbn.train_dbn('./joplin-data.pickle') exit() dbn = load_from_dump('./joplin-model.pickle') import sys if sys.argv[1] == 'sample': dbn.sample(threshold=0.5) elif sys.argv[1] == 'harmonize': dbn.label_from_file(path.dirname(sys.argv[0]), './12-days.xml', 0.01, 500, 0.4) else: print "invalid command" ================================================ FILE: DeepLearningTutorials/.gitignore ================================================ code/*.pyc code/*_plots code/tmp* code/midi code/rnnslu data/atis.* data/mnist.pkl.gz data/mnist_py3k.pkl.gz data/Nottingham.zip data/Nottingham data/midi.zip html *.pyc *~ *.swp ================================================ FILE: DeepLearningTutorials/.hgignore ================================================ syntax: glob *.pyc *.png *~ ================================================ FILE: DeepLearningTutorials/.travis.yml ================================================ # After changing this file, check it on: # http://lint.travis-ci.org/ #We can't get scipy installed with the python language #So we will use the system python from the c language. language: c #language: python #python: # - "2.5" # - "2.7" # - "3.2" # command to install dependencies before_install: #zlib1g-dev is needed to allow PIL to uncompress the dataset. - sudo apt-get update - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging install: # - "pip install -q numpy --use-mirrors" # Use Pillow instead of PIL as it is better packaged # - "pip install -q Pillow --use-mirrors" #If we don't install numpy before SciPy 0.10.1, the SciPy installations fails. # - "pip install -q scipy --use-mirrors" - "sudo pip install --no-deps git+git://github.com/Theano/Theano.git" env: - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA" - PART="test.py:test_SdA" - PART="test.py:test_dbn" - PART="test.py:test_rbm test.py:test_rnnrbm" - PART="-e test.py" #i7-2600K CPU @ 3.40GHz #166.572s #8 test.test_rbm OK #155.114s #7 test.test_dbn OK #152.365s #9 test.test_rnnrbm OK #127.286s #6 test.test_SdA OK #39.252s #5 test.test_dA OK #27.56s #4 test.test_convolutional_mlp OK #15.454s #3 test.test_mlp OK #12.732s #1 test.test_logistic_sgd OK #12.638s #2 test.test_logistic_cg OK #i7-920 #296.475s #7 code.test.test_dbn OK #257.272s #6 code.test.test_SdA OK #234.776s #9 code.test.test_rnnrbm OK #233.896s #8 code.test.test_rbm OK #65.737s #5 code.test.test_dA OK #37.658s #4 code.test.test_convolutional_mlp OK #24.172s #3 code.test.test_mlp OK #20.401s #1 code.test.test_logistic_sgd OK #17.546s #2 code.test.test_logistic_cg OK # On Core2 duo E8500 with MRG #308.004s #7 code.test.test_dbn OK #277.268s #6 code.test.test_SdA OK #126.102s #8 code.test.test_rbm OK #123.652s #9 code.test.test_rnnrbm OK #77.101s #5 code.test.test_dA OK #39.75s #4 code.test.test_convolutional_mlp OK #30.406s #3 code.test.test_mlp OK #21.132s #2 code.test.test_logistic_cg OK #17.945s #1 code.test.test_logistic_sgd OK # Unknown computer with older version of Theano #569.882s #9 code.test.test_rbm OK #298.992s #8 code.test.test_dbn OK #268.901s #7 code.test.test_SdA OK #67.292s #6 code.test.test_dA OK #27.485s #4 code.test.test_mlp OK #26.204s #5 code.test.test_convolutional_mlp OK #14.676s #3 code.test.test_logistic_cg OK #10.66s #2 code.test.test_logistic_sgd OK #5.795s #1 code.hmc.test_hmc.test_hmc OK script: - cd data - ./download.sh - ls - cd ../code - pwd - ls - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise - python --version - nosetests $PART ================================================ FILE: DeepLearningTutorials/README.rst ================================================ Deep Learning Tutorials ======================= Deep Learning is a new area of Machine Learning research, which has been introduced with the objective of moving Machine Learning closer to one of its original goals: Artificial Intelligence. Deep Learning is about learning multiple levels of representation and abstraction that help to make sense of data such as images, sound, and text. The tutorials presented here will introduce you to some of the most important deep learning algorithms and will also show you how to run them using Theano. Theano is a python library that makes writing deep learning models easy, and gives the option of training them on a GPU. The easiest way to follow the tutorials is to `browse them online `_. `Main development `_ of this project. .. image:: https://secure.travis-ci.org/lisa-lab/DeepLearningTutorials.png :target: http://travis-ci.org/lisa-lab/DeepLearningTutorials Project Layout -------------- Subdirectories: - code - Python files corresponding to each tutorial - data - data and scripts to download data that is used by the tutorials - doc - restructured text used by Sphinx to build the tutorial website - html - built automatically by doc/Makefile, contains tutorial website - issues_closed - issue tracking - issues_open - issue tracking - misc - administrative scripts Build instructions ------------------ To build the html version of the tutorials, install sphinx and run doc/Makefile ================================================ FILE: DeepLearningTutorials/__init__.py ================================================ ================================================ FILE: DeepLearningTutorials/code/DBN.py ================================================ """ """ import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from logistic_sgd import LogisticRegression, load_data from mlp import HiddenLayer from rbm import RBM # start-snippet-1 class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] } ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] } ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] } ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score def test_DBN(finetune_lr=0.1, pretraining_epochs=100, pretrain_lr=0.01, k=1, training_epochs=1000, dataset='mnist.pkl.gz', batch_size=10): """ Demonstrates how to train and test a Deep Belief Network. This is demonstrated on MNIST. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type k: int :param k: number of Gibbs steps in CD/PCD :type training_epochs: int :param training_epochs: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset :type batch_size: int :param batch_size: the size of a minibatch """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) print '... building the model' # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28, hidden_layers_sizes=[1000, 1000, 1000], n_outs=10) # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise for i in xrange(dbn.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() # end-snippet-2 print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr ) print '... finetuning the model' # early-stopping parameters patience = 4 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if __name__ == '__main__': test_DBN() ================================================ FILE: DeepLearningTutorials/code/SdA.py ================================================ """ This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. Denoising autoencoders are the building blocks for SdA. They are based on auto-encoders as the ones used in Bengio et al. 2007. An autoencoder takes an input x and first maps it to a hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting latent representation y is then mapped back to a "reconstructed" vector z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight matrix W' can optionally be constrained such that W' = W^T, in which case the autoencoder is said to have tied weights. The network is trained such that to minimize the reconstruction error (the error between x and z). For the denosing autoencoder, during training, first x is corrupted into \tilde{x}, where \tilde{x} is a partially destroyed version of x by means of a stochastic mapping. Afterwards y is computed as before (using \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction error is now measured between z and the uncorrupted input x, which is computed as the cross-entropy : - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] References : - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, 2008 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise Training of Deep Networks, Advances in Neural Information Processing Systems 19, 2007 """ import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from logistic_sgd import LogisticRegression, load_data from mlp import HiddenLayer from dA import dA # start-snippet-1 class SdA(object): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__( self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1] ): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs ) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size): ''' Generates a list of functions, each of them implementing one step in trainnig the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch :type learning_rate: float :param learning_rate: learning rate used during training for any of the dA layers ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function( inputs=[ index, theano.Param(corruption_level, default=0.2), theano.Param(learning_rate, default=0.1) ], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin: batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [ (param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams) ] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='train' ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='test' ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='valid' ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score def test_SdA(finetune_lr=0.1, pretraining_epochs=15, pretrain_lr=0.001, training_epochs=1000, dataset='mnist.pkl.gz', batch_size=1): """ Demonstrates how to train and test a stochastic denoising autoencoder. This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used in the finetune stage (factor for the stochastic gradient) :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type n_iter: int :param n_iter: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) print '... building the model' # construct the stacked denoising autoencoder class sda = SdA( numpy_rng=numpy_rng, n_ins=28 * 28, hidden_layers_sizes=[1000, 1000, 1000], n_outs=10 ) # end-snippet-3 start-snippet-4 ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise corruption_levels = [.1, .2, .3] for i in xrange(sda.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # end-snippet-4 ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, validate_model, test_model = sda.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr ) print '... finetunning the model' # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if __name__ == '__main__': test_SdA() ================================================ FILE: DeepLearningTutorials/code/__init__.py ================================================ ================================================ FILE: DeepLearningTutorials/code/cA.py ================================================ """This tutorial introduces Contractive auto-encoders (cA) using Theano. They are based on auto-encoders as the ones used in Bengio et al. 2007. An autoencoder takes an input x and first maps it to a hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting latent representation y is then mapped back to a "reconstructed" vector z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight matrix W' can optionally be constrained such that W' = W^T, in which case the autoencoder is said to have tied weights. The network is trained such that to minimize the reconstruction error (the error between x and z). Adding the squared Frobenius norm of the Jacobian of the hidden mapping h with respect to the visible units yields the contractive auto-encoder: - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + \| \frac{\partial h(x)}{\partial x} \|^2 References : - S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio: Contractive Auto-Encoders: Explicit Invariance During Feature Extraction, ICML-11 - S. Rifai, X. Muller, X. Glorot, G. Mesnil, Y. Bengio, and Pascal Vincent. Learning invariant features through local space contraction. Technical Report 1360, Universite de Montreal - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise Training of Deep Networks, Advances in Neural Information Processing Systems 19, 2007 """ import os import sys import time import numpy import theano import theano.tensor as T from logistic_sgd import load_data from utils import tile_raster_images try: import PIL.Image as Image except ImportError: import Image class cA(object): """ Contractive Auto-Encoder class (cA) The contractive autoencoder tries to reconstruct the input with an additional constraint on the latent space. With the objective of obtaining a robust representation of the input space, we regularize the L2 norm(Froebenius) of the jacobian of the hidden representation with respect to the input. Please refer to Rifai et al.,2011 for more details. If x is the input then equation (1) computes the projection of the input into the latent space h. Equation (2) computes the jacobian of h with respect to x. Equation (3) computes the reconstruction of the input, while equation (4) computes the reconstruction error and the added regularization term from Eq.(2). .. math:: h_i = s(W_i x + b_i) (1) J_i = h_i (1 - h_i) * W_i (2) x' = s(W' h + b') (3) L = -sum_{k=1}^d [x_k \log x'_k + (1-x_k) \log( 1-x'_k)] + lambda * sum_{i=1}^d sum_{j=1}^n J_{ij}^2 (4) """ def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100, n_batchsize=1, W=None, bhid=None, bvis=None): """Initialize the cA class by specifying the number of visible units (the dimension d of the input), the number of hidden units (the dimension d' of the latent or hidden space) and the contraction level. The constructor also receives symbolic variables for the input, weights and bias. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :param input: a symbolic description of the input or None for standalone cA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type n_batchsize int :param n_batchsize: number of examples per batch :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None """ self.n_visible = n_visible self.n_hidden = n_hidden self.n_batchsize = n_batchsize # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if # converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray( numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden) ), dtype=theano.config.floatX ) W = theano.shared(value=initial_W, name='W', borrow=True) if not bvis: bvis = theano.shared(value=numpy.zeros(n_visible, dtype=theano.config.floatX), borrow=True) if not bhid: bhid = theano.shared(value=numpy.zeros(n_hidden, dtype=theano.config.floatX), name='b', borrow=True) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T # if no input is given, generate a variable representing the input if input is None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x = T.dmatrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime] def get_hidden_values(self, input): """ Computes the values of the hidden layer """ return T.nnet.sigmoid(T.dot(input, self.W) + self.b) def get_jacobian(self, hidden, W): """Computes the jacobian of the hidden layer with respect to the input, reshapes are necessary for broadcasting the element-wise product on the right axis """ return T.reshape(hidden * (1 - hidden), (self.n_batchsize, 1, self.n_hidden)) * T.reshape( W, (1, self.n_visible, self.n_hidden)) def get_reconstructed_input(self, hidden): """Computes the reconstructed input given the values of the hidden layer """ return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) def get_cost_updates(self, contraction_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.sum(J ** 2) / self.n_batchsize # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob) # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates) def test_cA(learning_rate=0.01, training_epochs=20, dataset='mnist.pkl.gz', batch_size=10, output_folder='cA_plots', contraction_level=.1): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the contracting AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL # #################################### rng = numpy.random.RandomState(123) ca = cA(numpy_rng=rng, input=x, n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size) cost, updates = ca.get_cost_updates(contraction_level=contraction_level, learning_rate=learning_rate) train_ca = theano.function( [index], [T.mean(ca.L_rec), ca.L_jacob], updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_ca(batch_index)) c_array = numpy.vstack(c) print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean( c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1])) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = Image.fromarray(tile_raster_images( X=ca.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('cae_filters.png') os.chdir('../') if __name__ == '__main__': test_cA() ================================================ FILE: DeepLearningTutorials/code/convolutional_mlp.py ================================================ """This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a convolutional neural network, good for classifying images. This tutorial shows how to build the architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST results. This implementation simplifies the model in the following ways: - LeNetConvPool doesn't implement location-specific gain and bias parameters - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. - Digit classification is implemented with a logistic regression rather than an RBF network - LeNet5 was not fully-connected convolutions at second layer References: - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf """ import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.signal import downsample from theano.tensor.nnet import conv from logistic_sgd import LogisticRegression, load_data from mlp import HiddenLayer class LeNetConvPoolLayer(object): """Pool Layer of a convolutional network """ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d( input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape ) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d( input=conv_out, ds=poolsize, ignore_border=True ) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer self.params = [self.W, self.b] def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if __name__ == '__main__': evaluate_lenet5() def experiment(state, channel): evaluate_lenet5(state.learning_rate, dataset=state.dataset) ================================================ FILE: DeepLearningTutorials/code/dA.py ================================================ """ This tutorial introduces denoising auto-encoders (dA) using Theano. Denoising autoencoders are the building blocks for SdA. They are based on auto-encoders as the ones used in Bengio et al. 2007. An autoencoder takes an input x and first maps it to a hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting latent representation y is then mapped back to a "reconstructed" vector z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight matrix W' can optionally be constrained such that W' = W^T, in which case the autoencoder is said to have tied weights. The network is trained such that to minimize the reconstruction error (the error between x and z). For the denosing autoencoder, during training, first x is corrupted into \tilde{x}, where \tilde{x} is a partially destroyed version of x by means of a stochastic mapping. Afterwards y is computed as before (using \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction error is now measured between z and the uncorrupted input x, which is computed as the cross-entropy : - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] References : - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, 2008 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise Training of Deep Networks, Advances in Neural Information Processing Systems 19, 2007 """ import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from logistic_sgd import load_data from utils import tile_raster_images try: import PIL.Image as Image except ImportError: import Image # start-snippet-1 class dA(object): """Denoising Auto-Encoder class (dA) A denoising autoencoders tries to reconstruct the input from a corrupted version of it by projecting it first in a latent space and reprojecting it afterwards back in the input space. Please refer to Vincent et al.,2008 for more details. If x is the input then equation (1) computes a partially destroyed version of x by means of a stochastic mapping q_D. Equation (2) computes the projection of the input into the latent space. Equation (3) computes the reconstruction of the input, while equation (4) computes the reconstruction error. .. math:: \tilde{x} ~ q_D(\tilde{x}|x) (1) y = s(W \tilde{x} + b) (2) x = s(W' y + b') (3) L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) """ def __init__( self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500, W=None, bhid=None, bvis=None ): """ Initialize the dA class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ) and the corruption level. The constructor also receives symbolic variables for the input, weights and bias. Such a symbolic variables are useful when, for example the input is the result of some computations, or when weights are shared between the dA and an MLP layer. When dealing with SdAs this always happens, the dA on layer 2 gets as input the output of the dA on layer 1, and the weights of the dA are used in the second stage of training to construct an MLP. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :param input: a symbolic description of the input or None for standalone dA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if # converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray( numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden) ), dtype=theano.config.floatX ) W = theano.shared(value=initial_W, name='W', borrow=True) if not bvis: bvis = theano.shared( value=numpy.zeros( n_visible, dtype=theano.config.floatX ), borrow=True ) if not bhid: bhid = theano.shared( value=numpy.zeros( n_hidden, dtype=theano.config.floatX ), name='b', borrow=True ) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input is None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x = T.dmatrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime] # end-snippet-1 def get_corrupted_input(self, input, corruption_level): """This function keeps ``1-corruption_level`` entries of the inputs the same and zero-out randomly selected subset of size ``coruption_level`` Note : first argument of theano.rng.binomial is the shape(size) of random numbers that it should produce second argument is the number of trials third argument is the probability of success of any trial this will produce an array of 0s and 1s where 1 has a probability of 1 - ``corruption_level`` and 0 with ``corruption_level`` The binomial function return int64 data type by default. int64 multiplicated by the input type(floatX) always return float64. To keep all data in floatX when floatX is float32, we set the dtype of the binomial to floatX. As in our case the value of the binomial is always 0 or 1, this don't change the result. This is needed to allow the gpu to work correctly as it only support float32 for now. """ return self.theano_rng.binomial(size=input.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * input def get_hidden_values(self, input): """ Computes the values of the hidden layer """ return T.nnet.sigmoid(T.dot(input, self.W) + self.b) def get_reconstructed_input(self, hidden): """Computes the reconstructed input given the values of the hidden layer """ return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates) def test_dA(learning_rate=0.1, training_epochs=15, dataset='mnist.pkl.gz', batch_size=20, output_folder='dA_plots'): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA( numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500 ) cost, updates = da.get_cost_updates( corruption_level=0., learning_rate=learning_rate ) train_da = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA( numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500 ) cost, updates = da.get_cost_updates( corruption_level=0.3, learning_rate=learning_rate ) train_da = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.)) image = Image.fromarray(tile_raster_images( X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') os.chdir('../') if __name__ == '__main__': test_dA() ================================================ FILE: DeepLearningTutorials/code/hmc/__init__.py ================================================ ================================================ FILE: DeepLearningTutorials/code/hmc/hmc.py ================================================ """ TODO """ import numpy from theano import function, shared from theano import tensor as TT import theano sharedX = (lambda X, name: shared(numpy.asarray(X, dtype=theano.config.floatX), name=name)) def kinetic_energy(vel): """Returns the kinetic energy associated with the given velocity and mass of 1. Parameters ---------- vel: theano matrix Symbolic matrix whose rows are velocity vectors. Returns ------- return: theano vector Vector whose i-th entry is the kinetic entry associated with vel[i]. """ return 0.5 * (vel ** 2).sum(axis=1) def hamiltonian(pos, vel, energy_fn): """ Returns the Hamiltonian (sum of potential and kinetic energy) for the given velocity and position. Parameters ---------- pos: theano matrix Symbolic matrix whose rows are position vectors. vel: theano matrix Symbolic matrix whose rows are velocity vectors. energy_fn: python function Python function, operating on symbolic theano variables, used tox compute the potential energy at a given position. Returns ------- return: theano vector Vector whose i-th entry is the Hamiltonian at position pos[i] and velocity vel[i]. """ # assuming mass is 1 return energy_fn(pos) + kinetic_energy(vel) def metropolis_hastings_accept(energy_prev, energy_next, s_rng): """ Performs a Metropolis-Hastings accept-reject move. Parameters ---------- energy_prev: theano vector Symbolic theano tensor which contains the energy associated with the configuration at time-step t. energy_next: theano vector Symbolic theano tensor which contains the energy associated with the proposed configuration at time-step t+1. s_rng: theano.tensor.shared_randomstreams.RandomStreams Theano shared random stream object used to generate the random number used in proposal. Returns ------- return: boolean True if move is accepted, False otherwise """ ediff = energy_prev - energy_next return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0 def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn): """ Return final (position, velocity) obtained after an `n_steps` leapfrog updates, using Hamiltonian dynamics. Parameters ---------- initial_pos: shared theano matrix Initial position at which to start the simulation initial_vel: shared theano matrix Initial velocity of particles stepsize: shared theano scalar Scalar value controlling amount by which to move energy_fn: python function Python function, operating on symbolic theano variables, used to compute the potential energy at a given position. Returns ------- rval1: theano matrix Final positions obtained after simulation rval2: theano matrix Final velocity obtained after simulation """ def leapfrog(pos, vel, step): """ Inside loop of Scan. Performs one step of leapfrog update, using Hamiltonian dynamics. Parameters ---------- pos: theano matrix in leapfrog update equations, represents pos(t), position at time t vel: theano matrix in leapfrog update equations, represents vel(t - stepsize/2), velocity at time (t - stepsize/2) step: theano scalar scalar value controlling amount by which to move Returns ------- rval1: [theano matrix, theano matrix] Symbolic theano matrices for new position pos(t + stepsize), and velocity vel(t + stepsize/2) rval2: dictionary Dictionary of updates for the Scan Op """ # from pos(t) and vel(t-stepsize/2), compute vel(t+stepsize/2) dE_dpos = TT.grad(energy_fn(pos).sum(), pos) new_vel = vel - step * dE_dpos # from vel(t+stepsize/2) compute pos(t+stepsize) new_pos = pos + step * new_vel return [new_pos, new_vel], {} # compute velocity at time-step: t + stepsize/2 initial_energy = energy_fn(initial_pos) dE_dpos = TT.grad(initial_energy.sum(), initial_pos) vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos # compute position at time-step: t + stepsize pos_full_step = initial_pos + stepsize * vel_half_step # perform leapfrog updates: the scan op is used to repeatedly compute # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps]. (all_pos, all_vel), scan_updates = theano.scan( leapfrog, outputs_info=[ dict(initial=pos_full_step), dict(initial=vel_half_step), ], non_sequences=[stepsize], n_steps=n_steps - 1) final_pos = all_pos[-1] final_vel = all_vel[-1] # NOTE: Scan always returns an updates dictionary, in case the # scanned function draws samples from a RandomStream. These # updates must then be used when compiling the Theano function, to # avoid drawing the same random numbers each time the function is # called. In this case however, we consciously ignore # "scan_updates" because we know it is empty. assert not scan_updates # The last velocity returned by scan is vel(t + # (n_steps - 1 / 2) * stepsize) We therefore perform one more half-step # to return vel(t + n_steps * stepsize) energy = energy_fn(final_pos) final_vel = final_vel - 0.5 * stepsize * TT.grad(energy.sum(), final_pos) # return new proposal state return final_pos, final_vel # start-snippet-1 def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps): """ This function performs one-step of Hybrid Monte-Carlo sampling. We start by sampling a random velocity from a univariate Gaussian distribution, perform `n_steps` leap-frog updates using Hamiltonian dynamics and accept-reject using Metropolis-Hastings. Parameters ---------- s_rng: theano shared random stream Symbolic random number generator used to draw random velocity and perform accept-reject move. positions: shared theano matrix Symbolic matrix whose rows are position vectors. energy_fn: python function Python function, operating on symbolic theano variables, used to compute the potential energy at a given position. stepsize: shared theano scalar Shared variable containing the stepsize to use for `n_steps` of HMC simulation steps. n_steps: integer Number of HMC steps to perform before proposing a new position. Returns ------- rval1: boolean True if move is accepted, False otherwise rval2: theano matrix Matrix whose rows contain the proposed "new position" """ # end-snippet-1 start-snippet-2 # sample random velocity initial_vel = s_rng.normal(size=positions.shape) # end-snippet-2 start-snippet-3 # perform simulation of particles subject to Hamiltonian dynamics final_pos, final_vel = simulate_dynamics( initial_pos=positions, initial_vel=initial_vel, stepsize=stepsize, n_steps=n_steps, energy_fn=energy_fn ) # end-snippet-3 start-snippet-4 # accept/reject the proposed move based on the joint distribution accept = metropolis_hastings_accept( energy_prev=hamiltonian(positions, initial_vel, energy_fn), energy_next=hamiltonian(final_pos, final_vel, energy_fn), s_rng=s_rng ) # end-snippet-4 return accept, final_pos # start-snippet-5 def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, target_acceptance_rate, stepsize_inc, stepsize_dec, stepsize_min, stepsize_max, avg_acceptance_slowness): """This function is executed after `n_steps` of HMC sampling (`hmc_move` function). It creates the updates dictionary used by the `simulate` function. It takes care of updating: the position (if the move is accepted), the stepsize (to track a given target acceptance rate) and the average acceptance rate (computed as a moving average). Parameters ---------- positions: shared variable, theano matrix Shared theano matrix whose rows contain the old position stepsize: shared variable, theano scalar Shared theano scalar containing current step size avg_acceptance_rate: shared variable, theano scalar Shared theano scalar containing the current average acceptance rate final_pos: shared variable, theano matrix Shared theano matrix whose rows contain the new position accept: theano scalar Boolean-type variable representing whether or not the proposed HMC move should be accepted or not. target_acceptance_rate: float The stepsize is modified in order to track this target acceptance rate. stepsize_inc: float Amount by which to increment stepsize when acceptance rate is too high. stepsize_dec: float Amount by which to decrement stepsize when acceptance rate is too low. stepsize_min: float Lower-bound on `stepsize`. stepsize_min: float Upper-bound on `stepsize`. avg_acceptance_slowness: float Average acceptance rate is computed as an exponential moving average. (1-avg_acceptance_slowness) is the weight given to the newest observation. Returns ------- rval1: dictionary-like A dictionary of updates to be used by the `HMC_Sampler.simulate` function. The updates target the position, stepsize and average acceptance rate. """ ## POSITION UPDATES ## # broadcast `accept` scalar to tensor with the same dimensions as # final_pos. accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1))) # if accept is True, update to `final_pos` else stay put new_positions = TT.switch(accept_matrix, final_pos, positions) # end-snippet-5 start-snippet-7 ## STEPSIZE UPDATES ## # if acceptance rate is too low, our sampler is too "noisy" and we reduce # the stepsize. If it is too high, our sampler is too conservative, we can # get away with a larger stepsize (resulting in better mixing). _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate, stepsize * stepsize_inc, stepsize * stepsize_dec) # maintain stepsize in [stepsize_min, stepsize_max] new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max) # end-snippet-7 start-snippet-6 ## ACCEPT RATE UPDATES ## # perform exponential moving average mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype) new_acceptance_rate = TT.add( avg_acceptance_slowness * avg_acceptance_rate, (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype)) # end-snippet-6 start-snippet-8 return [(positions, new_positions), (stepsize, new_stepsize), (avg_acceptance_rate, new_acceptance_rate)] # end-snippet-8 class HMC_sampler(object): """ Convenience wrapper for performing Hybrid Monte Carlo (HMC). It creates the symbolic graph for performing an HMC simulation (using `hmc_move` and `hmc_updates`). The graph is then compiled into the `simulate` function, a theano function which runs the simulation and updates the required shared variables. Users should interface with the sampler thorugh the `draw` function which advances the markov chain and returns the current sample by calling `simulate` and `get_position` in sequence. The hyper-parameters are the same as those used by Marc'Aurelio's 'train_mcRBM.py' file (available on his personal home page). """ def __init__(self, **kwargs): self.__dict__.update(kwargs) @classmethod def new_from_shared_positions( cls, shared_positions, energy_fn, initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20, stepsize_dec=0.98, stepsize_min=0.001, stepsize_max=0.25, stepsize_inc=1.02, # used in geometric avg. 1.0 would be not moving at all avg_acceptance_slowness=0.9, seed=12345 ): """ :param shared_positions: theano ndarray shared var with many particle [initial] positions :param energy_fn: callable such that energy_fn(positions) returns theano vector of energies. The len of this vector is the batchsize. The sum of this energy vector must be differentiable (with theano.tensor.grad) with respect to the positions for HMC sampling to work. """ # allocate shared variables stepsize = sharedX(initial_stepsize, 'hmc_stepsize') avg_acceptance_rate = sharedX(target_acceptance_rate, 'avg_acceptance_rate') s_rng = TT.shared_randomstreams.RandomStreams(seed) # define graph for an `n_steps` HMC simulation accept, final_pos = hmc_move( s_rng, shared_positions, energy_fn, stepsize, n_steps) # define the dictionary of updates, to apply on every `simulate` call simulate_updates = hmc_updates( shared_positions, stepsize, avg_acceptance_rate, final_pos=final_pos, accept=accept, stepsize_min=stepsize_min, stepsize_max=stepsize_max, stepsize_inc=stepsize_inc, stepsize_dec=stepsize_dec, target_acceptance_rate=target_acceptance_rate, avg_acceptance_slowness=avg_acceptance_slowness) # compile theano function simulate = function([], [], updates=simulate_updates) # create HMC_sampler object with the following attributes ... return cls( positions=shared_positions, stepsize=stepsize, stepsize_min=stepsize_min, stepsize_max=stepsize_max, avg_acceptance_rate=avg_acceptance_rate, target_acceptance_rate=target_acceptance_rate, s_rng=s_rng, _updates=simulate_updates, simulate=simulate) def draw(self, **kwargs): """ Returns a new position obtained after `n_steps` of HMC simulation. Parameters ---------- kwargs: dictionary The `kwargs` dictionary is passed to the shared variable (self.positions) `get_value()` function. For example, to avoid copying the shared variable value, consider passing `borrow=True`. Returns ------- rval: numpy matrix Numpy matrix whose of dimensions similar to `initial_position`. """ self.simulate() return self.positions.get_value(borrow=False) ================================================ FILE: DeepLearningTutorials/code/hmc/test_hmc.py ================================================ import numpy from scipy import linalg import theano from hmc import HMC_sampler def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10): batchsize = 3 rng = numpy.random.RandomState(123) # Define a covariance and mu for a gaussian mu = numpy.array(rng.rand(dim) * 10, dtype=theano.config.floatX) cov = numpy.array(rng.rand(dim, dim), dtype=theano.config.floatX) cov = (cov + cov.T) / 2. cov[numpy.arange(dim), numpy.arange(dim)] = 1.0 cov_inv = linalg.inv(cov) # Define energy function for a multi-variate Gaussian def gaussian_energy(x): return 0.5 * (theano.tensor.dot((x - mu), cov_inv) * (x - mu)).sum(axis=1) # Declared shared random variable for positions position = rng.randn(batchsize, dim).astype(theano.config.floatX) position = theano.shared(position) # Create HMC sampler sampler = sampler_cls(position, gaussian_energy, initial_stepsize=1e-3, stepsize_max=0.5) # Start with a burn-in process garbage = [sampler.draw() for r in xrange(burnin)] # burn-in Draw # `n_samples`: result is a 3D tensor of dim [n_samples, batchsize, # dim] _samples = numpy.asarray([sampler.draw() for r in xrange(n_samples)]) # Flatten to [n_samples * batchsize, dim] samples = _samples.T.reshape(dim, -1).T print '****** TARGET VALUES ******' print 'target mean:', mu print 'target cov:\n', cov print '****** EMPIRICAL MEAN/COV USING HMC ******' print 'empirical mean: ', samples.mean(axis=0) print 'empirical_cov:\n', numpy.cov(samples.T) print '****** HMC INTERNALS ******' print 'final stepsize', sampler.stepsize.get_value() print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value() return sampler def test_hmc(): sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions, burnin=1000, n_samples=1000, dim=5) assert abs(sampler.avg_acceptance_rate.get_value() - sampler.target_acceptance_rate) < .1 assert sampler.stepsize.get_value() >= sampler.stepsize_min assert sampler.stepsize.get_value() <= sampler.stepsize_max ================================================ FILE: DeepLearningTutorials/code/imdb.py ================================================ import cPickle import gzip import os import numpy import theano def prepare_data(seqs, labels, maxlen=None): """Create the matrices from the datasets. This pad each sequence to the same lenght: the lenght of the longuest sequence or maxlen. if maxlen is set, we will cut all sequence to this maximum lenght. This swap the axis! """ # x: a list of sentences lengths = [len(s) for s in seqs] if maxlen is not None: new_seqs = [] new_labels = [] new_lengths = [] for l, s, y in zip(lengths, seqs, labels): if l < maxlen: new_seqs.append(s) new_labels.append(y) new_lengths.append(l) lengths = new_lengths labels = new_labels seqs = new_seqs if len(lengths) < 1: return None, None, None n_samples = len(seqs) maxlen = numpy.max(lengths) x = numpy.zeros((maxlen, n_samples)).astype('int64') x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX) for idx, s in enumerate(seqs): x[:lengths[idx], idx] = s x_mask[:lengths[idx], idx] = 1. return x, x_mask, labels def get_dataset_file(dataset, default_dataset, origin): '''Look for it as if it was a full path, if not, try local file, if not try in the data directory. Download dataset if it is not present ''' data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. new_path = os.path.join( os.path.split(__file__)[0], "..", "data", dataset ) if os.path.isfile(new_path) or data_file == default_dataset: dataset = new_path if (not os.path.isfile(dataset)) and data_file == default_dataset: import urllib print 'Downloading data from %s' % origin urllib.urlretrieve(origin, dataset) return dataset def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, sort_by_len=True): '''Loads the dataset :type path: String :param path: The path to the dataset (here IMDB) :type n_words: int :param n_words: The number of word to keep in the vocabulary. All extra words are set to unknow (1). :type valid_portion: float :param valid_portion: The proportion of the full train set used for the validation set. :type maxlen: None or positive int :param maxlen: the max sequence length we use in the train/valid set. :type sort_by_len: bool :name sort_by_len: Sort by the sequence lenght for the train, valid and test set. This allow faster execution as it cause less padding per minibatch. Another mechanism must be used to shuffle the train set at each epoch. ''' ############# # LOAD DATA # ############# # Load the dataset path = get_dataset_file( path, "imdb.pkl", "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") if path.endswith(".gz"): f = gzip.open(path, 'rb') else: f = open(path, 'rb') train_set = cPickle.load(f) test_set = cPickle.load(f) f.close() if maxlen: new_train_set_x = [] new_train_set_y = [] for x, y in zip(train_set[0], train_set[1]): if len(x) < maxlen: new_train_set_x.append(x) new_train_set_y.append(y) train_set = (new_train_set_x, new_train_set_y) del new_train_set_x, new_train_set_y # split training set into validation set train_set_x, train_set_y = train_set n_samples = len(train_set_x) sidx = numpy.random.permutation(n_samples) n_train = int(numpy.round(n_samples * (1. - valid_portion))) valid_set_x = [train_set_x[s] for s in sidx[n_train:]] valid_set_y = [train_set_y[s] for s in sidx[n_train:]] train_set_x = [train_set_x[s] for s in sidx[:n_train]] train_set_y = [train_set_y[s] for s in sidx[:n_train]] train_set = (train_set_x, train_set_y) valid_set = (valid_set_x, valid_set_y) def remove_unk(x): return [[1 if w >= n_words else w for w in sen] for sen in x] test_set_x, test_set_y = test_set valid_set_x, valid_set_y = valid_set train_set_x, train_set_y = train_set train_set_x = remove_unk(train_set_x) valid_set_x = remove_unk(valid_set_x) test_set_x = remove_unk(test_set_x) def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) if sort_by_len: sorted_index = len_argsort(test_set_x) test_set_x = [test_set_x[i] for i in sorted_index] test_set_y = [test_set_y[i] for i in sorted_index] sorted_index = len_argsort(valid_set_x) valid_set_x = [valid_set_x[i] for i in sorted_index] valid_set_y = [valid_set_y[i] for i in sorted_index] sorted_index = len_argsort(train_set_x) train_set_x = [train_set_x[i] for i in sorted_index] train_set_y = [train_set_y[i] for i in sorted_index] train = (train_set_x, train_set_y) valid = (valid_set_x, valid_set_y) test = (test_set_x, test_set_y) return train, valid, test ================================================ FILE: DeepLearningTutorials/code/imdb_preprocess.py ================================================ """ This script is what created the dataset pickled. 1) You need to download this file and put it in the same directory as this file. https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission. 2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory. 3) Then run this script. """ dataset_path='/Tmp/bastienf/aclImdb/' import numpy import cPickle as pkl from collections import OrderedDict import glob import os from subprocess import Popen, PIPE # tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-'] def tokenize(sentences): print 'Tokenizing..', text = "\n".join(sentences) tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) tok_text, _ = tokenizer.communicate(text) toks = tok_text.split('\n')[:-1] print 'Done' return toks def build_dict(path): sentences = [] currdir = os.getcwd() os.chdir('%s/pos/' % path) for ff in glob.glob("*.txt"): with open(ff, 'r') as f: sentences.append(f.readline().strip()) os.chdir('%s/neg/' % path) for ff in glob.glob("*.txt"): with open(ff, 'r') as f: sentences.append(f.readline().strip()) os.chdir(currdir) sentences = tokenize(sentences) print 'Building dictionary..', wordcount = dict() for ss in sentences: words = ss.strip().lower().split() for w in words: if w not in wordcount: wordcount[w] = 1 else: wordcount[w] += 1 counts = wordcount.values() keys = wordcount.keys() sorted_idx = numpy.argsort(counts)[::-1] worddict = dict() for idx, ss in enumerate(sorted_idx): worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK) print numpy.sum(counts), ' total words ', len(keys), ' unique words' return worddict def grab_data(path, dictionary): sentences = [] currdir = os.getcwd() os.chdir(path) for ff in glob.glob("*.txt"): with open(ff, 'r') as f: sentences.append(f.readline().strip()) os.chdir(currdir) sentences = tokenize(sentences) seqs = [None] * len(sentences) for idx, ss in enumerate(sentences): words = ss.strip().lower().split() seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words] return seqs def main(): # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ path = dataset_path dictionary = build_dict(os.path.join(path, 'train')) train_x_pos = grab_data(path+'train/pos', dictionary) train_x_neg = grab_data(path+'train/neg', dictionary) train_x = train_x_pos + train_x_neg train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg) test_x_pos = grab_data(path+'test/pos', dictionary) test_x_neg = grab_data(path+'test/neg', dictionary) test_x = test_x_pos + test_x_neg test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg) f = open('imdb.pkl', 'wb') pkl.dump((train_x, train_y), f, -1) pkl.dump((test_x, test_y), f, -1) f.close() f = open('imdb.dict.pkl', 'wb') pkl.dump(dictionary, f, -1) f.close() if __name__ == '__main__': main() ================================================ FILE: DeepLearningTutorials/code/logistic_cg.py ================================================ """ This tutorial introduces logistic regression using Theano and conjugate gradient descent. Logistic regression is a probabilistic, linear classifier. It is parametrized by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability. Mathematically, this can be written as: .. math:: P(Y=i|x, W,b) &= softmax_i(W x + b) \\ &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} The output of the model or prediction is then done by taking the argmax of the vector whose i'th element is P(Y=i|x). .. math:: y_{pred} = argmax_i P(Y=i|x,W,b) This tutorial presents a conjugate gradient optimization method that is suitable for smaller datasets. References: - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 4.3.2 """ __docformat__ = 'restructedtext en' import os import sys import time import numpy import theano import theano.tensor as T from logistic_sgd import load_data class LogisticRegression(object): """Multi-class Logistic Regression Class The logistic regression is fully described by a weight matrix :math:`W` and bias vector :math:`b`. Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability. """ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture ( one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoint lies :type n_out: int :param n_out: number of output units, the dimension of the space in which the target lies """ # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), # while b is a vector of n_out elements, making theta a vector of # n_in*n_out + n_out elements self.theta = theano.shared( value=numpy.zeros( n_in * n_out + n_out, dtype=theano.config.floatX ), name='theta', borrow=True ) # W is represented by the fisr n_in*n_out elements of theta self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out)) # b is the rest (last n_out elements) self.b = self.theta[n_in * n_out:n_in * n_out + n_out] # compute vector of class-membership probabilities in symbolic form self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) # compute prediction as class whose probability is maximal in # symbolic form self.y_pred = T.argmax(self.p_y_given_x, axis=1) def negative_log_likelihood(self, y): """Return the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.y_pred, y)) else: raise NotImplementedError() def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'): """Demonstrate conjugate gradient optimization of a log-linear model This is demonstrated on MNIST. :type n_epochs: int :param n_epochs: number of epochs to run the optimizer :type mnist_pkl_gz: string :param mnist_pkl_gz: the path of the mnist training file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ ############# # LOAD DATA # ############# datasets = load_data(mnist_pkl_gz) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] batch_size = 600 # size of the minibatch n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size n_in = 28 * 28 # number of input units n_out = 10 # number of output units ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data minibatch_offset = T.lscalar() # offset to the start of a [mini]batch x = T.matrix() # the data is presented as rasterized images y = T.ivector() # the labels are presented as 1D vector of # [int] labels # construct the logistic regression class classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y).mean() # compile a theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( [minibatch_offset], classifier.errors(y), givens={ x: test_set_x[minibatch_offset:minibatch_offset + batch_size], y: test_set_y[minibatch_offset:minibatch_offset + batch_size] }, name="test" ) validate_model = theano.function( [minibatch_offset], classifier.errors(y), givens={ x: valid_set_x[minibatch_offset: minibatch_offset + batch_size], y: valid_set_y[minibatch_offset: minibatch_offset + batch_size] }, name="validate" ) # compile a theano function that returns the cost of a minibatch batch_cost = theano.function( [minibatch_offset], cost, givens={ x: train_set_x[minibatch_offset: minibatch_offset + batch_size], y: train_set_y[minibatch_offset: minibatch_offset + batch_size] }, name="batch_cost" ) # compile a theano function that returns the gradient of the minibatch # with respect to theta batch_grad = theano.function( [minibatch_offset], T.grad(cost, classifier.theta), givens={ x: train_set_x[minibatch_offset: minibatch_offset + batch_size], y: train_set_y[minibatch_offset: minibatch_offset + batch_size] }, name="batch_grad" ) # creates a function that computes the average cost on the training set def train_fn(theta_value): classifier.theta.set_value(theta_value, borrow=True) train_losses = [batch_cost(i * batch_size) for i in xrange(n_train_batches)] return numpy.mean(train_losses) # creates a function that computes the average gradient of cost with # respect to theta def train_fn_grad(theta_value): classifier.theta.set_value(theta_value, borrow=True) grad = batch_grad(0) for i in xrange(1, n_train_batches): grad += batch_grad(i * batch_size) return grad / n_train_batches validation_scores = [numpy.inf, 0] # creates the validation function def callback(theta_value): classifier.theta.set_value(theta_value, borrow=True) #compute the validation loss validation_losses = [validate_model(i * batch_size) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('validation error %f %%' % (this_validation_loss * 100.,)) # check if it is better then best validation score got until now if this_validation_loss < validation_scores[0]: # if so, replace the old one, and compute the score on the # testing dataset validation_scores[0] = this_validation_loss test_losses = [test_model(i * batch_size) for i in xrange(n_test_batches)] validation_scores[1] = numpy.mean(test_losses) ############### # TRAIN MODEL # ############### # using scipy conjugate gradient optimizer import scipy.optimize print ("Optimizing using scipy.optimize.fmin_cg...") start_time = time.clock() best_w_b = scipy.optimize.fmin_cg( f=train_fn, x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype), fprime=train_fn_grad, callback=callback, disp=0, maxiter=n_epochs ) end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, with ' 'test performance %f %%' ) % (validation_scores[0] * 100., validation_scores[1] * 100.) ) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))) if __name__ == '__main__': cg_optimization_mnist() ================================================ FILE: DeepLearningTutorials/code/logistic_sgd.py ================================================ """ This tutorial introduces logistic regression using Theano and stochastic gradient descent. Logistic regression is a probabilistic, linear classifier. It is parametrized by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability. Mathematically, this can be written as: .. math:: P(Y=i|x, W,b) &= softmax_i(W x + b) \\ &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} The output of the model or prediction is then done by taking the argmax of the vector whose i'th element is P(Y=i|x). .. math:: y_{pred} = argmax_i P(Y=i|x,W,b) This tutorial presents a stochastic gradient descent optimization method suitable for large datasets. References: - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 4.3.2 """ __docformat__ = 'restructedtext en' import cPickle import gzip import os import sys import time import numpy import theano import theano.tensor as T class LogisticRegression(object): """Multi-class Logistic Regression Class The logistic regression is fully described by a weight matrix :math:`W` and bias vector :math:`b`. Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability. """ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # start-snippet-1 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared( value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX ), name='W', borrow=True ) # initialize the baises b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) # symbolic expression for computing the matrix of class-membership # probabilities # Where: # W is a matrix where column-k represent the separation hyper plain for # class-k # x is a matrix where row-j represents input training sample-j # b is a vector where element-k represent the free parameter of hyper # plain-k self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) # symbolic description of how to compute prediction as class whose # probability is maximal self.y_pred = T.argmax(self.p_y_given_x, axis=1) # end-snippet-1 # parameters of the model self.params = [self.W, self.b] def negative_log_likelihood(self, y): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # start-snippet-2 # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) # end-snippet-2 def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.y_pred, y)) else: raise NotImplementedError() def load_data(dataset): ''' Loads the dataset :type dataset: string :param dataset: the path to the dataset (here MNIST) ''' ############# # LOAD DATA # ############# # Download the MNIST dataset if it is not present data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. new_path = os.path.join( os.path.split(__file__)[0], "..", "data", dataset ) if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': dataset = new_path if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': import urllib origin = ( 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' ) print 'Downloading data from %s' % origin urllib.urlretrieve(origin, dataset) print '... loading data' # Load the dataset f = gzip.open(dataset, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() #train_set, valid_set, test_set format: tuple(input, target) #input is an numpy.ndarray of 2 dimensions (a matrix) #witch row's correspond to an example. target is a #numpy.ndarray of 1 dimensions (vector)) that have the same length as #the number of rows in the input. It should give the target #target to the example with the same index in the input. def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-3 ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) ) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))) if __name__ == '__main__': sgd_optimization_mnist() ================================================ FILE: DeepLearningTutorials/code/lstm.py ================================================ ''' Build a tweet sentiment analyzer ''' from collections import OrderedDict import cPickle as pkl import random import sys import time import numpy import theano from theano import config import theano.tensor as tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import imdb datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} def numpy_floatX(data): return numpy.asarray(data, dtype=config.floatX) def get_minibatches_idx(n, minibatch_size, shuffle=False): """ Used to shuffle the dataset at each iteration. """ idx_list = numpy.arange(n, dtype="int32") if shuffle: random.shuffle(idx_list) minibatches = [] minibatch_start = 0 for i in range(n // minibatch_size): minibatches.append(idx_list[minibatch_start: minibatch_start + minibatch_size]) minibatch_start += minibatch_size if (minibatch_start != n): # Make a minibatch out of what is left minibatches.append(idx_list[minibatch_start:]) return zip(range(len(minibatches)), minibatches) def get_dataset(name): return datasets[name][0], datasets[name][1] def zipp(params, tparams): """ When we reload the model. Needed for the GPU stuff. """ for kk, vv in params.iteritems(): tparams[kk].set_value(vv) def unzip(zipped): """ When we pickle the model. Needed for the GPU stuff. """ new_params = OrderedDict() for kk, vv in zipped.iteritems(): new_params[kk] = vv.get_value() return new_params def dropout_layer(state_before, use_noise, trng): proj = tensor.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype)), state_before * 0.5) return proj def _p(pp, name): return '%s_%s' % (pp, name) def init_params(options): """ Global (not LSTM) parameter. For the embeding and the classifier. """ params = OrderedDict() # embedding randn = numpy.random.rand(options['n_words'], options['dim_proj']) params['Wemb'] = (0.01 * randn).astype(config.floatX) params = get_layer(options['encoder'])[0](options, params, prefix=options['encoder']) # classifier params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], options['ydim']).astype(config.floatX) params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX) return params def load_params(path, params): pp = numpy.load(path) for kk, vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] return params def init_tparams(params): tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams def get_layer(name): fns = layers[name] return fns def ortho_weight(ndim): W = numpy.random.randn(ndim, ndim) u, s, v = numpy.linalg.svd(W) return u.astype(config.floatX) def param_init_lstm(options, params, prefix='lstm'): """ Init the LSTM parameter: :see: init_params """ W = numpy.concatenate([ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj'])], axis=1) params[_p(prefix, 'W')] = W U = numpy.concatenate([ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj'])], axis=1) params[_p(prefix, 'U')] = U b = numpy.zeros((4 * options['dim_proj'],)) params[_p(prefix, 'b')] = b.astype(config.floatX) return params def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tparams[_p(prefix, 'b')] i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0] # ff: Feed Forward (normal neural net), only useful to put after lstm # before the classifier. layers = {'lstm': (param_init_lstm, lstm_layer)} def sgd(lr, tparams, grads, x, mask, y, cost): """ Stochastic Gradient Descent :note: A more complicated version of sgd then needed. This is done like that for adadelta and rmsprop. """ # New set of shared variable that will contain the gradient # for a mini-batch. gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] # Function that computes gradients for a mini-batch, but do not # updates the weights. f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, name='sgd_f_grad_shared') pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] # Function that updates the weights from the previously computed # gradient. f_update = theano.function([lr], [], updates=pup, name='sgd_f_update') return f_grad_shared, f_update def adadelta(lr, tparams, grads, x, mask, y, cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update def rmsprop(lr, tparams, grads, x, mask, y, cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update def build_model(tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) y = tensor.vector('y', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_proj']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix=options['encoder'], mask=mask) if options['encoder'] == 'lstm': proj = (proj * mask[:, :, None]).sum(axis=0) proj = proj / mask.sum(axis=0)[:, None] if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() return use_noise, x, mask, y, f_pred_prob, f_pred, cost def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): """ If you want to use a trained model, this is useful to compute the probabilities of new examples. """ n_samples = len(data[0]) probs = numpy.zeros((n_samples, 2)).astype(config.floatX) n_done = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index], maxlen=None) pred_probs = f_pred_prob(x, mask) probs[valid_index, :] = pred_probs n_done += len(valid_index) if verbose: print '%d/%d samples classified' % (n_done, n_samples) return probs def pred_error(f_pred, prepare_data, data, iterator, verbose=False): """ Just compute the error f_pred: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ valid_err = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index], maxlen=None) preds = f_pred(x, mask) targets = numpy.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there validFreq=370, # Compute the validation error after this number of update. saveFreq=1110, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model="", # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print "model options", model_options load_data, prepare_data = get_dataset(dataset) print 'Loading data' train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print 'Optimization' kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.clock() try: for eidx in xrange(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost if saveto and numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) if (uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 print ('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.clock() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err if __name__ == '__main__': # See function train for all possible parameter and there definition. train_lstm( #reload_model="lstm_model.npz", max_epochs=100, test_size=500, ) ================================================ FILE: DeepLearningTutorials/code/mlp.py ================================================ """ This tutorial introduces the multilayer perceptron using Theano. A multilayer perceptron is a logistic regressor where instead of feeding the input to the logistic regression you insert a intermediate layer, called the hidden layer, that has a nonlinear activation function (usually tanh or sigmoid) . One can use many such hidden layers making the architecture deep. The tutorial will also tackle the problem of MNIST digit classification. .. math:: f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), References: - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 5 """ __docformat__ = 'restructedtext en' import os import sys import time import numpy import theano import theano.tensor as T from logistic_sgd import LogisticRegression, load_data # start-snippet-1 class HiddenLayer(object): def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input = input # end-snippet-1 # `W` is initialized with `W_values` which is uniformely sampled # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) # for tanh activation function # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid # compared to tanh # We have no info for other function, so we use the same as # tanh. if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b self.output = ( lin_output if activation is None else activation(lin_output) ) # parameters of the model self.params = [self.W, self.b] # start-snippet-2 class MLP(object): """Multi-Layer Perceptron Class A multilayer perceptron is a feedforward artificial neural network model that has one layer or more of hidden units and nonlinear activations. Intermediate layers usually have as activation function tanh or the sigmoid function (defined here by a ``HiddenLayer`` class) while the top layer is a softamx layer (defined here by a ``LogisticRegression`` class). """ def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function self.hiddenLayer = HiddenLayer( rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh ) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out ) # end-snippet-2 start-snippet-3 # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum() ) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum() ) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10 ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-5 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if __name__ == '__main__': test_mlp() ================================================ FILE: DeepLearningTutorials/code/rbm.py ================================================ """This tutorial introduces restricted boltzmann machines (RBM) using Theano. Boltzmann Machines (BMs) are a particular form of energy-based model which contain hidden variables. Restricted Boltzmann Machines further restrict BMs to those without visible-visible and hidden-hidden connections. """ import time try: import PIL.Image as Image except ImportError: import Image import numpy import theano import theano.tensor as T import os from theano.tensor.shared_randomstreams import RandomStreams from utils import tile_raster_images from logistic_sgd import load_data # start-snippet-1 class RBM(object): """Restricted Boltzmann Machine (RBM) """ def __init__( self, input=None, n_visible=784, n_hidden=500, W=None, hbias=None, vbias=None, numpy_rng=None, theano_rng=None ): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias """ self.n_visible = n_visible self.n_hidden = n_hidden if numpy_rng is None: # create a number generator numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if W is None: # W is initialized with `initial_W` which is uniformely # sampled from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if # converted using asarray to dtype theano.config.floatX so # that the code is runable on GPU initial_W = numpy.asarray( numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden) ), dtype=theano.config.floatX ) # theano shared variables for weights and biases W = theano.shared(value=initial_W, name='W', borrow=True) if hbias is None: # create shared variable for hidden units bias hbias = theano.shared( value=numpy.zeros( n_hidden, dtype=theano.config.floatX ), name='hbias', borrow=True ) if vbias is None: # create shared variable for visible units bias vbias = theano.shared( value=numpy.zeros( n_visible, dtype=theano.config.floatX ), name='vbias', borrow=True ) # initialize input layer for standalone RBM or layer0 of DBN self.input = input if not input: self.input = T.matrix('input') self.W = W self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng # **** WARNING: It is not a good idea to put things in this list # other than shared variables created in this function. self.params = [self.W, self.hbias, self.vbias] # end-snippet-1 def free_energy(self, v_sample): ''' Function to compute the free energy ''' wx_b = T.dot(v_sample, self.W) + self.hbias vbias_term = T.dot(v_sample, self.vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - vbias_term def propup(self, vis): '''This function propagates the visible units activation upwards to the hidden units Note that we return also the pre-sigmoid activation of the layer. As it will turn out later, due to how Theano deals with optimizations, this symbolic variable will be needed to write down a more stable computational graph (see details in the reconstruction cost function) ''' pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] def sample_h_given_v(self, v0_sample): ''' This function infers state of hidden units given visible units ''' # compute the activation of the hidden units given a sample of # the visibles pre_sigmoid_h1, h1_mean = self.propup(v0_sample) # get a sample of the hiddens given their activation # Note that theano_rng.binomial returns a symbolic sample of dtype # int64 by default. If we want to keep our computations in floatX # for the GPU we need to specify to return the dtype floatX h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX) return [pre_sigmoid_h1, h1_mean, h1_sample] def propdown(self, hid): '''This function propagates the hidden units activation downwards to the visible units Note that we return also the pre_sigmoid_activation of the layer. As it will turn out later, due to how Theano deals with optimizations, this symbolic variable will be needed to write down a more stable computational graph (see details in the reconstruction cost function) ''' pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] def sample_v_given_h(self, h0_sample): ''' This function infers state of visible units given hidden units ''' # compute the activation of the visible given the hidden sample pre_sigmoid_v1, v1_mean = self.propdown(h0_sample) # get a sample of the visible given their activation # Note that theano_rng.binomial returns a symbolic sample of dtype # int64 by default. If we want to keep our computations in floatX # for the GPU we need to specify to return the dtype floatX v1_sample = self.theano_rng.binomial(size=v1_mean.shape, n=1, p=v1_mean, dtype=theano.config.floatX) return [pre_sigmoid_v1, v1_mean, v1_sample] def gibbs_hvh(self, h0_sample): ''' This function implements one step of Gibbs sampling, starting from the hidden state''' pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample) pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) return [pre_sigmoid_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample] def gibbs_vhv(self, v0_sample): ''' This function implements one step of Gibbs sampling, starting from the visible state''' pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample) pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample) return [pre_sigmoid_h1, h1_mean, h1_sample, pre_sigmoid_v1, v1_mean, v1_sample] # start-snippet-2 def get_cost_updates(self, lr=0.1, persistent=None, k=1): """This functions implements one step of CD-k or PCD-k :param lr: learning rate used to train the RBM :param persistent: None for CD. For PCD, shared variable containing old state of Gibbs chain. This must be a shared variable of size (batch size, number of hidden units). :param k: number of Gibbs steps to do in CD-k/PCD-k Returns a proxy for the cost and the updates dictionary. The dictionary contains the update rules for weights and biases but also an update of the shared variable used to store the persistent chain, if one is used. """ # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) # decide how to initialize persistent chain: # for CD, we use the newly generate hidden sample # for PCD, we initialize from the old state of the chain if persistent is None: chain_start = ph_sample else: chain_start = persistent # end-snippet-2 # perform actual negative phase # in order to implement CD-k/PCD-k we need to scan over the # function that implements one gibbs step k times. # Read Theano tutorial on scan for more information : # http://deeplearning.net/software/theano/library/scan.html # the scan will return the entire Gibbs chain ( [ pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples ], updates ) = theano.scan( self.gibbs_hvh, # the None are place holders, saying that # chain_start is the initial state corresponding to the # 6th output outputs_info=[None, None, None, None, None, chain_start], n_steps=k ) # start-snippet-3 # determine gradients on RBM parameters # note that we only need the sample at the end of the chain chain_end = nv_samples[-1] cost = T.mean(self.free_energy(self.input)) - T.mean( self.free_energy(chain_end)) # We must not compute the gradient through the gibbs sampling gparams = T.grad(cost, self.params, consider_constant=[chain_end]) # end-snippet-3 start-snippet-4 # constructs the update dictionary for gparam, param in zip(gparams, self.params): # make sure that the learning rate is of the right dtype updates[param] = param - gparam * T.cast( lr, dtype=theano.config.floatX ) if persistent: # Note that this works only if persistent is a shared variable updates[persistent] = nh_samples[-1] # pseudo-likelihood is a better proxy for PCD monitoring_cost = self.get_pseudo_likelihood_cost(updates) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost(updates, pre_sigmoid_nvs[-1]) return monitoring_cost, updates # end-snippet-4 def get_pseudo_likelihood_cost(self, updates): """Stochastic approximation to the pseudo-likelihood""" # index of bit i in expression p(x_i | x_{\i}) bit_i_idx = theano.shared(value=0, name='bit_i_idx') # binarize the input image by rounding to nearest integer xi = T.round(self.input) # calculate free energy for the given bit configuration fe_xi = self.free_energy(xi) # flip bit x_i of matrix xi and preserve all other bits x_{\i} # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns # the result to xi_flip, instead of working in place on xi. xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) # calculate free energy with bit flipped fe_xi_flip = self.free_energy(xi_flip) # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) # increment bit_i_idx % number as part of updates updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible return cost def get_reconstruction_cost(self, updates, pre_sigmoid_nv): """Approximation to the reconstruction error Note that this function requires the pre-sigmoid activation as input. To understand why this is so you need to understand a bit about how Theano works. Whenever you compile a Theano function, the computational graph that you pass as input gets optimized for speed and stability. This is done by changing several parts of the subgraphs with others. One such optimization expresses terms of the form log(sigmoid(x)) in terms of softplus. We need this optimization for the cross-entropy since sigmoid of numbers larger than 30. (or even less then that) turn to 1. and numbers smaller than -30. turn to 0 which in terms will force theano to compute log(0) and therefore we will get either -inf or NaN as cost. If the value is expressed in terms of softplus we do not get this undesirable behaviour. This optimization usually works fine, but here we have a special case. The sigmoid is applied inside the scan op, while the log is outside. Therefore Theano will only see log(scan(..)) instead of log(sigmoid(..)) and will not apply the wanted optimization. We can not go and replace the sigmoid in scan with something else also, because this only needs to be done on the last step. Therefore the easiest and more efficient way is to get also the pre-sigmoid activation as an output of scan, and apply both the log and sigmoid outside scan such that Theano can catch and optimize the expression. """ cross_entropy = T.mean( T.sum( self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) + (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)), axis=1 ) ) return cross_entropy def test_rbm(learning_rate=0.1, training_epochs=15, dataset='mnist.pkl.gz', batch_size=20, n_chains=20, n_samples=10, output_folder='rbm_plots', n_hidden=500): """ Demonstrate how to train and afterwards sample from it using Theano. This is demonstrated on MNIST. :param learning_rate: learning rate used for training the RBM :param training_epochs: number of epochs used for training :param dataset: path the the pickled dataset :param batch_size: size of a batch used to train the RBM :param n_chains: number of parallel Gibbs chains to be used for sampling :param n_samples: number of samples to plot for each chain """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) # initialize storage for the persistent chain (state = hidden # layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True) # construct the RBM class rbm = RBM(input=x, n_visible=28 * 28, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) # get the cost and the gradient corresponding to one step of CD-15 cost, updates = rbm.get_cost_updates(lr=learning_rate, persistent=persistent_chain, k=15) ################################# # Training the RBM # ################################# if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) # start-snippet-5 # it is ok for a theano function to have no output # the purpose of train_rbm is solely to update the RBM parameters train_rbm = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] }, name='train_rbm' ) plotting_time = 0. start_time = time.clock() # go through training epochs for epoch in xrange(training_epochs): # go through the training set mean_cost = [] for batch_index in xrange(n_train_batches): mean_cost += [train_rbm(batch_index)] print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost) # Plot filters after each training epoch plotting_start = time.clock() # Construct image from the weight matrix image = Image.fromarray( tile_raster_images( X=rbm.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1) ) ) image.save('filters_at_epoch_%i.png' % epoch) plotting_stop = time.clock() plotting_time += (plotting_stop - plotting_start) end_time = time.clock() pretraining_time = (end_time - start_time) - plotting_time print ('Training took %f minutes' % (pretraining_time / 60.)) # end-snippet-5 start-snippet-6 ################################# # Sampling from the RBM # ################################# # find out the number of test samples number_of_test_samples = test_set_x.get_value(borrow=True).shape[0] # pick random test examples, with which to initialize the persistent chain test_idx = rng.randint(number_of_test_samples - n_chains) persistent_vis_chain = theano.shared( numpy.asarray( test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains], dtype=theano.config.floatX ) ) # end-snippet-6 start-snippet-7 plot_every = 1000 # define one step of Gibbs sampling (mf = mean-field) define a # function that does `plot_every` steps before returning the # sample for plotting ( [ presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs, vis_samples ], updates ) = theano.scan( rbm.gibbs_vhv, outputs_info=[None, None, None, None, None, persistent_vis_chain], n_steps=plot_every ) # add to updates the shared variable that takes care of our persistent # chain :. updates.update({persistent_vis_chain: vis_samples[-1]}) # construct the function that implements our persistent chain. # we generate the "mean field" activations for plotting and the actual # samples for reinitializing the state of our persistent chain sample_fn = theano.function( [], [ vis_mfs[-1], vis_samples[-1] ], updates=updates, name='sample_fn' ) # create a space to store the image for plotting ( we need to leave # room for the tile_spacing as well) image_data = numpy.zeros( (29 * n_samples + 1, 29 * n_chains - 1), dtype='uint8' ) for idx in xrange(n_samples): # generate `plot_every` intermediate samples that we discard, # because successive samples in the chain are too correlated vis_mf, vis_sample = sample_fn() print ' ... plotting sample ', idx image_data[29 * idx:29 * idx + 28, :] = tile_raster_images( X=vis_mf, img_shape=(28, 28), tile_shape=(1, n_chains), tile_spacing=(1, 1) ) # construct image image = Image.fromarray(image_data) image.save('samples.png') # end-snippet-7 os.chdir('../') if __name__ == '__main__': test_rbm() ================================================ FILE: DeepLearningTutorials/code/rnnrbm.py ================================================ # Author: Nicolas Boulanger-Lewandowski # University of Montreal (2012) # RNN-RBM deep learning tutorial # More information at http://deeplearning.net/tutorial/rnnrbm.html import glob import os import sys import numpy try: import pylab except ImportError: print ( "pylab isn't available. If you use its functionality, it will crash." ) print "It can be installed with 'pip install -q Pillow'" from midi.utils import midiread, midiwrite import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams #Don't use a python long as this don't work on 32 bits computers. numpy.random.seed(0xbeef) rng = RandomStreams(seed=numpy.random.randint(1 << 30)) theano.config.warn.subtensor_merge_bug = False def build_rbm(v, W, bv, bh, k): '''Construct a k-step Gibbs chain starting at v for an RBM. v : Theano vector or matrix If a matrix, multiple chains will be run in parallel (batch). W : Theano matrix Weight matrix of the RBM. bv : Theano vector Visible bias vector of the RBM. bh : Theano vector Hidden bias vector of the RBM. k : scalar or Theano scalar Length of the Gibbs chain. Return a (v_sample, cost, monitor, updates) tuple: v_sample : Theano vector or matrix with the same shape as `v` Corresponds to the generated sample(s). cost : Theano scalar Expression whose gradient with respect to W, bv, bh is the CD-k approximation to the log-likelihood of `v` (training example) under the RBM. The cost is averaged in the batch case. monitor: Theano scalar Pseudo log-likelihood (also averaged in the batch case). updates: dictionary of Theano variable -> Theano variable The `updates` object returned by scan.''' def gibbs_step(v): mean_h = T.nnet.sigmoid(T.dot(v, W) + bh) h = rng.binomial(size=mean_h.shape, n=1, p=mean_h, dtype=theano.config.floatX) mean_v = T.nnet.sigmoid(T.dot(h, W.T) + bv) v = rng.binomial(size=mean_v.shape, n=1, p=mean_v, dtype=theano.config.floatX) return mean_v, v chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v], n_steps=k) v_sample = chain[-1] mean_v = gibbs_step(v_sample)[0] monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v) monitor = monitor.sum() / v.shape[0] def free_energy(v): return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum() cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0] return v_sample, cost, monitor, updates def shared_normal(num_rows, num_cols, scale=1): '''Initialize a matrix shared variable with normally distributed elements.''' return theano.shared(numpy.random.normal( scale=scale, size=(num_rows, num_cols)).astype(theano.config.floatX)) def shared_zeros(*shape): '''Initialize a vector shared variable with zero elements.''' return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX)) def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent): '''Construct a symbolic RNN-RBM and initialize parameters. n_visible : integer Number of visible units. n_hidden : integer Number of hidden units of the conditional RBMs. n_hidden_recurrent : integer Number of hidden units of the RNN. Return a (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate) tuple: v : Theano matrix Symbolic variable holding an input sequence (used during training) v_sample : Theano matrix Symbolic variable holding the negative particles for CD log-likelihood gradient estimation (used during training) cost : Theano scalar Expression whose gradient (considering v_sample constant) corresponds to the LL gradient of the RNN-RBM (used during training) monitor : Theano scalar Frame-level pseudo-likelihood (useful for monitoring during training) params : tuple of Theano shared variables The parameters of the model to be optimized during training. updates_train : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the training function. v_t : Theano matrix Symbolic variable holding a generated sequence (used during sampling) updates_generate : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the generation function.''' W = shared_normal(n_visible, n_hidden, 0.01) bv = shared_zeros(n_visible) bh = shared_zeros(n_hidden) Wuh = shared_normal(n_hidden_recurrent, n_hidden, 0.0001) Wuv = shared_normal(n_hidden_recurrent, n_visible, 0.0001) Wvu = shared_normal(n_visible, n_hidden_recurrent, 0.0001) Wuu = shared_normal(n_hidden_recurrent, n_hidden_recurrent, 0.0001) bu = shared_zeros(n_hidden_recurrent) params = W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu # learned parameters as shared # variables v = T.matrix() # a training sequence u0 = T.zeros((n_hidden_recurrent,)) # initial value for the RNN hidden # units # If `v_t` is given, deterministic recurrence to compute the variable # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence # but with a separate Gibbs chain at each time step to sample (generate) # from the RNN-RBM. The resulting sample v_t is returned in order to be # passed down to the sequence history. def recurrence(v_t, u_tm1): bv_t = bv + T.dot(u_tm1, Wuv) bh_t = bh + T.dot(u_tm1, Wuh) generate = v_t is None if generate: v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t, bh_t, k=25) u_t = T.tanh(bu + T.dot(v_t, Wvu) + T.dot(u_tm1, Wuu)) return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t] # For training, the deterministic recurrence is used to compute all the # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained # in batches using those parameters. (u_t, bv_t, bh_t), updates_train = theano.scan( lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1), sequences=v, outputs_info=[u0, None, None], non_sequences=params) v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:], k=15) updates_train.update(updates_rbm) # symbolic loop for sequence generation (v_t, u_t), updates_generate = theano.scan( lambda u_tm1, *_: recurrence(None, u_tm1), outputs_info=[None, u0], non_sequences=params, n_steps=200) return (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate) class RnnRbm: '''Simple class to train an RNN-RBM from MIDI files and to generate sample sequences.''' def __init__( self, n_hidden=150, n_hidden_recurrent=100, lr=0.001, r=(21, 109), dt=0.3 ): '''Constructs and compiles Theano functions for training and sequence generation. n_hidden : integer Number of hidden units of the conditional RBMs. n_hidden_recurrent : integer Number of hidden units of the RNN. lr : float Learning rate r : (integer, integer) tuple Specifies the pitch range of the piano-roll in MIDI note numbers, including r[0] but not r[1], such that r[1]-r[0] is the number of visible units of the RBM at a given time step. The default (21, 109) corresponds to the full range of piano (88 notes). dt : float Sampling period when converting the MIDI files into piano-rolls, or equivalently the time difference between consecutive time steps.''' self.r = r self.dt = dt (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate) = build_rnnrbm( r[1] - r[0], n_hidden, n_hidden_recurrent ) gradient = T.grad(cost, params, consider_constant=[v_sample]) updates_train.update( ((p, p - lr * g) for p, g in zip(params, gradient)) ) self.train_function = theano.function( [v], monitor, updates=updates_train ) self.generate_function = theano.function( [], v_t, updates=updates_generate ) def train(self, files, batch_size=100, num_epochs=200): '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI files converted to piano-rolls. files : list of strings List of MIDI files that will be loaded as piano-rolls for training. batch_size : integer Training sequences will be split into subsequences of at most this size before applying the SGD updates. num_epochs : integer Number of epochs (pass over the training set) performed. The user can safely interrupt training with Ctrl+C at any time.''' assert len(files) > 0, 'Training set is empty!' \ ' (did you download the data files?)' dataset = [midiread(f, self.r, self.dt).piano_roll.astype(theano.config.floatX) for f in files] try: for epoch in xrange(num_epochs): numpy.random.shuffle(dataset) costs = [] for s, sequence in enumerate(dataset): for i in xrange(0, len(sequence), batch_size): cost = self.train_function(sequence[i:i + batch_size]) costs.append(cost) print 'Epoch %i/%i' % (epoch + 1, num_epochs), print numpy.mean(costs) sys.stdout.flush() except KeyboardInterrupt: print 'Interrupted by user.' def generate(self, filename, show=True): '''Generate a sample sequence, plot the resulting piano-roll and save it as a MIDI file. filename : string A MIDI file will be created at this location. show : boolean If True, a piano-roll of the generated sequence will be shown.''' piano_roll = self.generate_function() midiwrite(filename, piano_roll, self.r, self.dt) if show: extent = (0, self.dt * len(piano_roll)) + self.r pylab.figure() pylab.imshow(piano_roll.T, origin='lower', aspect='auto', interpolation='nearest', cmap=pylab.cm.gray_r, extent=extent) pylab.xlabel('time (s)') pylab.ylabel('MIDI note number') pylab.title('generated piano-roll') def test_rnnrbm(batch_size=100, num_epochs=200): model = RnnRbm() re = os.path.join(os.path.split(os.path.dirname(__file__))[0], 'data', 'Nottingham', 'train', '*.mid') model.train(glob.glob(re), batch_size=batch_size, num_epochs=num_epochs) return model if __name__ == '__main__': model = test_rnnrbm() model.generate('sample1.mid') model.generate('sample2.mid') pylab.show() ================================================ FILE: DeepLearningTutorials/code/rnnslu.py ================================================ from collections import OrderedDict import copy import cPickle import gzip import os import urllib import random import stat import subprocess import sys import time import numpy import theano from theano import tensor as T # Otherwise the deepcopy fails import sys sys.setrecursionlimit(1500) PREFIX = os.getenv( 'ATISDATA', os.path.join(os.path.split(os.path.abspath(os.path.dirname(__file__)))[0], 'data')) # utils functions def shuffle(lol, seed): ''' lol :: list of list as input seed :: seed the shuffling shuffle inplace each list in the same order ''' for l in lol: random.seed(seed) random.shuffle(l) # start-snippet-1 def contextwin(l, win): ''' win :: int corresponding to the size of the window given a list of indexes composing a sentence l :: array containing the word indexes it will return a list of list of indexes corresponding to context windows surrounding each word in the sentence ''' assert (win % 2) == 1 assert win >= 1 l = list(l) lpadded = win // 2 * [-1] + l + win // 2 * [-1] out = [lpadded[i:(i + win)] for i in range(len(l))] assert len(out) == len(l) return out # end-snippet-1 # data loading functions def atisfold(fold): assert fold in range(5) filename = os.path.join(PREFIX, 'atis.fold'+str(fold)+'.pkl.gz') f = gzip.open(filename, 'rb') train_set, valid_set, test_set, dicts = cPickle.load(f) return train_set, valid_set, test_set, dicts # metrics function using conlleval.pl def conlleval(p, g, w, filename, script_path): ''' INPUT: p :: predictions g :: groundtruth w :: corresponding words OUTPUT: filename :: name of the file where the predictions are written. it will be the input of conlleval.pl script for computing the performance in terms of precision recall and f1 score OTHER: script_path :: path to the directory containing the conlleval.pl script ''' out = '' for sl, sp, sw in zip(g, p, w): out += 'BOS O O\n' for wl, wp, w in zip(sl, sp, sw): out += w + ' ' + wl + ' ' + wp + '\n' out += 'EOS O O\n\n' f = open(filename, 'w') f.writelines(out) f.close() return get_perf(filename, script_path) def download(origin, destination): ''' download the corresponding atis file from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/ ''' print 'Downloading data from %s' % origin urllib.urlretrieve(origin, destination) def get_perf(filename, folder): ''' run conlleval.pl perl script to obtain precision/recall and F1 score ''' _conlleval = os.path.join(folder, 'conlleval.pl') if not os.path.isfile(_conlleval): url = 'http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl' download(url, _conlleval) os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, _ = proc.communicate(''.join(open(filename).readlines())) for line in stdout.split('\n'): if 'accuracy' in line: out = line.split() break precision = float(out[6][:-2]) recall = float(out[8][:-2]) f1score = float(out[10]) return {'p': precision, 'r': recall, 'f1': f1score} # start-snippet-2 class RNNSLU(object): ''' elman neural net model ''' def __init__(self, nh, nc, ne, de, cs): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' # parameters of the model self.emb = theano.shared(name='embeddings', value=0.2 * numpy.random.uniform(-1.0, 1.0, (ne+1, de)) # add one for padding at the end .astype(theano.config.floatX)) self.wx = theano.shared(name='wx', value=0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh)) .astype(theano.config.floatX)) self.wh = theano.shared(name='wh', value=0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh)) .astype(theano.config.floatX)) self.w = theano.shared(name='w', value=0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc)) .astype(theano.config.floatX)) self.bh = theano.shared(name='bh', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.b = theano.shared(name='b', value=numpy.zeros(nc, dtype=theano.config.floatX)) self.h0 = theano.shared(name='h0', value=numpy.zeros(nh, dtype=theano.config.floatX)) # bundle self.params = [self.emb, self.wx, self.wh, self.w, self.bh, self.b, self.h0] # end-snippet-2 # as many columns as context window size # as many lines as words in the sentence # start-snippet-3 idxs = T.imatrix() x = self.emb[idxs].reshape((idxs.shape[0], de*cs)) y_sentence = T.ivector('y_sentence') # labels # end-snippet-3 start-snippet-4 def recurrence(x_t, h_tm1): h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh) s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b) return [h_t, s_t] [h, s], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[self.h0, None], n_steps=x.shape[0]) p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # end-snippet-4 # cost and gradients and learning rate # start-snippet-5 lr = T.scalar('lr') sentence_nll = -T.mean(T.log(p_y_given_x_sentence) [T.arange(x.shape[0]), y_sentence]) sentence_gradients = T.grad(sentence_nll, self.params) sentence_updates = OrderedDict((p, p - lr*g) for p, g in zip(self.params, sentence_gradients)) # end-snippet-5 # theano functions to compile # start-snippet-6 self.classify = theano.function(inputs=[idxs], outputs=y_pred) self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates) # end-snippet-6 start-snippet-7 self.normalize = theano.function(inputs=[], updates={self.emb: self.emb / T.sqrt((self.emb**2) .sum(axis=1)) .dimshuffle(0, 'x')}) # end-snippet-7 def train(self, x, y, window_size, learning_rate): cwords = contextwin(x, window_size) words = map(lambda x: numpy.asarray(x).astype('int32'), cwords) labels = y self.sentence_train(words, labels, learning_rate) self.normalize() def save(self, folder): for param in self.params: numpy.save(os.path.join(folder, param.name + '.npy'), param.get_value()) def load(self, folder): for param in self.params: param.set_value(numpy.load(os.path.join(folder, param.name + '.npy'))) def main(param=None): if not param: param = { 'fold': 3, # 5 folds 0,1,2,3,4 'data': 'atis', 'lr': 0.0970806646812754, 'verbose': 1, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'nhidden': 200, # number of hidden units 'seed': 345, 'emb_dimension': 50, # dimension of word embedding 'nepochs': 60, # 60 is recommended 'savemodel': False} print param folder_name = os.path.basename(__file__).split('.')[0] folder = os.path.join(os.path.dirname(__file__), folder_name) if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = atisfold(param['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(set(reduce(lambda x, y: list(x) + list(y), train_lex + valid_lex + test_lex))) nclasses = len(set(reduce(lambda x, y: list(x)+list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] # instanciate the model numpy.random.seed(param['seed']) random.seed(param['seed']) rnn = RNNSLU(nh=param['nhidden'], nc=nclasses, ne=vocsize, de=param['emb_dimension'], cs=param['win']) # train with early stopping on validation set best_f1 = -numpy.inf param['clr'] = param['lr'] for e in xrange(param['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], param['seed']) param['ce'] = e tic = time.time() for i, (x, y) in enumerate(zip(train_lex, train_y)): rnn.train(x, y, param['win'], param['clr']) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [map(lambda x: idx2label[x], rnn.classify(numpy.asarray( contextwin(x, param['win'])).astype('int32'))) for x in test_lex] predictions_valid = [map(lambda x: idx2label[x], rnn.classify(numpy.asarray( contextwin(x, param['win'])).astype('int32'))) for x in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt', folder) res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt', folder) if res_valid['f1'] > best_f1: if param['savemodel']: rnn.save(folder) best_rnn = copy.deepcopy(rnn) best_f1 = res_valid['f1'] if param['verbose']: print('NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1']) param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1'] param['vp'], param['tp'] = res_valid['p'], res_test['p'] param['vr'], param['tr'] = res_valid['r'], res_test['r'] param['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: if param['verbose']: print '' # learning rate decay if no improvement in 10 epochs if param['decay'] and abs(param['be']-param['ce']) >= 10: param['clr'] *= 0.5 rnn = best_rnn if param['clr'] < 1e-5: break print('BEST RESULT: epoch', param['be'], 'valid F1', param['vf1'], 'best test F1', param['tf1'], 'with the model', folder) if __name__ == '__main__': main() ================================================ FILE: DeepLearningTutorials/code/test.py ================================================ import sys import numpy import convolutional_mlp import dA import DBN import logistic_cg import logistic_sgd import mlp import rbm import rnnrbm import SdA import rnnslu import lstm def test_rnnslu(): rnnslu.main() def test_logistic_sgd(): logistic_sgd.sgd_optimization_mnist(n_epochs=10) def test_logistic_cg(): try: import scipy logistic_cg.cg_optimization_mnist(n_epochs=10) except ImportError: from nose.plugins.skip import SkipTest raise SkipTest( 'SciPy not available. Needed for the logistic_cg example.') def test_mlp(): mlp.test_mlp(n_epochs=1) def test_convolutional_mlp(): convolutional_mlp.evaluate_lenet5(n_epochs=1, nkerns=[5, 5]) def test_dA(): dA.test_dA(training_epochs=1, output_folder='tmp_dA_plots') def test_SdA(): SdA.test_SdA(pretraining_epochs=1, training_epochs=1, batch_size=300) def test_dbn(): DBN.test_DBN(pretraining_epochs=1, training_epochs=1, batch_size=300) def test_rbm(): rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1, n_hidden=20, output_folder='tmp_rbm_plots') def test_rnnrbm(): rnnrbm.test_rnnrbm(num_epochs=1) def test_lstm(): lstm.train_lstm(max_epochs=1, test_size=1000, saveto='') def speed(): """ This fonction modify the configuration theano and don't restore it! """ algo = ['logistic_sgd', 'logistic_cg', 'mlp', 'convolutional_mlp', 'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm', 'rnnslu', 'lstm'] to_exec = [True] * len(algo) # to_exec = [False] * len(algo) # to_exec[-1] = True do_float64 = True do_float32 = True do_gpu = True algo_executed = [s for idx, s in enumerate(algo) if to_exec[idx]] #Timming expected are from the buildbot that have an i7-920 @ # 2.67GHz with hyperthread enabled for the cpu, 12G of ram. An GeForce GTX # 580 for the GPU. OS=Fedora 14, gcc=4.5.1, python/BLAS from EPD # 7.1-2 (python 2.7.2, mkl unknow). BLAS with only 1 thread. expected_times_64 = numpy.asarray([9.8, 22.5, 76.1, 73.7, 116.4, 346.9, 381.9, 558.1, 186.3, 50.8, 113.6]) expected_times_32 = numpy.asarray([8.1, 17.9, 42.5, 66.5, 71, 191.2, 226.8, 432.8, 176.2, 36.9, 78.0]) # Number with just 1 decimal are new value that are faster with # the Theano version 0.5rc2 Other number are older. They are not # updated, as we where faster in the past! # TODO: find why and fix this! # Here is the value for the buildbot on February 3th 2012 with a GTX 285 # sgd, cg mlp conv da # sda dbn rbm # gpu times[3.72957802, 9.94316864, 29.1772666, 9.13857198, 25.91144657, # 18.30802011, 53.38651466, 285.41386175] # expected [3.076634879, 7.555234910, 18.99226785, 9.58915591, 24.130070450, # 24.77524018, 92.66246653, 322.340329170] # sgd, cg mlp conv da # sda dbn rbm #expected/get [0.82492841, 0.75984178, 0.65092691, 1.04930573, 0.93125138 # 1.35324519 1.7356905 1.12937868] expected_times_gpu = numpy.asarray([3.0, 7.55523491, 18.99226785, 5.8, 20.5, 11.8, 47.9, 290.1, 255.4, 72.4, 17.0]) expected_times_64 = [s for idx, s in enumerate(expected_times_64) if to_exec[idx]] expected_times_32 = [s for idx, s in enumerate(expected_times_32) if to_exec[idx]] expected_times_gpu = [s for idx, s in enumerate(expected_times_gpu) if to_exec[idx]] def time_test(m, l, idx, f, **kwargs): if not to_exec[idx]: return print algo[idx] ts = m.call_time try: f(**kwargs) except Exception, e: print >> sys.stderr, 'test', algo[idx], 'FAILED', e l.append(numpy.nan) return te = m.call_time l.append(te - ts) def do_tests(): m = theano.compile.mode.get_default_mode() l = [] time_test(m, l, 0, logistic_sgd.sgd_optimization_mnist, n_epochs=30) time_test(m, l, 1, logistic_cg.cg_optimization_mnist, n_epochs=30) time_test(m, l, 2, mlp.test_mlp, n_epochs=5) time_test(m, l, 3, convolutional_mlp.evaluate_lenet5, n_epochs=5, nkerns=[5, 5]) time_test(m, l, 4, dA.test_dA, training_epochs=2, output_folder='tmp_dA_plots') time_test(m, l, 5, SdA.test_SdA, pretraining_epochs=1, training_epochs=2, batch_size=300) time_test(m, l, 6, DBN.test_DBN, pretraining_epochs=1, training_epochs=2, batch_size=300) time_test(m, l, 7, rbm.test_rbm, training_epochs=1, batch_size=300, n_chains=1, n_samples=1, output_folder='tmp_rbm_plots') time_test(m, l, 8, rnnrbm.test_rnnrbm, num_epochs=1) s = {'fold': 3, # 5 folds 0,1,2,3,4 'data': 'atis', 'lr': 0.0970806646812754, 'verbose': 1, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'nhidden': 200, # number of hidden units 'seed': 345, 'emb_dimension': 50, # dimension of word embedding 'nepochs': 1, # 60 is recommended 'savemodel': False} time_test(m, l, 9, rnnslu.main, param=s) time_test(m, l, 10, lstm.train_lstm, max_epochs=1, test_size=1000, saveto='') return numpy.asarray(l) #test in float64 in FAST_RUN mode on the cpu import theano if do_float64: theano.config.floatX = 'float64' theano.config.mode = 'FAST_RUN' float64_times = do_tests() print >> sys.stderr, algo_executed print >> sys.stderr, 'float64 times', float64_times print >> sys.stderr, 'float64 expected', expected_times_64 print >> sys.stderr, 'float64 % expected/get', ( expected_times_64 / float64_times) #test in float32 in FAST_RUN mode on the cpu theano.config.floatX = 'float32' if do_float32: float32_times = do_tests() print >> sys.stderr, algo_executed print >> sys.stderr, 'float32 times', float32_times print >> sys.stderr, 'float32 expected', expected_times_32 print >> sys.stderr, 'float32 % expected/get', ( expected_times_32 / float32_times) if do_float64: print >> sys.stderr, 'float64/float32', ( float64_times / float32_times) print >> sys.stderr print >> sys.stderr, ('Duplicate the timing to have everything ' 'in one place') print >> sys.stderr, algo_executed print >> sys.stderr, 'float64 times', float64_times print >> sys.stderr, 'float64 expected', expected_times_64 print >> sys.stderr, 'float64 % expected/get', ( expected_times_64 / float64_times) print >> sys.stderr, 'float32 times', float32_times print >> sys.stderr, 'float32 expected', expected_times_32 print >> sys.stderr, 'float32 % expected/get', ( expected_times_32 / float32_times) print >> sys.stderr, 'float64/float32', ( float64_times / float32_times) print >> sys.stderr, 'expected float64/float32', ( expected_times_64 / float32_times) #test in float32 in FAST_RUN mode on the gpu import theano.sandbox.cuda if do_gpu: theano.sandbox.cuda.use('gpu') gpu_times = do_tests() print >> sys.stderr, algo_executed print >> sys.stderr, 'gpu times', gpu_times print >> sys.stderr, 'gpu expected', expected_times_gpu print >> sys.stderr, 'gpu % expected/get', ( expected_times_gpu / gpu_times) if do_float64: print >> sys.stderr, 'float64/gpu', float64_times / gpu_times if (do_float64 + do_float32 + do_gpu) > 1: print >> sys.stderr print >> sys.stderr, ('Duplicate the timing to have everything ' 'in one place') print >> sys.stderr, algo_executed if do_float64: print >> sys.stderr, 'float64 times', float64_times print >> sys.stderr, 'float64 expected', expected_times_64 print >> sys.stderr, 'float64 % expected/get', ( expected_times_64 / float64_times) if do_float32: print >> sys.stderr, 'float32 times', float32_times print >> sys.stderr, 'float32 expected', expected_times_32 print >> sys.stderr, 'float32 % expected/get', ( expected_times_32 / float32_times) if do_gpu: print >> sys.stderr, 'gpu times', gpu_times print >> sys.stderr, 'gpu expected', expected_times_gpu print >> sys.stderr, 'gpu % expected/get', ( expected_times_gpu / gpu_times) print if do_float64 and do_float32: print >> sys.stderr, 'float64/float32', ( float64_times / float32_times) print >> sys.stderr, 'expected float64/float32', ( expected_times_64 / float32_times) if do_float64 and do_gpu: print >> sys.stderr, 'float64/gpu', float64_times / gpu_times print >> sys.stderr, 'expected float64/gpu', ( expected_times_64 / gpu_times) if do_float32 and do_gpu: print >> sys.stderr, 'float32/gpu', float32_times / gpu_times print >> sys.stderr, 'expected float32/gpu', ( expected_times_32 / gpu_times) def compare(x, y): ratio = x / y # If there is more then 5% difference between the expected # time and the real time, we consider this an error. return sum((ratio < 0.95) + (ratio > 1.05)) print if do_float64: err = compare(expected_times_64, float64_times) print >> sys.stderr, 'speed_failure_float64=' + str(err) if do_float32: err = compare(expected_times_32, float32_times) print >> sys.stderr, 'speed_failure_float32=' + str(err) if do_gpu: err = compare(expected_times_gpu, gpu_times) print >> sys.stderr, 'speed_failure_gpu=' + str(err) assert not numpy.isnan(gpu_times).any() ================================================ FILE: DeepLearningTutorials/code/utils.py ================================================ """ This file contains different utility functions that are not connected in anyway to the networks presented in the tutorials, but rather help in processing the outputs into a more understandable way. For example ``tile_raster_images`` helps in generating a easy to grasp image from a set of samples or weights. """ import numpy def scale_to_unit_interval(ndar, eps=1e-8): """ Scales all values in the ndarray ndar to be between 0 and 1 """ ndar = ndar.copy() ndar -= ndar.min() ndar *= 1.0 / (ndar.max() + eps) return ndar def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), scale_rows_to_unit_interval=True, output_pixel_vals=True): """ Transform an array with one flattened image per row, into an array in which images are reshaped and layed out like tiles on a floor. This function is useful for visualizing datasets whose rows are images, and also columns of matrices for transforming those rows (such as the first layer of a neural net). :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can be 2-D ndarrays or None; :param X: a 2-D array in which every row is a flattened image. :type img_shape: tuple; (height, width) :param img_shape: the original shape of each image :type tile_shape: tuple; (rows, cols) :param tile_shape: the number of images to tile (rows, cols) :param output_pixel_vals: if output should be pixel values (i.e. int8 values) or floats :param scale_rows_to_unit_interval: if the values need to be scaled before being plotted to [0,1] or not :returns: array suitable for viewing as an image. (See:`Image.fromarray`.) :rtype: a 2-d array with same dtype as X. """ assert len(img_shape) == 2 assert len(tile_shape) == 2 assert len(tile_spacing) == 2 # The expression below can be re-written in a more C style as # follows : # # out_shape = [0,0] # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - # tile_spacing[0] # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - # tile_spacing[1] out_shape = [ (ishp + tsp) * tshp - tsp for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing) ] if isinstance(X, tuple): assert len(X) == 4 # Create an output numpy ndarray to store the image if output_pixel_vals: out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') else: out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) #colors default to 0, alpha defaults to 1 (opaque) if output_pixel_vals: channel_defaults = [0, 0, 0, 255] else: channel_defaults = [0., 0., 0., 1.] for i in xrange(4): if X[i] is None: # if channel is None, fill it with zeros of the correct # dtype dt = out_array.dtype if output_pixel_vals: dt = 'uint8' out_array[:, :, i] = numpy.zeros( out_shape, dtype=dt ) + channel_defaults[i] else: # use a recurrent call to compute the channel and store it # in the output out_array[:, :, i] = tile_raster_images( X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) return out_array else: # if we are dealing with only one channel H, W = img_shape Hs, Ws = tile_spacing # generate a matrix to store the output dt = X.dtype if output_pixel_vals: dt = 'uint8' out_array = numpy.zeros(out_shape, dtype=dt) for tile_row in xrange(tile_shape[0]): for tile_col in xrange(tile_shape[1]): if tile_row * tile_shape[1] + tile_col < X.shape[0]: this_x = X[tile_row * tile_shape[1] + tile_col] if scale_rows_to_unit_interval: # if we should scale values to be between 0 and 1 # do this by calling the `scale_to_unit_interval` # function this_img = scale_to_unit_interval( this_x.reshape(img_shape)) else: this_img = this_x.reshape(img_shape) # add the slice to the corresponding position in the # output array c = 1 if output_pixel_vals: c = 255 out_array[ tile_row * (H + Hs): tile_row * (H + Hs) + H, tile_col * (W + Ws): tile_col * (W + Ws) + W ] = this_img * c return out_array ================================================ FILE: DeepLearningTutorials/data/download.sh ================================================ #!/bin/sh which wget >/dev/null 2>&1 WGET=$? which curl >/dev/null 2>&1 CURL=$? if [ "$WGET" -eq 0 ]; then DL_CMD="wget -c" elif [ "$CURL" -eq 0 ]; then DL_CMD="curl -C - -O" else echo "You need wget or curl installed to download" exit 1 fi $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)" $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold1.pkl.gz $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold2.pkl.gz $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold3.pkl.gz $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold4.pkl.gz ================================================ FILE: DeepLearningTutorials/doc/.templates/layout.html ================================================ {% extends "!layout.html" %} {%- block extrahead %} {{ super() }} {% endblock %} {% block footer %} {{ super() }} {% endblock %} ================================================ FILE: DeepLearningTutorials/doc/DBN.txt ================================================ .. _DBN: Deep Belief Networks ==================== .. note:: This section assumes the reader has already read through :doc:`logreg` and :doc:`mlp` and :doc:`rbm`. Additionally it uses the following Theano functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_. .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers .. note:: The code for this section is available for download `here`_. .. _here: http://deeplearning.net/tutorial/code/DBN.py Deep Belief Networks ++++++++++++++++++++ [Hinton06]_ showed that RBMs can be stacked and trained in a greedy manner to form so-called Deep Belief Networks (DBN). DBNs are graphical models which learn to extract a deep hierarchical representation of the training data. They model the joint distribution between observed vector :math:`x` and the :math:`\ell` hidden layers :math:`h^k` as follows: .. math:: :label: dbn P(x, h^1, \ldots, h^{\ell}) = \left(\prod_{k=0}^{\ell-2} P(h^k|h^{k+1})\right) P(h^{\ell-1},h^{\ell}) where :math:`x=h^0`, :math:`P(h^{k-1} | h^k)` is a conditional distribution for the visible units conditioned on the hidden units of the RBM at level :math:`k`, and :math:`P(h^{\ell-1}, h^{\ell})` is the visible-hidden joint distribution in the top-level RBM. This is illustrated in the figure below. .. figure:: images/DBN3.png :align: center The principle of greedy layer-wise unsupervised training can be applied to DBNs with RBMs as the building blocks for each layer [Hinton06]_, [Bengio07]_. The process is as follows: 1. Train the first layer as an RBM that models the raw input :math:`x = h^{(0)}` as its visible layer. 2. Use that first layer to obtain a representation of the input that will be used as data for the second layer. Two common solutions exist. This representation can be chosen as being the mean activations :math:`p(h^{(1)}=1|h^{(0)})` or samples of :math:`p(h^{(1)}|h^{(0)})`. 3. Train the second layer as an RBM, taking the transformed data (samples or mean activations) as training examples (for the visible layer of that RBM). 4. Iterate (2 and 3) for the desired number of layers, each time propagating upward either samples or mean values. 5. Fine-tune all the parameters of this deep architecture with respect to a proxy for the DBN log- likelihood, or with respect to a supervised training criterion (after adding extra learning machinery to convert the learned representation into supervised predictions, e.g. a linear classifier). In this tutorial, we focus on fine-tuning via supervised gradient descent. Specifically, we use a logistic regression classifier to classify the input :math:`x` based on the output of the last hidden layer :math:`h^{(l)}` of the DBN. Fine-tuning is then performed via supervised gradient descent of the negative log-likelihood cost function. Since the supervised gradient is only non-null for the weights and hidden layer biases of each layer (i.e. null for the visible biases of each RBM), this procedure is equivalent to initializing the parameters of a deep MLP with the weights and hidden layer biases obtained with the unsupervised training strategy. Justifying Greedy-Layer Wise Pre-Training +++++++++++++++++++++++++++++++++++++++++ Why does such an algorithm work ? Taking as example a 2-layer DBN with hidden layers :math:`h^{(1)}` and :math:`h^{(2)}` (with respective weight parameters :math:`W^{(1)}` and :math:`W^{(2)}`), [Hinton06]_ established (see also Bengio09]_ for a detailed derivation) that :math:`\log p(x)` can be rewritten as, .. math:: :label: dbn_bound \log p(x) = &KL(Q(h^{(1)}|x)||p(h^{(1)}|x)) + H_{Q(h^{(1)}|x)} + \\ &\sum_h Q(h^{(1)}|x)(\log p(h^{(1)}) + \log p(x|h^{(1)})). :math:`KL(Q(h^{(1)}|x) || p(h^{(1)}|x))` represents the KL divergence between the posterior :math:`Q(h^{(1)}|x)` of the first RBM if it were standalone, and the probability :math:`p(h^{(1)}|x)` for the same layer but defined by the entire DBN (i.e. taking into account the prior :math:`p(h^{(1)},h^{(2)})` defined by the top-level RBM). :math:`H_{Q(h^{(1)}|x)}` is the entropy of the distribution :math:`Q(h^{(1)}|x)`. It can be shown that if we initialize both hidden layers such that :math:`W^{(2)}={W^{(1)}}^T`, :math:`Q(h^{(1)}|x)=p(h^{(1)}|x)` and the KL divergence term is null. If we learn the first level RBM and then keep its parameters :math:`W^{(1)}` fixed, optimizing Eq. :eq:`dbn_bound` with respect to :math:`W^{(2)}` can thus only increase the likelihood :math:`p(x)`. Also, notice that if we isolate the terms which depend only on :math:`W^{(2)}`, we get: .. math:: \sum_h Q(h^{(1)}|x)p(h^{(1)}) Optimizing this with respect to :math:`W^{(2)}` amounts to training a second-stage RBM, using the output of :math:`Q(h^{(1)}|x)` as the training distribution, when :math:`x` is sampled from the training distribution for the first RBM. Implementation ++++++++++++++ To implement DBNs in Theano, we will use the class defined in the :doc:`rbm` tutorial. One can also observe that the code for the DBN is very similar with the one for SdA, because both involve the principle of unsupervised layer-wise pre-training followed by supervised fine-tuning as a deep MLP. The main difference is that we use the RBM class instead of the dA class. We start off by defining the DBN class which will store the layers of the MLP, along with their associated RBMs. Since we take the viewpoint of using the RBMs to initialize an MLP, the code will reflect this by seperating as much as possible the RBMs used to initialize the network and the MLP used for classification. .. literalinclude:: ../code/DBN.py :start-after: start-snippet-1 :end-before: end-snippet-1 ``self.sigmoid_layers`` will store the feed-forward graphs which together form the MLP, while ``self.rbm_layers`` will store the RBMs used to pretrain each layer of the MLP. Next step, we construct ``n_layers`` sigmoid layers (we use the ``HiddenLayer`` class introduced in :ref:`mlp`, with the only modification that we replaced the non-linearity from ``tanh`` to the logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers`` RBMs, where ``n_layers`` is the depth of our model. We link the sigmoid layers such that they form an MLP, and construct each RBM such that they share the weight matrix and the hidden bias with its corresponding sigmoid layer. .. literalinclude:: ../code/DBN.py :start-after: # MLP. :end-before: # We now need to add a logistic layer on top of the MLP All that is left is to stack one last logistic regression layer in order to form an MLP. We will use the ``LogisticRegression`` class introduced in :ref:`logreg`. .. literalinclude:: ../code/DBN.py :start-after: # We now need to add a logistic layer on top of the MLP :end-before: def pretraining_functions The class also provides a method which generates training functions for each of the RBMs. They are returned as a list, where element :math:`i` is a function which implements one step of training for the ``RBM`` at layer :math:`i`. .. literalinclude:: ../code/DBN.py :start-after: self.errors = self.logLayer.errors(self.y) :end-before: learning_rate = T.scalar('lr') In order to be able to change the learning rate during training, we associate a Theano variable to it that has a default value. .. literalinclude:: ../code/DBN.py :start-after: index = T.lscalar('index') :end-before: def build_finetune_functions Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and optionally ``lr`` -- the learning rate. Note that the names of the parameters are the names given to the Theano variables (e.g. ``lr``) when they are constructed and not the name of the python variables (e.g. ``learning_rate``). Keep this in mind when working with Theano. Optionally, if you provide ``k`` (the number of Gibbs steps to perform in CD or PCD) this will also become an argument of your function. In the same fashion, the DBN class includes a method for building the functions required for finetuning ( a ``train_model``, a ``validate_model`` and a ``test_model`` function). .. literalinclude:: ../code/DBN.py :pyobject: DBN.build_finetune_functions Note that the returned ``valid_score`` and ``test_score`` are not Theano functions, but rather Python functions. These loop over the entire validation set and the entire test set to produce a list of the losses obtained over these sets. Putting it all together +++++++++++++++++++++++ The few lines of code below constructs the deep belief network : .. literalinclude:: ../code/DBN.py :start-after: # numpy random generator :end-before: start-snippet-2 There are two stages in training this network: (1) a layer-wise pre-training and (2) a fine-tuning stage. For the pre-training stage, we loop over all the layers of the network. For each layer, we use the compiled theano function which determines the input to the ``i``-th level RBM and performs one step of CD-k within this RBM. This function is applied to the training set for a fixed number of epochs given by ``pretraining_epochs``. .. literalinclude:: ../code/DBN.py :start-after: start-snippet-2 :end-before: end-snippet-2 The fine-tuning loop is very similar to the one in the :ref:`mlp` tutorial, the only difference being that we now use the functions given by ``build_finetune_functions``. Running the Code ++++++++++++++++ The user can run the code by calling: .. code-block:: bash python code/DBN.py With the default parameters, the code runs for 100 pre-training epochs with mini-batches of size 10. This corresponds to performing 500,000 unsupervised parameter updates. We use an unsupervised learning rate of 0.01, with a supervised learning rate of 0.1. The DBN itself consists of three hidden layers with 1000 units per layer. With early-stopping, this configuration achieved a minimal validation error of 1.27 with corresponding test error of 1.34 after 46 supervised epochs. On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL library (running on 4 cores), pretraining took 615 minutes with an average of 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately 2.20 mins/epoch. Hyper-parameters were selected by optimizing on the validation error. We tested unsupervised learning rates in :math:`\{10^{-1}, ..., 10^{-5}\}` and supervised learning rates in :math:`\{10^{-1}, ..., 10^{-4}\}`. We did not use any form of regularization besides early-stopping, nor did we optimize over the number of pretraining updates. Tips and Tricks +++++++++++++++ One way to improve the running time of your code (given that you have sufficient memory available), is to compute the representation of the entire dataset at layer ``i`` in a single pass, once the weights of the :math:`i-1`-th layers have been fixed. Namely, start by training your first layer RBM. Once it is trained, you can compute the hidden units values for every example in the dataset and store this as a new dataset which is used to train the 2nd layer RBM. Once you trained the RBM for layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on. This avoids calculating the intermediate (hidden layer) representations, ``pretraining_epochs`` times at the expense of increased memory usage. ================================================ FILE: DeepLearningTutorials/doc/LICENSE.txt ================================================ .. _license: LICENSE ======= Copyright (c) 2008--2013, Theano Development Team All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Theano nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: DeepLearningTutorials/doc/Makefile ================================================ all: python scripts/docgen.py ================================================ FILE: DeepLearningTutorials/doc/SdA.txt ================================================ .. _SdA: Stacked Denoising Autoencoders (SdA) ==================================== .. note:: This section assumes you have already read through :doc:`logreg` and :doc:`mlp`. Additionally it uses the following Theano functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_. .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers .. note:: The code for this section is available for download `here`_. .. _here: http://deeplearning.net/tutorial/code/SdA.py The Stacked Denoising Autoencoder (SdA) is an extension of the stacked autoencoder [Bengio07]_ and it was introduced in [Vincent08]_. This tutorial builds on the previous tutorial :ref:`dA`. Especially if you do not have experience with autoencoders, we recommend reading it before going any further. .. _stacked_autoencoders: Stacked Autoencoders ++++++++++++++++++++ Denoising autoencoders can be stacked to form a deep network by feeding the latent representation (output code) of the denoising autoencoder found on the layer below as input to the current layer. The **unsupervised pre-training** of such an architecture is done one layer at a time. Each layer is trained as a denoising autoencoder by minimizing the error in reconstructing its input (which is the output code of the previous layer). Once the first :math:`k` layers are trained, we can train the :math:`k+1`-th layer because we can now compute the code or latent representation from the layer below. Once all layers are pre-trained, the network goes through a second stage of training called **fine-tuning**. Here we consider **supervised fine-tuning** where we want to minimize prediction error on a supervised task. For this, we first add a logistic regression layer on top of the network (more precisely on the output code of the output layer). We then train the entire network as we would train a multilayer perceptron. At this point, we only consider the encoding parts of each auto-encoder. This stage is supervised, since now we use the target class during training. (See the :ref:`mlp` for details on the multilayer perceptron.) This can be easily implemented in Theano, using the class defined previously for a denoising autoencoder. We can see the stacked denoising autoencoder as having two facades: a list of autoencoders, and an MLP. During pre-training we use the first facade, i.e., we treat our model as a list of autoencoders, and train each autoencoder seperately. In the second stage of training, we use the second facade. These two facades are linked because: * the autoencoders and the sigmoid layers of the MLP share parameters, and * the latent representations computed by intermediate layers of the MLP are fed as input to the autoencoders. .. literalinclude:: ../code/SdA.py :start-after: start-snippet-1 :end-before: end-snippet-1 ``self.sigmoid_layers`` will store the sigmoid layers of the MLP facade, while ``self.dA_layers`` will store the denoising autoencoder associated with the layers of the MLP. Next, we construct ``n_layers`` sigmoid layers and ``n_layers`` denoising autoencoders, where ``n_layers`` is the depth of our model. We use the ``HiddenLayer`` class introduced in :ref:`mlp`, with one modification: we replace the ``tanh`` non-linearity with the logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`). We link the sigmoid layers to form an MLP, and construct the denoising autoencoders such that each shares the weight matrix and the bias of its encoding part with its corresponding sigmoid layer. .. literalinclude:: ../code/SdA.py :start-after: start-snippet-2 :end-before: end-snippet-2 All we need now is to add a logistic layer on top of the sigmoid layers such that we have an MLP. We will use the ``LogisticRegression`` class introduced in :ref:`logreg`. .. literalinclude:: ../code/SdA.py :start-after: end-snippet-2 :end-before: def pretraining_functions The ``SdA`` class also provides a method that generates training functions for the denoising autoencoders in its layers. They are returned as a list, where element :math:`i` is a function that implements one step of training the ``dA`` corresponding to layer :math:`i`. .. literalinclude:: ../code/SdA.py :start-after: self.errors = self.logLayer.errors(self.y) :end-before: corruption_level = T.scalar('corruption') To be able to change the corruption level or the learning rate during training, we associate Theano variables with them. .. literalinclude:: ../code/SdA.py :start-after: index = T.lscalar('index') :end-before: def build_finetune_functions Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and optionally ``corruption``---the corruption level or ``lr``---the learning rate. Note that the names of the parameters are the names given to the Theano variables when they are constructed, not the names of the Python variables (``learning_rate`` or ``corruption_level``). Keep this in mind when working with Theano. In the same fashion we build a method for constructing the functions required during finetuning (``train_fn``, ``valid_score`` and ``test_score``). .. literalinclude:: ../code/SdA.py :pyobject: SdA.build_finetune_functions Note that ``valid_score`` and ``test_score`` are not Theano functions, but rather Python functions that loop over the entire validation set and the entire test set, respectively, producing a list of the losses over these sets. Putting it all together +++++++++++++++++++++++ The few lines of code below construct the stacked denoising autoencoder: .. literalinclude:: ../code/SdA.py :start-after: start-snippet-3 :end-before: end-snippet-3 There are two stages of training for this network: layer-wise pre-training followed by fine-tuning. For the pre-training stage, we will loop over all the layers of the network. For each layer we will use the compiled Theano function that implements a SGD step towards optimizing the weights for reducing the reconstruction cost of that layer. This function will be applied to the training set for a fixed number of epochs given by ``pretraining_epochs``. .. literalinclude:: ../code/SdA.py :start-after: start-snippet-4 :end-before: end-snippet-4 The fine-tuning loop is very similar to the one in the :ref:`mlp`. The only difference is that it uses the functions given by ``build_finetune_functions``. Running the Code ++++++++++++++++ The user can run the code by calling: .. code-block:: bash python code/SdA.py By default the code runs 15 pre-training epochs for each layer, with a batch size of 1. The corruption levels are 0.1 for the first layer, 0.2 for the second, and 0.3 for the third. The pretraining learning rate is 0.001 and the finetuning learning rate is 0.1. Pre-training takes 585.01 minutes, with an average of 13 minutes per epoch. Fine-tuning is completed after 36 epochs in 444.2 minutes, with an average of 12.34 minutes per epoch. The final validation score is 1.39% with a testing score of 1.3%. These results were obtained on a machine with an Intel Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS. Tips and Tricks +++++++++++++++ One way to improve the running time of your code (assuming you have sufficient memory available), is to compute how the network, up to layer :math:`k-1`, transforms your data. Namely, you start by training your first layer dA. Once it is trained, you can compute the hidden units values for every datapoint in your dataset and store this as a new dataset that you will use to train the dA corresponding to layer 2. Once you have trained the dA for layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on. You can see now, that at this point, the dAs are trained individually, and they just provide (one to the other) a non-linear transformation of the input. Once all dAs are trained, you can start fine-tuning the model. ================================================ FILE: DeepLearningTutorials/doc/conf.py ================================================ # -*- coding: utf-8 -*- # # theano documentation build configuration file, created by # sphinx-quickstart on Tue Oct 7 16:34:06 2008. # # This file is execfile()d with the current directory set to its containing dir. # # The contents of this file are pickled, so don't put values in the namespace # that aren't pickleable (module imports are okay, they're removed automatically). # # All configuration values have a default value; values that are commented out # serve to show the default value. import sys, os # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. #sys.path.append(os.path.abspath('some/directory')) # General configuration # --------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo'] try: from sphinx.ext import pngmath extensions.append('sphinx.ext.pngmath') except ImportError: print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath' pass # Add any paths that contain templates here, relative to this directory. templates_path = ['.templates'] # The suffix of source filenames. source_suffix = '.txt' # The master toctree document. master_doc = 'contents' # General substitutions. project = 'DeepLearning' copyright = '2008--2010, LISA lab' # The default replacements for |version| and |release|, also used in various # other places throughout the built documents. # # The short X.Y version. version = '0.1' # The full version, including alpha/beta/rc tags. release = '0.1' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directories, that shouldn't be searched # for source files. exclude_dirs = ['scripts'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # Options for HTML output # ----------------------- # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. #html_style = 'default.css' html_theme = 'sphinxdoc' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (within the static path) to place at the top of # the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['.static', 'images'] html_static_path = ['images'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. html_use_modindex = True # If false, no index is generated. html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, the reST sources are included in the HTML build as _sources/. #html_copy_source = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'deeplearningdoc' # Options for LaTeX output # ------------------------ # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). latex_font_size = '11pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ ('contents', 'deeplearning.tex', 'Deep Learning Tutorial', 'LISA lab, University of Montreal', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_use_modindex = True default_role = 'math' pngmath_divpng_args = ['-gamma 1.5','-D 110'] pngmath_latex_preamble = '\\usepackage{amsmath}\n'+\ '\\usepackage{amsfonts}\n'+\ '\\usepackage{amssymb}\n'+\ '\\def\\E{\\mathbf{E}}\n'+\ '\\def\\F{\\mathbf{F}}\n'+\ '\\def\\x{\\mathbf{x}}\n'+\ '\\def\\h{\\mathbf{h}}\n'+\ '\\def\\v{\\mathbf{v}}\n'+\ '\\def\\nv{\\mathbf{v^{{\bf -}}}}\n'+\ '\\def\\nh{\\mathbf{h^{{\bf -}}}}\n'+\ '\\def\\s{\\mathbf{s}}\n'+\ '\\def\\b{\\mathbf{b}}\n'+\ '\\def\\c{\\mathbf{c}}\n'+\ '\\def\\W{\\mathbf{W}}\n'+\ '\\def\\C{\\mathbf{C}}\n'+\ '\\def\\P{\\mathbf{P}}\n'+\ '\\def\\T{{\\bf \\mathcal T}}\n'+\ '\\def\\B{{\\bf \\mathcal B}}\n' ================================================ FILE: DeepLearningTutorials/doc/contents.txt ================================================ .. _contents: ======== Contents ======== .. toctree:: :maxdepth: 2 LICENSE index gettingstarted logreg mlp lenet dA SdA rbm DBN hmc rnnslu lstm rnnrbm utilities references ================================================ FILE: DeepLearningTutorials/doc/dA.txt ================================================ .. _daa: Denoising Autoencoders (dA) =========================== .. note:: This section assumes the reader has already read through :doc:`logreg` and :doc:`mlp`. Additionally it uses the following Theano functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_. .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers .. note:: The code for this section is available for download `here`_. .. _here: http://deeplearning.net/tutorial/code/dA.py The Denoising Autoencoder (dA) is an extension of a classical autoencoder and it was introduced as a building block for deep networks in [Vincent08]_. We will start the tutorial with a short discussion on :ref:`autoencoders`. .. _autoencoders: Autoencoders +++++++++++++ See section 4.6 of [Bengio09]_ for an overview of auto-encoders. An autoencoder takes an input :math:`\mathbf{x} \in [0,1]^d` and first maps it (with an *encoder)* to a hidden representation :math:`\mathbf{y} \in [0,1]^{d'}` through a deterministic mapping, e.g.: .. math:: \mathbf{y} = s(\mathbf{W}\mathbf{x} + \mathbf{b}) Where :math:`s` is a non-linearity such as the sigmoid. The latent representation :math:`\mathbf{y}`, or **code** is then mapped back (with a *decoder)* into a **reconstruction** :math:`\mathbf{z}` of the same shape as :math:`\mathbf{x}`. The mapping happens through a similar transformation, e.g.: .. math:: \mathbf{z} = s(\mathbf{W'}\mathbf{y} + \mathbf{b'}) (Here, the prime symbol does not indicate matrix transposition.) :math:`\mathbf{z}` should be seen as a prediction of :math:`\mathbf{x}`, given the code :math:`\mathbf{y}`. Optionally, the weight matrix :math:`\mathbf{W'}` of the reverse mapping may be constrained to be the transpose of the forward mapping: :math:`\mathbf{W'} = \mathbf{W}^T`. This is referred to as *tied weights*. The parameters of this model (namely :math:`\mathbf{W}`, :math:`\mathbf{b}`, :math:`\mathbf{b'}` and, if one doesn't use tied weights, also :math:`\mathbf{W'}`) are optimized such that the average reconstruction error is minimized. The reconstruction error can be measured in many ways, depending on the appropriate distributional assumptions on the input given the code. The traditional *squared error* :math:`L(\mathbf{x} \mathbf{z}) = || \mathbf{x} - \mathbf{z} ||^2`, can be used. If the input is interpreted as either bit vectors or vectors of bit probabilities, *cross-entropy* of the reconstruction can be used: .. math:: L_{H} (\mathbf{x}, \mathbf{z}) = - \sum^d_{k=1}[\mathbf{x}_k \log \mathbf{z}_k + (1 - \mathbf{x}_k)\log(1 - \mathbf{z}_k)] The hope is that the code :math:`\mathbf{y}` is a *distributed* representation that captures the coordinates along the main factors of variation in the data. This is similar to the way the projection on principal components would capture the main factors of variation in the data. Indeed, if there is one linear hidden layer (the *code)* and the mean squared error criterion is used to train the network, then the :math:`k` hidden units learn to project the input in the span of the first :math:`k` principal components of the data. If the hidden layer is non-linear, the auto-encoder behaves differently from PCA, with the ability to capture multi-modal aspects of the input distribution. The departure from PCA becomes even more important when we consider *stacking multiple encoders* (and their corresponding decoders) when building a deep auto-encoder [Hinton06]_. Because :math:`\mathbf{y}` is viewed as a lossy compression of :math:`\mathbf{x}`, it cannot be a good (small-loss) compression for all :math:`\mathbf{x}`. Optimization makes it a good compression for training examples, and hopefully for other inputs as well, but not for arbitrary inputs. That is the sense in which an auto-encoder generalizes: it gives low reconstruction error on test examples from the same distribution as the training examples, but generally high reconstruction error on samples randomly chosen from the input space. We want to implement an auto-encoder using Theano, in the form of a class, that could be afterwards used in constructing a stacked autoencoder. The first step is to create shared variables for the parameters of the autoencoder :math:`\mathbf{W}`, :math:`\mathbf{b}` and :math:`\mathbf{b'}`. (Since we are using tied weights in this tutorial, :math:`\mathbf{W}^T` will be used for :math:`\mathbf{W'}`): .. literalinclude:: ../code/dA.py :start-after: start-snippet-1 :end-before: end-snippet-1 Note that we pass the symbolic ``input`` to the autoencoder as a parameter. This is so that we can concatenate layers of autoencoders to form a deep network: the symbolic output (the :math:`\mathbf{y}` above) of layer :math:`k` will be the symbolic input of layer :math:`k+1`. Now we can express the computation of the latent representation and of the reconstructed signal: .. literalinclude:: ../code/dA.py :pyobject: dA.get_hidden_values .. literalinclude:: ../code/dA.py :pyobject: dA.get_reconstructed_input And using these function we can compute the cost and the updates of one stochastic gradient descent step : .. literalinclude:: ../code/dA.py :pyobject: dA.get_cost_updates We can now define a function that applied iteratively will update the parameters ``W``, ``b`` and ``b_prime`` such that the reconstruction cost is approximately minimized. .. literalinclude:: ../code/dA.py :start-after: theano_rng = RandomStreams(rng.randint(2 ** 30)) :end-before: start_time = time.clock() If there is no constraint besides minimizing the reconstruction error, one might expect an auto-encoder with :math:`n` inputs and an encoding of dimension :math:`n` (or greater) to learn the identity function, merely mapping an input to its copy. Such an autoencoder would not differentiate test examples (from the training distribution) from other input configurations. Surprisingly, experiments reported in [Bengio07]_ suggest that, in practice, when trained with stochastic gradient descent, non-linear auto-encoders with more hidden units than inputs (called overcomplete) yield useful representations. (Here, "useful" means that a network taking the encoding as input has low classification error.) A simple explanation is that stochastic gradient descent with early stopping is similar to an L2 regularization of the parameters. To achieve perfect reconstruction of continuous inputs, a one-hidden layer auto-encoder with non-linear hidden units (exactly like in the above code) needs very small weights in the first (encoding) layer, to bring the non-linearity of the hidden units into their linear regime, and very large weights in the second (decoding) layer. With binary inputs, very large weights are also needed to completely minimize the reconstruction error. Since the implicit or explicit regularization makes it difficult to reach large-weight solutions, the optimization algorithm finds encodings which only work well for examples similar to those in the training set, which is what we want. It means that the *representation is exploiting statistical regularities present in the training set,* rather than merely learning to replicate the input. There are other ways by which an auto-encoder with more hidden units than inputs could be prevented from learning the identity function, capturing something useful about the input in its hidden representation. One is the addition of *sparsity* (forcing many of the hidden units to be zero or near-zero). Sparsity has been exploited very successfully by many [Ranzato07]_ [Lee08]_. Another is to add randomness in the transformation from input to reconstruction. This technique is used in Restricted Boltzmann Machines (discussed later in :ref:`rbm`), as well as in Denoising Auto-Encoders, discussed below. .. _DA: Denoising Autoencoders ++++++++++++++++++++++ The idea behind denoising autoencoders is simple. In order to force the hidden layer to discover more robust features and prevent it from simply learning the identity, we train the autoencoder to *reconstruct the input from a corrupted version of it*. The denoising auto-encoder is a stochastic version of the auto-encoder. Intuitively, a denoising auto-encoder does two things: try to encode the input (preserve the information about the input), and try to undo the effect of a corruption process stochastically applied to the input of the auto-encoder. The latter can only be done by capturing the statistical dependencies between the inputs. The denoising auto-encoder can be understood from different perspectives (the manifold learning perspective, stochastic operator perspective, bottom-up -- information theoretic perspective, top-down -- generative model perspective), all of which are explained in [Vincent08]_. See also section 7.2 of [Bengio09]_ for an overview of auto-encoders. In [Vincent08]_, the stochastic corruption process randomly sets some of the inputs (as many as half of them) to zero. Hence the denoising auto-encoder is trying to *predict the corrupted (i.e. missing) values from the uncorrupted (i.e., non-missing) values*, for randomly selected subsets of missing patterns. Note how being able to predict any subset of variables from the rest is a sufficient condition for completely capturing the joint distribution between a set of variables (this is how Gibbs sampling works). To convert the autoencoder class into a denoising autoencoder class, all we need to do is to add a stochastic corruption step operating on the input. The input can be corrupted in many ways, but in this tutorial we will stick to the original corruption mechanism of randomly masking entries of the input by making them zero. The code below does just that : .. code-block:: python from theano.tensor.shared_randomstreams import RandomStreams def get_corrupted_input(self, input, corruption_level): """ This function keeps ``1-corruption_level`` entries of the inputs the same and zero-out randomly selected subset of size ``coruption_level`` Note : first argument of theano.rng.binomial is the shape(size) of random numbers that it should produce second argument is the number of trials third argument is the probability of success of any trial this will produce an array of 0s and 1s where 1 has a probability of 1 - ``corruption_level`` and 0 with ``corruption_level`` """ return self.theano_rng.binomial(size=input.shape, n=1, p=1 - corruption_level) * input In the stacked autoencoder class (:ref:`stacked_autoencoders`) the weights of the ``dA`` class have to be shared with those of a corresponding sigmoid layer. For this reason, the constructor of the ``dA`` also gets Theano variables pointing to the shared parameters. If those parameters are left to ``None``, new ones will be constructed. The final denoising autoencoder class becomes : .. code-block:: python class dA(object): """Denoising Auto-Encoder class (dA) A denoising autoencoders tries to reconstruct the input from a corrupted version of it by projecting it first in a latent space and reprojecting it afterwards back in the input space. Please refer to Vincent et al.,2008 for more details. If x is the input then equation (1) computes a partially destroyed version of x by means of a stochastic mapping q_D. Equation (2) computes the projection of the input into the latent space. Equation (3) computes the reconstruction of the input, while equation (4) computes the reconstruction error. .. math:: \tilde{x} ~ q_D(\tilde{x}|x) (1) y = s(W \tilde{x} + b) (2) x = s(W' y + b') (3) L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) """ def __init__(self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500, W=None, bhid=None, bvis=None): """ Initialize the dA class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ) and the corruption level. The constructor also receives symbolic variables for the input, weights and bias. Such a symbolic variables are useful when, for example the input is the result of some computations, or when weights are shared between the dA and an MLP layer. When dealing with SdAs this always happens, the dA on layer 2 gets as input the output of the dA on layer 1, and the weights of the dA are used in the second stage of training to construct an MLP. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :paran input: a symbolic description of the input or None for standalone dA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values if not theano_rng : theano_rng = RandomStreams(rng.randint(2 ** 30)) # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformely sampled # from -4.*sqrt(6./(n_visible+n_hidden)) and 4.*sqrt(6./(n_hidden+n_visible)) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) W = theano.shared(value=initial_W, name='W') if not bvis: bvis = theano.shared(value = numpy.zeros(n_visible, dtype=theano.config.floatX), name='bvis') if not bhid: bhid = theano.shared(value=numpy.zeros(n_hidden, dtype=theano.config.floatX), name='bhid') self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input == None: # we use a matrix because we expect a minibatch of several examples, # each example being a row self.x = T.dmatrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime] def get_corrupted_input(self, input, corruption_level): """ This function keeps ``1-corruption_level`` entries of the inputs the same and zero-out randomly selected subset of size ``coruption_level`` Note : first argument of theano.rng.binomial is the shape(size) of random numbers that it should produce second argument is the number of trials third argument is the probability of success of any trial this will produce an array of 0s and 1s where 1 has a probability of 1 - ``corruption_level`` and 0 with ``corruption_level`` """ return self.theano_rng.binomial(size=input.shape, n=1, p=1 - corruption_level) * input def get_hidden_values(self, input): """ Computes the values of the hidden layer """ return T.nnet.sigmoid(T.dot(input, self.W) + self.b) def get_reconstructed_input(self, hidden ): """ Computes the reconstructed input given the values of the hidden layer """ return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values( tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using minibatches, # L will be a vector, with one entry per example in minibatch L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1 ) # note : L is now a vector, where each element is the cross-entropy cost # of the reconstruction of the corresponding example of the # minibatch. We need to compute the average of all these to get # the cost of the minibatch cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates) Putting it All Together +++++++++++++++++++++++ It is easy now to construct an instance of our ``dA`` class and train it. .. code-block:: python # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images ###################### # BUILDING THE MODEL # ###################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0.2, learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens = {x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock training_time = (end_time - start_time) print ('Training took %f minutes' % (pretraining_time / 60.)) In order to get a feeling of what the network learned we are going to plot the filters (defined by the weight matrix). Bear in mind, however, that this does not provide the entire story, since we neglect the biases and plot the weights up to a multiplicative constant (weights are converted to values between 0 and 1). To plot our filters we will need the help of ``tile_raster_images`` (see :ref:`how-to-plot`) so we urge the reader to familiarize himself with it. Also using the help of the Python Image Library, the following lines of code will save the filters as an image : .. code-block:: python image = Image.fromarray(tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') Running the Code ++++++++++++++++ To run the code : .. code-block:: bash python dA.py The resulted filters when we do not use any noise are : .. figure:: images/filters_corruption_0.png :align: center The filters for 30 percent noise : .. figure:: images/filters_corruption_30.png :align: center ================================================ FILE: DeepLearningTutorials/doc/deep.txt ================================================ .. _deep: Deep Learning ============= The breakthrough to effective training strategies for deep architectures came in 2006 with the algorithms for training deep belief networks (DBN) [Hinton07]_ and stacked auto-encoders [Ranzato07]_ , [Bengio07]_ . All these methods are based on a similar approach: **greedy layer-wise unsupervised pre-training** followed by **supervised fine-tuning**. The pretraining strategy consists in using unsupervised learning to guide the training of intermediate levels of representation. Each layer is pre-trained with an unsupervised learning algorithm, which attempts to learn a nonlinear transformation of its input, in order to captures its main variations. Higher levels of abstractions are created by feeding the output of one layer, to the input of the subsequent layer. The resulting an architecture can then be seen in two lights: * the pre-trained deep network can be used to initialize the weights of all, but the last layer of a deep neural network. The weights are then further adapted to a supervised task (such as classification) through traditional gradient descent (see :ref:`Multilayer perceptron `). This is referred to as the fine-tuning step. * the pre-trained deep network can also serve solely as a feature extractor. The output of the last layer is fed to a classifier, such as logistic regression, which is trained independently. Better results can be obtained by concatenating the output of the last layer, with the hidden representations of all intermediate layers [Lee09]_. For the purposes of this tutorial, we will focus on the first interpretation, as that is what was first proposed in [Hinton06]_. Deep Coding +++++++++++ Since Deep Belief Networks (DBN) and Stacked Denoising-AutoEncoders (SDA) share much of the same architecture and have very similar training algorithms (in terms of pretraining and fine-tuning stages), it makes sense to implement them in a similar fashion, as part of a "Deep Learning" framework. We thus define a generic interface, which both of these architectures will share. .. code-block:: python class DeepLayerwiseModel(object): def layerwise_pretrain(self, layer_fns, pretrain_amounts): """ """ def finetune(self, datasets, lr, batch_size): """ class DBN(DeepLayerwiseModel): """ """ class StackedDAA(DeepLayerwiseModel): """ """ .. code-block:: python def deep_main(learning_rate=0.1, pretraining_epochs=20, pretrain_lr=0.1, training_epochs=1000, batch_size=20, mnist_file='mnist.pkl.gz'): n_train_examples, train_valid_test = load_mnist(mnist_file) # instantiate model deep_model = ... #### #### Phase 1: Pre-training #### # create an array of functions, which will be used for the greedy # layer-wise unsupervised training procedure pretrain_functions = deep_model.pretrain_functions( batch_size=batch_size, train_set_x=train_set_x, learning_rate=pretrain_lr, ... ) # loop over all the layers in our network for layer_idx, pretrain_fn in enumerate(pretrain_functions): # iterate over a certain number of epochs) for i in xrange(pretraining_epochs * n_train_examples / batch_size): # follow one step in the gradient of the unsupervised cost # function, at the given layer layer_fn(i) .. code-block:: python #### #### Phase 2: Fine Tuning #### # create theano functions for fine-tuning, as well as # validation and testing our model. train_fn, valid_scores, test_scores =\ deep_model.finetune_functions( train_valid_test[0][0], # training dataset learning_rate=finetune_lr, # the learning rate batch_size=batch_size) # number of examples to use at once # use these functions as part of the generic early-stopping procedure for i in xrange(patience_max): if i >= patience: break cost_i = train_fn(i) ... ================================================ FILE: DeepLearningTutorials/doc/gettingstarted.txt ================================================ .. _gettingstarted: =============== Getting Started =============== These tutorials do not attempt to make up for a graduate or undergraduate course in machine learning, but we do make a rapid overview of some important concepts (and notation) to make sure that we're on the same page. You'll also need to download the datasets mentioned in this chapter in order to run the example code of the up-coming tutorials. .. _download: .. index:: Download: Download ======== On each learning algorithm page, you will be able to download the corresponding files. If you want to download all of them at the same time, you can clone the git repository of the tutorial:: git clone git://github.com/lisa-lab/DeepLearningTutorials.git .. _datasets: .. index:: Datasets Datasets ======== .. index:: MNIST Dataset MNIST Dataset +++++++++++++ (`mnist.pkl.gz `_) The `MNIST `_ dataset consists of handwritten digit images and it is divided in 60,000 examples for the training set and 10,000 examples for testing. In many papers as well as in this tutorial, the official training set of 60,000 is divided into an actual training set of 50,000 examples and 10,000 validation examples (for selecting hyper-parameters like learning rate and size of the model). All digit images have been size-normalized and centered in a fixed size image of 28 x 28 pixels. In the original dataset each pixel of the image is represented by a value between 0 and 255, where 0 is black, 255 is white and anything in between is a different shade of grey. Here are some examples of MNIST digits: |0| |1| |2| |3| |4| |5| .. |0| image:: images/mnist_0.png .. |1| image:: images/mnist_1.png .. |2| image:: images/mnist_2.png .. |3| image:: images/mnist_3.png .. |4| image:: images/mnist_4.png .. |5| image:: images/mnist_5.png For convenience we pickled the dataset to make it easier to use in python. It is available for download `here `_. The pickled file represents a tuple of 3 lists : the training set, the validation set and the testing set. Each of the three lists is a pair formed from a list of images and a list of class labels for each of the images. An image is represented as numpy 1-dimensional array of 784 (28 x 28) float values between 0 and 1 (0 stands for black, 1 for white). The labels are numbers between 0 and 9 indicating which digit the image represents. The code block below shows how to load the dataset. .. code-block:: python import cPickle, gzip, numpy # Load the dataset f = gzip.open('mnist.pkl.gz', 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() When using the dataset, we usually divide it in minibatches (see :ref:`opt_SGD`). We encourage you to store the dataset into shared variables and access it based on the minibatch index, given a fixed and known batch size. The reason behind shared variables is related to using the GPU. There is a large overhead when copying data into the GPU memory. If you would copy data on request ( each minibatch individually when needed) as the code will do if you do not use shared variables, due to this overhead, the GPU code will not be much faster then the CPU code (maybe even slower). If you have your data in Theano shared variables though, you give Theano the possibility to copy the entire data on the GPU in a single call when the shared variables are constructed. Afterwards the GPU can access any minibatch by taking a slice from this shared variables, without needing to copy any information from the CPU memory and therefore bypassing the overhead. Because the datapoints and their labels are usually of different nature (labels are usually integers while datapoints are real numbers) we suggest to use different variables for labes and data. Also we recomand using different variables for the training set, validation set and testing set to make the code more readable (resulting in 6 different shared variables). Since now the data is in one variable, and a minibatch is defined as a slice of that variable, it comes more natural to define a minibatch by indicating its index and its size. In our setup the batch size stays constant through out the execution of the code, therefore a function will actually require only the index to identify on which datapoints to work. The code below shows how to store your data and how to access a minibatch: .. code-block:: python def shared_dataset(data_xy): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets us get around this issue return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) batch_size = 500 # size of the minibatch # accessing the third minibatch of the training set data = train_set_x[2 * 500: 3 * 500] label = train_set_y[2 * 500: 3 * 500] The data has to be stored as floats on the GPU ( the right ``dtype`` for storing on the GPU is given by ``theano.config.floatX``). To get around this shortcomming for the labels, we store them as float, and then cast it to int. .. note:: If you are running your code on the GPU and the dataset you are using is too large to fit in memory the code will crash. In such a case you should store the data in a shared variable. You can however store a sufficiently small chunk of your data (several minibatches) in a shared variable and use that during training. Once you got through the chunk, update the values it stores. This way you minimize the number of data transfers between CPU memory and GPU memory. .. index:: Notation Notation ======== .. index:: Dataset notation Dataset notation ++++++++++++++++ We label data sets as :math:`\mathcal{D}`. When the distinction is important, we indicate train, validation, and test sets as: :math:`\mathcal{D}_{train}`, :math:`\mathcal{D}_{valid}` and :math:`\mathcal{D}_{test}`. The validation set is used to perform model selection and hyper-parameter selection, whereas the test set is used to evaluate the final generalization error and compare different algorithms in an unbiased way. The tutorials mostly deal with classification problems, where each data set :math:`\mathcal{D}` is an indexed set of pairs :math:`(x^{(i)},y^{(i)})`. We use superscripts to distinguish training set examples: :math:`x^{(i)} \in \mathcal{R}^D` is thus the i-th training example of dimensionality :math:`D`. Similarly, :math:`y^{(i)} \in \{0, ..., L\}` is the i-th label assigned to input :math:`x^{(i)}`. It is straightforward to extend these examples to ones where :math:`y^{(i)}` has other types (e.g. Gaussian for regression, or groups of multinomials for predicting multiple symbols). .. index:: Math Convetions Math Conventions ++++++++++++++++ * :math:`W`: upper-case symbols refer to a matrix unless specified otherwise * :math:`W_{ij}`: element at i-th row and j-th column of matrix :math:`W` * :math:`W_{i \cdot}, W_i`: vector, i-th row of matrix :math:`W` * :math:`W_{\cdot j}`: vector, j-th column of matrix :math:`W` * :math:`b`: lower-case symbols refer to a vector unless specified otherwise * :math:`b_i`: i-th element of vector :math:`b` .. index:: List of Symbols and acronyms List of Symbols and acronyms ++++++++++++++++++++++++++++ * :math:`D`: number of input dimensions. * :math:`D_h^{(i)}`: number of hidden units in the :math:`i`-th layer. * :math:`f_{\theta}(x)`, :math:`f(x)`: classification function associated with a model :math:`P(Y|x,\theta)`, defined as :math:`{\rm argmax}_k P(Y=k|x,\theta)`. Note that we will often drop the :math:`\theta` subscript. * L: number of labels. * :math:`\mathcal{L}(\theta, \cal{D})`: log-likelihood :math:`\cal{D}` of the model defined by parameters :math:`\theta`. * :math:`\ell(\theta, \cal{D})` empirical loss of the prediction function f parameterized by :math:`\theta` on data set :math:`\cal{D}`. * NLL: negative log-likelihood * :math:`\theta`: set of all parameters for a given model .. index:: Python Namespaces Python Namespaces +++++++++++++++++ Tutorial code often uses the following namespaces: .. code-block:: python import theano import theano.tensor as T import numpy A Primer on Supervised Optimization for Deep Learning ===================================================== .. _stoch-grad-label: What's exciting about Deep Learning is largely the use of unsupervised learning of deep networks. But supervised learning also plays an important role. The utility of unsupervised *pre-training* is often evaluated on the basis of what performance can be achieved after supervised *fine-tuning*. This chapter reviews the basics of supervised learning for classification models, and covers the minibatch stochastic gradient descent algorithm that is used to fine-tune many of the models in the Deep Learning Tutorials. Have a look at these `introductory course notes on gradient-based learning `_ for more basics on the notion of optimizing a training criterion using the gradient. .. _opt_learn_classifier: Learning a Classifier +++++++++++++++++++++ .. index:: Zero-One Loss Zero-One Loss ------------- The models presented in these deep learning tutorials are mostly used for classification. The objective in training a classifier is to minimize the number of errors (zero-one loss) on unseen examples. If :math:`f: R^D \rightarrow \{0,...,L\}` is the prediction function, then this loss can be written as: .. math:: \ell_{0,1} = \sum_{i=0}^{|\mathcal{D}|} I_{f(x^{(i)}) \neq y^{(i)}} where either :math:`\mathcal{D}` is the training set (during training) or :math:`\mathcal{D} \cap \mathcal{D}_{train} = \emptyset` (to avoid biasing the evaluation of validation or test error). :math:`I` is the indicator function defined as: .. math:: I_x = \left\{\begin{array}{ccc} 1&\mbox{ if $x$ is True} \\ 0&\mbox{ otherwise}\end{array}\right. In this tutorial, :math:`f` is defined as: .. math:: f(x) = {\rm argmax}_k P(Y=k | x, \theta) In python, using Theano this can be written as : .. code-block:: python # zero_one_loss is a Theano variable representing a symbolic # expression of the zero one loss ; to get the actual value this # symbolic expression has to be compiled into a Theano function (see # the Theano tutorial for more details) zero_one_loss = T.sum(T.neq(T.argmax(p_y_given_x), y)) .. index:: Negative Log--Likelihood Loss Negative Log-Likelihood Loss ---------------------------- Since the zero-one loss is not differentiable, optimizing it for large models (thousands or millions of parameters) is prohibitively expensive (computationally). We thus maximize the log-likelihood of our classifier given all the labels in a training set. .. math:: \mathcal{L}(\theta, \mathcal{D}) = \sum_{i=0}^{|\mathcal{D}|} \log P(Y=y^{(i)} | x^{(i)}, \theta) The likelihood of the correct class is not the same as the number of right predictions, but from the point of view of a randomly initialized classifier they are pretty similar. Remember that likelihood and zero-one loss are different objectives; you should see that they are corralated on the validation set but sometimes one will rise while the other falls, or vice-versa. Since we usually speak in terms of minimizing a loss function, learning will thus attempt to **minimize** the **negative** log-likelihood (NLL), defined as: .. math:: NLL(\theta, \mathcal{D}) = - \sum_{i=0}^{|\mathcal{D}|} \log P(Y=y^{(i)} | x^{(i)}, \theta) The NLL of our classifier is a differentiable surrogate for the zero-one loss, and we use the gradient of this function over our training data as a supervised learning signal for deep learning of a classifier. This can be computed using the following line of code : .. code-block:: python # NLL is a symbolic variable ; to get the actual value of NLL, this symbolic # expression has to be compiled into a Theano function (see the Theano # tutorial for more details) NLL = -T.sum(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) # note on syntax: T.arange(y.shape[0]) is a vector of integers [0,1,2,...,len(y)]. # Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the # elements M[0,a], M[1,b], ..., M[K,k] as a vector. Here, we use this # syntax to retrieve the log-probability of the correct labels, y. .. index:: Stochastic Gradient Descent .. _opt_SGD: Stochastic Gradient Descent +++++++++++++++++++++++++++ What is ordinary gradient descent? it is a simple algorithm in which we repeatedly make small steps downward on an error surface defined by a loss function of some parameters. For the purpose of ordinary gradient descent we consider that the training data is rolled into the loss function. Then the pseudocode of this algorithm can be described as : .. code-block:: python # GRADIENT DESCENT while True: loss = f(params) d_loss_wrt_params = ... # compute gradient params -= learning_rate * d_loss_wrt_params if : return params Stochastic gradient descent (SGD) works according to the same principles as ordinary gradient descent, but proceeds more quickly by estimating the gradient from just a few examples at a time instead of the entire training set. In its purest form, we estimate the gradient from just a single example at a time. .. code-block:: python # STOCHASTIC GRADIENT DESCENT for (x_i,y_i) in training_set: # imagine an infinite generator # that may repeat examples (if there is only a finite training set) loss = f(params, x_i, y_i) d_loss_wrt_params = ... # compute gradient params -= learning_rate * d_loss_wrt_params if : return params The variant that we recommend for deep learning is a further twist on stochastic gradient descent using so-called "minibatches". Minibatch SGD works identically to SGD, except that we use more than one training example to make each estimate of the gradient. This technique reduces variance in the estimate of the gradient, and often makes better use of the hierarchical memory organization in modern computers. .. code-block:: python for (x_batch,y_batch) in train_batches: # imagine an infinite generator # that may repeat examples loss = f(params, x_batch, y_batch) d_loss_wrt_params = ... # compute gradient using theano params -= learning_rate * d_loss_wrt_params if : return params There is a tradeoff in the choice of the minibatch size :math:`B`. The reduction of variance and use of SIMD instructions helps most when increasing :math:`B` from 1 to 2, but the marginal improvement fades rapidly to nothing. With large :math:`B`, time is wasted in reducing the variance of the gradient estimator, that time would be better spent on additional gradient steps. An optimal :math:`B` is model-, dataset-, and hardware-dependent, and can be anywhere from 1 to maybe several hundreds. In the tutorial we set it to 20, but this choice is almost arbitrary (though harmless). .. note:: If you are training for a fixed number of epochs, the minibatch size becomes important because it controls the number of updates done to your parameters. Training the same model for 10 epochs using a batch size of 1 yields completely different results compared to training for the same 10 epochs but with a batchsize of 20. Keep this in mind when switching between batch sizes and be prepared to tweak all the other parameters acording to the batch size used. All code-blocks above show pseudocode of how the algorithm looks like. Implementing such algorithm in Theano can be done as follows : .. code-block:: python # Minibatch Stochastic Gradient Descent # assume loss is a symbolic description of the loss function given # the symbolic variables params (shared variable), x_batch, y_batch; # compute gradient of loss with respect to params d_loss_wrt_params = T.grad(loss, params) # compile the MSGD step into a theano function updates = [(params, params - learning_rate * d_loss_wrt_params)] MSGD = theano.function([x_batch,y_batch], loss, updates=updates) for (x_batch, y_batch) in train_batches: # here x_batch and y_batch are elements of train_batches and # therefore numpy arrays; function MSGD also updates the params print('Current loss is ', MSGD(x_batch, y_batch)) if stopping_condition_is_met: return params .. index:: Regularization Regularization ++++++++++++++ There is more to machine learning than optimization. When we train our model from data we are trying to prepare it to do well on *new* examples, not the ones it has already seen. The training loop above for MSGD does not take this into account, and may overfit the training examples. A way to combat overfitting is through regularization. There are several techniques for regularization; the ones we will explain here are L1/L2 regularization and early-stopping. .. index:: L1 and L2 regularization .. _L1_L2_regularization : L1 and L2 regularization ------------------------ L1 and L2 regularization involve adding an extra term to the loss function, which penalizes certain parameter configurations. Formally, if our loss function is: .. math:: NLL(\theta, \mathcal{D}) = - \sum_{i=0}^{|\mathcal{D}|} \log P(Y=y^{(i)} | x^{(i)}, \theta) then the regularized loss will be: .. math:: E(\theta, \mathcal{D}) = NLL(\theta, \mathcal{D}) + \lambda R(\theta)\\ or, in our case .. math:: E(\theta, \mathcal{D}) = NLL(\theta, \mathcal{D}) + \lambda||\theta||_p^p where .. math:: ||\theta||_p = \left(\sum_{j=0}^{|\theta|}{|\theta_j|^p}\right)^{\frac{1}{p}} which is the :math:`L_p` norm of :math:`\theta`. :math:`\lambda` is a hyper-parameter which controls the relative importance of the regularization parameter. Commonly used values for p are 1 and 2, hence the L1/L2 nomenclature. If p=2, then the regularizer is also called "weight decay". In principle, adding a regularization term to the loss will encourage smooth network mappings in a neural network (by penalizing large values of the parameters, which decreases the amount of nonlinearity that the network models). More intuitively, the two terms (NLL and :math:`R(\theta)`) correspond to modelling the data well (NLL) and having "simple" or "smooth" solutions (:math:`R(\theta)`). Thus, minimizing the sum of both will, in theory, correspond to finding the right trade-off between the fit to the training data and the "generality" of the solution that is found. To follow Occam's razor principle, this minimization should find us the simplest solution (as measured by our simplicity criterion) that fits the training data. Note that the fact that a solution is "simple" does not mean that it will generalize well. Empirically, it was found that performing such regularization in the context of neural networks helps with generalization, especially on small datasets. The code block below shows how to compute the loss in python when it contains both a L1 regularization term weighted by :math:`\lambda_1` and L2 regularization term weighted by :math:`\lambda_2` .. code-block:: python # symbolic Theano variable that represents the L1 regularization term L1 = T.sum(abs(param)) # symbolic Theano variable that represents the squared L2 term L2_sqr = T.sum(param ** 2) # the loss loss = NLL + lambda_1 * L1 + lambda_2 * L2 .. index:: Early-Stopping .. _opt_early_stopping: Early-Stopping -------------- Early-stopping combats overfitting by monitoring the model's performance on a *validation set*. A validation set is a set of examples that we never use for gradient descent, but which is also not a part of the *test set*. The validation examples are considered to be representative of future test examples. We can use them during training because they are not part of the test set. If the model's performance ceases to improve sufficiently on the validation set, or even degrades with further optimization, then the heuristic implemented here gives up on much further optimization. The choice of when to stop is a judgement call and a few heuristics exist, but these tutorials will make use of a strategy based on a geometrically increasing amount of patience. .. code-block:: python # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience/2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): # Report "1" for first epoch, "n_epochs" for last epoch epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): d_loss_wrt_params = ... # compute gradient params -= learning_rate * d_loss_wrt_params # gradient descent # iteration number. We want it to start at 0. iter = (epoch - 1) * n_train_batches + minibatch_index # note that if we do `iter % validation_frequency` it will be # true for iter = 0 which we do not want. We want it true for # iter = validation_frequency - 1. if (iter + 1) % validation_frequency == 0: this_validation_loss = ... # compute zero-one loss on validation set if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_params = copy.deepcopy(params) best_validation_loss = this_validation_loss if patience <= iter: done_looping = True break # POSTCONDITION: # best_params refers to the best out-of-sample parameters observed during the optimization If we run out of batches of training data before running out of patience, then we just go back to the beginning of the training set and repeat. .. note:: The ``validation_frequency`` should always be smaller than the ``patience``. The code should check at least two times how it performs before running out of patience. This is the reason we used the formulation ``validation_frequency = min( value, patience/2.)`` .. note:: This algorithm could possibly be improved by using a test of statistical significance rather than the simple comparison, when deciding whether to increase the patience. .. index:: Testing Testing +++++++ After the loop exits, the best_params variable refers to the best-performing model on the validation set. If we repeat this procedure for another model class, or even another random initialization, we should use the same train/valid/test split of the data, and get other best-performing models. If we have to choose what the best model class or the best initialization was, we compare the best_validation_loss for each model. When we have finally chosen the model we think is the best (on validation data), we report that model's test set performance. That is the performance we expect on unseen examples. Recap +++++ That's it for the optimization section. The technique of early-stopping requires us to partition the set of examples into three sets (training :math:`\mathcal{D}_{train}`, validation :math:`\mathcal{D}_{valid}`, test :math:`\mathcal{D}_{test}`). The training set is used for minibatch stochastic gradient descent on the differentiable approximation of the objective function. As we perform this gradient descent, we periodically consult the validation set to see how our model is doing on the real objective function (or at least our empirical estimate of it). When we see a good model on the validation set, we save it. When it has been a long time since seeing a good model, we abandon our search and return the best parameters found, for evaluation on the test set. Theano/Python Tips =================== Loading and Saving Models ++++++++++++++++++++++++++ When you're doing experiments, it can take hours (sometimes days!) for gradient-descent to find the best parameters. You will want to save those weights once you find them. You may also want to save your current-best estimates as the search progresses. **Pickle the numpy ndarrays from your shared variables** The best way to save/archive your model's parameters is to use pickle or deepcopy the ndarray objects. So for example, if your parameters are in shared variables ``w, v, u``, then your save command should look something like: .. code-block:: python >>> import cPickle >>> save_file = open('path', 'wb') # this will overwrite current contents >>> cPickle.dump(w.get_value(borrow=True), save_file, -1) # the -1 is for HIGHEST_PROTOCOL >>> cPickle.dump(v.get_value(borrow=True), save_file, -1) # .. and it triggers much more efficient >>> cPickle.dump(u.get_value(borrow=True), save_file, -1) # .. storage than numpy's default >>> save_file.close() Then later, you can load your data back like this: .. code-block:: python >>> save_file = open('path') >>> w.set_value(cPickle.load(save_file), borrow=True) >>> v.set_value(cPickle.load(save_file), borrow=True) >>> u.set_value(cPickle.load(save_file), borrow=True) This technique is a bit verbose, but it is tried and true. You will be able to load your data and render it in matplotlib without trouble, years after saving it. **Do not pickle your training or test functions for long-term storage** Theano functions are compatible with Python's deepcopy and pickle mechanisms, but you should not necessarily pickle a Theano function. If you update your Theano folder and one of the internal changes, then you may not be able to un-pickle your model. Theano is still in active development, and the internal APIs are subject to change. So to be on the safe side -- do not pickle your entire training or testing functions for long-term storage. The pickle mechanism is aimed at for short-term storage, such as a temp file, or a copy to another machine in a distributed job. Read more about `serialization in Theano`_, or Python's `pickling`_. .. _pickling: http://docs.python.org/library/pickle.html .. _serialization in Theano: http://deeplearning.net/software/theano/tutorial/loading_and_saving.html Plotting Intermediate Results ++++++++++++++++++++++++++++++ Visualizations can be very powerful tools for understanding what your model or training algorithm is doing. You might be tempted to insert ``matplotlib`` plotting commands, or ``PIL`` image-rendering commands into your model-training script. However, later you will observe something interesting in one of those pre-rendered images and want to investigate something that isn't clear from the pictures. You'll wished you had saved the original model. **If you have enough disk space, your training script should save intermediate models and a visualization script should process those saved models.** You already have a model-saving function right? Just use it again to save these intermediate models. Libraries you'll want to know about: Python Image Library (`PIL`_), `matplotlib`_. .. _PIL: http://www.pythonware.com/products/pil .. _matplotlib: http://matplotlib.sourceforge.net ================================================ FILE: DeepLearningTutorials/doc/hmc.txt ================================================ .. _HMC: Hybrid Monte-Carlo Sampling =========================== .. note:: This is an advanced tutorial, which shows how one can implemented Hybrid Monte-Carlo (HMC) sampling using Theano. We assume the reader is already familiar with Theano and energy-based models such as the RBM. .. note:: The code for this section is available for download `here `_. Theory ++++++ Maximum likelihood learning of energy-based models requires a robust algorithm to sample negative phase particles (see Eq.(4) of the :doc:`rbm` tutorial). When training RBMs with CD or PCD, this is typically done with block Gibbs sampling, where the conditional distributions :math:`p(h|v)` and :math:`p(v|h)` are used as the transition operators of the Markov chain. In certain cases however, these conditional distributions might be difficult to sample from (i.e. requiring expensive matrix inversions, as in the case of the "mean-covariance RBM"). Also, even if Gibbs sampling can be done efficiently, it nevertheless operates via a random walk which might not be statistically efficient for some distributions. In this context, and when sampling from continuous variables, Hybrid Monte Carlo (HMC) can prove to be a powerful tool [Duane87]_. It avoids random walk behavior by simulating a physical system governed by Hamiltonian dynamics, potentially avoiding tricky conditional distributions in the process. In HMC, model samples are obtained by simulating a physical system, where particles move about a high-dimensional landscape, subject to potential and kinetic energies. Adapting the notation from [Neal93]_, particles are characterized by a position vector or state :math:`s \in \mathcal{R}^D` and velocity vector :math:`\phi \in \mathcal{R}^D`. The combined state of a particle is denoted as :math:`\chi=(s,\phi)`. The Hamiltonian is then defined as the sum of potential energy :math:`E(s)` (same energy function defined by energy-based models) and kinetic energy :math:`K(\phi)`, as follows: .. math:: \mathcal{H}(s,\phi) = E(s) + K(\phi) = E(s) + \frac{1}{2} \sum_i \phi_i^2 Instead of sampling :math:`p(s)` directly, HMC operates by sampling from the canonical distribution :math:`p(s,\phi) = \frac{1}{Z} \exp(-\mathcal{H}(s,\phi))=p(s)p(\phi)`. Because the two variables are independent, marginalizing over :math:`\phi` is trivial and recovers the original distribution of interest. **Hamiltonian Dynamics** State :math:`s` and velocity :math:`\phi` are modified such that :math:`\mathcal{H}(s,\phi)` remains constant throughout the simulation. The differential equations are given by: .. math:: :label: ds_dt \frac{ds_i}{dt} &= \frac{\partial \mathcal{H}}{\partial \phi_i} = \phi_i \\ \frac{d\phi_i}{dt} &= - \frac{\partial \mathcal{H}}{\partial s_i} = - \frac{\partial E}{\partial s_i} As shown in [Neal93]_, the above transformation preserves volume and is reversible. The above dynamics can thus be used as transition operators of a Markov chain and will leave :math:`p(s,\phi)` invariant. That chain by itself is not ergodic however, since simulating the dynamics maintains a fixed Hamiltonian :math:`\mathcal{H}(s,\phi)`. HMC thus alternates hamiltonian dynamic steps, with Gibbs sampling of the velocity. Because :math:`p(s)` and :math:`p(\phi)` are independent, sampling :math:`\phi_{new} \sim p(\phi|s)` is trivial since :math:`p(\phi|s)=p(\phi)`, where :math:`p(\phi)` is often taken to be the uni-variate Gaussian. **The Leap-Frog Algorithm** In practice, we cannot simulate Hamiltonian dynamics exactly because of the problem of time discretization. There are several ways one can do this. To maintain invariance of the Markov chain however, care must be taken to preserve the properties of volume conservation and time reversibility. The **leap-frog algorithm** maintains these properties and operates in 3 steps: .. math:: :label: leap-frog \phi_i(t + \epsilon/2) &= \phi_i(t) - \frac{\epsilon}{2} \frac{\partial{}}{\partial s_i} E(s(t)) \\ s_i(t + \epsilon) &= s_i(t) + \epsilon \phi_i(t + \epsilon/2) \\ \phi_i(t + \epsilon) &= \phi_i(t + \epsilon/2) - \frac{\epsilon}{2} \frac{\partial{}}{\partial s_i} E(s(t + \epsilon)) \\ We thus perform a half-step update of the velocity at time :math:`t+\epsilon/2`, which is then used to compute :math:`s(t + \epsilon)` and :math:`\phi(t + \epsilon)`. **Accept / Reject** In practice, using finite stepsizes :math:`\epsilon` will not preserve :math:`\mathcal{H}(s,\phi)` exactly and will introduce bias in the simulation. Also, rounding errors due to the use of floating point numbers means that the above transformation will not be perfectly reversible. HMC cancels these effects **exactly** by adding a Metropolis accept/reject stage, after :math:`n` leapfrog steps. The new state :math:`\chi' = (s',\phi')` is accepted with probability :math:`p_{acc}(\chi,\chi')`, defined as: .. math:: p_{acc}(\chi,\chi') = min \left( 1, \frac{\exp(-\mathcal{H}(s',\phi')}{\exp(-\mathcal{H}(s,\phi)} \right) **HMC Algorithm** In this tutorial, we obtain a new HMC sample as follows: 1. sample a new velocity from a univariate Gaussian distribution 2. perform :math:`n` leapfrog steps to obtain the new state :math:`\chi'` 3. perform accept/reject move of :math:`\chi'` Implementing HMC Using Theano +++++++++++++++++++++++++++++ In Theano, update dictionaries and shared variables provide a natural way to implement a sampling algorithm. The current state of the sampler can be represented as a Theano shared variable, with HMC updates being implemented by the updates list of a Theano function. We breakdown the HMC algorithm into the following sub-components: * `simulate\_dynamics`: a symbolic Python function which, given an initial position and velocity, will perform `n\_steps` leapfrog updates and return the symbolic variables for the proposed state :math:`\chi'`. * `hmc\_move`: a symbolic Python function which given a starting position, generates :math:`\chi` by randomly sampling a velocity vector. It then calls `simulate\_dynamics` and determines whether the transition :math:`\chi \rightarrow \chi'` is to be accepted. * `hmc\_updates`: a Python function which, given the symbolic outputs of `hmc\_move`, generates the list of updates for a single iteration of HMC. * `HMC\_sampler`: a Python helper class which wraps everything together. **simulate_dynamics** To perform :math:`n` leapfrog steps, we first need to define a function over which `Scan` can iterate over. Instead of implementing Eq. :eq:`leap-frog` verbatim, notice that we can obtain :math:`s(t + n \epsilon)` and :math:`\phi(t + n \epsilon)` by performing an initial half-step update for :math:`\phi`, followed by :math:`n` full-step updates for :math:`s,\phi` and one last half-step update for :math:`\phi`. In loop form, this gives: .. math:: :label: leap-frog2 & \phi_i(t + \epsilon/2) = \phi_i(t) - \frac{\epsilon}{2} \frac{\partial{}}{\partial s_i} E(s(t)) \\ & s_i(t + \epsilon) = s_i(t) + \epsilon \phi_i(t + \epsilon/2) \\ & \text{For } m \in [2,n]\text{, perform full updates: } \\ & \qquad \phi_i(t + (m - 1/2)\epsilon) = \phi_i(t + (m-3/2)\epsilon) - \epsilon \frac{\partial{}}{\partial s_i} E(s(t + (m-1)\epsilon)) \\ & \qquad s_i(t + m\epsilon) = s_i(t) + \epsilon \phi_i(t + (m-1/2)\epsilon) \\ & \phi_i(t + n\epsilon) = \phi_i(t + (n-1/2)\epsilon) - \frac{\epsilon}{2} \frac{\partial{}}{\partial s_i} E(s(t + n\epsilon)) \\ The inner-loop defined above is implemented by the following `leapfrog` function, with `pos`, `vel` and `step` replacing :math:`s,\phi` and :math:`\epsilon` respectively. .. literalinclude:: ../code/hmc/hmc.py :pyobject: simulate_dynamics.leapfrog The `simulate_dynamics` function performs the full algorithm of Eqs. :eq:`leap-frog2`. We start with the initial half-step update of :math:`\phi` and full-step of :math:`s`, and then scan over the `leapfrog` method `n\_steps-1` times. .. literalinclude:: ../code/hmc/hmc.py :pyobject: simulate_dynamics A final half-step is performed to compute :math:`\phi(t+n\epsilon)`, and the final proposed state :math:`\chi'` is returned. **hmc_move** The `hmc\_move` function implements the remaining steps (steps 1 and 3) of an HMC move proposal (while wrapping the `simulate\_dynamics` function). Given a matrix of initial states :math:`s \in \mathcal{R}^{N \times D}` (`positions`) and energy function :math:`E(s)` (`energy\_fn`), it defines the symbolic graph for computing `n\_steps` of HMC, using a given `stepsize`. The function prototype is as follows: .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-1 :end-before: end-snippet-1 We start by sampling random velocities, using the provided shared RandomStream object. Velocities are sampled independently for each dimension and for each particle under simulation, yielding a :math:`N \times D` matrix. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-2 :end-before: end-snippet-2 Since we now have an initial position and velocity, we can now call the `simulate\_dynamics` to obtain the proposal for the new state :math:`\chi'`. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-3 :end-before: end-snippet-3 We then accept/reject the proposed state based on the Metropolis algorithm. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-4 :end-before: end-snippet-4 where `metropolis\_hastings\_accept` and `hamiltonian` are helper functions, defined as follows. .. literalinclude:: ../code/hmc/hmc.py :pyobject: metropolis_hastings_accept .. literalinclude:: ../code/hmc/hmc.py :pyobject: hamiltonian .. literalinclude:: ../code/hmc/hmc.py :pyobject: kinetic_energy `hmc\_move` finally returns the tuple `(accept, final\_pos)`. `accept` is a symbolic boolean variable indicating whether or not the new state `final_pos` should be used or not. **hmc_updates** .. _switch: http://deeplearning.net/software/theano/library/tensor/basic.html#tensor.switch .. _clip: http://deeplearning.net/software/theano/library/tensor/basic.html#tensor.clip .. _dimshuffle: http://deeplearning.net/software/theano/library/tensor/basic.html#tensor._tensor_py_operators.dimshuffle The purpose of `hmc\_updates` is to generate the list of updates to perform, whenever our HMC sampling function is called. `hmc\_updates` thus receives as parameters, a series of shared variables to update (`positions`, `stepsize` and `avg\_acceptance\_rate`), and the parameters required to compute their new state. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-5 :end-before: end-snippet-5 Using the above code, the dictionary `{positions: new\_positions}` can be used to update the state of the sampler with either (1) the new state `final\_pos` if `accept` is True, or (2) the old state if `accept` is False. This conditional assignment is performed by the `switch`_ op. `switch` expects as its first argument, a boolean mask with the same broadcastable dimensions as the second and third argument. Since `accept` is scalar-valued, we must first use `dimshuffle`_ to transform it to a tensor with `final\_pos.ndim` broadcastable dimensions (`accept\_matrix`). `hmc\_updates` additionally implements an adaptive version of HMC, as implemented in the accompanying code to [Ranzato10]_. We start by tracking the average acceptance rate of the HMC move proposals (across many simulations), using an exponential moving average with time constant `1-avg\_acceptance\_slowness`. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-6 :end-before: end-snippet-6 If the average acceptance rate is larger than the `target\_acceptance\_rate`, we increase the `stepsize` by a factor of `stepsize\_inc` in order to increase the mixing rate of our chain. If the average acceptance rate is too low however, `stepsize` is decreased by a factor of `stepsize\_dec`, yielding a more conservative mixing rate. The `clip`_ op allows us to maintain the `stepsize` in the range [`stepsize\_min`, `stepsize\_max`]. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-7 :end-before: end-snippet-7 The final updates list is then returned. .. literalinclude:: ../code/hmc/hmc.py :start-after: start-snippet-8 :end-before: end-snippet-8 **HMC_sampler** We finally tie everything together using the `HMC\_Sampler` class. Its main elements are: * `new\_from\_shared\_positions`: a constructor method which allocates various shared variables and strings together the calls to `hmc\_move` and `hmc\_updates`. It also builds the theano function `simulate`, whose sole purpose is to execute the updates generated by `hmc\_updates`. * `draw`: a convenience method which calls the Theano function `simulate` and returns a copy of the contents of the shared variable `self.positions`. .. literalinclude:: ../code/hmc/hmc.py :pyobject: HMC_sampler Testing our Sampler +++++++++++++++++++ We test our implementation of HMC by sampling from a multi-variate Gaussian distribution. We start by generating a random mean vector `mu` and covariance matrix `cov`, which allows us to define the energy function of the corresponding Gaussian distribution: `gaussian\_energy`. We then initialize the state of the sampler by allocating a `position` shared variable. It is passed to the constructor of `HMC\_sampler` along with our target energy function. Following a burn-in period, we then generate a large number of samples and compare the empirical mean and covariance matrix to their true values. .. literalinclude:: ../code/hmc/test_hmc.py :pyobject: sampler_on_nd_gaussian .. literalinclude:: ../code/hmc/test_hmc.py :pyobject: test_hmc The above code can be run using the command: "nosetests -s code/hmc/test\_hmc.py". The output is as follows: .. code-block:: bash [desjagui@atchoum hmc]$ python test_hmc.py ****** TARGET VALUES ****** target mean: [ 6.96469186 2.86139335 2.26851454 5.51314769 7.1946897 ] target cov: [[ 1. 0.66197111 0.71141257 0.55766643 0.35753822] [ 0.66197111 1. 0.31053199 0.45455485 0.37991646] [ 0.71141257 0.31053199 1. 0.62800335 0.38004541] [ 0.55766643 0.45455485 0.62800335 1. 0.50807871] [ 0.35753822 0.37991646 0.38004541 0.50807871 1. ]] ****** EMPIRICAL MEAN/COV USING HMC ****** empirical mean: [ 6.94155164 2.81526039 2.26301715 5.46536853 7.19414496] empirical_cov: [[ 1.05152997 0.68393537 0.76038645 0.59930252 0.37478746] [ 0.68393537 0.97708159 0.37351422 0.48362404 0.3839558 ] [ 0.76038645 0.37351422 1.03797111 0.67342957 0.41529132] [ 0.59930252 0.48362404 0.67342957 1.02865056 0.53613649] [ 0.37478746 0.3839558 0.41529132 0.53613649 0.98721449]] ****** HMC INTERNALS ****** final stepsize 0.460446628091 final acceptance_rate 0.922502043428 As can be seen above, the samples generated by our HMC sampler yield an empirical mean and covariance matrix, which are very close to the true underlying parameters. The adaptive algorithm also seemed to work well as the final acceptance rate is close to our target of `0.9`. References ++++++++++ .. [Alder59] Alder, B. J. and Wainwright, T. E. (1959) "Studies in molecular dynamics. 1. General method", Journal of Chemical Physics, vol. 31, pp. 459-466. .. [Andersen80] Andersen, H.C. (1980) "Molecular dynamics simulations at constant pressure and/or temperature", Journal of Chemical Physics, vol. 72, pp. 2384-2393. .. [Duane87] Duane, S., Kennedy, A. D., Pendleton, B. J., and Roweth, D. (1987) "Hybrid Monte Carlo", Physics Letters, vol. 195, pp. 216-222. .. [Neal93] Neal, R. M. (1993) "Probabilistic Inference Using Markov Chain Monte Carlo Methods", Technical Report CRG-TR-93-1, Dept. of Computer Science, University of Toronto, 144 pages ================================================ FILE: DeepLearningTutorials/doc/index.txt ================================================ ======================= Deep Learning Tutorials ======================= Deep Learning is a new area of Machine Learning research, which has been introduced with the objective of moving Machine Learning closer to one of its original goals: Artificial Intelligence. See these course notes for a `brief introduction to Machine Learning for AI `_ and an `introduction to Deep Learning algorithms `_. Deep Learning is about learning multiple levels of representation and abstraction that help to make sense of data such as images, sound, and text. For more about deep learning algorithms, see for example: - The monograph or review paper `Learning Deep Architectures for AI `_ (Foundations & Trends in Machine Learning, 2009). - The ICML 2009 Workshop on Learning Feature Hierarchies `webpage `_ has a `list of references `_. - The LISA `public wiki `_ has a `reading list `_ and a `bibliography `_. - Geoff Hinton has `readings `_ from 2009's `NIPS tutorial `_. The tutorials presented here will introduce you to some of the most important deep learning algorithms and will also show you how to run them using Theano_. Theano is a python library that makes writing deep learning models easy, and gives the option of training them on a GPU. The algorithm tutorials have some prerequisites. You should know some python, and be familiar with numpy. Since this tutorial is about using Theano, you should read over the `Theano basic tutorial`_ first. Once you've done that, read through our :ref:`gettingstarted` chapter -- it introduces the notation, and [downloadable] datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent. The purely supervised learning algorithms are meant to be read in order: #. :ref:`Logistic Regression ` - using Theano for something simple #. :ref:`Multilayer perceptron ` - introduction to layers #. :ref:`Deep Convolutional Network ` - a simplified version of LeNet5 The unsupervised and semi-supervised learning algorithms can be read in any order (the auto-encoders can be read independently of the RBM/DBN thread): * :ref:`Auto Encoders, Denoising Autoencoders ` - description of autoencoders * :ref:`Stacked Denoising Auto-Encoders ` - easy steps into unsupervised pre-training for deep nets * :ref:`Restricted Boltzmann Machines ` - single layer generative RBM model * :ref:`Deep Belief Networks ` - unsupervised generative pre-training of stacked RBMs followed by supervised fine-tuning Building towards including the mcRBM model, we have a new tutorial on sampling from energy models: * :ref:`HMC Sampling ` - hybrid (aka Hamiltonian) Monte-Carlo sampling with scan() Building towards including the Contractive auto-encoders tutorial, we have the code for now: * `Contractive auto-encoders`_ code - There is some basic doc in the code. Recurrent neural networks with word embeddings and context window: * :ref:`Semantic Parsing of Speech using Recurrent Net ` LSTM network for sentiment analysis: * :ref:`LSTM network ` Energy-based recurrent neural network (RNN-RBM): * :ref:`Modeling and generating sequences of polyphonic music ` .. _Theano: http://deeplearning.net/software/theano .. _Theano basic tutorial: http://deeplearning.net/software/theano/tutorial .. _Contractive auto-encoders: https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/cA.py ================================================ FILE: DeepLearningTutorials/doc/lenet.txt ================================================ .. _lenet: Convolutional Neural Networks (LeNet) ===================================== .. note:: This section assumes the reader has already read through :doc:`logreg` and :doc:`mlp`. Additionally, it uses the following new Theano functions and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `floatX`_, `downsample`_ , `conv2d`_, `dimshuffle`_. If you intend to run the code on GPU also read `GPU`_. To run this example on a GPU, you need a good GPU. It needs at least 1GB of GPU RAM. More may be required if your monitor is connected to the GPU. When the GPU is connected to the monitor, there is a limit of a few seconds for each GPU function call. This is needed as current GPUs can't be used for the monitor while doing computation. Without this limit, the screen would freeze for too long and make it look as if the computer froze. This example hits this limit with medium-quality GPUs. When the GPU isn't connected to a monitor, there is no time limit. You can lower the batch size to fix the time out problem. .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html .. _downsample: http://deeplearning.net/software/theano/library/tensor/signal/downsample.html .. _conv2d: http://deeplearning.net/software/theano/library/tensor/signal/conv.html#module-conv .. _dimshuffle: http://deeplearning.net/software/theano/library/tensor/basic.html#tensor._tensor_py_operators.dimshuffle .. note:: The code for this section is available for download `here`_ and the `3wolfmoon image`_ .. _here: http://deeplearning.net/tutorial/code/convolutional_mlp.py .. _3wolfmoon image: https://raw.githubusercontent.com/lisa-lab/DeepLearningTutorials/master/doc/images/3wolfmoon.jpg Motivation ++++++++++ Convolutional Neural Networks (CNN) are biologically-inspired variants of MLPs. From Hubel and Wiesel's early work on the cat's visual cortex [Hubel68]_, we know the visual cortex contains a complex arrangement of cells. These cells are sensitive to small sub-regions of the visual field, called a *receptive field*. The sub-regions are tiled to cover the entire visual field. These cells act as local filters over the input space and are well-suited to exploit the strong spatially local correlation present in natural images. Additionally, two basic cell types have been identified: Simple cells respond maximally to specific edge-like patterns within their receptive field. Complex cells have larger receptive fields and are locally invariant to the exact position of the pattern. The animal visual cortex being the most powerful visual processing system in existence, it seems natural to emulate its behavior. Hence, many neurally-inspired models can be found in the literature. To name a few: the NeoCognitron [Fukushima]_, HMAX [Serre07]_ and LeNet-5 [LeCun98]_, which will be the focus of this tutorial. Sparse Connectivity +++++++++++++++++++ CNNs exploit spatially-local correlation by enforcing a local connectivity pattern between neurons of adjacent layers. In other words, the inputs of hidden units in layer **m** are from a subset of units in layer **m-1**, units that have spatially contiguous receptive fields. We can illustrate this graphically as follows: .. figure:: images/sparse_1D_nn.png :align: center Imagine that layer **m-1** is the input retina. In the above figure, units in layer **m** have receptive fields of width 3 in the input retina and are thus only connected to 3 adjacent neurons in the retina layer. Units in layer **m+1** have a similar connectivity with the layer below. We say that their receptive field with respect to the layer below is also 3, but their receptive field with respect to the input is larger (5). Each unit is unresponsive to variations outside of its receptive field with respect to the retina. The architecture thus ensures that the learnt "filters" produce the strongest response to a spatially local input pattern. However, as shown above, stacking many such layers leads to (non-linear) "filters" that become increasingly "global" (i.e. responsive to a larger region of pixel space). For example, the unit in hidden layer **m+1** can encode a non-linear feature of width 5 (in terms of pixel space). Shared Weights ++++++++++++++ In addition, in CNNs, each filter :math:`h_i` is replicated across the entire visual field. These replicated units share the same parameterization (weight vector and bias) and form a *feature map*. .. figure:: images/conv_1D_nn.png :align: center In the above figure, we show 3 hidden units belonging to the same feature map. Weights of the same color are shared---constrained to be identical. Gradient descent can still be used to learn such shared parameters, with only a small change to the original algorithm. The gradient of a shared weight is simply the sum of the gradients of the parameters being shared. Replicating units in this way allows for features to be detected *regardless of their position in the visual field.* Additionally, weight sharing increases learning efficiency by greatly reducing the number of free parameters being learnt. The constraints on the model enable CNNs to achieve better generalization on vision problems. Details and Notation ++++++++++++++++++++ A feature map is obtained by repeated application of a function across sub-regions of the entire image, in other words, by *convolution* of the input image with a linear filter, adding a bias term and then applying a non-linear function. If we denote the k-th feature map at a given layer as :math:`h^k`, whose filters are determined by the weights :math:`W^k` and bias :math:`b_k`, then the feature map :math:`h^k` is obtained as follows (for :math:`tanh` non-linearities): .. math:: h^k_{ij} = \tanh ( (W^k * x)_{ij} + b_k ). .. Note:: Recall the following definition of convolution for a 1D signal. :math:`o[n] = f[n]*g[n] = \sum_{u=-\infty}^{\infty} f[u] g[n-u] = \sum_{u=-\infty}^{\infty} f[n-u] g[u]`. This can be extended to 2D as follows: :math:`o[m,n] = f[m,n]*g[m,n] = \sum_{u=-\infty}^{\infty} \sum_{v=-\infty}^{\infty} f[u,v] g[m-u,n-v]`. To form a richer representation of the data, each hidden layer is composed of *multiple* feature maps, :math:`\{h^{(k)}, k=0..K\}`. The weights :math:`W` of a hidden layer can be represented in a 4D tensor containing elements for every combination of destination feature map, source feature map, source vertical position, and source horizontal position. The biases :math:`b` can be represented as a vector containing one element for every destination feature map. We illustrate this graphically as follows: .. figure:: images/cnn_explained.png :align: center **Figure 1**: example of a convolutional layer The figure shows two layers of a CNN. **Layer m-1** contains four feature maps. **Hidden layer m** contains two feature maps (:math:`h^0` and :math:`h^1`). Pixels (neuron outputs) in :math:`h^0` and :math:`h^1` (outlined as blue and red squares) are computed from pixels of layer (m-1) which fall within their 2x2 receptive field in the layer below (shown as colored rectangles). Notice how the receptive field spans all four input feature maps. The weights :math:`W^0` and :math:`W^1` of :math:`h^0` and :math:`h^1` are thus 3D weight tensors. The leading dimension indexes the input feature maps, while the other two refer to the pixel coordinates. Putting it all together, :math:`W^{kl}_{ij}` denotes the weight connecting each pixel of the k-th feature map at layer m, with the pixel at coordinates (i,j) of the l-th feature map of layer (m-1). The Convolution Operator ++++++++++++++++++++++++ ConvOp is the main workhorse for implementing a convolutional layer in Theano. ConvOp is used by ``theano.tensor.signal.conv2d``, which takes two symbolic inputs: * a 4D tensor corresponding to a mini-batch of input images. The shape of the tensor is as follows: [mini-batch size, number of input feature maps, image height, image width]. * a 4D tensor corresponding to the weight matrix :math:`W`. The shape of the tensor is: [number of feature maps at layer m, number of feature maps at layer m-1, filter height, filter width] Below is the Theano code for implementing a convolutional layer similar to the one of Figure 1. The input consists of 3 features maps (an RGB color image) of size 120x160. We use two convolutional filters with 9x9 receptive fields. .. code-block:: python import theano from theano import tensor as T from theano.tensor.nnet import conv import numpy rng = numpy.random.RandomState(23455) # instantiate 4D tensor for input input = T.tensor4(name='input') # initialize shared variable for weights. w_shp = (2, 3, 9, 9) w_bound = numpy.sqrt(3 * 9 * 9) W = theano.shared( numpy.asarray( rng.uniform( low=-1.0 / w_bound, high=1.0 / w_bound, size=w_shp), dtype=input.dtype), name ='W') # initialize shared variable for bias (1D tensor) with random values # IMPORTANT: biases are usually initialized to zero. However in this # particular application, we simply apply the convolutional layer to # an image without learning the parameters. We therefore initialize # them to random values to "simulate" learning. b_shp = (2,) b = theano.shared(numpy.asarray( rng.uniform(low=-.5, high=.5, size=b_shp), dtype=input.dtype), name ='b') # build symbolic expression that computes the convolution of input with filters in w conv_out = conv.conv2d(input, W) # build symbolic expression to add bias and apply activation function, i.e. produce neural net layer output # A few words on ``dimshuffle`` : # ``dimshuffle`` is a powerful tool in reshaping a tensor; # what it allows you to do is to shuffle dimension around # but also to insert new ones along which the tensor will be # broadcastable; # dimshuffle('x', 2, 'x', 0, 1) # This will work on 3d tensors with no broadcastable # dimensions. The first dimension will be broadcastable, # then we will have the third dimension of the input tensor as # the second of the resulting tensor, etc. If the tensor has # shape (20, 30, 40), the resulting tensor will have dimensions # (1, 40, 1, 20, 30). (AxBxC tensor is mapped to 1xCx1xAxB tensor) # More examples: # dimshuffle('x') -> make a 0d (scalar) into a 1d vector # dimshuffle(0, 1) -> identity # dimshuffle(1, 0) -> inverts the first and second dimensions # dimshuffle('x', 0) -> make a row out of a 1d vector (N to 1xN) # dimshuffle(0, 'x') -> make a column out of a 1d vector (N to Nx1) # dimshuffle(2, 0, 1) -> AxBxC to CxAxB # dimshuffle(0, 'x', 1) -> AxB to Ax1xB # dimshuffle(1, 'x', 0) -> AxB to Bx1xA output = T.nnet.sigmoid(conv_out + b.dimshuffle('x', 0, 'x', 'x')) # create theano function to compute filtered images f = theano.function([input], output) Let's have a little bit of fun with this... .. code-block:: python import numpy import pylab from PIL import Image # open random image of dimensions 639x516 img = Image.open(open('doc/images/3wolfmoon.jpg')) # dimensions are (height, width, channel) img = numpy.asarray(img, dtype='float64') / 256. # put image in 4D tensor of shape (1, 3, height, width) img_ = img.transpose(2, 0, 1).reshape(1, 3, 639, 516) filtered_img = f(img_) # plot original image and first and second components of output pylab.subplot(1, 3, 1); pylab.axis('off'); pylab.imshow(img) pylab.gray(); # recall that the convOp output (filtered image) is actually a "minibatch", # of size 1 here, so we take index 0 in the first dimension: pylab.subplot(1, 3, 2); pylab.axis('off'); pylab.imshow(filtered_img[0, 0, :, :]) pylab.subplot(1, 3, 3); pylab.axis('off'); pylab.imshow(filtered_img[0, 1, :, :]) pylab.show() This should generate the following output. .. image:: images/3wolfmoon_output.png :align: center Notice that a randomly initialized filter acts very much like an edge detector! Note that we use the same weight initialization formula as with the MLP. Weights are sampled randomly from a uniform distribution in the range [-1/fan-in, 1/fan-in], where fan-in is the number of inputs to a hidden unit. For MLPs, this was the number of units in the layer below. For CNNs however, we have to take into account the number of input feature maps and the size of the receptive fields. MaxPooling ++++++++++ Another important concept of CNNs is *max-pooling,* which is a form of non-linear down-sampling. Max-pooling partitions the input image into a set of non-overlapping rectangles and, for each such sub-region, outputs the maximum value. Max-pooling is useful in vision for two reasons: #. By eliminating non-maximal values, it reduces computation for upper layers. #. It provides a form of translation invariance. Imagine cascading a max-pooling layer with a convolutional layer. There are 8 directions in which one can translate the input image by a single pixel. If max-pooling is done over a 2x2 region, 3 out of these 8 possible configurations will produce exactly the same output at the convolutional layer. For max-pooling over a 3x3 window, this jumps to 5/8. Since it provides additional robustness to position, max-pooling is a "smart" way of reducing the dimensionality of intermediate representations. Max-pooling is done in Theano by way of ``theano.tensor.signal.downsample.max_pool_2d``. This function takes as input an N dimensional tensor (where N >= 2) and a downscaling factor and performs max-pooling over the 2 trailing dimensions of the tensor. An example is worth a thousand words: .. code-block:: python from theano.tensor.signal import downsample input = T.dtensor4('input') maxpool_shape = (2, 2) pool_out = downsample.max_pool_2d(input, maxpool_shape, ignore_border=True) f = theano.function([input],pool_out) invals = numpy.random.RandomState(1).rand(3, 2, 5, 5) print 'With ignore_border set to True:' print 'invals[0, 0, :, :] =\n', invals[0, 0, :, :] print 'output[0, 0, :, :] =\n', f(invals)[0, 0, :, :] pool_out = downsample.max_pool_2d(input, maxpool_shape, ignore_border=False) f = theano.function([input],pool_out) print 'With ignore_border set to False:' print 'invals[1, 0, :, :] =\n ', invals[1, 0, :, :] print 'output[1, 0, :, :] =\n ', f(invals)[1, 0, :, :] This should generate the following output: .. code-block:: bash With ignore_border set to True: invals[0, 0, :, :] = [[ 4.17022005e-01 7.20324493e-01 1.14374817e-04 3.02332573e-01 1.46755891e-01] [ 9.23385948e-02 1.86260211e-01 3.45560727e-01 3.96767474e-01 5.38816734e-01] [ 4.19194514e-01 6.85219500e-01 2.04452250e-01 8.78117436e-01 2.73875932e-02] [ 6.70467510e-01 4.17304802e-01 5.58689828e-01 1.40386939e-01 1.98101489e-01] [ 8.00744569e-01 9.68261576e-01 3.13424178e-01 6.92322616e-01 8.76389152e-01]] output[0, 0, :, :] = [[ 0.72032449 0.39676747] [ 0.6852195 0.87811744]] With ignore_border set to False: invals[1, 0, :, :] = [[ 0.01936696 0.67883553 0.21162812 0.26554666 0.49157316] [ 0.05336255 0.57411761 0.14672857 0.58930554 0.69975836] [ 0.10233443 0.41405599 0.69440016 0.41417927 0.04995346] [ 0.53589641 0.66379465 0.51488911 0.94459476 0.58655504] [ 0.90340192 0.1374747 0.13927635 0.80739129 0.39767684]] output[1, 0, :, :] = [[ 0.67883553 0.58930554 0.69975836] [ 0.66379465 0.94459476 0.58655504] [ 0.90340192 0.80739129 0.39767684]] Note that compared to most Theano code, the ``max_pool_2d`` operation is a little *special*. It requires the downscaling factor ``ds`` (tuple of length 2 containing downscaling factors for image width and height) to be known at graph build time. This may change in the near future. The Full Model: LeNet +++++++++++++++++++++ Sparse, convolutional layers and max-pooling are at the heart of the LeNet family of models. While the exact details of the model will vary greatly, the figure below shows a graphical depiction of a LeNet model. .. image:: images/mylenet.png :align: center The lower-layers are composed to alternating convolution and max-pooling layers. The upper-layers however are fully-connected and correspond to a traditional MLP (hidden layer + logistic regression). The input to the first fully-connected layer is the set of all features maps at the layer below. From an implementation point of view, this means lower-layers operate on 4D tensors. These are then flattened to a 2D matrix of rasterized feature maps, to be compatible with our previous MLP implementation. Putting it All Together +++++++++++++++++++++++ We now have all we need to implement a LeNet model in Theano. We start with the LeNetConvPoolLayer class, which implements a {convolution + max-pooling} layer. .. literalinclude:: ../code/convolutional_mlp.py :pyobject: LeNetConvPoolLayer Notice that when initializing the weight values, the fan-in is determined by the size of the receptive fields and the number of input feature maps. Finally, using the LogisticRegression class defined in :doc:`logreg` and the HiddenLayer class defined in :doc:`mlp` , we can instantiate the network as follows. .. literalinclude:: ../code/convolutional_mlp.py :start-after: start-snippet-1 :end-before: end-snippet-1 We leave out the code that performs the actual training and early-stopping, since it is exactly the same as with an MLP. The interested reader can nevertheless access the code in the 'code' folder of DeepLearningTutorials. Running the Code ++++++++++++++++ The user can then run the code by calling: .. code-block:: bash python code/convolutional_mlp.py The following output was obtained with the default parameters on a Core i7-2600K CPU clocked at 3.40GHz and using flags 'floatX=float32': .. code-block:: bash Optimization complete. Best validation score of 0.910000 % obtained at iteration 17800,with test performance 0.920000 % The code for file convolutional_mlp.py ran for 380.28m Using a GeForce GTX 285, we obtained the following: .. code-block:: bash Optimization complete. Best validation score of 0.910000 % obtained at iteration 15500,with test performance 0.930000 % The code for file convolutional_mlp.py ran for 46.76m And similarly on a GeForce GTX 480: .. code-block:: bash Optimization complete. Best validation score of 0.910000 % obtained at iteration 16400,with test performance 0.930000 % The code for file convolutional_mlp.py ran for 32.52m Note that the discrepancies in validation and test error (as well as iteration count) are due to different implementations of the rounding mechanism in hardware. They can be safely ignored. Tips and Tricks +++++++++++++++ Choosing Hyperparameters ------------------------ CNNs are especially tricky to train, as they add even more hyper-parameters than a standard MLP. While the usual rules of thumb for learning rates and regularization constants still apply, the following should be kept in mind when optimizing CNNs. Number of filters ***************** When choosing the number of filters per layer, keep in mind that computing the activations of a single convolutional filter is much more expensive than with traditional MLPs ! Assume layer :math:`(l-1)` contains :math:`K^{l-1}` feature maps and :math:`M \times N` pixel positions (i.e., number of positions times number of feature maps), and there are :math:`K^l` filters at layer :math:`l` of shape :math:`m \times n`. Then computing a feature map (applying an :math:`m \times n` filter at all :math:`(M-m) \times (N-n)` pixel positions where the filter can be applied) costs :math:`(M-m) \times (N-n) \times m \times n \times K^{l-1}`. The total cost is :math:`K^l` times that. Things may be more complicated if not all features at one level are connected to all features at the previous one. For a standard MLP, the cost would only be :math:`K^l \times K^{l-1}` where there are :math:`K^l` different neurons at level :math:`l`. As such, the number of filters used in CNNs is typically much smaller than the number of hidden units in MLPs and depends on the size of the feature maps (itself a function of input image size and filter shapes). Since feature map size decreases with depth, layers near the input layer will tend to have fewer filters while layers higher up can have much more. In fact, to equalize computation at each layer, the product of the number of features and the number of pixel positions is typically picked to be roughly constant across layers. To preserve the information about the input would require keeping the total number of activations (number of feature maps times number of pixel positions) to be non-decreasing from one layer to the next (of course we could hope to get away with less when we are doing supervised learning). The number of feature maps directly controls capacity and so that depends on the number of available examples and the complexity of the task. Filter Shape ************ Common filter shapes found in the litterature vary greatly, usually based on the dataset. Best results on MNIST-sized images (28x28) are usually in the 5x5 range on the first layer, while natural image datasets (often with hundreds of pixels in each dimension) tend to use larger first-layer filters of shape 12x12 or 15x15. The trick is thus to find the right level of "granularity" (i.e. filter shapes) in order to create abstractions at the proper scale, given a particular dataset. Max Pooling Shape ***************** Typical values are 2x2 or no max-pooling. Very large input images may warrant 4x4 pooling in the lower-layers. Keep in mind however, that this will reduce the dimension of the signal by a factor of 16, and may result in throwing away too much information. .. rubric:: Footnotes .. [#f1] For clarity, we use the word "unit" or "neuron" to refer to the artificial neuron and "cell" to refer to the biological neuron. Tips **** If you want to try this model on a new dataset, here are a few tips that can help you get better results: * Whitening the data (e.g. with PCA) * Decay the learning rate in each epoch ================================================ FILE: DeepLearningTutorials/doc/logreg.txt ================================================ .. index:: Logistic Regression .. _logreg : Classifying MNIST digits using Logistic Regression ================================================== .. note:: This sections assumes familiarity with the following Theano concepts: `shared variables`_ , `basic arithmetic ops`_ , `T.grad`_ , `floatX`_. If you intend to run the code on GPU also read `GPU`_. .. note:: The code for this section is available for download `here`_. .. _here: http://deeplearning.net/tutorial/code/logistic_sgd.py .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html In this section, we show how Theano can be used to implement the most basic classifier: the logistic regression. We start off with a quick primer of the model, which serves both as a refresher but also to anchor the notation and show how mathematical expressions are mapped onto Theano graphs. In the deepest of machine learning traditions, this tutorial will tackle the exciting problem of MNIST digit classification. The Model +++++++++ Logistic regression is a probabilistic, linear classifier. It is parametrized by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is done by projecting an input vector onto a set of hyperplanes, each of which corresponds to a class. The distance from the input to a hyperplane reflects the probability that the input is a member of the corresponding class. Mathematically, the probability that an input vector :math:`x` is a member of a class :math:`i`, a value of a stochastic variable :math:`Y`, can be written as: .. math:: P(Y=i|x, W,b) &= softmax_i(W x + b) \\ &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} The model's prediction :math:`y_{pred}` is the class whose probability is maximal, specifically: .. math:: y_{pred} = {\rm argmax}_i P(Y=i|x,W,b) The code to do this in Theano is the following: .. literalinclude:: ../code/logistic_sgd.py :start-after: start-snippet-1 :end-before: end-snippet-1 Since the parameters of the model must maintain a persistent state throughout training, we allocate shared variables for :math:`W,b`. This declares them both as being symbolic Theano variables, but also initializes their contents. The dot and softmax operators are then used to compute the vector :math:`P(Y|x, W,b)`. The result ``p_y_given_x`` is a symbolic variable of vector-type. To get the actual model prediction, we can use the ``T.argmax`` operator, which will return the index at which ``p_y_given_x`` is maximal (i.e. the class with maximum probability). Now of course, the model we have defined so far does not do anything useful yet, since its parameters are still in their initial state. The following section will thus cover how to learn the optimal parameters. .. note:: For a complete list of Theano ops, see: `list of ops `_ Defining a Loss Function ++++++++++++++++++++++++ Learning optimal model parameters involves minimizing a loss function. In the case of multi-class logistic regression, it is very common to use the negative log-likelihood as the loss. This is equivalent to maximizing the likelihood of the data set :math:`\cal{D}` under the model parameterized by :math:`\theta`. Let us first start by defining the likelihood :math:`\cal{L}` and loss :math:`\ell`: .. math:: \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) = - \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) While entire books are dedicated to the topic of minimization, gradient descent is by far the simplest method for minimizing arbitrary non-linear functions. This tutorial will use the method of stochastic gradient method with mini-batches (MSGD). See :ref:`opt_SGD` for more details. The following Theano code defines the (symbolic) loss for a given minibatch: .. literalinclude:: ../code/logistic_sgd.py :start-after: start-snippet-2 :end-before: end-snippet-2 .. note:: Even though the loss is formally defined as the *sum*, over the data set, of individual error terms, in practice, we use the *mean* (``T.mean``) in the code. This allows for the learning rate choice to be less dependent of the minibatch size. Creating a LogisticRegression class +++++++++++++++++++++++++++++++++++ We now have all the tools we need to define a ``LogisticRegression`` class, which encapsulates the basic behaviour of logistic regression. The code is very similar to what we have covered so far, and should be self explanatory. .. literalinclude:: ../code/logistic_sgd.py :pyobject: LogisticRegression We instantiate this class as follows: .. literalinclude:: ../code/logistic_sgd.py :start-after: index = T.lscalar() :end-before: # the cost we minimize during We start by allocating symbolic variables for the training inputs :math:`x` and their corresponding classes :math:`y`. Note that ``x`` and ``y`` are defined outside the scope of the ``LogisticRegression`` object. Since the class requires the input to build its graph, it is passed as a parameter of the ``__init__`` function. This is useful in case you want to connect instances of such classes to form a deep network. The output of one layer can be passed as the input of the layer above. (This tutorial does not build a multi-layer network, but this code will be reused in future tutorials that do.) Finally, we define a (symbolic) ``cost`` variable to minimize, using the instance method ``classifier.negative_log_likelihood``. .. literalinclude:: ../code/logistic_sgd.py :start-after: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) :end-before: # compiling a Theano function that computes the mistakes Note that ``x`` is an implicit symbolic input to the definition of ``cost``, because the symbolic variables of ``classifier`` were defined in terms of ``x`` at initialization. Learning the Model ++++++++++++++++++ To implement MSGD in most programming languages (C/C++, Matlab, Python), one would start by manually deriving the expressions for the gradient of the loss with respect to the parameters: in this case :math:`\partial{\ell}/\partial{W}`, and :math:`\partial{\ell}/\partial{b}`, This can get pretty tricky for complex models, as expressions for :math:`\partial{\ell}/\partial{\theta}` can get fairly complex, especially when taking into account problems of numerical stability. With Theano, this work is greatly simplified. It performs automatic differentiation and applies certain math transforms to improve numerical stability. To get the gradients :math:`\partial{\ell}/\partial{W}` and :math:`\partial{\ell}/\partial{b}` in Theano, simply do the following: .. literalinclude:: ../code/logistic_sgd.py :start-after: # compute the gradient of cost :end-before: # start-snippet-3 ``g_W`` and ``g_b`` are symbolic variables, which can be used as part of a computation graph. The function ``train_model``, which performs one step of gradient descent, can then be defined as follows: .. literalinclude:: ../code/logistic_sgd.py :start-after: start-snippet-3 :end-before: end-snippet-3 ``updates`` is a list of pairs. In each pair, the first element is the symbolic variable to be updated in the step, and the second element is the symbolic function for calculating its new value. Similarly, ``givens`` is a dictionary whose keys are symbolic variables and whose values specify their replacements during the step. The function ``train_model`` is then defined such that: * the input is the mini-batch index ``index`` that, together with the batch size (which is not an input since it is fixed) defines :math:`x` with corresponding labels :math:`y` * the return value is the cost/loss associated with the x, y defined by the ``index`` * on every function call, it will first replace ``x`` and ``y`` with the slices from the training set specified by ``index``. Then, it will evaluate the cost associated with that minibatch and apply the operations defined by the ``updates`` list. Each time ``train_model(index)`` is called, it will thus compute and return the cost of a minibatch, while also performing a step of MSGD. The entire learning algorithm thus consists in looping over all examples in the dataset, considering all the examples in one minibatch at a time, and repeatedly calling the ``train_model`` function. Testing the model +++++++++++++++++ As explained in :ref:`opt_learn_classifier`, when testing the model we are interested in the number of misclassified examples (and not only in the likelihood). The ``LogisticRegression`` class therefore has an extra instance method, which builds the symbolic graph for retrieving the number of misclassified examples in each minibatch. The code is as follows: .. literalinclude:: ../code/logistic_sgd.py :pyobject: LogisticRegression.errors We then create a function ``test_model`` and a function ``validate_model``, which we can call to retrieve this value. As you will see shortly, ``validate_model`` is key to our early-stopping implementation (see :ref:`opt_early_stopping`). These functions take a minibatch index and compute, for the examples in that minibatch, the number that were misclassified by the model. The only difference between them is that ``test_model`` draws its minibatches from the testing set, while ``validate_model`` draws its from the validation set. .. literalinclude:: ../code/logistic_sgd.py :start-after: cost = classifier.negative_log_likelihood(y) :end-before: # compute the gradient of cost Putting it All Together +++++++++++++++++++++++ The finished product is as follows. .. literalinclude:: ../code/logistic_sgd.py The user can learn to classify MNIST digits with SGD logistic regression, by typing, from within the DeepLearningTutorials folder: .. code-block:: bash python code/logistic_sgd.py The output one should expect is of the form : .. code-block:: bash ... epoch 72, minibatch 83/83, validation error 7.510417 % epoch 72, minibatch 83/83, test error of best model 7.510417 % epoch 73, minibatch 83/83, validation error 7.500000 % epoch 73, minibatch 83/83, test error of best model 7.489583 % Optimization complete with best validation score of 7.500000 %,with test performance 7.489583 % The code run for 74 epochs, with 1.936983 epochs/sec On an Intel(R) Core(TM)2 Duo CPU E8400 @ 3.00 Ghz the code runs with approximately 1.936 epochs/sec and it took 75 epochs to reach a test error of 7.489%. On the GPU the code does almost 10.0 epochs/sec. For this instance we used a batch size of 600. .. rubric:: Footnotes .. [#f1] For smaller datasets and simpler models, more sophisticated descent algorithms can be more effective. The sample code `logistic_cg.py `_ demonstrates how to use SciPy's conjugate gradient solver with Theano on the logistic regression task. ================================================ FILE: DeepLearningTutorials/doc/lstm.txt ================================================ .. _lstm: LSTM Networks for Sentiment Analysis ********************************************** Summary +++++++ This tutorial aims to provide an example of how a Recurrent Neural Network (RNN) using the Long Short Term Memory (LSTM) architecture can be implemented using Theano. In this tutorial, this model is used to perform sentiment analysis on movie reviews from the `Large Movie Review Dataset `_, sometimes known as the IMDB dataset. In this task, given a movie review, the model attempts to predict whether it is positive or negative. This is a binary classification task. Data ++++ As previously mentioned, the provided scripts are used to train a LSTM recurrent neural network on the Large Movie Review Dataset dataset. While the dataset is public, in this tutorial we provide a copy of the dataset that has previously been preprocessed according to the needs of this LSTM implementation. Running the code provided in this tutorial will automatically download the data to the local directory. Model +++++ LSTM ==== In a *traditional* recurrent neural network, during the gradient back-propagation phase, the gradient signal can end up being multiplied a large number of times (as many as the number of timesteps) by the weight matrix associated with the connections between the neurons of the recurrent hidden layer. This means that, the magnitude of weights in the transition matrix can have a strong impact on the learning process. If the weights in this matrix are small (or, more formally, if the leading eigenvalue of the weight matrix is smaller than 1.0), it can lead to a situation called *vanishing gradients* where the gradient signal gets so small that learning either becomes very slow or stops working altogether. It can also make more difficult the task of learning long-term dependencies in the data. Conversely, if the weights in this matrix are large (or, again, more formally, if the leading eigenvalue of the weight matrix is larger than 1.0), it can lead to a situation where the gradient signal is so large that it can cause learning to diverge. This is often referred to as *exploding gradients*. These issues are the main motivation behind the LSTM model which introduces a new structure called a *memory cell* (see Figure 1 below). A memory cell is composed of four main elements: an input gate, a neuron with a self-recurrent connection (a connection to itself), a forget gate and an output gate. The self-recurrent connection has a weight of 1.0 and ensures that, barring any outside interference, the state of a memory cell can remain constant from one timestep to another. The gates serve to modulate the interactions between the memory cell itself and its environment. The input gate can allow incoming signal to alter the state of the memory cell or block it. On the other hand, the output gate can allow the state of the memory cell to have an effect on other neurons or prevent it. Finally, the forget gate can modulate the memory cell’s self-recurrent connection, allowing the cell to remember or forget its previous state, as needed. .. figure:: images/lstm_memorycell.png :align: center **Figure 1** : Illustration of an LSTM memory cell. The equations below describe how a layer of memory cells is updated at every timestep :math:`t`. In these equations : * :math:`x_t` is the input to the memory cell layer at time :math:`t` * :math:`W_i`, :math:`W_f`, :math:`W_c`, :math:`W_o`, :math:`U_i`, :math:`U_f`, :math:`U_c`, :math:`U_o` and :math:`V_o` are weight matrices * :math:`b_i`, :math:`b_f`, :math:`b_c` and :math:`b_o` are bias vectors First, we compute the values for :math:`i_t`, the input gate, and :math:`\widetilde{C_t}` the candidate value for the states of the memory cells at time :math:`t` : .. math:: :label: 1 i_t = \sigma(W_i x_t + U_i h_{t-1} + b_i) .. math:: :label: 2 \widetilde{C_t} = tanh(W_c x_t + U_c h_{t-1} + b_c) Second, we compute the value for :math:`f_t`, the activation of the memory cells' forget gates at time :math:`t` : .. math:: :label: 3 f_t = \sigma(W_f x_t + U_f h_{t-1} + b_f) Given the value of the input gate activation :math:`i_t`, the forget gate activation :math:`f_t` and the candidate state value :math:`\widetilde{C_t}`, we can compute :math:`C_t` the memory cells' new state at time :math:`t` : .. math:: :label: 4 C_t = i_t * \widetilde{C_t} + f_t * C_{t-1} With the new state of the memory cells, we can compute the value of their output gates and, subsequently, their outputs : .. math:: :label: 5 o_t = \sigma(W_o x_t + U_o h_{t-1} + V_o C_t + b_1) .. math:: :label: 6 h_t = o_t * tanh(C_t) Our model ========= The model we used in this tutorial is a variation of the standard LSTM model. In this variant, the activation of a cell’s output gate does not depend on the memory cell’s state :math:`C_t`. This allows us to perform part of the computation more efficiently (see the implementation note, below, for details). This means that, in the variant we have implemented, there is no matrix :math:`V_o` and equation :eq:`5` is replaced by equation :eq:`5-alt` : .. math:: :label: 5-alt o_t = \sigma(W_o x_t + U_o h_{t-1} + b_1) Our model is composed of a single LSTM layer followed by an average pooling and a logistic regression layer as illustrated in Figure 2 below. Thus, from an input sequence :math:`x_0, x_1, x_2, ..., x_n`, the memory cells in the LSTM layer will produce a representation sequence :math:`h_0, h_1, h_2, ..., h_n`. This representation sequence is then averaged over all timesteps resulting in representation h. Finally, this representation is fed to a logistic regression layer whose target is the class label associated with the input sequence. .. figure:: images/lstm.png :align: center **Figure 2** : Illustration of the model used in this tutorial. It is composed of a single LSTM layer followed by mean pooling over time and logistic regression. **Implementation note** : In the code included this tutorial, the equations :eq:`1`, :eq:`2`, :eq:`3` and :eq:`5-alt` are performed in parallel to make the computation more efficient. This is possible because none of these equations rely on a result produced by the other ones. It is achieved by concatenating the four matrices :math:`W_*` into a single weight matrix :math:`W` and performing the same concatenation on the weight matrices :math:`U_*` to produce the matrix :math:`U` and the bias vectors :math:`b_*` to produce the vector :math:`b`. Then, the pre-nonlinearity activations can be computed with : .. math:: z = \sigma(W x_t + U h_{t-1} + b) The result is then sliced to obtain the pre-nonlinearity activations for :math:`i`, :math:`f`, :math:`\widetilde{C_t}`, and :math:`o` and the non-linearities are then applied independently for each. Code - Citations - Contact ++++++++++++++++++++++++++ Code ==== The LSTM implementation can be found in the two following files : * `lstm.py `_ : Main script. Defines and train the model. * `imdb.py `_ : Secondary script. Handles the loading and preprocessing of the IMDB dataset. After downloading both scripts and putting both in the same folder, the user can run the code by calling: .. code-block:: bash THEANO_FLAGS="floatX=float32" python lstm.py The script will automatically download the data and decompress it. **Note** : The provided code supports the Stochastic Gradient Descent (SGD), AdaDelta and RMSProp optimization methods. You are advised to use AdaDelta or RMSProp because SGD appears to performs poorly on this task with this particular model. Papers ====== If you use this tutorial, please cite the following papers. Introduction of the LSTM model: * `[pdf] `_ Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780. Addition of the forget gate to the LSTM model: * `[pdf] `_ Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471. More recent LSTM paper: * `[pdf] `_ Graves, Alex. Supervised sequence labelling with recurrent neural networks. Vol. 385. Springer, 2012. Papers related to Theano: * `[pdf] `_ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012. * `[pdf] `_ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010. Thank you! Contact ======= Please email `Kyunghyun Cho `_ for any problem report or feedback. We will be glad to hear from you. References ++++++++++ * Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780. * Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471. * Graves, A. (2012). Supervised sequence labelling with recurrent neural networks (Vol. 385). Springer. * Hochreiter, S., Bengio, Y., Frasconi, P., & Schmidhuber, J. (2001). Gradient flow in recurrent nets: the difficulty of learning long-term dependencies. * Bengio, Y., Simard, P., & Frasconi, P. (1994). Learning long-term dependencies with gradient descent is difficult. Neural Networks, IEEE Transactions on, 5(2), 157-166. * Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1 (pp. 142-150). Association for Computational Linguistics. ================================================ FILE: DeepLearningTutorials/doc/mlp.txt ================================================ .. index:: Multilayer Perceptron .. _mlp: Multilayer Perceptron ===================== .. note:: This section assumes the reader has already read through :doc:`logreg`. Additionally, it uses the following new Theano functions and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, :ref:`L1_L2_regularization`, `floatX`_. If you intend to run the code on GPU also read `GPU`_. .. note:: The code for this section is available for download `here`_. .. _here: http://deeplearning.net/tutorial/code/mlp.py .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html The next architecture we are going to present using Theano is the single-hidden-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a logistic regression classifier where the input is first transformed using a learnt non-linear transformation :math:`\Phi`. This transformation projects the input data into a space where it becomes linearly separable. This intermediate layer is referred to as a **hidden layer**. A single hidden layer is sufficient to make MLPs a **universal approximator**. However we will see later on that there are substantial benefits to using many such hidden layers, i.e. the very premise of **deep learning**. See these course notes for an `introduction to MLPs, the back-propagation algorithm, and how to train MLPs `_. This tutorial will again tackle the problem of MNIST digit classification. The Model +++++++++ An MLP (or Artificial Neural Network - ANN) with a single hidden layer can be represented graphically as follows: .. figure:: images/mlp.png :align: center Formally, a one-hidden-layer MLP is a function :math:`f: R^D \rightarrow R^L`, where :math:`D` is the size of input vector :math:`x` and :math:`L` is the size of the output vector :math:`f(x)`, such that, in matrix notation: .. math:: f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), with bias vectors :math:`b^{(1)}`, :math:`b^{(2)}`; weight matrices :math:`W^{(1)}`, :math:`W^{(2)}` and activation functions :math:`G` and :math:`s`. The vector :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)` constitutes the hidden layer. :math:`W^{(1)} \in R^{D \times D_h}` is the weight matrix connecting the input vector to the hidden layer. Each column :math:`W^{(1)}_{\cdot i}` represents the weights from the input units to the i-th hidden unit. Typical choices for :math:`s` include :math:`tanh`, with :math:`tanh(a)=(e^a-e^{-a})/(e^a+e^{-a})`, or the logistic :math:`sigmoid` function, with :math:`sigmoid(a)=1/(1+e^{-a})`. We will be using :math:`tanh` in this tutorial because it typically yields to faster training (and sometimes also to better local minima). Both the :math:`tanh` and :math:`sigmoid` are scalar-to-scalar functions but their natural extension to vectors and tensors consists in applying them element-wise (e.g. separately on each element of the vector, yielding a same-size vector). The output vector is then obtained as: :math:`o(x) = G(b^{(2)} + W^{(2)} h(x))`. The reader should recognize the form we already used for :doc:`logreg`. As before, class-membership probabilities can be obtained by choosing :math:`G` as the :math:`softmax` function (in the case of multi-class classification). To train an MLP, we learn **all** parameters of the model, and here we use :ref:`opt_SGD` with minibatches. The set of parameters to learn is the set :math:`\theta = \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`. Obtaining the gradients :math:`\partial{\ell}/\partial{\theta}` can be achieved through the **backpropagation algorithm** (a special case of the chain-rule of derivation). Thankfully, since Theano performs automatic differentation, we will not need to cover this in the tutorial ! Going from logistic regression to MLP +++++++++++++++++++++++++++++++++++++ This tutorial will focus on a single-hidden-layer MLP. We start off by implementing a class that will represent a hidden layer. To construct the MLP we will then only need to throw a logistic regression layer on top. .. literalinclude:: ../code/mlp.py :start-after: start-snippet-1 :end-before: end-snippet-1 The initial values for the weights of a hidden layer :math:`i` should be uniformly sampled from a symmetric interval that depends on the activation function. For :math:`tanh` activation function results obtained in [Xavier10]_ show that the interval should be :math:`[-\sqrt{\frac{6}{fan_{in}+fan_{out}}},\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`, where :math:`fan_{in}` is the number of units in the :math:`(i-1)`-th layer, and :math:`fan_{out}` is the number of units in the :math:`i`-th layer. For the sigmoid function the interval is :math:`[-4\sqrt{\frac{6}{fan_{in}+fan_{out}}},4\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`. This initialization ensures that, early in training, each neuron operates in a regime of its activation function where information can easily be propagated both upward (activations flowing from inputs to outputs) and backward (gradients flowing from outputs to inputs). .. literalinclude:: ../code/mlp.py :start-after: end-snippet-1 :end-before: lin_output = T.dot(input, self.W) + self.b Note that we used a given non-linear function as the activation function of the hidden layer. By default this is ``tanh``, but in many cases we might want to use something else. .. literalinclude:: ../code/mlp.py :start-after: self.b = b :end-before: # parameters of the model If you look into theory this class implements the graph that computes the hidden layer value :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)`. If you give this graph as input to the ``LogisticRegression`` class, implemented in the previous tutorial :doc:`logreg`, you get the output of the MLP. You can see this in the following short implementation of the ``MLP`` class. .. literalinclude:: ../code/mlp.py :start-after: start-snippet-2 :end-before: end-snippet-2 In this tutorial we will also use L1 and L2 regularization (see :ref:`L1_L2_regularization`). For this, we need to compute the L1 norm and the squared L2 norm of the weights :math:`W^{(1)}, W^{(2)}`. .. literalinclude:: ../code/mlp.py :start-after: start-snippet-3 :end-before: end-snippet-3 As before, we train this model using stochastic gradient descent with mini-batches. The difference is that we modify the cost function to include the regularization term. ``L1_reg`` and ``L2_reg`` are the hyperparameters controlling the weight of these regularization terms in the total cost function. The code that computes the new cost is: .. literalinclude:: ../code/mlp.py :start-after: start-snippet-4 :end-before: end-snippet-4 We then update the parameters of the model using the gradient. This code is almost identical to the one for logistic regression. Only the number of parameters differ. To get around this ( and write code that could work for any number of parameters) we will use the list of parameters that we created with the model ``params`` and parse it, computing a gradient at each step. .. literalinclude:: ../code/mlp.py :start-after: start-snippet-5 :end-before: end-snippet-5 Putting it All Together +++++++++++++++++++++++ Having covered the basic concepts, writing an MLP class becomes quite easy. The code below shows how this can be done, in a way which is analogous to our previous logistic regression implementation. .. literalinclude:: ../code/mlp.py The user can then run the code by calling : .. code-block:: bash python code/mlp.py The output one should expect is of the form : .. code-block:: bash Optimization complete. Best validation score of 1.690000 % obtained at iteration 2070000, with test performance 1.650000 % The code for file mlp.py ran for 97.34m On an Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz the code runs with approximately 10.3 epoch/minute and it took 828 epochs to reach a test error of 1.65%. To put this into perspective, we refer the reader to the results section of `this `_ page. Tips and Tricks for training MLPs +++++++++++++++++++++++++++++++++ There are several hyper-parameters in the above code, which are not (and, generally speaking, cannot be) optimized by gradient descent. Strictly speaking, finding an optimal set of values for these hyper-parameters is not a feasible problem. First, we can't simply optimize each of them independently. Second, we cannot readily apply gradient techniques that we described previously (partly because some parameters are discrete values and others are real-valued). Third, the optimization problem is not convex and finding a (local) minimum would involve a non-trivial amount of work. The good news is that over the last 25 years, researchers have devised various rules of thumb for choosing hyper-parameters in a neural network. A very good overview of these tricks can be found in `Efficient BackProp `_ by Yann LeCun, Leon Bottou, Genevieve Orr, and Klaus-Robert Mueller. In here, we summarize the same issues, with an emphasis on the parameters and techniques that we actually used in our code. Nonlinearity -------------- Two of the most common ones are the :math:`sigmoid` and the :math:`tanh` function. For reasons explained in `Section 4.4 `_, nonlinearities that are symmetric around the origin are preferred because they tend to produce zero-mean inputs to the next layer (which is a desirable property). Empirically, we have observed that the :math:`tanh` has better convergence properties. Weight initialization --------------------- At initialization we want the weights to be small enough around the origin so that the activation function operates in its linear regime, where gradients are the largest. Other desirable properties, especially for deep networks, are to conserve variance of the activation as well as variance of back-propagated gradients from layer to layer. This allows information to flow well upward and downward in the network and reduces discrepancies between layers. Under some assumptions, a compromise between these two constraints leads to the following initialization: :math:`uniform[-\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]` for tanh and :math:`uniform[-4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]` for sigmoid. Where :math:`fan_{in}` is the number of inputs and :math:`fan_{out}` the number of hidden units. For mathematical considerations please refer to [Xavier10]_. Learning rate -------------- There is a great deal of literature on choosing a good learning rate. The simplest solution is to simply have a constant rate. Rule of thumb: try several log-spaced values (:math:`10^{-1},10^{-2},\ldots`) and narrow the (logarithmic) grid search to the region where you obtain the lowest validation error. Decreasing the learning rate over time is sometimes a good idea. One simple rule for doing that is :math:`\frac{\mu_0}{1 + d\times t}` where :math:`\mu_0` is the initial rate (chosen, perhaps, using the grid search technique explained above), :math:`d` is a so-called "decrease constant" which controls the rate at which the learning rate decreases (typically, a smaller positive number, :math:`10^{-3}` and smaller) and :math:`t` is the epoch/stage. `Section 4.7 `_ details procedures for choosing a learning rate for each parameter (weight) in our network and for choosing them adaptively based on the error of the classifier. Number of hidden units ----------------------- This hyper-parameter is very much dataset-dependent. Vaguely speaking, the more complicated the input distribution is, the more capacity the network will require to model it, and so the larger the number of hidden units that will be needed (note that the number of weights in a layer, perhaps a more direct measure of capacity, is :math:`D\times D_h` (recall :math:`D` is the number of inputs and :math:`D_h` is the number of hidden units). Unless we employ some regularization scheme (early stopping or L1/L2 penalties), a typical number of hidden units vs. generalization performance graph will be U-shaped. Regularization parameter ------------------------ Typical values to try for the L1/L2 regularization parameter :math:`\lambda` are :math:`10^{-2},10^{-3},\ldots`. In the framework that we described so far, optimizing this parameter will not lead to significantly better solutions, but is worth exploring nonetheless. ================================================ FILE: DeepLearningTutorials/doc/rbm.txt ================================================ .. _RBM: Restricted Boltzmann Machines (RBM) =================================== .. note:: This section assumes the reader has already read through :doc:`logreg` and :doc:`mlp`. Additionally it uses the following Theano functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_ and `scan`_. If you intend to run the code on GPU also read `GPU`_. .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers .. _scan: http://deeplearning.net/software/theano/library/scan.html .. note:: The code for this section is available for download `here `_. Energy-Based Models (EBM) +++++++++++++++++++++++++ **Energy-based** models associate a scalar energy to each configuration of the variables of interest. Learning corresponds to modifying that energy function so that its shape has desirable properties. For example, we would like plausible or desirable configurations to have low energy. Energy-based probabilistic models define a probability distribution through an energy function, as follows: .. math:: :label: energy1 p(x) = \frac {e^{-E(x)}} {Z}. The normalizing factor :math:`Z` is called the **partition function** by analogy with physical systems. .. math:: Z = \sum_x e^{-E(x)} An energy-based model can be learnt by performing (stochastic) gradient descent on the empirical negative log-likelihood of the training data. As for the logistic regression we will first define the log-likelihood and then the loss function as being the negative log-likelihood. .. math:: \mathcal{L}(\theta, \mathcal{D}) = \frac{1}{N} \sum_{x^{(i)} \in \mathcal{D}} \log\ p(x^{(i)})\\ \ell (\theta, \mathcal{D}) = - \mathcal{L} (\theta, \mathcal{D}) using the stochastic gradient :math:`-\frac{\partial \log p(x^{(i)})}{\partial \theta}`, where :math:`\theta` are the parameters of the model. **EBMs with Hidden Units** In many cases of interest, we do not observe the example :math:`x` fully, or we want to introduce some non-observed variables to increase the expressive power of the model. So we consider an observed part (still denoted :math:`x` here) and a **hidden** part :math:`h`. We can then write: .. math:: :label: energy2 P(x) = \sum_h P(x,h) = \sum_h \frac{e^{-E(x,h)}}{Z}. In such cases, to map this formulation to one similar to Eq. :eq:`energy1`, we introduce the notation (inspired from physics) of **free energy**, defined as follows: .. math:: :label: free_energy \mathcal{F}(x) = - \log \sum_h e^{-E(x,h)} which allows us to write, .. math:: &P(x) = \frac{e^{-\mathcal{F}(x)}}{Z} \text{ with } Z=\sum_x e^{-\mathcal{F}(x)}. The data negative log-likelihood gradient then has a particularly interesting form. .. math:: :label: free_energy_grad - \frac{\partial \log p(x)}{\partial \theta} &= \frac{\partial \mathcal{F}(x)}{\partial \theta} - \sum_{\tilde{x}} p(\tilde{x}) \ \frac{\partial \mathcal{F}(\tilde{x})}{\partial \theta}. Notice that the above gradient contains two terms, which are referred to as the **positive** and **negative phase**. The terms positive and negative do not refer to the sign of each term in the equation, but rather reflect their effect on the probability density defined by the model. The first term increases the probability of training data (by reducing the corresponding free energy), while the second term decreases the probability of samples generated by the model. It is usually difficult to determine this gradient analytically, as it involves the computation of :math:`E_P [ \frac{\partial \mathcal{F}(x)} {\partial \theta} ]`. This is nothing less than an expectation over all possible configurations of the input :math:`x` (under the distribution :math:`P` formed by the model) ! The first step in making this computation tractable is to estimate the expectation using a fixed number of model samples. Samples used to estimate the negative phase gradient are referred to as **negative particles**, which are denoted as :math:`\mathcal{N}`. The gradient can then be written as: .. math:: :label: bm_grad - \frac{\partial \log p(x)}{\partial \theta} &\approx \frac{\partial \mathcal{F}(x)}{\partial \theta} - \frac{1}{|\mathcal{N}|}\sum_{\tilde{x} \in \mathcal{N}} \ \frac{\partial \mathcal{F}(\tilde{x})}{\partial \theta}. where we would ideally like elements :math:`\tilde{x}` of :math:`\mathcal{N}` to be sampled according to :math:`P` (i.e. we are doing Monte-Carlo). With the above formula, we almost have a pratical, stochastic algorithm for learning an EBM. The only missing ingredient is how to extract these negative particles :math:`\mathcal{N}`. While the statistical literature abounds with sampling methods, Markov Chain Monte Carlo methods are especially well suited for models such as the Restricted Boltzmann Machines (RBM), a specific type of EBM. Restricted Boltzmann Machines (RBM) +++++++++++++++++++++++++++++++++++ Boltzmann Machines (BMs) are a particular form of log-linear Markov Random Field (MRF), i.e., for which the energy function is linear in its free parameters. To make them powerful enough to represent complicated distributions (i.e., go from the limited parametric setting to a non-parametric one), we consider that some of the variables are never observed (they are called hidden). By having more hidden variables (also called hidden units), we can increase the modeling capacity of the Boltzmann Machine (BM). Restricted Boltzmann Machines further restrict BMs to those without visible-visible and hidden-hidden connections. A graphical depiction of an RBM is shown below. .. image:: images/rbm.png :align: center The energy function :math:`E(v,h)` of an RBM is defined as: .. math:: :label: rbm_energy E(v,h) = - b'v - c'h - h'Wv where :math:`W` represents the weights connecting hidden and visible units and :math:`b`, :math:`c` are the offsets of the visible and hidden layers respectively. This translates directly to the following free energy formula: .. math:: \mathcal{F}(v)= - b'v - \sum_i \log \sum_{h_i} e^{h_i (c_i + W_i v)}. Because of the specific structure of RBMs, visible and hidden units are conditionally independent given one-another. Using this property, we can write: .. math:: p(h|v) &= \prod_i p(h_i|v) \\ p(v|h) &= \prod_j p(v_j|h). **RBMs with binary units** In the commonly studied case of using binary units (where :math:`v_j` and :math:`h_i \in \{0,1\}`), we obtain from Eq. :eq:`rbm_energy` and :eq:`energy2`, a probabilistic version of the usual neuron activation function: .. math:: :label: rbm_propup P(h_i=1|v) = sigm(c_i + W_i v) \\ .. math:: :label: rbm_propdown P(v_j=1|h) = sigm(b_j + W'_j h) The free energy of an RBM with binary units further simplifies to: .. math:: :label: rbm_free_energy \mathcal{F}(v)= - b'v - \sum_i \log(1 + e^{(c_i + W_i v)}). **Update Equations with Binary Units** Combining Eqs. :eq:`bm_grad` with :eq:`rbm_free_energy`, we obtain the following log-likelihood gradients for an RBM with binary units: .. math:: :label: rbm_grad - \frac{\partial{ \log p(v)}}{\partial W_{ij}} &= E_v[p(h_i|v) \cdot v_j] - v^{(i)}_j \cdot sigm(W_i \cdot v^{(i)} + c_i) \\ -\frac{\partial{ \log p(v)}}{\partial c_i} &= E_v[p(h_i|v)] - sigm(W_i \cdot v^{(i)}) \\ -\frac{\partial{ \log p(v)}}{\partial b_j} &= E_v[p(v_j|h)] - v^{(i)}_j For a more detailed derivation of these equations, we refer the reader to the following `page `_, or to section 5 of `Learning Deep Architectures for AI `_. We will however not use these formulas, but rather get the gradient using Theano `T.grad`_ from equation :eq:`free_energy_grad`. Sampling in an RBM ++++++++++++++++++ Samples of :math:`p(x)` can be obtained by running a Markov chain to convergence, using Gibbs sampling as the transition operator. Gibbs sampling of the joint of N random variables :math:`S=(S_1, ... , S_N)` is done through a sequence of N sampling sub-steps of the form :math:`S_i \sim p(S_i | S_{-i})` where :math:`S_{-i}` contains the :math:`N-1` other random variables in :math:`S` excluding :math:`S_i`. For RBMs, :math:`S` consists of the set of visible and hidden units. However, since they are conditionally independent, one can perform block Gibbs sampling. In this setting, visible units are sampled simultaneously given fixed values of the hidden units. Similarly, hidden units are sampled simultaneously given the visibles. A step in the Markov chain is thus taken as follows: .. math:: h^{(n+1)} &\sim sigm(W'v^{(n)} + c) \\ v^{(n+1)} &\sim sigm(W h^{(n+1)} + b), where :math:`h^{(n)}` refers to the set of all hidden units at the n-th step of the Markov chain. What it means is that, for example, :math:`h^{(n+1)}_i` is randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_i'v^{(n)} + c_i)`, and similarly, :math:`v^{(n+1)}_j` is randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_{.j} h^{(n+1)} + b_j)`. This can be illustrated graphically: .. image:: images/markov_chain.png :align: center As :math:`t \rightarrow \infty`, samples :math:`(v^{(t)}, h^{(t)})` are guaranteed to be accurate samples of :math:`p(v,h)`. In theory, each parameter update in the learning process would require running one such chain to convergence. It is needless to say that doing so would be prohibitively expensive. As such, several algorithms have been devised for RBMs, in order to efficiently sample from :math:`p(v,h)` during the learning process. Contrastive Divergence (CD-k) ----------------------------- Contrastive Divergence uses two tricks to speed up the sampling process: * since we eventually want :math:`p(v) \approx p_{train}(v)` (the true, underlying distribution of the data), we initialize the Markov chain with a training example (i.e., from a distribution that is expected to be close to :math:`p`, so that the chain will be already close to having converged to its final distribution :math:`p`). * CD does not wait for the chain to converge. Samples are obtained after only k-steps of Gibbs sampling. In pratice, :math:`k=1` has been shown to work surprisingly well. Persistent CD ------------- Persistent CD [Tieleman08]_ uses another approximation for sampling from :math:`p(v,h)`. It relies on a single Markov chain, which has a persistent state (i.e., not restarting a chain for each observed example). For each parameter update, we extract new samples by simply running the chain for k-steps. The state of the chain is then preserved for subsequent updates. The general intuition is that if parameter updates are small enough compared to the mixing rate of the chain, the Markov chain should be able to "catch up" to changes in the model. Implementation ++++++++++++++ We construct an ``RBM`` class. The parameters of the network can either be initialized by the constructor or can be passed as arguments. This option is useful when an RBM is used as the building block of a deep network, in which case the weight matrix and the hidden layer bias is shared with the corresponding sigmoidal layer of an MLP network. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-1 :end-before: end-snippet-1 Next step is to define functions which construct the symbolic graph associated with Eqs. :eq:`rbm_propup` - :eq:`rbm_propdown`. The code is as follows: .. literalinclude:: ../code/rbm.py :pyobject: RBM.propup .. literalinclude:: ../code/rbm.py :pyobject: RBM.sample_h_given_v .. literalinclude:: ../code/rbm.py :pyobject: RBM.propdown .. literalinclude:: ../code/rbm.py :pyobject: RBM.sample_v_given_h We can then use these functions to define the symbolic graph for a Gibbs sampling step. We define two functions: * ``gibbs_vhv`` which performs a step of Gibbs sampling starting from the visible units. As we shall see, this will be useful for sampling from the RBM. * ``gibbs_hvh`` which performs a step of Gibbs sampling starting from the hidden units. This function will be useful for performing CD and PCD updates. The code is as follows: .. literalinclude:: ../code/rbm.py :pyobject: RBM.gibbs_hvh .. literalinclude:: ../code/rbm.py :pyobject: RBM.gibbs_vhv Note that we also return the pre-sigmoid activation. To understand why this is so you need to understand a bit about how Theano works. Whenever you compile a Theano function, the computational graph that you pass as input gets optimized for speed and stability. This is done by changing several parts of the subgraphs with others. One such optimization expresses terms of the form log(sigmoid(x)) in terms of softplus. We need this optimization for the cross-entropy since sigmoid of numbers larger than 30. (or even less then that) turn to 1. and numbers smaller than -30. turn to 0 which in terms will force theano to compute log(0) and therefore we will get either -inf or NaN as cost. If the value is expressed in terms of softplus we do not get this undesirable behaviour. This optimization usually works fine, but here we have a special case. The sigmoid is applied inside the scan op, while the log is outside. Therefore Theano will only see log(scan(..)) instead of log(sigmoid(..)) and will not apply the wanted optimization. We can not go and replace the sigmoid in scan with something else also, because this only needs to be done on the last step. Therefore the easiest and more efficient way is to get also the pre-sigmoid activation as an output of scan, and apply both the log and sigmoid outside scan such that Theano can catch and optimize the expression. The class also has a function that computes the free energy of the model, needed for computing the gradient of the parameters (see Eq. :eq:`free_energy_grad`). Note that we also return the pre-sigmoid .. literalinclude:: ../code/rbm.py :pyobject: RBM.free_energy We then add a ``get_cost_updates`` method, whose purpose is to generate the symbolic gradients for CD-k and PCD-k updates. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-2 :end-before: end-snippet-2 Note that ``get_cost_updates`` takes as argument a variable called ``persistent``. This allows us to use the same code to implement both CD and PCD. To use PCD, ``persistent`` should refer to a shared variable which contains the state of the Gibbs chain from the previous iteration. If ``persistent`` is ``None``, we initialize the Gibbs chain with the hidden sample generated during the positive phase, therefore implementing CD. Once we have established the starting point of the chain, we can then compute the sample at the end of the Gibbs chain, sample that we need for getting the gradient (see Eq. :eq:`free_energy_grad`). To do so, we will use the ``scan`` op provided by Theano, therefore we urge the reader to look it up by following this `link `_. .. literalinclude:: ../code/rbm.py :start-after: end-snippet-2 :end-before: start-snippet-3 Once we have the generated the chain we take the sample at the end of the chain to get the free energy of the negative phase. Note that the ``chain_end`` is a symbolical Theano variable expressed in terms of the model parameters, and if we would apply ``T.grad`` naively, the function will try to go through the Gibbs chain to get the gradients. This is not what we want (it will mess up our gradients) and therefore we need to indicate to ``T.grad`` that ``chain_end`` is a constant. We do this by using the argument ``consider_constant`` of ``T.grad``. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-3 :end-before: end-snippet-3 Finally, we add to the updates dictionary returned by scan (which contains updates rules for random states of ``theano_rng``) to contain the parameter updates. In the case of PCD, these should also update the shared variable containing the state of the Gibbs chain. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-4 :end-before: end-snippet-4 Tracking Progress ----------------- RBMs are particularly tricky to train. Because of the partition function :math:`Z` of Eq. :eq:`energy1`, we cannot estimate the log-likelihood :math:`\log(P(x))` during training. We therefore have no direct useful metric for choosing the optimal hyperparameters. Several options are available to the user. **Inspection of Negative Samples** Negative samples obtained during training can be visualized. As training progresses, we know that the model defined by the RBM becomes closer to the true underlying distribution, :math:`p_{train}(x)`. Negative samples should thus look like samples from the training set. Obviously bad hyperparameters can be discarded in this fashion. **Visual Inspection of Filters** The filters learnt by the model can be visualized. This amounts to plotting the weights of each unit as a gray-scale image (after reshaping to a square matrix). Filters should pick out strong features in the data. While it is not clear for an arbitrary dataset, what these features should look like, training on MNIST usually results in filters which act as stroke detectors, while training on natural images lead to Gabor like filters if trained in conjunction with a sparsity criteria. **Proxies to Likelihood** Other, more tractable functions can be used as a proxy to the likelihood. When training an RBM with PCD, one can use pseudo-likelihood as the proxy. Pseudo-likelihood (PL) is much less expensive to compute, as it assumes that all bits are independent. Therefore, .. math:: PL(x) = \prod_i P(x_i | x_{-i}) \text{ and }\\ \log PL(x) = \sum_i \log P(x_i | x_{-i}) Here :math:`x_{-i}` denotes the set of all bits of :math:`x` except bit :math:`i`. The log-PL is therefore the sum of the log-probabilities of each bit :math:`x_i`, conditioned on the state of all other bits. For MNIST, this would involve summing over the 784 input dimensions, which remains rather expensive. For this reason, we use the following stochastic approximation to log-PL: .. math:: g = N \cdot \log P(x_i | x_{-i}) \text{, where } i \sim U(0,N), \text{, and}\\ E[ g ] = \log PL(x) where the expectation is taken over the uniform random choice of index :math:`i`, and :math:`N` is the number of visible units. In order to work with binary units, we further introduce the notation :math:`\tilde{x}_i` to refer to :math:`x` with bit-i being flipped (1->0, 0->1). The log-PL for an RBM with binary units is then written as: .. math:: \log PL(x) &\approx N \cdot \log \frac {e^{-FE(x)}} {e^{-FE(x)} + e^{-FE(\tilde{x}_i)}} \\ &\approx N \cdot \log[ sigm (FE(\tilde{x}_i) - FE(x)) ] We therefore return this cost as well as the RBM updates in the ``get_cost_updates`` function of the ``RBM`` class. Notice that we modify the updates dictionary to increment the index of bit :math:`i`. This will result in bit :math:`i` cycling over all possible values :math:`\{0,1,...,N\}`, from one update to another. Note that for CD training the cross-entropy cost between the input and the reconstruction (the same as the one used for the de-noising autoencoder) is more reliable then the pseudo-loglikelihood. Here is the code we use to compute the pseudo-likelihood: .. literalinclude:: ../code/rbm.py :pyobject: RBM.get_pseudo_likelihood_cost Main Loop --------- We now have all the necessary ingredients to start training our network. Before going over the training loop however, the reader should familiarize himself with the function ``tile_raster_images`` (see :ref:`how-to-plot`). Since RBMs are generative models, we are interested in sampling from them and plotting/visualizing these samples. We also want to visualize the filters (weights) learnt by the RBM, to gain insights into what the RBM is actually doing. Bear in mind however, that this does not provide the entire story, since we neglect the biases and plot the weights up to a multiplicative constant (weights are converted to values between 0 and 1). Having these utility functions, we can start training the RBM and plot/save the filters after each training epoch. We train the RBM using PCD, as it has been shown to lead to a better generative model ([Tieleman08]_). .. literalinclude:: ../code/rbm.py :start-after: start-snippet-5 :end-before: end-snippet-5 Once the RBM is trained, we can then use the ``gibbs_vhv`` function to implement the Gibbs chain required for sampling. We initialize the Gibbs chain starting from test examples (although we could as well pick it from the training set) in order to speed up convergence and avoid problems with random initialization. We again use Theano's ``scan`` op to do 1000 steps before each plotting. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-6 :end-before: end-snippet-6 Next we create the 20 persistent chains in parallel to get our samples. To do so, we compile a theano function which performs one Gibbs step and updates the state of the persistent chain with the new visible sample. We apply this function iteratively for a large number of steps, plotting the samples at every 1000 steps. .. literalinclude:: ../code/rbm.py :start-after: start-snippet-7 :end-before: end-snippet-7 Results +++++++ We ran the code with PCD-15, learning rate of 0.1 and a batch size of 20, for 15 epochs. Training the model takes 122.466 minutes on a Intel Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS. The output was the following: .. code-block:: bash ... loading data Training epoch 0, cost is -90.6507246003 Training epoch 1, cost is -81.235857373 Training epoch 2, cost is -74.9120966945 Training epoch 3, cost is -73.0213216101 Training epoch 4, cost is -68.4098570497 Training epoch 5, cost is -63.2693021647 Training epoch 6, cost is -65.99578971 Training epoch 7, cost is -68.1236650015 Training epoch 8, cost is -68.3207365087 Training epoch 9, cost is -64.2949797113 Training epoch 10, cost is -61.5194867893 Training epoch 11, cost is -61.6539369402 Training epoch 12, cost is -63.5465278086 Training epoch 13, cost is -63.3787093527 Training epoch 14, cost is -62.755739271 Training took 122.466000 minutes ... plotting sample 0 ... plotting sample 1 ... plotting sample 2 ... plotting sample 3 ... plotting sample 4 ... plotting sample 5 ... plotting sample 6 ... plotting sample 7 ... plotting sample 8 ... plotting sample 9 The pictures below show the filters after 15 epochs : .. figure:: images/filters_at_epoch_14.png :align: center Filters obtained after 15 epochs. Here are the samples generated by the RBM after training. Each row represents a mini-batch of negative particles (samples from independent Gibbs chains). 1000 steps of Gibbs sampling were taken between each of those rows. .. figure:: images/samples.png :align: center ================================================ FILE: DeepLearningTutorials/doc/references.txt ================================================ .. _references: ========== References ========== .. [Bengio07] Y. Bengio, P. Lamblin, D. Popovici and H. Larochelle, `Greedy Layer-Wise Training of Deep Networks `_, in Advances in Neural Information Processing Systems 19 (NIPS'06), pages 153-160, MIT Press 2007. .. [Bengio09] Y. Bengio, `Learning deep architectures for AI `_, Foundations and Trends in Machine Learning 1(2) pages 1-127. .. [BengioDelalleau09] Y. Bengio, O. Delalleau, Justifying and Generalizing Contrastive Divergence (2009), Neural Computation, 21(6): 1601-1621. .. [BoulangerLewandowski12] N Boulanger-Lewandowski, Y. Bengio and P. Vincent, `Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription `_, in Proceedings of the 29th International Conference on Machine Learning (ICML), 2012. .. [Fukushima] Fukushima, K. (1980). Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position. Biological Cybernetics, 36, 193–202. .. [Hinton06] G.E. Hinton and R.R. Salakhutdinov, `Reducing the Dimensionality of Data with Neural Networks `_, Science, 28 July 2006, Vol. 313. no. 5786, pp. 504 - 507. .. [Hinton07] G.E. Hinton, S. Osindero, and Y. Teh, "A fast learning algorithm for deep belief nets", Neural Computation, vol 18, 2006 .. [Hubel68] Hubel, D. and Wiesel, T. (1968). Receptive fields and functional architecture of monkey striate cortex. Journal of Physiology (London), 195, 215–243. .. [LeCun98] LeCun, Y., Bottou, L., Bengio, Y., and Haffner, P. (1998d). Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278–2324. .. [Lee08] H. Lee, C. Ekanadham, and A.Y. Ng., `Sparse deep belief net model for visual area V2 `_, in Advances in Neural Information Processing Systems (NIPS) 20, 2008. .. [Lee09] H. Lee, R. Grosse, R. Ranganath, and A.Y. Ng, "Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations.", ICML 2009 .. [Ranzato10] M. Ranzato, A. Krizhevsky, G. Hinton, "Factored 3-Way Restricted Boltzmann Machines for Modeling Natural Images". Proc. of the 13-th International Conference on Artificial Intelligence and Statistics (AISTATS 2010), Italy, 2010 .. [Ranzato07] M.A. Ranzato, C. Poultney, S. Chopra and Y. LeCun, in J. Platt et al., `Efficient Learning of Sparse Representations with an Energy-Based Model `_, Advances in Neural Information Processing Systems (NIPS 2006), MIT Press, 2007. .. [Serre07] Serre, T., Wolf, L., Bileschi, S., and Riesenhuber, M. (2007). Robust object recog- nition with cortex-like mechanisms. IEEE Trans. Pattern Anal. Mach. Intell., 29(3), 411–426. Member-Poggio, Tomaso. .. [Vincent08] P. Vincent, H. Larochelle Y. Bengio and P.A. Manzagol, `Extracting and Composing Robust Features with Denoising Autoencoders `_, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08), pages 1096 - 1103, ACM, 2008. .. [Tieleman08] T. Tieleman, Training restricted boltzmann machines using approximations to the likelihood gradient, ICML 2008. .. [Xavier10] Y. Bengio, X. Glorot, Understanding the difficulty of training deep feedforward neuralnetworks, AISTATS 2010 ================================================ FILE: DeepLearningTutorials/doc/rnnrbm.txt ================================================ .. _rnnrbm: Modeling and generating sequences of polyphonic music with the RNN-RBM ======================================================================== .. note:: This tutorial demonstrates a basic implementation of the RNN-RBM as described in [BoulangerLewandowski12]_ (`pdf `_). We assume the reader is familiar with `recurrent neural networks using the scan op `_ and `restricted Boltzmann machines (RBM) `_. .. note:: The code for this section is available for download here: `rnnrbm.py `_. You will need the modified `Python MIDI package (GPL license) `_ in your ``$PYTHONPATH`` or in the working directory in order to convert MIDI files to and from piano-rolls. The script also assumes that the content of the `Nottingham Database of folk tunes `_ has been extracted in the ``../data`` directory. Alternative MIDI datasets are available `here `_. Note that both dependencies above can be setup automatically by running the ``download.sh`` script in the ``../data`` directory. .. caution:: Need Theano 0.6 or more recent. The RNN-RBM +++++++++++++++++++++++++ The RNN-RBM is an energy-based model for density estimation of temporal sequences, where the feature vector :math:`v^{(t)}` at time step :math:`t` may be high-dimensional. It allows to describe multimodal conditional distributions of :math:`v^{(t)}|\mathcal A^{(t)}`, where :math:`\mathcal A^{(t)}\equiv \{v_\tau|\tau