Repository: jiamings/fast-weights Branch: master Commit: 757ae149d602 Files: 5 Total size: 10.5 KB Directory structure: gitextract_y6qtsdrn/ ├── .gitignore ├── README.md ├── associative_retrieval.py ├── fw.py └── generator.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pkl *.pyc .idea/ ================================================ FILE: README.md ================================================ ## Using Fast Weights to Attend to the Recent Past Reproducing the associative model experiment on the paper [Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258) by Jimmy Ba et al. (Incomplete) ### Prerequisites Tensorflow (version >= 0.8) ### How to Run the Experiments Generate a dataset ``` $ python generator.py ``` This script generates a file called `associative-retrieval.pkl`, which can be used for training. Run the model ``` $ python fw.py ``` ### Findings The following is the accuracy and loss graph for R=20. **The experiments are barely tuned.** ![](fig/acc.png) ![](fig/loss.png) **Layer Normalization is extremely crucial for the success of training.** - Otherwise, training will not converge when the inner step is larger than 1. - Even when inner step of 1, the performance without layer normalization is much worse. For R=20, only 0.4 accuracy can be achieved (which is same as the level of other models.) - Even with Layer Normalization, using slow weights (ie. vanilla RNN) is much worse than using fast weights. Further improvements: - Complete fine-tuning - Work on other tasks ### References [Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258). Jimmy Ba,  Geoffrey Hinton, Volodymyr Mnih, Joel Z. Leibo, Catalin Ionescu. [Layer Normalization](https://arxiv.org/abs/1607.06450). Jimmy Ba, Ryan Kiros, Geoffery Hinton. ================================================ FILE: associative_retrieval.py ================================================ import numpy as np import collections try: import cPickle as pickle except ImportError: import pickle Datasets = collections.namedtuple('Datasets', ['train', 'val', 'test']) class Dataset(object): def __init__(self, x, y): self._x = x self._y = y self._epoch_completed = 0 self._index_in_epoch = 0 self._num_examples = self.x.shape[0] self.perm = np.random.permutation(np.arange(self._num_examples)) @property def x(self): return self._x @property def y(self): return self._y @property def num_examples(self): return self._num_examples def next_batch(self, batch_size): assert batch_size <= self._num_examples start = self._index_in_epoch self._index_in_epoch += batch_size if self._index_in_epoch >= self.num_examples: self._epoch_completed += 1 np.random.shuffle(self.perm) start = 0 self._index_in_epoch = batch_size end = self._index_in_epoch return self._x[self.perm[start:end]], self._y[self.perm[start:end]] def read_data(data_path='associative-retrieval.pkl'): with open(data_path, 'rb') as f: d = pickle.load(f) x_train = d['x_train'] x_val = d['x_val'] x_test = d['x_test'] y_train = d['y_train'] y_val = d['y_val'] y_test = d['y_test'] train = Dataset(x_train, y_train) test = Dataset(x_test, y_test) val = Dataset(x_val, y_val) return Datasets(train=train, val=val, test=test) ================================================ FILE: fw.py ================================================ from __future__ import print_function import tensorflow as tf import numpy as np import time from associative_retrieval import read_data from subprocess import call ar_data = read_data() STEP_NUM = 11 ELEM_NUM = 26 + 10 + 1 class FastWeightsRecurrentNeuralNetworks(object): def __init__(self, step_num, elem_num, hidden_num): self.x = tf.placeholder(tf.float32, [None, step_num, elem_num]) self.y = tf.placeholder(tf.float32, [None, elem_num]) self.l = tf.placeholder(tf.float32, []) self.e = tf.placeholder(tf.float32, []) self.w1 = tf.Variable(tf.random_uniform([elem_num, 50], -np.sqrt(0.02), np.sqrt(0.02)), dtype=tf.float32) self.b1 = tf.Variable(tf.zeros([1, 50]), dtype=tf.float32) self.w2 = tf.Variable(tf.random_uniform([50, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32) self.b2 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32) self.w3 = tf.Variable(tf.random_uniform([hidden_num, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32) self.b3 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32) self.w4 = tf.Variable(tf.random_uniform([100, elem_num], -np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num)), dtype=tf.float32) self.b4 = tf.Variable(tf.zeros([1, elem_num]), dtype=tf.float32) self.w = tf.Variable(initial_value=0.05 * np.identity(hidden_num), dtype=tf.float32) self.c = tf.Variable(tf.random_uniform([100, hidden_num], -np.sqrt(hidden_num), np.sqrt(hidden_num)), dtype=tf.float32) self.g = tf.Variable(tf.ones([1, hidden_num]), dtype=tf.float32) self.b = tf.Variable(tf.zeros([1, hidden_num]), dtype=tf.float32) batch_size = tf.shape(self.x)[0] a = tf.zeros(tf.pack([batch_size, hidden_num, hidden_num]), dtype=tf.float32) h = tf.zeros([batch_size, hidden_num], dtype=tf.float32) la = [] for t in range(0, step_num): z = tf.nn.relu(tf.matmul( tf.nn.relu(tf.matmul(self.x[:, t, :], self.w1) + self.b1), self.w2) + self.b2 ) h = tf.nn.relu( tf.matmul(h, self.w) + tf.matmul(z, self.c) ) hs = tf.reshape(h, tf.pack([batch_size, 1, hidden_num])) hh = hs a = tf.add(tf.scalar_mul(self.l, a), tf.scalar_mul(self.e, tf.batch_matmul(tf.transpose(hs, [0, 2, 1]), hs))) la.append(tf.reduce_mean(tf.square(a))) for s in range(0, 1): hs = tf.reshape(tf.matmul(h, self.w), tf.shape(hh)) + \ tf.reshape(tf.matmul(z, self.c), tf.shape(hh)) + \ tf.batch_matmul(hs, a) mu = tf.reduce_mean(hs, reduction_indices=0) sig = tf.sqrt(tf.reduce_mean(tf.square(hs - mu), reduction_indices=0)) hs = tf.nn.relu(tf.div(tf.mul(self.g, (hs - mu)), sig) + self.b) h = tf.reshape(hs, tf.pack([batch_size, hidden_num])) h = tf.nn.relu(tf.matmul(h, self.w3) + self.b3) logits = tf.matmul(h, self.w4) + self.b4 self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.y)) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(self.loss) correct = tf.equal(tf.argmax(logits, dimension=1), tf.argmax(self.y, dimension=1)) self.acc = tf.reduce_mean(tf.cast(correct, tf.float32)) self.summary = tf.merge_summary([ tf.scalar_summary('loss', self.loss), tf.scalar_summary('acc', self.acc) ]) self.sess = tf.Session() def train(self, save=0, verbose=0): call('rm -rf ./summary'.split(' ')) self.sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter('./summary') batch_size = 100 start_time = time.time() saver = tf.train.Saver(tf.all_variables()) for epoch in range(0, 500): batch_idxs = 600 for idx in range(0, batch_idxs): bx, by = ar_data.train.next_batch(batch_size=batch_size) loss, acc, summary, _ = self.sess.run([self.loss, self.acc, self.summary, self.trainer], feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5}) writer.add_summary(summary, global_step=epoch * batch_idxs + idx) if verbose > 0 and idx % verbose == 0: print('Epoch: [{:4d}] [{:4d}/{:4d}] time: {:.4f}, loss: {:.8f}, acc: {:.2f}'.format( epoch, idx, batch_idxs, time.time() - start_time, loss, acc )) if save > 0 and (epoch+1) % save == 0: saver.save(self.sess, 'log/model', global_step=epoch) saver.save(self.sess, 'log/moodel-final') def test(self, val=True): batch_idxs = 100 batch_size = 100 tot = 0.0 data = ar_data.val if val else ar_data.test name = 'Validation' if val else 'Test' for idx in range(0, batch_idxs): bx, by = data.next_batch(batch_size=batch_size) acc = self.sess.run(self.acc, feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5}) tot += acc / batch_idxs print('{}: {:.4f}'.format(name, tot)) def load(self, save_path='log/model-final'): saver = tf.train.Saver(tf.all_variables()) saver.restore(self.sess, save_path=save_path) if __name__ == '__main__': c = FastWeightsRecurrentNeuralNetworks(STEP_NUM, ELEM_NUM, 20) c.train(verbose=10) ================================================ FILE: generator.py ================================================ import numpy as np import random import cPickle as pickle num_train = 60000 num_val = 10000 num_test = 10000 step_num = 4 elem_num = 26 + 10 + 1 x_train = np.zeros([num_train, step_num * 2 + 3, elem_num], dtype=np.float32) x_val = np.zeros([num_val, step_num * 2 + 3, elem_num], dtype=np.float32) x_test = np.zeros([num_test, step_num * 2 + 3, elem_num], dtype=np.float32) y_train = np.zeros([num_train, elem_num], dtype=np.float32) y_val = np.zeros([num_val, elem_num], dtype=np.float32) y_test = np.zeros([num_test, elem_num], dtype=np.float32) def get_one_hot(c): a = np.zeros([elem_num]) if ord('a') <= ord(c) <= ord('z'): a[ord(c) - ord('a')] = 1 elif ord('0') <= ord(c) <= ord('9'): a[ord(c) - ord('0') + 26] = 1 else: a[-1] = 1 return a def generate_one(): a = np.zeros([step_num * 2 + 3, elem_num]) d = {} st = '' for i in range(0, step_num): c = random.randint(0, 25) while d.has_key(c): c = random.randint(0, 25) b = random.randint(0, 9) d[c] = b s, t = chr(c + ord('a')), chr(b + ord('0')) st += s + t a[i*2] = get_one_hot(s) a[i*2+1] = get_one_hot(t) s = random.choice(d.keys()) t = chr(s + ord('a')) r = chr(d[s] + ord('0')) a[step_num * 2] = get_one_hot('?') a[step_num * 2 + 1] = get_one_hot('?') a[step_num * 2 + 2] = get_one_hot(t) st += '??' + t + r e = get_one_hot(r) return a, e if __name__ == '__main__': for i in range(0, num_train): x_train[i], y_train[i] = generate_one() for i in range(0, num_test): x_test[i], y_test[i] = generate_one() for i in range(0, num_val): x_val[i], y_val[i] = generate_one() d = { 'x_train': x_train, 'x_test': x_test, 'x_val': x_val, 'y_train': y_train, 'y_test': y_test, 'y_val': y_val } with open('associative-retrieval.pkl', 'wb') as f: pickle.dump(d, f, protocol=2)