Repository: jiamings/fast-weights
Branch: master
Commit: 757ae149d602
Files: 5
Total size: 10.5 KB

Directory structure:
gitextract_y6qtsdrn/

├── .gitignore
├── README.md
├── associative_retrieval.py
├── fw.py
└── generator.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pkl
*.pyc
.idea/


================================================
FILE: README.md
================================================
## Using Fast Weights to Attend to the Recent Past

Reproducing the associative model experiment on the paper

[Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258) by Jimmy Ba et al. (Incomplete)


### Prerequisites

Tensorflow (version >= 0.8)


### How to Run the Experiments

Generate a dataset

```
$ python generator.py
```

This script generates a file called `associative-retrieval.pkl`, which can be used for training.


Run the model

```
$ python fw.py
```


### Findings

The following is the accuracy and loss graph for R=20. **The experiments are barely tuned.**

![](fig/acc.png)


![](fig/loss.png)

**Layer Normalization is extremely crucial for the success of training.** 

- Otherwise, training will not converge when the inner step is larger than 1. 
- Even when inner step of 1, the performance without layer normalization is much worse. For R=20, only 0.4 accuracy can be achieved (which is same as the level of other models.)
- Even with Layer Normalization, using slow weights (ie. vanilla RNN) is much worse than using fast weights.


Further improvements:

- Complete fine-tuning
- Work on other tasks


### References

[Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258). Jimmy Ba,  Geoffrey Hinton, Volodymyr Mnih, Joel Z. Leibo, Catalin Ionescu.

[Layer Normalization](https://arxiv.org/abs/1607.06450). Jimmy Ba, Ryan Kiros, Geoffery Hinton.

================================================
FILE: associative_retrieval.py
================================================
import numpy as np
import collections
try:
    import cPickle as pickle
except ImportError:
    import pickle


Datasets = collections.namedtuple('Datasets', ['train', 'val', 'test'])


class Dataset(object):
    def __init__(self, x, y):
        self._x = x
        self._y = y
        self._epoch_completed = 0
        self._index_in_epoch = 0
        self._num_examples = self.x.shape[0]
        self.perm = np.random.permutation(np.arange(self._num_examples))

    @property
    def x(self):
        return self._x

    @property
    def y(self):
        return self._y

    @property
    def num_examples(self):
        return self._num_examples

    def next_batch(self, batch_size):
        assert batch_size <= self._num_examples
        start = self._index_in_epoch
        self._index_in_epoch += batch_size
        if self._index_in_epoch >= self.num_examples:
            self._epoch_completed += 1
            np.random.shuffle(self.perm)
            start = 0
            self._index_in_epoch = batch_size
        end = self._index_in_epoch
        return self._x[self.perm[start:end]], self._y[self.perm[start:end]]


def read_data(data_path='associative-retrieval.pkl'):
    with open(data_path, 'rb') as f:
        d = pickle.load(f)
    x_train = d['x_train']
    x_val = d['x_val']
    x_test = d['x_test']
    y_train = d['y_train']
    y_val = d['y_val']
    y_test = d['y_test']
    train = Dataset(x_train, y_train)
    test = Dataset(x_test, y_test)
    val = Dataset(x_val, y_val)
    return Datasets(train=train, val=val, test=test)


================================================
FILE: fw.py
================================================
from __future__ import print_function

import tensorflow as tf
import numpy as np
import time
from associative_retrieval import read_data
from subprocess import call

ar_data = read_data()

STEP_NUM = 11
ELEM_NUM = 26 + 10 + 1


class FastWeightsRecurrentNeuralNetworks(object):
    def __init__(self, step_num, elem_num, hidden_num):
        self.x = tf.placeholder(tf.float32, [None, step_num, elem_num])
        self.y = tf.placeholder(tf.float32, [None, elem_num])
        self.l = tf.placeholder(tf.float32, [])
        self.e = tf.placeholder(tf.float32, [])

        self.w1 = tf.Variable(tf.random_uniform([elem_num, 50], -np.sqrt(0.02), np.sqrt(0.02)), dtype=tf.float32)
        self.b1 = tf.Variable(tf.zeros([1, 50]), dtype=tf.float32)
        self.w2 = tf.Variable(tf.random_uniform([50, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32)
        self.b2 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32)
        self.w3 = tf.Variable(tf.random_uniform([hidden_num, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32)
        self.b3 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32)
        self.w4 = tf.Variable(tf.random_uniform([100, elem_num], -np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num)),
                              dtype=tf.float32)
        self.b4 = tf.Variable(tf.zeros([1, elem_num]), dtype=tf.float32)

        self.w = tf.Variable(initial_value=0.05 * np.identity(hidden_num), dtype=tf.float32)
        self.c = tf.Variable(tf.random_uniform([100, hidden_num], -np.sqrt(hidden_num), np.sqrt(hidden_num)),
                             dtype=tf.float32)
        self.g = tf.Variable(tf.ones([1, hidden_num]), dtype=tf.float32)
        self.b = tf.Variable(tf.zeros([1, hidden_num]), dtype=tf.float32)

        batch_size = tf.shape(self.x)[0]

        a = tf.zeros(tf.pack([batch_size, hidden_num, hidden_num]), dtype=tf.float32)
        h = tf.zeros([batch_size, hidden_num], dtype=tf.float32)
        la = []
        for t in range(0, step_num):
            z = tf.nn.relu(tf.matmul(
                tf.nn.relu(tf.matmul(self.x[:, t, :], self.w1) + self.b1),
                self.w2) + self.b2
                           )
            h = tf.nn.relu(
                tf.matmul(h, self.w) + tf.matmul(z, self.c)
            )
            hs = tf.reshape(h, tf.pack([batch_size, 1, hidden_num]))
            hh = hs
            a = tf.add(tf.scalar_mul(self.l, a),
                       tf.scalar_mul(self.e, tf.batch_matmul(tf.transpose(hs, [0, 2, 1]), hs)))
            la.append(tf.reduce_mean(tf.square(a)))
            for s in range(0, 1):
                hs = tf.reshape(tf.matmul(h, self.w), tf.shape(hh)) + \
                     tf.reshape(tf.matmul(z, self.c), tf.shape(hh)) + \
                     tf.batch_matmul(hs, a)
                mu = tf.reduce_mean(hs, reduction_indices=0)
                sig = tf.sqrt(tf.reduce_mean(tf.square(hs - mu), reduction_indices=0))
                hs = tf.nn.relu(tf.div(tf.mul(self.g, (hs - mu)), sig) + self.b)
            h = tf.reshape(hs, tf.pack([batch_size, hidden_num]))
        h = tf.nn.relu(tf.matmul(h, self.w3) + self.b3)
        logits = tf.matmul(h, self.w4) + self.b4
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.y))
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(self.loss)
        correct = tf.equal(tf.argmax(logits, dimension=1), tf.argmax(self.y, dimension=1))
        self.acc = tf.reduce_mean(tf.cast(correct, tf.float32))
        self.summary = tf.merge_summary([
            tf.scalar_summary('loss', self.loss),
            tf.scalar_summary('acc', self.acc)
        ])
        self.sess = tf.Session()

    def train(self, save=0, verbose=0):
        call('rm -rf ./summary'.split(' '))
        self.sess.run(tf.initialize_all_variables())
        writer = tf.train.SummaryWriter('./summary')
        batch_size = 100
        start_time = time.time()
        saver = tf.train.Saver(tf.all_variables())
        for epoch in range(0, 500):
            batch_idxs = 600
            for idx in range(0, batch_idxs):
                bx, by = ar_data.train.next_batch(batch_size=batch_size)
                loss, acc, summary, _ = self.sess.run([self.loss, self.acc, self.summary, self.trainer],
                                        feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5})
                writer.add_summary(summary, global_step=epoch * batch_idxs + idx)
                if verbose > 0 and idx % verbose == 0:
                    print('Epoch: [{:4d}] [{:4d}/{:4d}] time: {:.4f}, loss: {:.8f}, acc: {:.2f}'.format(
                        epoch, idx, batch_idxs, time.time() - start_time, loss, acc
                    ))
            if save > 0 and (epoch+1) % save == 0:
                saver.save(self.sess, 'log/model', global_step=epoch)
        saver.save(self.sess, 'log/moodel-final')

    def test(self, val=True):
        batch_idxs = 100
        batch_size = 100
        tot = 0.0
        data = ar_data.val if val else ar_data.test
        name = 'Validation' if val else 'Test'
        for idx in range(0, batch_idxs):
            bx, by = data.next_batch(batch_size=batch_size)
            acc = self.sess.run(self.acc, feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5})
            tot += acc / batch_idxs
        print('{}: {:.4f}'.format(name, tot))

    def load(self, save_path='log/model-final'):
        saver = tf.train.Saver(tf.all_variables())
        saver.restore(self.sess, save_path=save_path)


if __name__ == '__main__':
    c = FastWeightsRecurrentNeuralNetworks(STEP_NUM, ELEM_NUM, 20)
    c.train(verbose=10)


================================================
FILE: generator.py
================================================
import numpy as np
import random
import cPickle as pickle

num_train = 60000
num_val = 10000
num_test = 10000

step_num = 4
elem_num = 26 + 10 + 1

x_train = np.zeros([num_train, step_num * 2 + 3, elem_num], dtype=np.float32)
x_val = np.zeros([num_val, step_num * 2 + 3, elem_num], dtype=np.float32)
x_test = np.zeros([num_test, step_num * 2 + 3, elem_num], dtype=np.float32)

y_train = np.zeros([num_train, elem_num], dtype=np.float32)
y_val = np.zeros([num_val, elem_num], dtype=np.float32)
y_test = np.zeros([num_test, elem_num], dtype=np.float32)


def get_one_hot(c):
    a = np.zeros([elem_num])
    if ord('a') <= ord(c) <= ord('z'):
        a[ord(c) - ord('a')] = 1
    elif ord('0') <= ord(c) <= ord('9'):
        a[ord(c) - ord('0') + 26] = 1
    else:
        a[-1] = 1
    return a


def generate_one():
    a = np.zeros([step_num * 2 + 3, elem_num])
    d = {}
    st = ''

    for i in range(0, step_num):
        c = random.randint(0, 25)
        while d.has_key(c):
            c = random.randint(0, 25)
        b = random.randint(0, 9)
        d[c] = b
        s, t = chr(c + ord('a')), chr(b + ord('0'))
        st += s + t
        a[i*2] = get_one_hot(s)
        a[i*2+1] = get_one_hot(t)

    s = random.choice(d.keys())
    t = chr(s + ord('a'))
    r = chr(d[s] + ord('0'))
    a[step_num * 2] = get_one_hot('?')
    a[step_num * 2 + 1] = get_one_hot('?')
    a[step_num * 2 + 2] = get_one_hot(t)
    st += '??' + t + r
    e = get_one_hot(r)
    return a, e

if __name__ == '__main__':
    for i in range(0, num_train):
        x_train[i], y_train[i] = generate_one()

    for i in range(0, num_test):
        x_test[i], y_test[i] = generate_one()

    for i in range(0, num_val):
        x_val[i], y_val[i] = generate_one()

    d = {
        'x_train': x_train,
        'x_test': x_test,
        'x_val': x_val,
        'y_train': y_train,
        'y_test': y_test,
        'y_val': y_val
    }
    with open('associative-retrieval.pkl', 'wb') as f:
        pickle.dump(d, f, protocol=2)