[
  {
    "path": ".gitignore",
    "content": "*.pkl\n*.pyc\n.idea/\n"
  },
  {
    "path": "README.md",
    "content": "## Using Fast Weights to Attend to the Recent Past\n\nReproducing the associative model experiment on the paper\n\n[Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258) by Jimmy Ba et al. (Incomplete)\n\n\n\n### Prerequisites\n\nTensorflow (version >= 0.8)\n\n\n\n### How to Run the Experiments\n\nGenerate a dataset\n\n```\n$ python generator.py\n```\n\nThis script generates a file called `associative-retrieval.pkl`, which can be used for training.\n\n\n\nRun the model\n\n```\n$ python fw.py\n```\n\n\n\n### Findings\n\nThe following is the accuracy and loss graph for R=20. **The experiments are barely tuned.**\n\n![](fig/acc.png)\n\n\n\n![](fig/loss.png)\n\n**Layer Normalization is extremely crucial for the success of training.** \n\n- Otherwise, training will not converge when the inner step is larger than 1. \n- Even when inner step of 1, the performance without layer normalization is much worse. For R=20, only 0.4 accuracy can be achieved (which is same as the level of other models.)\n- Even with Layer Normalization, using slow weights (ie. vanilla RNN) is much worse than using fast weights.\n\n\n\nFurther improvements:\n\n- Complete fine-tuning\n- Work on other tasks\n\n\n\n\n### References\n\n[Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258). Jimmy Ba,  Geoffrey Hinton, Volodymyr Mnih, Joel Z. Leibo, Catalin Ionescu.\n\n[Layer Normalization](https://arxiv.org/abs/1607.06450). Jimmy Ba, Ryan Kiros, Geoffery Hinton."
  },
  {
    "path": "associative_retrieval.py",
    "content": "import numpy as np\nimport collections\ntry:\n    import cPickle as pickle\nexcept ImportError:\n    import pickle\n\n\nDatasets = collections.namedtuple('Datasets', ['train', 'val', 'test'])\n\n\nclass Dataset(object):\n    def __init__(self, x, y):\n        self._x = x\n        self._y = y\n        self._epoch_completed = 0\n        self._index_in_epoch = 0\n        self._num_examples = self.x.shape[0]\n        self.perm = np.random.permutation(np.arange(self._num_examples))\n\n    @property\n    def x(self):\n        return self._x\n\n    @property\n    def y(self):\n        return self._y\n\n    @property\n    def num_examples(self):\n        return self._num_examples\n\n    def next_batch(self, batch_size):\n        assert batch_size <= self._num_examples\n        start = self._index_in_epoch\n        self._index_in_epoch += batch_size\n        if self._index_in_epoch >= self.num_examples:\n            self._epoch_completed += 1\n            np.random.shuffle(self.perm)\n            start = 0\n            self._index_in_epoch = batch_size\n        end = self._index_in_epoch\n        return self._x[self.perm[start:end]], self._y[self.perm[start:end]]\n\n\ndef read_data(data_path='associative-retrieval.pkl'):\n    with open(data_path, 'rb') as f:\n        d = pickle.load(f)\n    x_train = d['x_train']\n    x_val = d['x_val']\n    x_test = d['x_test']\n    y_train = d['y_train']\n    y_val = d['y_val']\n    y_test = d['y_test']\n    train = Dataset(x_train, y_train)\n    test = Dataset(x_test, y_test)\n    val = Dataset(x_val, y_val)\n    return Datasets(train=train, val=val, test=test)\n\n"
  },
  {
    "path": "fw.py",
    "content": "from __future__ import print_function\n\nimport tensorflow as tf\nimport numpy as np\nimport time\nfrom associative_retrieval import read_data\nfrom subprocess import call\n\nar_data = read_data()\n\nSTEP_NUM = 11\nELEM_NUM = 26 + 10 + 1\n\n\nclass FastWeightsRecurrentNeuralNetworks(object):\n    def __init__(self, step_num, elem_num, hidden_num):\n        self.x = tf.placeholder(tf.float32, [None, step_num, elem_num])\n        self.y = tf.placeholder(tf.float32, [None, elem_num])\n        self.l = tf.placeholder(tf.float32, [])\n        self.e = tf.placeholder(tf.float32, [])\n\n        self.w1 = tf.Variable(tf.random_uniform([elem_num, 50], -np.sqrt(0.02), np.sqrt(0.02)), dtype=tf.float32)\n        self.b1 = tf.Variable(tf.zeros([1, 50]), dtype=tf.float32)\n        self.w2 = tf.Variable(tf.random_uniform([50, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32)\n        self.b2 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32)\n        self.w3 = tf.Variable(tf.random_uniform([hidden_num, 100], -np.sqrt(0.01), np.sqrt(0.01)), dtype=tf.float32)\n        self.b3 = tf.Variable(tf.zeros([1, 100]), dtype=tf.float32)\n        self.w4 = tf.Variable(tf.random_uniform([100, elem_num], -np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num)),\n                              dtype=tf.float32)\n        self.b4 = tf.Variable(tf.zeros([1, elem_num]), dtype=tf.float32)\n\n        self.w = tf.Variable(initial_value=0.05 * np.identity(hidden_num), dtype=tf.float32)\n        self.c = tf.Variable(tf.random_uniform([100, hidden_num], -np.sqrt(hidden_num), np.sqrt(hidden_num)),\n                             dtype=tf.float32)\n        self.g = tf.Variable(tf.ones([1, hidden_num]), dtype=tf.float32)\n        self.b = tf.Variable(tf.zeros([1, hidden_num]), dtype=tf.float32)\n\n        batch_size = tf.shape(self.x)[0]\n\n        a = tf.zeros(tf.pack([batch_size, hidden_num, hidden_num]), dtype=tf.float32)\n        h = tf.zeros([batch_size, hidden_num], dtype=tf.float32)\n        la = []\n        for t in range(0, step_num):\n            z = tf.nn.relu(tf.matmul(\n                tf.nn.relu(tf.matmul(self.x[:, t, :], self.w1) + self.b1),\n                self.w2) + self.b2\n                           )\n            h = tf.nn.relu(\n                tf.matmul(h, self.w) + tf.matmul(z, self.c)\n            )\n            hs = tf.reshape(h, tf.pack([batch_size, 1, hidden_num]))\n            hh = hs\n            a = tf.add(tf.scalar_mul(self.l, a),\n                       tf.scalar_mul(self.e, tf.batch_matmul(tf.transpose(hs, [0, 2, 1]), hs)))\n            la.append(tf.reduce_mean(tf.square(a)))\n            for s in range(0, 1):\n                hs = tf.reshape(tf.matmul(h, self.w), tf.shape(hh)) + \\\n                     tf.reshape(tf.matmul(z, self.c), tf.shape(hh)) + \\\n                     tf.batch_matmul(hs, a)\n                mu = tf.reduce_mean(hs, reduction_indices=0)\n                sig = tf.sqrt(tf.reduce_mean(tf.square(hs - mu), reduction_indices=0))\n                hs = tf.nn.relu(tf.div(tf.mul(self.g, (hs - mu)), sig) + self.b)\n            h = tf.reshape(hs, tf.pack([batch_size, hidden_num]))\n        h = tf.nn.relu(tf.matmul(h, self.w3) + self.b3)\n        logits = tf.matmul(h, self.w4) + self.b4\n        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.y))\n        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(self.loss)\n        correct = tf.equal(tf.argmax(logits, dimension=1), tf.argmax(self.y, dimension=1))\n        self.acc = tf.reduce_mean(tf.cast(correct, tf.float32))\n        self.summary = tf.merge_summary([\n            tf.scalar_summary('loss', self.loss),\n            tf.scalar_summary('acc', self.acc)\n        ])\n        self.sess = tf.Session()\n\n    def train(self, save=0, verbose=0):\n        call('rm -rf ./summary'.split(' '))\n        self.sess.run(tf.initialize_all_variables())\n        writer = tf.train.SummaryWriter('./summary')\n        batch_size = 100\n        start_time = time.time()\n        saver = tf.train.Saver(tf.all_variables())\n        for epoch in range(0, 500):\n            batch_idxs = 600\n            for idx in range(0, batch_idxs):\n                bx, by = ar_data.train.next_batch(batch_size=batch_size)\n                loss, acc, summary, _ = self.sess.run([self.loss, self.acc, self.summary, self.trainer],\n                                        feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5})\n                writer.add_summary(summary, global_step=epoch * batch_idxs + idx)\n                if verbose > 0 and idx % verbose == 0:\n                    print('Epoch: [{:4d}] [{:4d}/{:4d}] time: {:.4f}, loss: {:.8f}, acc: {:.2f}'.format(\n                        epoch, idx, batch_idxs, time.time() - start_time, loss, acc\n                    ))\n            if save > 0 and (epoch+1) % save == 0:\n                saver.save(self.sess, 'log/model', global_step=epoch)\n        saver.save(self.sess, 'log/moodel-final')\n\n    def test(self, val=True):\n        batch_idxs = 100\n        batch_size = 100\n        tot = 0.0\n        data = ar_data.val if val else ar_data.test\n        name = 'Validation' if val else 'Test'\n        for idx in range(0, batch_idxs):\n            bx, by = data.next_batch(batch_size=batch_size)\n            acc = self.sess.run(self.acc, feed_dict={self.x: bx, self.y: by, self.l: 0.9, self.e: 0.5})\n            tot += acc / batch_idxs\n        print('{}: {:.4f}'.format(name, tot))\n\n    def load(self, save_path='log/model-final'):\n        saver = tf.train.Saver(tf.all_variables())\n        saver.restore(self.sess, save_path=save_path)\n\n\nif __name__ == '__main__':\n    c = FastWeightsRecurrentNeuralNetworks(STEP_NUM, ELEM_NUM, 20)\n    c.train(verbose=10)\n\n\n\n\n\n"
  },
  {
    "path": "generator.py",
    "content": "import numpy as np\nimport random\nimport cPickle as pickle\n\nnum_train = 60000\nnum_val = 10000\nnum_test = 10000\n\nstep_num = 4\nelem_num = 26 + 10 + 1\n\nx_train = np.zeros([num_train, step_num * 2 + 3, elem_num], dtype=np.float32)\nx_val = np.zeros([num_val, step_num * 2 + 3, elem_num], dtype=np.float32)\nx_test = np.zeros([num_test, step_num * 2 + 3, elem_num], dtype=np.float32)\n\ny_train = np.zeros([num_train, elem_num], dtype=np.float32)\ny_val = np.zeros([num_val, elem_num], dtype=np.float32)\ny_test = np.zeros([num_test, elem_num], dtype=np.float32)\n\n\ndef get_one_hot(c):\n    a = np.zeros([elem_num])\n    if ord('a') <= ord(c) <= ord('z'):\n        a[ord(c) - ord('a')] = 1\n    elif ord('0') <= ord(c) <= ord('9'):\n        a[ord(c) - ord('0') + 26] = 1\n    else:\n        a[-1] = 1\n    return a\n\n\ndef generate_one():\n    a = np.zeros([step_num * 2 + 3, elem_num])\n    d = {}\n    st = ''\n\n    for i in range(0, step_num):\n        c = random.randint(0, 25)\n        while d.has_key(c):\n            c = random.randint(0, 25)\n        b = random.randint(0, 9)\n        d[c] = b\n        s, t = chr(c + ord('a')), chr(b + ord('0'))\n        st += s + t\n        a[i*2] = get_one_hot(s)\n        a[i*2+1] = get_one_hot(t)\n\n    s = random.choice(d.keys())\n    t = chr(s + ord('a'))\n    r = chr(d[s] + ord('0'))\n    a[step_num * 2] = get_one_hot('?')\n    a[step_num * 2 + 1] = get_one_hot('?')\n    a[step_num * 2 + 2] = get_one_hot(t)\n    st += '??' + t + r\n    e = get_one_hot(r)\n    return a, e\n\nif __name__ == '__main__':\n    for i in range(0, num_train):\n        x_train[i], y_train[i] = generate_one()\n\n    for i in range(0, num_test):\n        x_test[i], y_test[i] = generate_one()\n\n    for i in range(0, num_val):\n        x_val[i], y_val[i] = generate_one()\n\n    d = {\n        'x_train': x_train,\n        'x_test': x_test,\n        'x_val': x_val,\n        'y_train': y_train,\n        'y_test': y_test,\n        'y_val': y_val\n    }\n    with open('associative-retrieval.pkl', 'wb') as f:\n        pickle.dump(d, f, protocol=2)\n"
  }
]