Repository: MultiPath/CopyNet
Branch: master
Commit: dba24d58d505
Files: 56
Total size: 609.0 KB
Directory structure:
gitextract_9j7ufkjh/
├── .idea/
│ └── vcs.xml
├── LICENSE
├── README.md
├── emolga/
│ ├── __init__.py
│ ├── basic/
│ │ ├── __init__.py
│ │ ├── activations.py
│ │ ├── initializations.py
│ │ ├── objectives.py
│ │ └── optimizers.py
│ ├── config.py
│ ├── config_variant.py
│ ├── dataset/
│ │ └── build_dataset.py
│ ├── layers/
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── core.py
│ │ ├── embeddings.py
│ │ ├── gridlstm.py
│ │ ├── ntm_minibatch.py
│ │ └── recurrent.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── covc_encdec.py
│ │ ├── encdec.py
│ │ ├── ntm_encdec.py
│ │ ├── pointers.py
│ │ └── variational.py
│ ├── run.py
│ ├── test_lm.py
│ ├── test_nvtm.py
│ ├── test_run.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── generic_utils.py
│ │ ├── io_utils.py
│ │ ├── np_utils.py
│ │ ├── test_utils.py
│ │ └── theano_utils.py
│ └── voc.pkl
└── experiments/
├── __init__.py
├── bst_dataset.py
├── bst_vest.py
├── config.py
├── copynet.py
├── copynet_input.py
├── dataset.py
├── lcsts_dataset.py
├── lcsts_rouge.py
├── lcsts_sample.py
├── lcsts_test.py
├── lcsts_vest.py
├── lcsts_vest_new.py
├── movie_dataset.py
├── syn_vest.py
├── syntest.py
├── synthetic.py
├── weibo_dataset.py
└── weibo_vest.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .idea/vcs.xml
================================================
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2016 Jiatao Gu
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# CopyNet
incorporating copying mechanism in sequence-to-sequence learning
================================================
FILE: emolga/__init__.py
================================================
__author__ = 'yinpengcheng'
================================================
FILE: emolga/basic/__init__.py
================================================
__author__ = 'jiataogu'
================================================
FILE: emolga/basic/activations.py
================================================
import theano.tensor as T
def softmax(x):
return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
def vector_softmax(x):
return T.nnet.softmax(x.reshape((1, x.shape[0])))[0]
def time_distributed_softmax(x):
import warnings
warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning)
return softmax(x)
def softplus(x):
return T.nnet.softplus(x)
def relu(x):
return T.nnet.relu(x)
def tanh(x):
return T.tanh(x)
def sigmoid(x):
return T.nnet.sigmoid(x)
def hard_sigmoid(x):
return T.nnet.hard_sigmoid(x)
def linear(x):
'''
The function returns the variable that is passed in, so all types work
'''
return x
def maxout2(x):
shape = x.shape
if x.ndim == 1:
shape1 = T.cast(shape[0] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape1, shape2])
x = x.max(1)
elif x.ndim == 2:
shape1 = T.cast(shape[1] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape1, shape2])
x = x.max(2)
elif x.ndim == 3:
shape1 = T.cast(shape[2] / 2, 'int64')
shape2 = T.cast(2, 'int64')
x = x.reshape([shape[0], shape[1], shape1, shape2])
x = x.max(3)
return x
from emolga.utils.generic_utils import get_from_module
def get(identifier):
return get_from_module(identifier, globals(), 'activation function')
================================================
FILE: emolga/basic/initializations.py
================================================
import theano
import theano.tensor as T
import numpy as np
from emolga.utils.theano_utils import sharedX, shared_zeros, shared_ones
def get_fans(shape):
if isinstance(shape, int):
shape = (1, shape)
fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
fan_out = shape[1] if len(shape) == 2 else shape[0]
return fan_in, fan_out
def uniform(shape, scale=0.1):
return sharedX(np.random.uniform(low=-scale, high=scale, size=shape))
def normal(shape, scale=0.05):
return sharedX(np.random.randn(*shape) * scale)
def lecun_uniform(shape):
''' Reference: LeCun 98, Efficient Backprop
http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
'''
fan_in, fan_out = get_fans(shape)
scale = np.sqrt(3. / fan_in)
return uniform(shape, scale)
def glorot_normal(shape):
''' Reference: Glorot & Bengio, AISTATS 2010
'''
fan_in, fan_out = get_fans(shape)
s = np.sqrt(2. / (fan_in + fan_out))
return normal(shape, s)
def glorot_uniform(shape):
fan_in, fan_out = get_fans(shape)
s = np.sqrt(6. / (fan_in + fan_out))
return uniform(shape, s)
def he_normal(shape):
''' Reference: He et al., http://arxiv.org/abs/1502.01852
'''
fan_in, fan_out = get_fans(shape)
s = np.sqrt(2. / fan_in)
return normal(shape, s)
def he_uniform(shape):
fan_in, fan_out = get_fans(shape)
s = np.sqrt(6. / fan_in)
return uniform(shape, s)
def orthogonal(shape, scale=1.1):
''' From Lasagne
'''
flat_shape = (shape[0], np.prod(shape[1:]))
a = np.random.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
# pick the one with the correct shape
q = u if u.shape == flat_shape else v
q = q.reshape(shape)
return sharedX(scale * q[:shape[0], :shape[1]])
def identity(shape, scale=1):
if len(shape) != 2 or shape[0] != shape[1]:
raise Exception("Identity matrix initialization can only be used for 2D square matrices")
else:
return sharedX(scale * np.identity(shape[0]))
def zero(shape):
return shared_zeros(shape)
def one(shape):
return shared_ones(shape)
from emolga.utils.generic_utils import get_from_module
def get(identifier):
return get_from_module(identifier, globals(), 'initialization')
================================================
FILE: emolga/basic/objectives.py
================================================
from __future__ import absolute_import
import theano
import theano.tensor as T
import numpy as np
from six.moves import range
if theano.config.floatX == 'float64':
epsilon = 1.0e-9
else:
epsilon = 1.0e-7
def mean_squared_error(y_true, y_pred):
return T.sqr(y_pred - y_true).mean(axis=-1)
def mean_absolute_error(y_true, y_pred):
return T.abs_(y_pred - y_true).mean(axis=-1)
def mean_absolute_percentage_error(y_true, y_pred):
return T.abs_((y_true - y_pred) / T.clip(T.abs_(y_true), epsilon, np.inf)).mean(axis=-1) * 100.
def mean_squared_logarithmic_error(y_true, y_pred):
return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
def squared_hinge(y_true, y_pred):
return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
def hinge(y_true, y_pred):
return T.maximum(1. - y_true * y_pred, 0.).mean(axis=-1)
def categorical_crossentropy(y_true, y_pred):
'''Expects a binary class matrix instead of a vector of scalar classes
'''
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
# scale preds so that the class probas of each sample sum to 1
y_pred /= y_pred.sum(axis=-1, keepdims=True)
cce = T.nnet.categorical_crossentropy(y_pred, y_true)
return cce
def binary_crossentropy(y_true, y_pred):
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
bce = T.nnet.binary_crossentropy(y_pred, y_true).mean(axis=-1)
return bce
def poisson_loss(y_true, y_pred):
return T.mean(y_pred - y_true * T.log(y_pred + epsilon), axis=-1)
####################################################
# Variational Auto-encoder
def gaussian_kl_divergence(mean, ln_var):
"""Computes the KL-divergence of Gaussian variables from the standard one.
Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
representing :math:`\\log(\\sigma^2)`, this function returns a variable
representing the KL-divergence between the given multi-dimensional Gaussian
:math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`
.. math::
D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),
where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
and :math:`I` is an identity matrix.
Args:
mean (~chainer.Variable): A variable representing mean of given
gaussian distribution, :math:`\\mu`.
ln_var (~chainer.Variable): A variable representing logarithm of
variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.
Returns:
~chainer.Variable: A variable representing KL-divergence between
given gaussian distribution and the standard gaussian.
"""
var = T.exp(ln_var)
return 0.5 * T.sum(mean * mean + var - ln_var - 1, 1)
# aliases
mse = MSE = mean_squared_error
mae = MAE = mean_absolute_error
mape = MAPE = mean_absolute_percentage_error
msle = MSLE = mean_squared_logarithmic_error
gkl = GKL = gaussian_kl_divergence
from emolga.utils.generic_utils import get_from_module
def get(identifier):
return get_from_module(identifier, globals(), 'objective')
================================================
FILE: emolga/basic/optimizers.py
================================================
from __future__ import absolute_import
import theano
import sys
from theano.sandbox.rng_mrg import MRG_RandomStreams
import theano.tensor as T
import logging
from emolga.utils.theano_utils import shared_zeros, shared_scalar, floatX
from emolga.utils.generic_utils import get_from_module
from six.moves import zip
from copy import copy, deepcopy
logger = logging.getLogger(__name__)
def clip_norm(g, c, n):
if c > 0:
g = T.switch(T.ge(n, c), g * c / n, g)
return g
def kl_divergence(p, p_hat):
return p_hat - p + p * T.log(p / p_hat)
class Optimizer(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
self.updates = []
self.save_parm = []
def add(self, v):
self.save_parm += [v]
def get_state(self):
return [u[0].get_value() for u in self.updates]
def set_state(self, value_list):
assert len(self.updates) == len(value_list)
for u, v in zip(self.updates, value_list):
u[0].set_value(floatX(v))
def get_updates(self, params, loss):
raise NotImplementedError
def get_gradients(self, loss, params):
"""
Consider the situation that gradient is weighted.
"""
if isinstance(loss, list):
grads = T.grad(loss[0], params, consider_constant=loss[1:]) # gradient of loss
else:
grads = T.grad(loss, params)
if hasattr(self, 'clipnorm') and self.clipnorm > 0:
print 'use gradient clipping!!'
norm = T.sqrt(sum([T.sum(g ** 2) for g in grads]))
grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
return grads
def get_config(self):
return {"name": self.__class__.__name__}
class SGD(Optimizer):
def __init__(self, lr=0.05, momentum=0.9, decay=0.01, nesterov=True, *args, **kwargs):
super(SGD, self).__init__(**kwargs)
self.__dict__.update(locals())
self.iterations = shared_scalar(0)
self.lr = shared_scalar(lr)
self.momentum = shared_scalar(momentum)
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
self.updates = [(self.iterations, self.iterations + 1.)]
for p, g in zip(params, grads):
m = shared_zeros(p.get_value().shape) # momentum
v = self.momentum * m - lr * g # velocity
self.updates.append((m, v))
if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v
self.updates.append((p, new_p)) # apply constraints
return self.updates
def get_config(self):
return {"name": self.__class__.__name__,
"lr": float(self.lr.get_value()),
"momentum": float(self.momentum.get_value()),
"decay": float(self.decay.get_value()),
"nesterov": self.nesterov}
class RMSprop(Optimizer):
def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
super(RMSprop, self).__init__(**kwargs)
self.__dict__.update(locals())
self.lr = shared_scalar(lr)
self.rho = shared_scalar(rho)
self.iterations = shared_scalar(0)
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
self.updates = [(self.iterations, self.iterations + 1.)]
for p, g, a in zip(params, grads, accumulators):
new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
self.updates.append((a, new_a))
new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
self.updates.append((p, new_p)) # apply constraints
return self.updates
def get_config(self):
return {"name": self.__class__.__name__,
"lr": float(self.lr.get_value()),
"rho": float(self.rho.get_value()),
"epsilon": self.epsilon}
class Adagrad(Optimizer):
def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
super(Adagrad, self).__init__(**kwargs)
self.__dict__.update(locals())
self.lr = shared_scalar(lr)
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
self.updates = []
for p, g, a, c in zip(params, grads, accumulators, constraints):
new_a = a + g ** 2 # update accumulator
self.updates.append((a, new_a))
new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
self.updates.append((p, c(new_p))) # apply constraints
return self.updates
def get_config(self):
return {"name": self.__class__.__name__,
"lr": float(self.lr.get_value()),
"epsilon": self.epsilon}
class Adadelta(Optimizer):
'''
Reference: http://arxiv.org/abs/1212.5701
'''
def __init__(self, lr=0.1, rho=0.95, epsilon=1e-6, *args, **kwargs):
super(Adadelta, self).__init__(**kwargs)
self.__dict__.update(locals())
self.lr = shared_scalar(lr)
self.iterations = shared_scalar(0)
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
# self.updates = []
self.updates = [(self.iterations, self.iterations + 1.)]
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
self.updates.append((a, new_a))
# use the new accumulator and the *old* delta_accumulator
update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a +
self.epsilon)
new_p = p - self.lr * update
self.updates.append((p, new_p))
# update delta_accumulator
new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
self.updates.append((d_a, new_d_a))
return self.updates
def get_config(self):
return {"name": self.__class__.__name__,
"lr": float(self.lr.get_value()),
"rho": self.rho,
"epsilon": self.epsilon}
class Adam(Optimizer): # new Adam is designed for our purpose.
'''
Reference: http://arxiv.org/abs/1412.6980v8
Default parameters follow those provided in the original paper.
We add Gaussian Noise to improve the performance.
'''
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, save=False, rng=None, *args, **kwargs):
super(Adam, self).__init__(**kwargs)
self.__dict__.update(locals())
print locals()
self.iterations = shared_scalar(0, name='iteration')
self.lr = shared_scalar(lr, name='lr')
self.rng = MRG_RandomStreams(use_cuda=True)
self.noise = []
self.forget = dict()
self.rng = rng
self.add(self.iterations)
self.add(self.lr)
def add_noise(self, param):
if param.name not in self.noise:
logger.info('add gradient noise to {}'.format(param))
self.noise += [param.name]
def add_forget(self, param):
if param.name not in self.forget:
logger.info('add forgetting list to {}'.format(param))
self.forget[param.name] = theano.shared(param.get_value())
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
self.updates = [(self.iterations, self.iterations + 1.)]
self.pu = []
t = self.iterations + 1
lr_t = self.lr * T.sqrt(1 - self.beta_2**t) / (1 - self.beta_1**t)
for p, g in zip(params, grads):
m = theano.shared(p.get_value() * 0., name=p.name + '_m') # zero init of moment
v = theano.shared(p.get_value() * 0., name=p.name + '_v') # zero init of velocity
self.add(m)
self.add(v)
# g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.005 * t ** (-0.55)), dtype='float32')
# if p.name in self.noise:
# g_deviated = g + g_noise
# else:
# g_deviated = g
g_deviated = g # + g_noise
m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated
v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2)
u_t = -lr_t * m_t / (T.sqrt(v_t) + self.epsilon)
p_t = p + u_t
# # memory reformatting!
# if p.name in self.forget:
# p_t = (1 - p_mem) * p_t + p_mem * self.forget[p.name]
# p_s = (1 - p_fgt) * p_t + p_fgt * self.forget[p.name]
# self.updates.append((self.forget[p.name], p_s))
self.updates.append((m, m_t))
self.updates.append((v, v_t))
self.updates.append((p, p_t)) # apply constraints
self.pu.append((p, p_t - p))
if self.save:
return self.updates, self.pu
return self.updates
# aliases
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta
adam = Adam
def get(identifier, kwargs=None):
return get_from_module(identifier, globals(), 'optimizer', instantiate=True,
kwargs=kwargs)
================================================
FILE: emolga/config.py
================================================
__author__ = 'jiataogu'
import os
import os.path as path
def setup_ptb2():
# pretraining setting up.
# get the lm_config.
config = dict()
config['on_unused_input'] = 'ignore'
config['seed'] = 3030029828
config['level'] = 'DEBUG'
# config['model'] = 'RNNLM'
# config['model'] = 'VAE'
# config['model'] = 'RNNLM' #'Helmholtz'
config['model'] = 'HarX'
config['highway'] = False
config['use_noise'] = False
config['optimizer'] = 'adam' #'adadelta'
# config['lr'] = 0.1
# config['optimizer'] = 'sgd'
# dataset
config['path'] = path.realpath(path.curdir) + '/' # '/home/thoma/Work/Dial-DRL/'
config['vocabulary_set'] = config['path'] + 'dataset/ptbcorpus/voc.pkl'
config['dataset'] = config['path'] + 'dataset/ptbcorpus/data_train.pkl'
config['dataset_valid'] = config['path'] + 'dataset/ptbcorpus/data_valid.pkl'
config['dataset_test'] = config['path'] + 'dataset/ptbcorpus/data_test.pkl'
# output hdf5 file place.
config['path_h5'] = config['path'] + 'H5'
if not os.path.exists(config['path_h5']):
os.mkdir(config['path_h5'])
# output log place
config['path_log'] = config['path'] + 'Logs'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
# size
config['batch_size'] = 20
config['eval_batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
# Encoder: dimension
config['enc_embedd_dim'] = 300
config['enc_hidden_dim'] = 300
config['enc_contxt_dim'] = 350
config['encoder'] = 'RNN'
config['pooling'] = False
# Encoder: Model
config['bidirectional'] = False # True
config['decposterior'] = True
config['enc_use_contxt'] = False
# Agent: dimension
config['action_dim'] = 50
config['output_dim'] = 300
# Decoder: dimension
config['dec_embedd_dim'] = 300
config['dec_hidden_dim'] = 300
config['dec_contxt_dim'] = 300
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = False
config['bias_code'] = False # True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = False
config['context_predict'] = True # False
config['leaky_predict'] = False # True
config['dropout'] = 0.3
# Decoder: sampling
config['max_len'] = 88 # 15
config['sample_beam'] = 10
config['sample_stoch'] = False
config['sample_argmax'] = False
# Auto-Encoder
config['nonlinear_A'] = True
config['nonlinear_B'] = False
# VAE/Helmholtz: Model
config['repeats'] = 10
config['eval_repeats'] = 10
config['eval_N'] = 10
config['variant_control'] = False
config['factor'] = 10.
config['mult_q'] = 10.
print 'setup ok.'
return config
================================================
FILE: emolga/config_variant.py
================================================
__author__ = 'jiataogu'
from config import setup_ptb2
setup = setup_ptb2
"""
This file is for small variant fix on original
"""
def setup_bienc(config=None):
if config is None:
config = setup()
print 'make some modification'
config['bidirectional'] = True
config['decposterior'] = False
return config
def setup_dim(config=None):
if config is None:
config = setup()
print 'make some modification'
config['enc_embedd_dim'] = 300
config['enc_hidden_dim'] = 300
config['action_dim'] = 100
config['dec_embedd_dim'] = 300
config['dec_hidden_dim'] = 300
config['dec_contxt_dim'] = 300
return config
def setup_rep(config=None):
if config is None:
config = setup()
print 'make some modification'
config['repeats'] = 5
return config
def setup_opt(config=None):
if config is None:
config = setup()
print 'make some modification'
config['optimizer'] = 'Adam'
return config
================================================
FILE: emolga/dataset/build_dataset.py
================================================
__author__ = 'jiataogu'
import numpy as np
import numpy.random as rng
import cPickle
import pprint
import sys
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
from fuel import streams
def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL):
f = open(path, 'wb')
cPickle.dump(obj, f, protocol=protocol)
f.close()
def show_txt(array, path):
f = open(path, 'w')
for line in array:
f.write(' '.join(line) + '\n')
f.close()
def divide_dataset(dataset, test_size, max_size):
train_set = dict()
test_set = dict()
for w in dataset:
train_set[w] = dataset[w][test_size:max_size].astype('int32')
test_set[w] = dataset[w][:test_size].astype('int32')
return train_set, test_set
def deserialize_from_file(path):
f = open(path, 'rb')
obj = cPickle.load(f)
f.close()
return obj
def build_fuel(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('data', data)]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset, len(data)
def obtain_stream(dataset, batch_size, size=1):
if size == 1:
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('data'))
return data_stream
else:
data_streams = [dataset.get_example_stream() for _ in xrange(size)]
data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
for data_stream in data_streams]
data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams]
return data_streams
def build_ptb():
path = './ptbcorpus/'
print path
# make the dataset and vocabulary
X_train = [l.split() for l in open(path + 'ptb.train.txt').readlines()]
X_test = [l.split() for l in open(path + 'ptb.test.txt').readlines()]
X_valid = [l.split() for l in open(path + 'ptb.valid.txt').readlines()]
X = X_train + X_test + X_valid
idx2word = dict(enumerate(set([w for l in X for w in l]), 1))
idx2word[0] = ''
word2idx = {v: k for k, v in idx2word.items()}
ixwords_train = [[word2idx[w] for w in l] for l in X_train]
ixwords_test = [[word2idx[w] for w in l] for l in X_test]
ixwords_valid = [[word2idx[w] for w in l] for l in X_valid]
ixwords_tv = [[word2idx[w] for w in l] for l in (X_train + X_valid)]
max_len = max([len(w) for w in X_train])
print max_len
# serialization:
# serialize_to_file(ixwords_train, path + 'data_train.pkl')
# serialize_to_file(ixwords_test, path + 'data_test.pkl')
# serialize_to_file(ixwords_valid, path + 'data_valid.pkl')
# serialize_to_file(ixwords_tv, path + 'data_tv.pkl')
# serialize_to_file([idx2word, word2idx], path + 'voc.pkl')
# show_txt(X, 'data.txt')
print 'save done.'
def filter_unk(X, min_freq=5):
voc = dict()
for l in X:
for w in l:
if w not in voc:
voc[w] = 1
else:
voc[w] += 1
word2idx = dict()
word2idx[''] = 0
id2word = dict()
id2word[0] = ''
at = 1
for w in voc:
if voc[w] > min_freq:
word2idx[w] = at
id2word[at] = w
at += 1
word2idx[''] = at
id2word[at] = ''
return word2idx, id2word
def build_msr():
# path = '/home/thoma/Work/Dial-DRL/dataset/MSRSCC/'
path = '/Users/jiataogu/Work/Dial-DRL/dataset/MSRSCC/'
print path
X = [l.split() for l in open(path + 'train.txt').readlines()]
word2idx, idx2word = filter_unk(X, min_freq=5)
print 'vocabulary size={0}. {1} samples'.format(len(word2idx), len(X))
mean_len = np.mean([len(w) for w in X])
print 'mean len = {}'.format(mean_len)
ixwords = [[word2idx[w]
if w in word2idx
else word2idx['']
for w in l] for l in X]
print ixwords[0]
# serialization:
serialize_to_file(ixwords, path + 'data_train.pkl')
if __name__ == '__main__':
build_msr()
# build_ptb()
# build_dataset()
# game = GuessOrder(size=8)
# q = 'Is there any number smaller de than 6 in the last 3 numbers ?'
# print game.easy_parse(q)
================================================
FILE: emolga/layers/__init__.py
================================================
__author__ = 'yinpengcheng'
================================================
FILE: emolga/layers/attention.py
================================================
__author__ = 'jiataogu'
from .core import *
"""
Attention Model.
<::: Two kinds of attention models ::::>
-- Linear Transformation
-- Inner Product
"""
class Attention(Layer):
def __init__(self, target_dim, source_dim, hidden_dim,
init='glorot_uniform', name='attention',
coverage=False, max_len=50,
shared=False):
super(Attention, self).__init__()
self.init = initializations.get(init)
self.softmax = activations.get('softmax')
self.tanh = activations.get('tanh')
self.target_dim = target_dim
self.source_dim = source_dim
self.hidden_dim = hidden_dim
self.max_len = max_len
self.coverage = coverage
if coverage:
print 'Use Coverage Trick!'
self.Wa = self.init((self.target_dim, self.hidden_dim))
self.Ua = self.init((self.source_dim, self.hidden_dim))
self.va = self.init((self.hidden_dim, 1))
self.Wa.name, self.Ua.name, self.va.name = \
'{}_Wa'.format(name), '{}_Ua'.format(name), '{}_va'.format(name)
self.params = [self.Wa, self.Ua, self.va]
if coverage:
self.Ca = self.init((1, self.hidden_dim))
self.Ca.name = '{}_Ca'.format(name)
self.params += [self.Ca]
def __call__(self, X, S,
Smask=None,
return_log=False,
Cov=None):
assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
# X is the key: (nb_samples, x_dim)
# S is the source (nb_samples, maxlen_s, ctx_dim)
# Cov is the coverage vector (nb_samples, maxlen_s)
if X.ndim == 1:
X = X[None, :]
S = S[None, :, :]
if not Smask:
Smask = Smask[None, :]
Eng = dot(X[:, None, :], self.Wa) + dot(S, self.Ua) # (nb_samples, source_num, hidden_dims)
Eng = self.tanh(Eng)
# location aware:
if self.coverage:
Eng += dot(Cov[:, :, None], self.Ca) # (nb_samples, source_num, hidden_dims)
Eng = dot(Eng, self.va)
Eng = Eng[:, :, 0] # ? (nb_samples, source_num)
if Smask is not None:
# I want to use mask!
EngSum = logSumExp(Eng, axis=1, mask=Smask)
if return_log:
return (Eng - EngSum) * Smask
else:
return T.exp(Eng - EngSum) * Smask
else:
if return_log:
return T.log(self.softmax(Eng))
else:
return self.softmax(Eng)
class CosineAttention(Layer):
def __init__(self, target_dim, source_dim,
init='glorot_uniform',
use_pipe=True,
name='attention'):
super(CosineAttention, self).__init__()
self.init = initializations.get(init)
self.softmax = activations.get('softmax')
self.softplus = activations.get('softplus')
self.tanh = activations.get('tanh')
self.use_pipe = use_pipe
self.target_dim = target_dim
self.source_dim = source_dim
# pipe
if self.use_pipe:
self.W_key = Dense(self.target_dim, self.source_dim, name='W_key')
else:
assert target_dim == source_dim
self.W_key = Identity(name='W_key')
self._add(self.W_key)
# sharpen
# self.W_beta = Dense(self.target_dim, 1, name='W_beta')
# dio-sharpen
# self.W_beta = Dense(self.target_dim, self.source_dim, name='W_beta')
# self._add(self.W_beta)
# self.gamma = self.init((source_dim, ))
# self.gamma = self.init((target_dim, source_dim))
# self.gamma.name = 'o_gamma'
# self.params += [self.gamma]
def __call__(self, X, S, Smask=None, return_log=False):
assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
if X.ndim == 1:
X = X[None, :]
S = S[None, :, :]
if not Smask:
Smask = Smask[None, :]
key = self.W_key(X) # (nb_samples, source_dim)
# beta = self.softplus(self.W_beta(X)) # (nb_samples, source_dim)
Eng = dot_2d(key, S) #, g=self.gamma)
# Eng = cosine_sim2d(key, S) # (nb_samples, source_num)
# Eng = T.repeat(beta, Eng.shape[1], axis=1) * Eng
if Smask is not None:
# I want to use mask!
EngSum = logSumExp(Eng, axis=1, mask=Smask)
if return_log:
return (Eng - EngSum) * Smask
else:
return T.exp(Eng - EngSum) * Smask
else:
if return_log:
return T.log(self.softmax(Eng))
else:
return self.softmax(Eng)
================================================
FILE: emolga/layers/core.py
================================================
# -*- coding: utf-8 -*-
from emolga.utils.theano_utils import *
import emolga.basic.initializations as initializations
import emolga.basic.activations as activations
class Layer(object):
def __init__(self):
self.params = []
self.layers = []
self.monitor = {}
self.watchlist = []
def init_updates(self):
self.updates = []
def _monitoring(self):
# add monitoring variables
for l in self.layers:
for v in l.monitor:
name = v + '@' + l.name
print name
self.monitor[name] = l.monitor[v]
def __call__(self, X, *args, **kwargs):
return X
def _add(self, layer):
if layer:
self.layers.append(layer)
self.params += layer.params
def supports_masked_input(self):
''' Whether or not this layer respects the output mask of its previous layer in its calculations. If you try
to attach a layer that does *not* support masked_input to a layer that gives a non-None output_mask() that is
an error'''
return False
def get_output_mask(self, train=None):
'''
For some models (such as RNNs) you want a way of being able to mark some output data-points as
"masked", so they are not used in future calculations. In such a model, get_output_mask() should return a mask
of one less dimension than get_output() (so if get_output is (nb_samples, nb_timesteps, nb_dimensions), then the mask
is (nb_samples, nb_timesteps), with a one for every unmasked datapoint, and a zero for every masked one.
If there is *no* masking then it shall return None. For instance if you attach an Activation layer (they support masking)
to a layer with an output_mask, then that Activation shall also have an output_mask. If you attach it to a layer with no
such mask, then the Activation's get_output_mask shall return None.
Some emolga have an output_mask even if their input is unmasked, notably Embedding which can turn the entry "0" into
a mask.
'''
return None
def set_weights(self, weights):
for p, w in zip(self.params, weights):
if p.eval().shape != w.shape:
raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
p.set_value(floatX(w))
def get_weights(self):
weights = []
for p in self.params:
weights.append(p.get_value())
return weights
def get_params(self):
return self.params
def set_name(self, name):
for i in range(len(self.params)):
if self.params[i].name is None:
self.params[i].name = '%s_p%d' % (name, i)
else:
self.params[i].name = name + '_' + self.params[i].name
self.name = name
class MaskedLayer(Layer):
'''
If your layer trivially supports masking (by simply copying the input mask to the output), then subclass MaskedLayer
instead of Layer, and make sure that you incorporate the input mask into your calculation of get_output()
'''
def supports_masked_input(self):
return True
class Identity(Layer):
def __init__(self, name='Identity'):
super(Identity, self).__init__()
if name is not None:
self.set_name(name)
def __call__(self, X):
return X
class Dense(Layer):
def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense',
learn_bias=True, negative_bias=False):
super(Dense, self).__init__()
self.init = initializations.get(init)
self.activation = activations.get(activation)
self.input_dim = input_dim
self.output_dim = output_dim
self.linear = (activation == 'linear')
# self.input = T.matrix()
self.W = self.init((self.input_dim, self.output_dim))
if not negative_bias:
self.b = shared_zeros((self.output_dim))
else:
self.b = shared_ones((self.output_dim))
self.learn_bias = learn_bias
if self.learn_bias:
self.params = [self.W, self.b]
else:
self.params = [self.W]
if name is not None:
self.set_name(name)
def set_name(self, name):
self.W.name = '%s_W' % name
self.b.name = '%s_b' % name
def __call__(self, X):
output = self.activation(T.dot(X, self.W) + 4. * self.b)
return output
def reverse(self, Y):
assert self.linear
output = T.dot((Y - self.b), self.W.T)
return output
class Dense2(Layer):
def __init__(self, input_dim1, input_dim2, output_dim, init='glorot_uniform', activation='tanh', name='Dense', learn_bias=True):
super(Dense2, self).__init__()
self.init = initializations.get(init)
self.activation = activations.get(activation)
self.input_dim1 = input_dim1
self.input_dim2 = input_dim2
self.output_dim = output_dim
self.linear = (activation == 'linear')
# self.input = T.matrix()
self.W1 = self.init((self.input_dim1, self.output_dim))
self.W2 = self.init((self.input_dim2, self.output_dim))
self.b = shared_zeros((self.output_dim))
self.learn_bias = learn_bias
if self.learn_bias:
self.params = [self.W1, self.W2, self.b]
else:
self.params = [self.W1, self.W2]
if name is not None:
self.set_name(name)
def set_name(self, name):
self.W1.name = '%s_W1' % name
self.W2.name = '%s_W2' % name
self.b.name = '%s_b' % name
def __call__(self, X1, X2):
output = self.activation(T.dot(X1, self.W1) + T.dot(X2, self.W2) + self.b)
return output
class Constant(Layer):
def __init__(self, input_dim, output_dim, init=None, activation='tanh', name='Bias'):
super(Constant, self).__init__()
assert input_dim == output_dim, 'Bias Layer needs to have the same input/output nodes.'
self.init = initializations.get(init)
self.activation = activations.get(activation)
self.input_dim = input_dim
self.output_dim = output_dim
self.b = shared_zeros(self.output_dim)
self.params = [self.b]
if name is not None:
self.set_name(name)
def set_name(self, name):
self.b.name = '%s_b' % name
def __call__(self, X=None):
output = self.activation(self.b)
if X:
L = X.shape[0]
output = T.extra_ops.repeat(output[None, :], L, axis=0)
return output
class MemoryLinear(Layer):
def __init__(self, input_dim, input_wdth, init='glorot_uniform',
activation='tanh', name='Bias', has_input=True):
super(MemoryLinear, self).__init__()
self.init = initializations.get(init)
self.activation = activations.get(activation)
self.input_dim = input_dim
self.input_wdth = input_wdth
self.b = self.init((self.input_dim, self.input_wdth))
self.params = [self.b]
if has_input:
self.P = self.init((self.input_dim, self.input_wdth))
self.params += [self.P]
if name is not None:
self.set_name(name)
def __call__(self, X=None):
out = self.b[None, :, :]
if X:
out += self.P[None, :, :] * X
return self.activation(out)
class Dropout(MaskedLayer):
"""
Hinton's dropout.
"""
def __init__(self, rng=None, p=1., name=None):
super(Dropout, self).__init__()
self.p = p
self.rng = rng
def __call__(self, X, train=True):
if self.p > 0.:
retain_prob = 1. - self.p
if train:
X *= self.rng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
else:
X *= retain_prob
return X
class Activation(MaskedLayer):
"""
Apply an activation function to an output.
"""
def __init__(self, activation):
super(Activation, self).__init__()
self.activation = activations.get(activation)
def __call__(self, X):
return self.activation(X)
================================================
FILE: emolga/layers/embeddings.py
================================================
# -*- coding: utf-8 -*-
from .core import Layer
from emolga.utils.theano_utils import *
import emolga.basic.initializations as initializations
class Embedding(Layer):
'''
Turn positive integers (indexes) into denses vectors of fixed size.
eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
@input_dim: size of vocabulary (highest input integer + 1)
@out_dim: size of dense representation
'''
def __init__(self, input_dim, output_dim, init='uniform', name=None):
super(Embedding, self).__init__()
self.init = initializations.get(init)
self.input_dim = input_dim
self.output_dim = output_dim
self.W = self.init((self.input_dim, self.output_dim))
self.params = [self.W]
if name is not None:
self.set_name(name)
def get_output_mask(self, X):
return T.ones_like(X) * (1 - T.eq(X, 0))
def __call__(self, X, mask_zero=False, context=None):
if context is None:
out = self.W[X]
else:
assert context.ndim == 3
flag = False
if X.ndim == 1:
flag = True
X = X[:, None]
b_size = context.shape[0]
EMB = T.repeat(self.W[None, :, :], b_size, axis=0)
EMB = T.concatenate([EMB, context], axis=1)
m_size = EMB.shape[1]
e_size = EMB.shape[2]
maxlen = X.shape[1]
EMB = EMB.reshape((b_size * m_size, e_size))
Z = (T.arange(b_size)[:, None] * m_size + X).reshape((b_size * maxlen,))
out = EMB[Z] # (b_size * maxlen, e_size)
if not flag:
out = out.reshape((b_size, maxlen, e_size))
else:
out = out.reshape((b_size, e_size))
if mask_zero:
return out, T.cast(self.get_output_mask(X), dtype='float32')
else:
return out
class Zero(Layer):
def __call__(self, X):
out = T.zeros(X.shape)
return out
class Bias(Layer):
def __call__(self, X):
tmp = X.flatten()
tmp = tmp.dimshuffle(0, 'x')
return tmp
================================================
FILE: emolga/layers/gridlstm.py
================================================
__author__ = 'jiataogu'
"""
The file is the implementation of Grid-LSTM
In this stage we only support 2D LSTM with Pooling.
"""
from recurrent import *
from attention import Attention
import logging
import copy
logger = logging.getLogger(__name__)
class Grid(Recurrent):
"""
Grid Cell for Grid-LSTM
===================================================
LSTM
[h', m'] = LSTM(x, h, m):
gi = sigmoid(Wi * x + Ui * h + Vi * m) # Vi is peep-hole
gf = sigmoid(Wf * x + Uf * h + Vf * m)
go = sigmoid(Wo * x + Uo * h + Vo * m)
gc = tanh(Wc * x +Uc * h)
m' = gf @ m + gi @ gc (@ represents element-wise dot.)
h' = go @ tanh(m')
===================================================
Grid
(here is an example for 2D Grid LSTM with priority dimension = 1)
-------------
| c' d' | Grid Block and Grid Updates.
| a a'|
| | [d' c'] = LSTM_d([b, d], c)
| b b'| [a' b'] = LSTM_t([b, d'], a)
| c d |
-------------
===================================================
Details please refer to:
"Grid Long Short-Term Memory", http://arxiv.org/abs/1507.01526
"""
def __init__(self,
output_dims,
input_dims, # [0, ... 0], 0 represents no external inputs.
priority=1,
peephole=True,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
use_input=False,
name=None, weights=None,
identity_connect=None
):
super(Grid, self).__init__()
# assert len(output_dims) == 2, 'in this stage, we only support 2D Grid-LSTM'
assert len(input_dims) == len(output_dims), '# of inputs must match # of outputs.'
"""
Initialization.
"""
self.input_dims = input_dims
self.output_dims = output_dims
self.N = len(output_dims)
self.priority = priority
self.peephole = peephole
self.use_input = use_input
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.inner_activation = activations.get(inner_activation)
self.identity_connect = identity_connect
self.axies = {0: 'x', 1: 'y', 2: 'z', 3: 'w'} # only support at most 4D now!
"""
Others info.
"""
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def build(self):
"""
Build the model weights
"""
logger.info("Building GridPool-LSTM !!")
self.W = dict()
self.U = dict()
self.V = dict()
self.b = dict()
# ******************************************************************************************
for k in xrange(self.N): # N-Grids (for 2 dimensions, 0 is for time; 1 is for depth.)
axis = self.axies[k]
# input layers:
if self.input_dims[k] > 0 and self.use_input:
# use the data information.
self.W[axis + '#i'], self.W[axis + '#f'], \
self.W[axis + '#o'], self.W[axis + '#c'] \
= [self.init((self.input_dims[k], self.output_dims[k])) for _ in xrange(4)]
# hidden layers:
for j in xrange(self.N): # every hidden states inputs.
pos = self.axies[j]
if k == j:
self.U[axis + pos + '#i'], self.U[axis + pos + '#f'], \
self.U[axis + pos + '#o'], self.U[axis + pos + '#c'] \
= [self.inner_init((self.output_dims[j], self.output_dims[k])) for _ in xrange(4)]
else:
self.U[axis + pos + '#i'], self.U[axis + pos + '#f'], \
self.U[axis + pos + '#o'], self.U[axis + pos + '#c'] \
= [self.init((self.output_dims[j], self.output_dims[k])) for _ in xrange(4)]
# bias layers:
self.b[axis + '#i'], self.b[axis + '#o'], self.b[axis + '#c'] \
= [shared_zeros(self.output_dims[k]) for _ in xrange(3)]
self.b[axis + '#f'] = self.forget_bias_init(self.output_dims[k])
# peep-hole layers:
if self.peephole:
self.V[axis + '#i'], self.V[axis + '#f'], self.V[axis + '#o'] \
= [self.init(self.output_dims[k]) for _ in xrange(3)]
# *****************************************************************************************
# set names for these weights
for A, n in zip([self.W, self.U, self.b, self.V], ['W', 'U', 'b', 'V']):
for w in A:
A[w].name = n + '_' + w
# set parameters
self.params = [self.W[s] for s in self.W] + \
[self.U[s] for s in self.U] + \
[self.b[s] for s in self.b] + \
[self.V[s] for s in self.V]
def lstm_(self, k, H, m, x, identity=False):
"""
LSTM
[h', m'] = LSTM(x, h, m):
gi = sigmoid(Wi * x + Ui * h + Vi * m) # Vi is peep-hole
gf = sigmoid(Wf * x + Uf * h + Vf * m)
go = sigmoid(Wo * x + Uo * h + Vo * m)
gc = tanh(Wc * x +Uc * h)
m' = gf @ m + gi @ gc (@ represents element-wise dot.)
h' = go @ tanh(m')
"""
assert len(H) == self.N, 'we have to use all the hidden states in Grid LSTM'
axis = self.axies[k]
# *************************************************************************
# bias energy
ei, ef, eo, ec = [self.b[axis + p] for p in ['#i', '#f', '#o', '#c']]
# hidden energy
for j in xrange(self.N):
pos = self.axies[j]
ei += T.dot(H[j], self.U[axis + pos + '#i'])
ef += T.dot(H[j], self.U[axis + pos + '#f'])
eo += T.dot(H[j], self.U[axis + pos + '#o'])
ec += T.dot(H[j], self.U[axis + pos + '#c'])
# input energy (if any)
if self.input_dims[k] > 0 and self.use_input:
ei += T.dot(x, self.W[axis + '#i'])
ef += T.dot(x, self.W[axis + '#f'])
eo += T.dot(x, self.W[axis + '#o'])
ec += T.dot(x, self.W[axis + '#c'])
# peep-hole connections
if self.peephole:
ei += m * self.V[axis + '#i'][None, :]
ef += m * self.V[axis + '#f'][None, :]
eo += m * self.V[axis + '#o'][None, :]
# *************************************************************************
# compute the gates.
i = self.inner_activation(ei)
f = self.inner_activation(ef)
o = self.inner_activation(eo)
c = self.activation(ec)
# update the memory and hidden states.
m_new = f * m + i * c
h_new = o * self.activation(m_new)
return h_new, m_new
def grid_(self,
hs_i,
ms_i,
xs_i,
priority=1,
identity=None):
"""
===================================================
Grid (2D as an example)
-------------
| c' d' | Grid Block and Grid Updates.
| a a'|
| | [d' c'] = LSTM_d([b, d], c)
| b b'| [a' b'] = LSTM_t([b, d'], a) priority
| c d |
-------------
a = my | b = hy | c = mx | d = hx
===================================================
Currently masking is not considered in GridLSTM.
"""
# compute LSTM updates for non-priority dimensions
H_new = hs_i
M_new = ms_i
for k in xrange(self.N):
if k == priority:
continue
m = ms_i[k]
x = xs_i[k]
H_new[k], M_new[k] \
= self.lstm_(k, hs_i, m, x)
if identity is not None:
if identity[k]:
H_new[k] += hs_i[k]
# compute LSTM updates along the priority dimension
if priority >= 0:
hs_ii = H_new
H_new[priority], M_new[priority] \
= self.lstm_(priority, hs_ii, ms_i[priority], xs_i[priority])
if identity is not None:
if identity[priority]:
H_new[priority] += hs_ii[priority]
return H_new, M_new
class GridLSTM3D(Grid):
"""
Grid-LSTM 3D version,
which has one flexible dimension (time) and 2 fixed dimensions (x & y)
"""
def __init__(self,
# parameters for Grid.
output_dims,
input_dims, # [0, ... 0], 0 represents no external inputs.
priority=1,
peephole=True,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
use_input=False,
name=None, weights=None,
identity_connect=None,
# parameters for 2D-GridLSTM
depth=10, # the size of a big grid
learn_init=False,
pooling=True,
attention=False,
shared=True,
dropout=0,
rng=None,
):
super(Grid, self).__init__()
assert len(output_dims) == 3, 'in this stage, we only support 3D Grid-LSTM'
assert len(input_dims) == len(output_dims), '# of inputs must match # of outputs.'
assert input_dims[2] == 0, 'we have no z-axis inputs here.'
assert shared, 'we share the weights in this stage.'
assert not (attention and pooling), 'attention and pooling cannot be set at the same time.'
"""
Initialization.
"""
logger.info(":::: Sequential Grid-Pool LSTM ::::")
self.input_dims = input_dims
self.output_dims = output_dims
self.N = len(output_dims)
self.depth = depth
self.dropout = dropout
self.priority = priority
self.peephole = peephole
self.use_input = use_input
self.pooling = pooling
self.attention = attention
self.learn_init = learn_init
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.relu = activations.get('relu')
self.inner_activation = activations.get(inner_activation)
self.identity_connect = identity_connect
self.axies = {0: 'x', 1: 'y', 2: 'z', 3: 'w'} # only support at most 4D now!
if self.identity_connect is not None:
logger.info('Identity Connection: {}'.format(self.identity_connect))
"""
Build the model weights.
"""
# build the centroid grid.
self.build()
# input projection layer (projected to time-axis) [x]
self.Ph = Dense(input_dims[0], output_dims[0], name='Ph')
self.Pm = Dense(input_dims[0], output_dims[0], name='Pm')
self._add(self.Ph)
self._add(self.Pm)
# learn init for depth-axis hidden states/memory cells. [y]
if self.learn_init:
self.M0 = self.init((depth, depth, output_dims[2]))
self.H0 = self.init((depth, depth, output_dims[2]))
self.M0.name, self.H0.name = 'M0', 'H0'
self.params += [self.M0, self.H0]
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self, *args):
# since depth is not determined, we cannot decide the number of inputs
# for one time step.
# if pooling is True:
# args = [raw_input] + (sequence)
# [hy] + [my]*depth (output_info)
#
inputs = args[0] # (nb_samples, x, y)
Hy_tm1 = [args[k] for k in range(1, 1 + self.depth)]
My_tm1 = [args[k] for k in range(1 + self.depth, 1 + 2 * self.depth)]
# x_axis input projection (get hx_t, mx_t)
hx_t = self.Ph(inputs) # (nb_samples, output_dim0, output_dim1)
mx_t = self.Pm(inputs) # (nb_samples, output_dim0, output_dim1)
# build computation path from bottom to top.
Hx_t = [hx_t]
Mx_t = [mx_t]
Hy_t = []
My_t = []
for d in xrange(self.depth):
hs_i = [Hx_t[-1], Hy_tm1[d]]
ms_i = [Mx_t[-1], My_tm1[d]]
xs_i = [inputs, T.zeros_like(inputs)]
hs_o, ms_o = self.grid_(hs_i, ms_i, xs_i, priority=self.priority, identity=self.identity_connect)
Hx_t += [hs_o[0]]
Hy_t += [hs_o[1]]
Mx_t += [ms_o[0]]
My_t += [ms_o[1]]
hx_out = Hx_t[-1]
mx_out = Mx_t[-1]
# get the output (output_y, output_x)
# MAX-Pooling
if self.pooling:
# hy_t = T.max([self.PP(hy) for hy in Hy_t], axis=0)
hy_t = T.max([self.PP(T.concatenate([hy, inputs], axis=-1)) for hy in Hy_t], axis=0)
Hy_t = [hy_t] * self.depth
if self.attention:
HHy_t = T.concatenate([hy[:, None, :] for hy in Hy_t], axis=1) # (nb_samples, n_depth, out_dim1)
annotation = self.A(inputs, HHy_t) # (nb_samples, n_depth)
hy_t = T.sum(HHy_t * annotation[:, :, None], axis=1) # (nb_samples, out_dim1)
Hy_t = [hy_t] * self.depth
R = Hy_t + My_t + [hx_out, mx_out]
return tuple(R)
def __call__(self, X, init_H=None, init_M=None,
return_sequence=False, one_step=False,
return_info='hy', train=True):
# It is training/testing path
self.train = train
# recently we did not support masking.
if X.ndim == 2:
X = X[:, None, :]
# one step
if one_step:
assert init_H is not None, 'previous state must be provided!'
assert init_M is not None, 'previous cell must be provided!'
X = X.dimshuffle((1, 0, 2))
if init_H is None:
if self.learn_init:
init_m = T.repeat(self.M0[:, None, :], X.shape[1], axis=1)
if self.pooling:
init_h = T.repeat(self.H0[None, :], self.depth, axis=0)
else:
init_h = self.H0
init_h = T.repeat(init_h[:, None, :], X.shape[1], axis=1)
init_H = []
init_M = []
for j in xrange(self.depth):
init_H.append(init_h[j])
init_M.append(init_m[j])
else:
init_H = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * self.depth
init_M = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * self.depth
pass
# computational graph !
if not one_step:
sequences = [X]
outputs_info = init_H + init_M + [None, None]
outputs, _ = theano.scan(
self._step,
sequences=sequences,
outputs_info=outputs_info
)
else:
outputs = self._step(*([X[0]] + init_H + init_M))
if return_info == 'hx':
if return_sequence:
return outputs[0].dimshuffle((1, 0, 2))
return outputs[-2][-1]
elif return_info == 'hy':
assert self.pooling or self.attention, 'y-axis hidden states are only used in the ``Pooling Mode".'
if return_sequence:
return outputs[2].dimshuffle((1, 0, 2))
return outputs[2][-1]
elif return_info == 'hxhy':
assert self.pooling or self.attention, 'y-axis hidden states are only used in the ``Pooling Mode".'
if return_sequence:
return outputs[-2].dimshuffle((1, 0, 2)), outputs[2].dimshuffle((1, 0, 2)) # x-y
return outputs[-2][-1], outputs[2][-1]
class SequentialGridLSTM(Grid):
"""
Details please refer to:
"Grid Long Short-Term Memory",
http://arxiv.org/abs/1507.01526
SequentialGridLSTM is a typical 2D-GridLSTM,
which has one flexible dimension (time) and one fixed dimension (depth)
Input information is added along x-axis.
"""
def __init__(self,
# parameters for Grid.
output_dims,
input_dims, # [0, ... 0], 0 represents no external inputs.
priority=1,
peephole=True,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
use_input=False,
name=None, weights=None,
identity_connect=None,
# parameters for 2D-GridLSTM
depth=5,
learn_init=False,
pooling=True,
attention=False,
shared=True,
dropout=0,
rng=None,
):
super(Grid, self).__init__()
assert len(output_dims) == 2, 'in this stage, we only support 2D Grid-LSTM'
assert len(input_dims) == len(output_dims), '# of inputs must match # of outputs.'
assert input_dims[1] == 0, 'we have no y-axis inputs here.'
assert shared, 'we share the weights in this stage.'
assert not (attention and pooling), 'attention and pooling cannot be set at the same time.'
"""
Initialization.
"""
logger.info(":::: Sequential Grid-Pool LSTM ::::")
self.input_dims = input_dims
self.output_dims = output_dims
self.N = len(output_dims)
self.depth = depth
self.dropout = dropout
self.priority = priority
self.peephole = peephole
self.use_input = use_input
self.pooling = pooling
self.attention = attention
self.learn_init = learn_init
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.relu = activations.get('relu')
self.inner_activation = activations.get(inner_activation)
self.identity_connect = identity_connect
self.axies = {0: 'x', 1: 'y', 2: 'z', 3: 'w'} # only support at most 4D now!
if self.identity_connect is not None:
logger.info('Identity Connection: {}'.format(self.identity_connect))
"""
Build the model weights.
"""
# build the centroid grid.
self.build()
# input projection layer (projected to time-axis) [x]
self.Ph = Dense(input_dims[0], output_dims[0], name='Ph')
self.Pm = Dense(input_dims[0], output_dims[0], name='Pm')
self._add(self.Ph)
self._add(self.Pm)
# learn init for depth-axis hidden states/memory cells. [y]
if self.learn_init:
self.M0 = self.init((depth, output_dims[1]))
if self.pooling:
self.H0 = self.init(output_dims[1])
else:
self.H0 = self.init((depth, output_dims[1]))
self.M0.name, self.H0.name = 'M0', 'H0'
self.params += [self.M0, self.H0]
# if we use attention instead of max-pooling
if self.pooling:
self.PP = Dense(output_dims[1] + input_dims[0], output_dims[1], # init='orthogonal',
name='PP', activation='linear')
self._add(self.PP)
if self.attention:
self.A = Attention(target_dim=input_dims[0],
source_dim=output_dims[1],
hidden_dim=200, name='attender')
self._add(self.A)
# if self.dropout > 0:
# logger.info(">>>>>> USE DropOut !! <<<<<<")
# self.D = Dropout(rng=rng, p=self.dropout, name='Dropout')
"""
Others info.
"""
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self, *args):
# since depth is not determined, we cannot decide the number of inputs
# for one time step.
# if pooling is True:
# args = [raw_input] + (sequence)
# [hy] + [my]*depth (output_info)
#
inputs = args[0]
Hy_tm1 = [args[k] for k in range(1, 1 + self.depth)]
My_tm1 = [args[k] for k in range(1 + self.depth, 1 + 2 * self.depth)]
# x_axis input projection (get hx_t, mx_t)
hx_t = self.Ph(inputs) # (nb_samples, output_dim0)
mx_t = self.Pm(inputs) # (nb_samples, output_dim0)
# build computation path from bottom to top.
Hx_t = [hx_t]
Mx_t = [mx_t]
Hy_t = []
My_t = []
for d in xrange(self.depth):
hs_i = [Hx_t[-1], Hy_tm1[d]]
ms_i = [Mx_t[-1], My_tm1[d]]
xs_i = [inputs, T.zeros_like(inputs)]
hs_o, ms_o = self.grid_(hs_i, ms_i, xs_i, priority=self.priority, identity=self.identity_connect)
Hx_t += [hs_o[0]]
Hy_t += [hs_o[1]]
Mx_t += [ms_o[0]]
My_t += [ms_o[1]]
hx_out = Hx_t[-1]
mx_out = Mx_t[-1]
# get the output (output_y, output_x)
# MAX-Pooling
if self.pooling:
# hy_t = T.max([self.PP(hy) for hy in Hy_t], axis=0)
hy_t = T.max([self.PP(T.concatenate([hy, inputs], axis=-1)) for hy in Hy_t], axis=0)
Hy_t = [hy_t] * self.depth
if self.attention:
HHy_t = T.concatenate([hy[:, None, :] for hy in Hy_t], axis=1) # (nb_samples, n_depth, out_dim1)
annotation = self.A(inputs, HHy_t) # (nb_samples, n_depth)
hy_t = T.sum(HHy_t * annotation[:, :, None], axis=1) # (nb_samples, out_dim1)
Hy_t = [hy_t] * self.depth
R = Hy_t + My_t + [hx_out, mx_out]
return tuple(R)
def __call__(self, X, init_H=None, init_M=None,
return_sequence=False, one_step=False,
return_info='hy', train=True):
# It is training/testing path
self.train = train
# recently we did not support masking.
if X.ndim == 2:
X = X[:, None, :]
# one step
if one_step:
assert init_H is not None, 'previous state must be provided!'
assert init_M is not None, 'previous cell must be provided!'
X = X.dimshuffle((1, 0, 2))
if init_H is None:
if self.learn_init:
init_m = T.repeat(self.M0[:, None, :], X.shape[1], axis=1)
if self.pooling:
init_h = T.repeat(self.H0[None, :], self.depth, axis=0)
else:
init_h = self.H0
init_h = T.repeat(init_h[:, None, :], X.shape[1], axis=1)
init_H = []
init_M = []
for j in xrange(self.depth):
init_H.append(init_h[j])
init_M.append(init_m[j])
else:
init_H = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * self.depth
init_M = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * self.depth
pass
# computational graph !
if not one_step:
sequences = [X]
outputs_info = init_H + init_M + [None, None]
outputs, _ = theano.scan(
self._step,
sequences=sequences,
outputs_info=outputs_info
)
else:
outputs = self._step(*([X[0]] + init_H + init_M))
if return_info == 'hx':
if return_sequence:
return outputs[0].dimshuffle((1, 0, 2))
return outputs[-2][-1]
elif return_info == 'hy':
assert self.pooling or self.attention, 'y-axis hidden states are only used in the ``Pooling Mode".'
if return_sequence:
return outputs[2].dimshuffle((1, 0, 2))
return outputs[2][-1]
elif return_info == 'hxhy':
assert self.pooling or self.attention, 'y-axis hidden states are only used in the ``Pooling Mode".'
if return_sequence:
return outputs[-2].dimshuffle((1, 0, 2)), outputs[2].dimshuffle((1, 0, 2)) # x-y
return outputs[-2][-1], outputs[2][-1]
class PyramidGridLSTM2D(Grid):
"""
A variant version of Sequential LSTM where we introduce a Pyramid structure.
"""
def __init__(self,
# parameters for Grid.
output_dims,
input_dims, # [0, ... 0], 0 represents no external inputs.
priority=1,
peephole=True,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
use_input=True,
name=None, weights=None,
identity_connect=None,
# parameters for 2D-GridLSTM
depth=5,
learn_init=False,
shared=True,
dropout=0
):
super(Grid, self).__init__()
assert len(output_dims) == 2, 'in this stage, we only support 2D Grid-LSTM'
assert len(input_dims) == len(output_dims), '# of inputs must match # of outputs.'
assert output_dims[0] == output_dims[1], 'Here we only support square model.'
assert shared, 'we share the weights in this stage.'
assert use_input, 'use input and add them in the middle'
"""
Initialization.
"""
logger.info(":::: Sequential Grid-Pool LSTM ::::")
self.input_dims = input_dims
self.output_dims = output_dims
self.N = len(output_dims)
self.depth = depth
self.dropout = dropout
self.priority = priority
self.peephole = peephole
self.use_input = use_input
self.learn_init = learn_init
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.relu = activations.get('relu')
self.inner_activation = activations.get(inner_activation)
self.identity_connect = identity_connect
self.axies = {0: 'x', 1: 'y', 2: 'z', 3: 'w'} # only support at most 4D now!
"""
Build the model weights.
"""
# build the centroid grid.
self.build()
# # input projection layer (projected to time-axis) [x]
# self.Ph = Dense(input_dims[0], output_dims[0], name='Ph')
# self.Pm = Dense(input_dims[0], output_dims[0], name='Pm')
#
# self._add(self.Ph)
# self._add(self.Pm)
# learn init/
if self.learn_init:
self.hx0 = self.init((1, output_dims[0]))
self.hy0 = self.init((1, output_dims[1]))
self.mx0 = self.init((1, output_dims[0]))
self.my0 = self.init((1, output_dims[1]))
self.hx0.name, self.hy0.name = 'hx0', 'hy0'
self.mx0.name, self.my0.name = 'mx0', 'my0'
self.params += [self.hx0, self.hy0, self.mx0, self.my0]
"""
Others info.
"""
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self, *args):
inputs = args[0]
hx_tm1 = args[1]
mx_tm1 = args[2]
hy_tm1 = args[3]
my_tm1 = args[4]
# zero constant inputs.
pre_info = [[[T.zeros_like(hx_tm1)
for _ in xrange(self.depth)]
for _ in xrange(self.depth)]
for _ in xrange(4)] # hx, mx, hy, my
pre_inputs = [[T.zeros_like(inputs)
for _ in xrange(self.depth)]
for _ in xrange(self.depth)]
for kk in xrange(self.depth):
pre_inputs[kk][kk] = inputs
pre_info[0][0][0] = hx_tm1
pre_info[1][0][0] = mx_tm1
pre_info[2][0][0] = hy_tm1
pre_info[3][0][0] = my_tm1
for step_x in xrange(self.depth):
for step_y in xrange(self.depth):
# input hidden/memory/input information
print pre_info[0][-1][-1], pre_info[2][-1][-1]
hs_i = [pre_info[0][step_x][step_y],
pre_info[2][step_x][step_y]]
ms_i = [pre_info[1][step_x][step_y],
pre_info[3][step_x][step_y]]
xs_i = [pre_inputs[step_x][step_y],
pre_inputs[step_x][step_y]]
# compute grid-lstm
hs_o, ms_o = self.grid_(hs_i, ms_i, xs_i, priority =-1)
# output hidden/memory information
if (step_x == self.depth - 1) and (step_y == self.depth - 1):
hx_t, mx_t, hy_t, my_t = hs_o[0], ms_o[0], hs_o[1], ms_o[1]
return hx_t, mx_t, hy_t, my_t
if step_x + 1 < self.depth:
pre_info[0][step_x + 1][step_y] = hs_o[0]
pre_info[1][step_x + 1][step_y] = ms_o[0]
if step_y + 1 < self.depth:
pre_info[2][step_x][step_y + 1] = hs_o[1]
pre_info[3][step_x][step_y + 1] = ms_o[1]
def __call__(self, X, init_x=None, init_y=None,
return_sequence=False, one_step=False):
# recently we did not support masking.
if X.ndim == 2:
X = X[:, None, :]
# one step
if one_step:
assert init_x is not None, 'previous x must be provided!'
assert init_y is not None, 'previous y must be provided!'
X = X.dimshuffle((1, 0, 2))
if init_x is None:
if self.learn_init:
init_mx = T.repeat(self.mx0, X.shape[1], axis=0)
init_my = T.repeat(self.my0, X.shape[1], axis=0)
init_hx = T.repeat(self.hx0, X.shape[1], axis=0)
init_hy = T.repeat(self.hy0, X.shape[1], axis=0)
init_input = [init_hx, init_mx, init_hy, init_my]
else:
init_x = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[0]), 1)] * 2
init_y = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * 2
init_input = init_x + init_y
else:
init_input = init_x + init_y
if not one_step:
sequence = [X]
output_info = init_input
outputs, _ = theano.scan(
self._step,
sequences=sequence,
outputs_info=output_info
)
else:
outputs = self._step(*([X[0]] + init_x + init_y))
if return_sequence:
hxs = outputs[0].dimshuffle((1, 0, 2))
hys = outputs[2].dimshuffle((1, 0, 2))
hs = T.concatenate([hxs, hys], axis=-1)
return hs
else:
hx = outputs[0][-1]
hy = outputs[2][-1]
h = T.concatenate([hx, hy], axis=-1)
return h
class PyramidLSTM(Layer):
"""
A more flexible Pyramid LSTM structure!
"""
def __init__(self,
# parameters for Grid.
output_dims,
input_dims, # [0, ... 0], 0 represents no external inputs.
priority=1,
peephole=True,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
use_input=True,
name=None, weights=None,
identity_connect=None,
# parameters for 2D-GridLSTM
depth=5,
learn_init=False,
shared=True,
dropout=0
):
super(PyramidLSTM, self).__init__()
assert len(output_dims) == 2, 'in this stage, we only support 2D Grid-LSTM'
assert len(input_dims) == len(output_dims), '# of inputs must match # of outputs.'
assert output_dims[0] == output_dims[1], 'Here we only support square model.'
assert shared, 'we share the weights in this stage.'
assert use_input, 'use input and add them in the middle'
"""
Initialization.
"""
logger.info(":::: Sequential Grid-Pool LSTM ::::")
self.N = len(output_dims)
self.depth = depth
self.dropout = dropout
self.priority = priority
self.peephole = peephole
self.use_input = use_input
self.learn_init = learn_init
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.relu = activations.get('relu')
self.inner_activation = activations.get(inner_activation)
self.identity_connect = identity_connect
self.axies = {0: 'x', 1: 'y', 2: 'z', 3: 'w'} # only support at most 4D now!
"""
Build the model weights.
"""
# build the centroid grid (3 grid versions)
self.grids = [Grid(output_dims,
input_dims,
-1,
peephole,
init, inner_init,
forget_bias_init,
activation, inner_activation, use_input,
name='Grid*{}'.format(k)
) for k in xrange(3)]
for k in xrange(3):
self.grids[k].build()
self._add(self.grids[k])
# # input projection layer (projected to time-axis) [x]
# self.Ph = Dense(input_dims[0], output_dims[0], name='Ph')
# self.Pm = Dense(input_dims[0], output_dims[0], name='Pm')
#
# self._add(self.Ph)
# self._add(self.Pm)
# learn init/
if self.learn_init:
self.hx0 = self.init((1, output_dims[0]))
self.hy0 = self.init((1, output_dims[1]))
self.mx0 = self.init((1, output_dims[0]))
self.my0 = self.init((1, output_dims[1]))
self.hx0.name, self.hy0.name = 'hx0', 'hy0'
self.mx0.name, self.my0.name = 'mx0', 'my0'
self.params += [self.hx0, self.hy0, self.mx0, self.my0]
"""
Others info.
"""
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self, *args):
inputs = args[0]
hx_tm1 = args[1]
mx_tm1 = args[2]
hy_tm1 = args[3]
my_tm1 = args[4]
# zero constant inputs.
pre_info = [[[T.zeros_like(hx_tm1)
for _ in xrange(self.depth)]
for _ in xrange(self.depth)]
for _ in xrange(4)] # hx, mx, hy, my
pre_inputs = [[T.zeros_like(inputs)
for _ in xrange(self.depth)]
for _ in xrange(self.depth)]
for kk in xrange(self.depth):
pre_inputs[kk][kk] = inputs
pre_info[0][0][0] = hx_tm1
pre_info[1][0][0] = mx_tm1
pre_info[2][0][0] = hy_tm1
pre_info[3][0][0] = my_tm1
for step_x in xrange(self.depth):
for step_y in xrange(self.depth):
# input hidden/memory/input information
print pre_info[0][-1][-1], pre_info[2][-1][-1]
hs_i = [pre_info[0][step_x][step_y],
pre_info[2][step_x][step_y]]
ms_i = [pre_info[1][step_x][step_y],
pre_info[3][step_x][step_y]]
xs_i = [pre_inputs[step_x][step_y],
pre_inputs[step_x][step_y]]
# compute grid-lstm
if (step_x + step_y + 1) < self.depth:
hs_o, ms_o = self.grids[0].grid_(hs_i, ms_i, xs_i, priority =-1)
elif (step_x + step_y + 1) == self.depth:
hs_o, ms_o = self.grids[1].grid_(hs_i, ms_i, xs_i, priority =-1)
else:
hs_o, ms_o = self.grids[2].grid_(hs_i, ms_i, xs_i, priority =-1)
# output hidden/memory information
if (step_x == self.depth - 1) and (step_y == self.depth - 1):
hx_t, mx_t, hy_t, my_t = hs_o[0], ms_o[0], hs_o[1], ms_o[1]
return hx_t, mx_t, hy_t, my_t
if step_x + 1 < self.depth:
pre_info[0][step_x + 1][step_y] = hs_o[0]
pre_info[1][step_x + 1][step_y] = ms_o[0]
if step_y + 1 < self.depth:
pre_info[2][step_x][step_y + 1] = hs_o[1]
pre_info[3][step_x][step_y + 1] = ms_o[1]
def __call__(self, X, init_x=None, init_y=None,
return_sequence=False, one_step=False):
# recently we did not support masking.
if X.ndim == 2:
X = X[:, None, :]
# one step
if one_step:
assert init_x is not None, 'previous x must be provided!'
assert init_y is not None, 'previous y must be provided!'
X = X.dimshuffle((1, 0, 2))
if init_x is None:
if self.learn_init:
init_mx = T.repeat(self.mx0, X.shape[1], axis=0)
init_my = T.repeat(self.my0, X.shape[1], axis=0)
init_hx = T.repeat(self.hx0, X.shape[1], axis=0)
init_hy = T.repeat(self.hy0, X.shape[1], axis=0)
init_input = [init_hx, init_mx, init_hy, init_my]
else:
init_x = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[0]), 1)] * 2
init_y = [T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dims[1]), 1)] * 2
init_input = init_x + init_y
else:
init_input = init_x + init_y
if not one_step:
sequence = [X]
output_info = init_input
outputs, _ = theano.scan(
self._step,
sequences=sequence,
outputs_info=output_info
)
else:
outputs = self._step(*([X[0]] + init_x + init_y))
if return_sequence:
hxs = outputs[0].dimshuffle((1, 0, 2))
hys = outputs[2].dimshuffle((1, 0, 2))
hs = T.concatenate([hxs, hys], axis=-1)
return hs
else:
hx = outputs[0][-1]
hy = outputs[2][-1]
h = T.concatenate([hx, hy], axis=-1)
return h
================================================
FILE: emolga/layers/ntm_minibatch.py
================================================
__author__ = 'jiataogu'
import theano
import theano.tensor as T
import scipy.linalg as sl
import numpy as np
from .core import *
from .recurrent import *
import copy
"""
This implementation supports both minibatch learning and on-line training.
We need a minibatch version for Neural Turing Machines.
"""
class Reader(Layer):
"""
"Reader Head" of the Neural Turing Machine.
"""
def __init__(self, input_dim, memory_width, shift_width, shift_conv,
init='glorot_uniform', inner_init='orthogonal',
name=None):
super(Reader, self).__init__()
self.input_dim = input_dim
self.memory_dim = memory_width
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.tanh = activations.get('tanh')
self.sigmoid = activations.get('sigmoid')
self.softplus = activations.get('softplus')
self.vec_softmax = activations.get('vector_softmax')
self.softmax = activations.get('softmax')
"""
Reader Params.
"""
self.W_key = self.init((input_dim, memory_width))
self.W_shift = self.init((input_dim, shift_width))
self.W_beta = self.init(input_dim)
self.W_gama = self.init(input_dim)
self.W_g = self.init(input_dim)
self.b_key = shared_zeros(memory_width)
self.b_shift = shared_zeros(shift_width)
self.b_beta = theano.shared(floatX(0))
self.b_gama = theano.shared(floatX(0))
self.b_g = theano.shared(floatX(0))
self.shift_conv = shift_conv
# add params and set names.
self.params = [self.W_key, self.W_shift, self.W_beta, self.W_gama, self.W_g,
self.b_key, self.b_shift, self.b_beta, self.b_gama, self.b_g]
self.W_key.name, self.W_shift.name, self.W_beta.name, \
self.W_gama.name, self.W_g.name = 'W_key', 'W_shift', 'W_beta', \
'W_gama', 'W_g'
self.b_key.name, self.b_shift.name, self.b_beta.name, \
self.b_gama.name, self.b_g.name = 'b_key', 'b_shift', 'b_beta', \
'b_gama', 'b_g'
def __call__(self, X, w_temp, m_temp):
# input dimensions
# X: (nb_samples, input_dim)
# w_temp: (nb_samples, memory_dim)
# m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory
key = dot(X, self.W_key, self.b_key) # (nb_samples, memory_width)
shift = self.softmax(
dot(X, self.W_shift, self.b_shift)) # (nb_samples, shift_width)
beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None] # (nb_samples, x)
gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1. # (nb_samples,)
gamma = gamma[:, None] # (nb_samples, x)
g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None] # (nb_samples, x)
signal = [key, shift, beta, gamma, g]
w_c = self.softmax(
beta * cosine_sim2d(key, m_temp)) # (nb_samples, memory_dim) //content-based addressing
w_g = g * w_c + (1 - g) * w_temp # (nb_samples, memory_dim) //history interpolation
w_s = shift_convolve2d(w_g, shift, self.shift_conv) # (nb_samples, memory_dim) //convolutional shift
w_p = w_s ** gamma # (nb_samples, memory_dim) //sharpening
w_t = w_p / T.sum(w_p, axis=1)[:, None] # (nb_samples, memory_dim)
return w_t
class Writer(Reader):
"""
"Writer head" of the Neural Turing Machine
"""
def __init__(self, input_dim, memory_width, shift_width, shift_conv,
init='glorot_uniform', inner_init='orthogonal',
name=None):
super(Writer, self).__init__(input_dim, memory_width, shift_width, shift_conv,
init, inner_init, name)
"""
Writer Params.
"""
self.W_erase = self.init((input_dim, memory_width))
self.W_add = self.init((input_dim, memory_width))
self.b_erase = shared_zeros(memory_width)
self.b_add = shared_zeros(memory_width)
# add params and set names.
self.params += [self.W_erase, self.W_add, self.b_erase, self.b_add]
self.W_erase.name, self.W_add.name = 'W_erase', 'W_add'
self.b_erase.name, self.b_add.name = 'b_erase', 'b_add'
def get_fixer(self, X):
erase = self.sigmoid(dot(X, self.W_erase, self.b_erase)) # (nb_samples, memory_width)
add = self.sigmoid(dot(X, self.W_add, self.b_add)) # (nb_samples, memory_width)
return erase, add
class Controller(Recurrent):
"""
Controller used in Neural Turing Machine.
- Core cell (Memory)
- Reader head
- Writer head
It is a simple RNN version. In reality the Neural Turing Machine will use the LSTM cell.
"""
def __init__(self,
input_dim,
memory_dim,
memory_width,
hidden_dim,
shift_width=3,
init='glorot_uniform',
inner_init='orthogonal',
name=None,
readonly=False,
curr_input=False,
recurrence=False,
memorybook=None
):
super(Controller, self).__init__()
# Initialization of the dimensions.
self.input_dim = input_dim
self.memory_dim = memory_dim
self.memory_width = memory_width
self.hidden_dim = hidden_dim
self.shift_width = shift_width
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.tanh = activations.get('tanh')
self.softmax = activations.get('softmax')
self.vec_softmax = activations.get('vector_softmax')
self.readonly = readonly
self.curr_input = curr_input
self.recurrence = recurrence
self.memorybook = memorybook
"""
Controller Module.
"""
# hidden projection:
self.W_in = self.init((input_dim, hidden_dim))
self.b_in = shared_zeros(hidden_dim)
self.W_rd = self.init((memory_width, hidden_dim))
self.W_in.name = 'W_in'
self.b_in.name = 'b_in'
self.W_rd.name = 'W_rd'
self.params = [self.W_in, self.b_in, self.W_rd]
# use recurrence:
if self.recurrence:
self.W_hh = self.inner_init((hidden_dim, hidden_dim))
self.W_hh.name = 'W_hh'
self.params += [self.W_hh]
# Shift convolution
shift_conv = sl.circulant(np.arange(memory_dim)).T[
np.arange(-(shift_width // 2), (shift_width // 2) + 1)][::-1]
# use the current input for weights.
if self.curr_input:
controller_size = self.input_dim + self.hidden_dim
else:
controller_size = self.hidden_dim
# write head
if not readonly:
self.writer = Writer(controller_size, memory_width, shift_width, shift_conv, name='writer')
self.writer.set_name('writer')
self._add(self.writer)
# read head
self.reader = Reader(controller_size, memory_width, shift_width, shift_conv, name='reader')
self.reader.set_name('reader')
self._add(self.reader)
# ***********************************************************
# reserved for None initialization (we don't use these often)
self.memory_init = self.init((memory_dim, memory_width))
self.w_write_init = self.softmax(np.random.rand(1, memory_dim).astype(theano.config.floatX))
self.w_read_init = self.softmax(np.random.rand(1, memory_dim).astype(theano.config.floatX))
self.contr_init = self.tanh(np.random.rand(1, hidden_dim).astype(theano.config.floatX))
if name is not None:
self.set_name(name)
def _controller(self, input_t, read_t, controller_tm1=None):
# input_t : (nb_sample, input_dim)
# read_t : (nb_sample, memory_width)
# controller_tm1: (nb_sample, hidden_dim)
if self.recurrence:
return self.tanh(dot(input_t, self.W_in) +
dot(controller_tm1, self.W_hh) +
dot(read_t, self.W_rd) +
self.b_in)
else:
return self.tanh(dot(input_t, self.W_in) +
dot(read_t, self.W_rd) +
self.b_in)
@staticmethod
def _read(w_read, memory):
# w_read : (nb_sample, memory_dim)
# memory : (nb_sample, memory_dim, memory_width)
# return dot(w_read, memory)
return T.sum(w_read[:, :, None] * memory, axis=1)
@staticmethod
def _write(w_write, memory, erase, add):
# w_write: (nb_sample, memory_dim)
# memory : (nb_sample, memory_dim, memory_width)
# erase/add: (nb_sample, memory_width)
w_write = w_write[:, :, None]
erase = erase[:, None, :]
add = add[:, None, :]
m_erased = memory * (1 - w_write * erase)
memory_t = m_erased + w_write * add # (nb_sample, memory_dim, memory_width)
return memory_t
def _step(self, input_t, mask_t,
memory_tm1,
w_write_tm1, w_read_tm1,
controller_tm1):
# input_t: (nb_sample, input_dim)
# memory_tm1: (nb_sample, memory_dim, memory_width)
# w_write_tm1: (nb_sample, memory_dim)
# w_read_tm1: (nb_sample, memory_dim)
# controller_tm1: (nb_sample, hidden_dim)
# read the memory
if self.curr_input:
info = T.concatenate((controller_tm1, input_t), axis=1)
w_read_t = self.reader(info, w_read_tm1, memory_tm1)
read_tm1 = self._read(w_read_t, memory_tm1)
else:
read_tm1 = self._read(w_read_tm1, memory_tm1) # (nb_sample, memory_width)
# get the new controller (hidden states.)
if self.recurrence:
controller_t = self._controller(input_t, read_tm1, controller_tm1)
else:
controller_t = self._controller(input_t, read_tm1) # (nb_sample, controller_size)
# update the memory cell (if need)
if not self.readonly:
if self.curr_input:
infow = T.concatenate((controller_t, input_t), axis=1)
w_write_t = self.writer(infow, w_write_tm1, memory_tm1) # (nb_sample, memory_dim)
erase_t, add_t = self.writer.get_fixer(infow) # (nb_sample, memory_width)
else:
w_write_t = self.writer(controller_t, w_write_tm1, memory_tm1)
erase_t, add_t = self.writer.get_fixer(controller_t)
memory_t = self._write(w_write_t, memory_tm1, erase_t, add_t) # (nb_sample, memory_dim, memory_width)
else:
w_write_t = w_write_tm1
memory_t = memory_tm1
# get the next reading weights.
if not self.curr_input:
w_read_t = self.reader(controller_t, w_read_tm1, memory_t) # (nb_sample, memory_dim)
# over masking
memory_t = memory_t * mask_t[:, :, None] + memory_tm1 * (1 - mask_t[:, :, None])
w_read_t = w_read_t * mask_t + w_read_tm1 * (1 - mask_t)
w_write_t = w_write_t * mask_t + w_write_tm1 * (1 - mask_t)
controller_t = controller_t * mask_t + controller_tm1 * (1 - mask_t)
return memory_t, w_write_t, w_read_t, controller_t
def __call__(self, X, mask=None, M=None, init_ww=None,
init_wr=None, init_c=None, return_sequence=False,
one_step=False, return_full=False):
# recurrent cell only work for tensor.
if X.ndim == 2:
X = X[:, None, :]
nb_samples = X.shape[0]
# mask
if mask is None:
mask = T.alloc(1., X.shape[0], 1)
padded_mask = self.get_padded_shuffled_mask(mask, pad=0)
X = X.dimshuffle((1, 0, 2))
# ***********************************************************************
# initialization states
if M is None:
memory_init = T.repeat(self.memory_init[None, :, :], nb_samples, axis=0)
else:
memory_init = M
if init_wr is None:
w_read_init = T.repeat(self.w_read_init, nb_samples, axis=0)
else:
w_read_init = init_wr
if init_ww is None:
w_write_init = T.repeat(self.w_write_init, nb_samples, axis=0)
else:
w_write_init = init_ww
if init_c is None:
contr_init = T.repeat(self.contr_init, nb_samples, axis=0)
else:
contr_init = init_c
# ************************************************************************
outputs_info = [memory_init, w_write_init, w_read_init, contr_init]
if one_step:
seq = [X[0], padded_mask[0]]
outputs = self._step(*(seq + outputs_info))
return outputs
else:
seq = [X, padded_mask]
outputs, _ = theano.scan(
self._step,
sequences=seq,
outputs_info=outputs_info,
name='controller_recurrence'
)
self.monitor['memory_info'] = outputs[0]
self.monitor['write_weights'] = outputs[1]
self.monitor['read_weights'] = outputs[2]
if not return_full:
if return_sequence:
return outputs[-1].dimshuffle((1, 0, 2))
return outputs[-1][-1]
else:
if return_sequence:
return [a.dimshuffle((1, 0, 2)) for a in outputs]
return [a[-1] for a in outputs]
class AttentionReader(Layer):
"""
"Reader Head" of the Neural Turing Machine.
"""
def __init__(self, input_dim, memory_width, shift_width, shift_conv,
init='glorot_uniform', inner_init='orthogonal',
name=None):
super(AttentionReader, self).__init__()
self.input_dim = input_dim
self.memory_dim = memory_width
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.tanh = activations.get('tanh')
self.sigmoid = activations.get('sigmoid')
self.softplus = activations.get('softplus')
self.vec_softmax = activations.get('vector_softmax')
self.softmax = activations.get('softmax')
"""
Reader Params.
"""
self.W_key = self.init((input_dim, memory_width))
self.W_lock = self.inner_init((memory_width, memory_width))
self.W_shift = self.init((input_dim, shift_width))
self.W_beta = self.init(input_dim)
self.W_gama = self.init(input_dim)
self.W_g = self.init(input_dim)
# self.v = self.init(memory_width)
self.b_key = shared_zeros(memory_width)
self.b_shift = shared_zeros(shift_width)
self.b_beta = theano.shared(floatX(0))
self.b_gama = theano.shared(floatX(0))
self.b_g = theano.shared(floatX(0))
self.shift_conv = shift_conv
# add params and set names.
self.params = [self.W_key, self.W_shift, self.W_beta, self.W_gama, self.W_g,
self.b_key, self.b_shift, self.b_beta, self.b_gama, self.b_g,
self.W_lock]
self.W_key.name, self.W_shift.name, self.W_beta.name, \
self.W_gama.name, self.W_g.name = 'W_key', 'W_shift', 'W_beta', \
'W_gama', 'W_g'
self.W_lock.name = 'W_lock'
self.b_key.name, self.b_shift.name, self.b_beta.name, \
self.b_gama.name, self.b_g.name = 'b_key', 'b_shift', 'b_beta', \
'b_gama', 'b_g'
def __call__(self, X, w_temp, m_temp):
# input dimensions
# X: (nb_samples, input_dim)
# w_temp: (nb_samples, memory_dim)
# m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory
key = dot(X, self.W_key, self.b_key) # (nb_samples, memory_width)
lock = dot(m_temp, self.W_lock) # (nb_samples, memory_dim, memory_width)
shift = self.softmax(
dot(X, self.W_shift, self.b_shift)) # (nb_samples, shift_width)
beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None] # (nb_samples, x)
gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1. # (nb_samples,)
gamma = gamma[:, None] # (nb_samples, x)
g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None] # (nb_samples, x)
signal = [key, shift, beta, gamma, g]
energy = T.sum(key[:, None, :] * lock, axis=2)
# energy = T.tensordot(key[:, None, :] + lock, self.v, [2, 0])
w_c = self.softmax(beta * energy)
# w_c = self.softmax(
# beta * cosine_sim2d(key, m_temp)) # (nb_samples, memory_dim) //content-based addressing
w_g = g * w_c + (1 - g) * w_temp # (nb_samples, memory_dim) //history interpolation
w_s = shift_convolve2d(w_g, shift, self.shift_conv) # (nb_samples, memory_dim) //convolutional shift
w_p = w_s ** gamma # (nb_samples, memory_dim) //sharpening
w_t = w_p / T.sum(w_p, axis=1)[:, None] # (nb_samples, memory_dim)
return w_t
class AttentionWriter(AttentionReader):
"""
"Writer head" of the Neural Turing Machine
"""
def __init__(self, input_dim, memory_width, shift_width, shift_conv,
init='glorot_uniform', inner_init='orthogonal',
name=None):
super(AttentionWriter, self).__init__(input_dim, memory_width, shift_width, shift_conv,
init, inner_init, name)
"""
Writer Params.
"""
self.W_erase = self.init((input_dim, memory_width))
self.W_add = self.init((input_dim, memory_width))
self.b_erase = shared_zeros(memory_width)
self.b_add = shared_zeros(memory_width)
# add params and set names.
self.params += [self.W_erase, self.W_add, self.b_erase, self.b_add]
self.W_erase.name, self.W_add.name = 'W_erase', 'W_add'
self.b_erase.name, self.b_add.name = 'b_erase', 'b_add'
def get_fixer(self, X):
erase = self.sigmoid(dot(X, self.W_erase, self.b_erase)) # (nb_samples, memory_width)
add = self.sigmoid(dot(X, self.W_add, self.b_add)) # (nb_samples, memory_width)
return erase, add
class BernoulliController(Recurrent):
"""
Controller used in Neural Turing Machine.
- Core cell (Memory): binary memory
- Reader head
- Writer head
It is a simple RNN version. In reality the Neural Turing Machine will use the LSTM cell.
"""
def __init__(self,
input_dim,
memory_dim,
memory_width,
hidden_dim,
shift_width=3,
init='glorot_uniform',
inner_init='orthogonal',
name=None,
readonly=False,
curr_input=False,
recurrence=False,
memorybook=None
):
super(BernoulliController, self).__init__()
# Initialization of the dimensions.
self.input_dim = input_dim
self.memory_dim = memory_dim
self.memory_width = memory_width
self.hidden_dim = hidden_dim
self.shift_width = shift_width
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.tanh = activations.get('tanh')
self.softmax = activations.get('softmax')
self.vec_softmax = activations.get('vector_softmax')
self.sigmoid = activations.get('sigmoid')
self.readonly = readonly
self.curr_input = curr_input
self.recurrence = recurrence
self.memorybook = memorybook
"""
Controller Module.
"""
# hidden projection:
self.W_in = self.init((input_dim, hidden_dim))
self.b_in = shared_zeros(hidden_dim)
self.W_rd = self.init((memory_width, hidden_dim))
self.W_in.name = 'W_in'
self.b_in.name = 'b_in'
self.W_rd.name = 'W_rd'
self.params = [self.W_in, self.b_in, self.W_rd]
# use recurrence:
if self.recurrence:
self.W_hh = self.inner_init((hidden_dim, hidden_dim))
self.W_hh.name = 'W_hh'
self.params += [self.W_hh]
# Shift convolution
shift_conv = sl.circulant(np.arange(memory_dim)).T[
np.arange(-(shift_width // 2), (shift_width // 2) + 1)][::-1]
# use the current input for weights.
if self.curr_input:
controller_size = self.input_dim + self.hidden_dim
else:
controller_size = self.hidden_dim
# write head
if not readonly:
self.writer = AttentionWriter(controller_size, memory_width, shift_width, shift_conv, name='writer')
self.writer.set_name('writer')
self._add(self.writer)
# read head
self.reader = AttentionReader(controller_size, memory_width, shift_width, shift_conv, name='reader')
self.reader.set_name('reader')
self._add(self.reader)
# ***********************************************************
# reserved for None initialization (we don't use these often)
self.memory_init = self.sigmoid(self.init((memory_dim, memory_width)))
self.w_write_init = self.softmax(np.random.rand(1, memory_dim).astype(theano.config.floatX))
self.w_read_init = self.softmax(np.random.rand(1, memory_dim).astype(theano.config.floatX))
self.contr_init = self.tanh(np.random.rand(1, hidden_dim).astype(theano.config.floatX))
if name is not None:
self.set_name(name)
def _controller(self, input_t, read_t, controller_tm1=None):
# input_t : (nb_sample, input_dim)
# read_t : (nb_sample, memory_width)
# controller_tm1: (nb_sample, hidden_dim)
if self.recurrence:
return self.tanh(dot(input_t, self.W_in) +
dot(controller_tm1, self.W_hh) +
dot(read_t, self.W_rd) +
self.b_in)
else:
return self.tanh(dot(input_t, self.W_in) +
dot(read_t, self.W_rd) +
self.b_in)
@staticmethod
def _read(w_read, memory):
# w_read : (nb_sample, memory_dim)
# memory : (nb_sample, memory_dim, memory_width)
# return dot(w_read, memory)
return T.sum(w_read[:, :, None] * memory, axis=1)
@staticmethod
def _write(w_write, memory, erase, add):
# w_write: (nb_sample, memory_dim)
# memory : (nb_sample, memory_dim, memory_width)
# erase/add: (nb_sample, memory_width)
w_write = w_write[:, :, None]
erase = erase[:, None, :] # erase is a gate.
add = add[:, None, :] # add is a bias
# m_erased = memory * (1 - w_write * erase)
# memory_t = m_erased + w_write * add # (nb_sample, memory_dim, memory_width)
memory_t = memory * (1 - w_write * erase) + \
add * w_write * (1 - erase)
return memory_t
def _step(self, input_t, mask_t,
memory_tm1,
w_write_tm1, w_read_tm1,
controller_tm1):
# input_t: (nb_sample, input_dim)
# memory_tm1: (nb_sample, memory_dim, memory_width)
# w_write_tm1: (nb_sample, memory_dim)
# w_read_tm1: (nb_sample, memory_dim)
# controller_tm1: (nb_sample, hidden_dim)
# read the memory
if self.curr_input:
info = T.concatenate((controller_tm1, input_t), axis=1)
w_read_t = self.reader(info, w_read_tm1, memory_tm1)
read_tm1 = self._read(w_read_t, memory_tm1)
else:
read_tm1 = self._read(w_read_tm1, memory_tm1) # (nb_sample, memory_width)
# get the new controller (hidden states.)
if self.recurrence:
controller_t = self._controller(input_t, read_tm1, controller_tm1)
else:
controller_t = self._controller(input_t, read_tm1) # (nb_sample, controller_size)
# update the memory cell (if need)
if not self.readonly:
if self.curr_input:
infow = T.concatenate((controller_t, input_t), axis=1)
w_write_t = self.writer(infow, w_write_tm1, memory_tm1) # (nb_sample, memory_dim)
erase_t, add_t = self.writer.get_fixer(infow) # (nb_sample, memory_width)
else:
w_write_t = self.writer(controller_t, w_write_tm1, memory_tm1)
erase_t, add_t = self.writer.get_fixer(controller_t)
memory_t = self._write(w_write_t, memory_tm1, erase_t, add_t) # (nb_sample, memory_dim, memory_width)
else:
w_write_t = w_write_tm1
memory_t = memory_tm1
# get the next reading weights.
if not self.curr_input:
w_read_t = self.reader(controller_t, w_read_tm1, memory_t) # (nb_sample, memory_dim)
# over masking
memory_t = memory_t * mask_t[:, :, None] + memory_tm1 * (1 - mask_t[:, :, None])
w_read_t = w_read_t * mask_t + w_read_tm1 * (1 - mask_t)
w_write_t = w_write_t * mask_t + w_write_tm1 * (1 - mask_t)
controller_t = controller_t * mask_t + controller_tm1 * (1 - mask_t)
return memory_t, w_write_t, w_read_t, controller_t
def __call__(self, X, mask=None, M=None, init_ww=None,
init_wr=None, init_c=None, return_sequence=False,
one_step=False, return_full=False):
# recurrent cell only work for tensor.
if X.ndim == 2:
X = X[:, None, :]
nb_samples = X.shape[0]
# mask
if mask is None:
mask = T.alloc(1., X.shape[0], 1)
padded_mask = self.get_padded_shuffled_mask(mask, pad=0)
X = X.dimshuffle((1, 0, 2))
# ***********************************************************************
# initialization states
if M is None:
memory_init = T.repeat(self.memory_init[None, :, :], nb_samples, axis=0)
else:
memory_init = M
if init_wr is None:
w_read_init = T.repeat(self.w_read_init, nb_samples, axis=0)
else:
w_read_init = init_wr
if init_ww is None:
w_write_init = T.repeat(self.w_write_init, nb_samples, axis=0)
else:
w_write_init = init_ww
if init_c is None:
contr_init = T.repeat(self.contr_init, nb_samples, axis=0)
else:
contr_init = init_c
# ************************************************************************
outputs_info = [memory_init, w_write_init, w_read_init, contr_init]
if one_step:
seq = [X[0], padded_mask[0]]
outputs = self._step(*(seq + outputs_info))
return outputs
else:
seq = [X, padded_mask]
outputs, _ = theano.scan(
self._step,
sequences=seq,
outputs_info=outputs_info,
name='controller_recurrence'
)
self.monitor['memory_info'] = outputs
if not return_full:
if return_sequence:
return outputs[-1].dimshuffle((1, 0, 2))
return outputs[-1][-1]
else:
if return_sequence:
return [a.dimshuffle((1, 0, 2)) for a in outputs]
return [a[-1] for a in outputs]
================================================
FILE: emolga/layers/recurrent.py
================================================
# -*- coding: utf-8 -*-
from abc import abstractmethod
from .core import *
class Recurrent(MaskedLayer):
"""
Recurrent Neural Network
"""
@staticmethod
def get_padded_shuffled_mask(mask, pad=0):
"""
What's going on here?
[1] change the 2D matrix into 3D.
[2]
"""
assert mask, 'mask cannot be None'
# mask is (nb_samples, time)
mask = T.shape_padright(mask) # (nb_samples, time, 1)
mask = T.addbroadcast(mask, -1)
mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1)
if pad > 0:
# left-pad in time with 0
padding = alloc_zeros_matrix(pad, mask.shape[1], 1)
mask = T.concatenate([padding, mask], axis=0)
return mask.astype('int8')
class GRU(Recurrent):
"""
Gated Recurrent Unit - Cho et al. 2014
Acts as a spatio-temporal projection,
turning a sequence of vectors into a single vector.
Eats inputs with shape:
(nb_samples, max_sample_length (samples shorter than this are padded with zeros at the end), input_dim)
and returns outputs with shape:
if not return_sequences:
(nb_samples, output_dim)
if return_sequences:
(nb_samples, max_sample_length, output_dim)
References:
On the Properties of Neural Machine Translation: Encoder–Decoder Approaches
http://www.aclweb.org/anthology/W14-4012
Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
http://arxiv.org/pdf/1412.3555v1.pdf
"""
def __init__(self,
input_dim,
output_dim=128,
context_dim=None,
init='glorot_uniform', inner_init='orthogonal',
activation='tanh', inner_activation='sigmoid',
name=None, weights=None):
super(GRU, self).__init__()
"""
Standard GRU.
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.activation = activations.get(activation)
self.inner_activation = activations.get(inner_activation)
self.W_z = self.init((self.input_dim, self.output_dim))
self.W_r = self.init((self.input_dim, self.output_dim))
self.W_h = self.init((self.input_dim, self.output_dim))
self.U_z = self.inner_init((self.output_dim, self.output_dim))
self.U_r = self.inner_init((self.output_dim, self.output_dim))
self.U_h = self.inner_init((self.output_dim, self.output_dim))
self.b_z = shared_zeros(self.output_dim)
self.b_r = shared_zeros(self.output_dim)
self.b_h = shared_zeros(self.output_dim)
# set names
self.W_z.name, self.U_z.name, self.b_z.name = 'Wz', 'Uz', 'bz'
self.W_r.name, self.U_r.name, self.b_r.name = 'Wr', 'Ur', 'br'
self.W_h.name, self.U_h.name, self.b_h.name = 'Wh', 'Uh', 'bh'
self.params = [
self.W_z, self.U_z, self.b_z,
self.W_r, self.U_r, self.b_r,
self.W_h, self.U_h, self.b_h,
]
"""
GRU with context inputs.
"""
if context_dim is not None:
self.context_dim = context_dim
self.C_z = self.init((self.context_dim, self.output_dim))
self.C_r = self.init((self.context_dim, self.output_dim))
self.C_h = self.init((self.context_dim, self.output_dim))
self.C_z.name, self.C_r.name, self.C_h.name = 'Cz', 'Cr', 'Ch'
self.params += [self.C_z, self.C_r, self.C_h]
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self,
xz_t, xr_t, xh_t, mask_t,
h_tm1,
u_z, u_r, u_h):
# h_mask_tm1 = mask_tm1 * h_tm1
# Here we use a GroundHog-like style which allows
z = self.inner_activation(xz_t + T.dot(h_tm1, u_z))
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
h_t = z * h_tm1 + (1 - z) * hh_t
h_t = mask_t * h_t + (1 - mask_t) * h_tm1
return h_t
def _step_gate(self,
xz_t, xr_t, xh_t, mask_t,
h_tm1,
u_z, u_r, u_h):
# h_mask_tm1 = mask_tm1 * h_tm1
# Here we use a GroundHog-like style which allows
z = self.inner_activation(xz_t + T.dot(h_tm1, u_z))
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
h_t = z * h_tm1 + (1 - z) * hh_t
h_t = mask_t * h_t + (1 - mask_t) * h_tm1
return h_t, z, r
def __call__(self, X, mask=None, C=None, init_h=None,
return_sequence=False, one_step=False,
return_gates=False):
"""
:param X: input sequence
:param mask: input mask
:param C: context constant
:return:
"""
# recurrent cell only work for tensor
if X.ndim == 2:
X = X[:, None, :]
if mask is not None:
mask = mask[:, None]
# mask
if mask is None: # sampling or beam-search
mask = T.alloc(1., X.shape[0], 1)
# one step
if one_step:
assert init_h, 'previous state must be provided!'
padded_mask = self.get_padded_shuffled_mask(mask, pad=0)
X = X.dimshuffle((1, 0, 2)) # X: (max_len, nb_samples, input_dim)
x_z = dot(X, self.W_z, self.b_z) # x_z: (max_len, nb_samples, output_dim)
x_r = dot(X, self.W_r, self.b_r) # x_r: (max_len, nb_samples, output_dim)
x_h = dot(X, self.W_h, self.b_h) # x_h: (max_len, nb_samples, output_dim)
"""
GRU with constant context. (not attention here.)
"""
if C is not None:
assert C.ndim == 2
ctx_step = C.dimshuffle('x', 0, 1) # C: (nb_samples, context_dim)
x_z += dot(ctx_step, self.C_z)
x_r += dot(ctx_step, self.C_r)
x_h += dot(ctx_step, self.C_h)
"""
GRU with additional initial/previous state.
"""
if init_h is None:
init_h = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
if not return_gates:
if one_step:
seq = [x_z, x_r, x_h, padded_mask] # A hidden BUG (1)+++(1) !?!!!?!!?!?
outputs_info = [init_h]
non_seq = [self.U_z, self.U_r, self.U_h]
outputs = self._step(*(seq + outputs_info + non_seq))
else:
outputs, updates = theano.scan(
self._step,
sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=init_h,
non_sequences=[self.U_z, self.U_r, self.U_h]
)
if return_sequence:
return outputs.dimshuffle((1, 0, 2))
return outputs[-1]
else:
if one_step:
seq = [x_z, x_r, x_h, padded_mask] # A hidden BUG (1)+++(1) !?!!!?!!?!?
outputs_info = [init_h]
non_seq = [self.U_z, self.U_r, self.U_h]
outputs, zz, rr = self._step_gate(*(seq + outputs_info + non_seq))
else:
outputx, updates = theano.scan(
self._step_gate,
sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=[init_h, None, None],
non_sequences=[self.U_z, self.U_r, self.U_h]
)
outputs, zz, rr = outputx
if return_sequence:
return outputs.dimshuffle((1, 0, 2)), zz.dimshuffle((1, 0, 2)), rr.dimshuffle((1, 0, 2))
return outputs[-1], zz[-1], rr[-1]
class JZS3(Recurrent):
"""
Evolved recurrent neural network architectures from the evaluation of thousands
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
This corresponds to the `MUT3` architecture described in the paper.
Takes inputs with shape:
(nb_samples, max_sample_length (samples shorter than this are padded with zeros at the end), input_dim)
and returns outputs with shape:
if not return_sequences:
(nb_samples, output_dim)
if return_sequences:
(nb_samples, max_sample_length, output_dim)
References:
An Empirical Exploration of Recurrent Network Architectures
http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf
"""
def __init__(self,
input_dim,
output_dim=128,
context_dim=None,
init='glorot_uniform', inner_init='orthogonal',
activation='tanh', inner_activation='sigmoid',
name=None, weights=None):
super(JZS3, self).__init__()
"""
Standard model
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.activation = activations.get(activation)
self.inner_activation = activations.get(inner_activation)
self.W_z = self.init((self.input_dim, self.output_dim))
self.U_z = self.inner_init((self.output_dim, self.output_dim))
self.b_z = shared_zeros(self.output_dim)
self.W_r = self.init((self.input_dim, self.output_dim))
self.U_r = self.inner_init((self.output_dim, self.output_dim))
self.b_r = shared_zeros(self.output_dim)
self.W_h = self.init((self.input_dim, self.output_dim))
self.U_h = self.inner_init((self.output_dim, self.output_dim))
self.b_h = shared_zeros(self.output_dim)
# set names
self.W_z.name, self.U_z.name, self.b_z.name = 'Wz', 'Uz', 'bz'
self.W_r.name, self.U_r.name, self.b_r.name = 'Wr', 'Ur', 'br'
self.W_h.name, self.U_h.name, self.b_h.name = 'Wh', 'Uh', 'bh'
self.params = [
self.W_z, self.U_z, self.b_z,
self.W_r, self.U_r, self.b_r,
self.W_h, self.U_h, self.b_h,
]
"""
context inputs.
"""
if context_dim is not None:
self.context_dim = context_dim
self.C_z = self.init((self.context_dim, self.output_dim))
self.C_r = self.init((self.context_dim, self.output_dim))
self.C_h = self.init((self.context_dim, self.output_dim))
self.C_z.name, self.C_r.name, self.C_h.name = 'Cz', 'Cr', 'Ch'
self.params += [self.C_z, self.C_r, self.C_h]
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self,
xz_t, xr_t, xh_t, mask_t,
h_tm1,
u_z, u_r, u_h):
# h_mask_tm1 = mask_tm1 * h_tm1
z = self.inner_activation(xz_t + T.dot(T.tanh(h_tm1), u_z))
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
h_t = (hh_t * z + h_tm1 * (1 - z)) * mask_t + (1 - mask_t) * h_tm1
return h_t
def __call__(self, X, mask=None, C=None, init_h=None, return_sequence=False, one_step=False):
# recurrent cell only work for tensor
if X.ndim == 2:
X = X[:, None, :]
# mask
if mask is None: # sampling or beam-search
mask = T.alloc(1., X.shape[0], X.shape[1])
# one step
if one_step:
assert init_h, 'previous state must be provided!'
padded_mask = self.get_padded_shuffled_mask(mask, pad=0)
X = X.dimshuffle((1, 0, 2))
x_z = dot(X, self.W_z, self.b_z)
x_r = dot(X, self.W_r, self.b_r)
x_h = dot(X, self.W_h, self.b_h)
"""
JZS3 with constant context. (not attention here.)
"""
if C is not None:
assert C.ndim == 2
ctx_step = C.dimshuffle('x', 0, 1) # C: (nb_samples, context_dim)
x_z += dot(ctx_step, self.C_z)
x_r += dot(ctx_step, self.C_r)
x_h += dot(ctx_step, self.C_h)
"""
JZS3 with additional initial/previous state.
"""
if init_h is None:
init_h = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
if one_step:
seq = [x_z, x_r, x_h, padded_mask]
outputs_info = [init_h]
non_seq = [self.U_z, self.U_r, self.U_h]
outputs = self._step(*(seq + outputs_info + non_seq))
else:
outputs, updates = theano.scan(
self._step,
sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=init_h,
non_sequences=[self.U_z, self.U_r, self.U_h],
)
if return_sequence:
return outputs.dimshuffle((1, 0, 2))
return outputs[-1]
class LSTM(Recurrent):
def __init__(self,
input_dim=0,
output_dim=128,
context_dim=None,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one',
activation='tanh', inner_activation='sigmoid',
name=None, weights=None):
super(LSTM, self).__init__()
"""
Standard model
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.init = initializations.get(init)
self.inner_init = initializations.get(inner_init)
self.forget_bias_init = initializations.get(forget_bias_init)
self.activation = activations.get(activation)
self.inner_activation = activations.get(inner_activation)
# input gate param.
self.W_i = self.init((self.input_dim, self.output_dim))
self.U_i = self.inner_init((self.output_dim, self.output_dim))
self.b_i = shared_zeros(self.output_dim)
# forget gate param.
self.W_f = self.init((self.input_dim, self.output_dim))
self.U_f = self.inner_init((self.output_dim, self.output_dim))
self.b_f = self.forget_bias_init(self.output_dim) # forget gate needs one bias.
# output gate param.
self.W_o = self.init((self.input_dim, self.output_dim))
self.U_o = self.inner_init((self.output_dim, self.output_dim))
self.b_o = shared_zeros(self.output_dim)
# memory param.
self.W_c = self.init((self.input_dim, self.output_dim))
self.U_c = self.inner_init((self.output_dim, self.output_dim))
self.b_c = shared_zeros(self.output_dim)
# set names
self.W_i.name, self.U_i.name, self.b_i.name = 'Wi', 'Ui', 'bi'
self.W_f.name, self.U_f.name, self.b_f.name = 'Wf', 'Uf', 'bf'
self.W_o.name, self.U_o.name, self.b_o.name = 'Wo', 'Uo', 'bo'
self.W_c.name, self.U_c.name, self.b_c.name = 'Wc', 'Uc', 'bc'
self.params = [
self.W_i, self.U_i, self.b_i,
self.W_f, self.U_f, self.b_f,
self.W_o, self.U_o, self.b_o,
self.W_c, self.U_c, self.b_c,
]
"""
context inputs.
"""
if context_dim is not None:
self.context_dim = context_dim
self.C_i = self.init((self.context_dim, self.output_dim))
self.C_f = self.init((self.context_dim, self.output_dim))
self.C_o = self.init((self.context_dim, self.output_dim))
self.C_c = self.init((self.context_dim, self.output_dim))
self.C_i.name, self.C_f.name, self.C_o.name, self.C_c.name = 'Ci', 'Cf', 'Co', 'Cc'
self.params += [self.C_i, self.C_f, self.C_o, self.C_c]
if weights is not None:
self.set_weights(weights)
if name is not None:
self.set_name(name)
def _step(self,
xi_t, xf_t, xo_t, xc_t, mask_t,
h_tm1, c_tm1,
u_i, u_f, u_o, u_c):
# h_mask_tm1 = mask_tm1 * h_tm1
i = self.inner_activation(xi_t + T.dot(h_tm1, u_i)) # input gate
f = self.inner_activation(xf_t + T.dot(h_tm1, u_f)) # forget gate
o = self.inner_activation(xo_t + T.dot(h_tm1, u_o)) # output gate
c = self.activation(xc_t + T.dot(h_tm1, u_c)) # memory updates
# update the memory cell.
c_t = f * c_tm1 + i * c
h_t = o * self.activation(c_t)
# masking
c_t = c_t * mask_t + (1 - mask_t) * c_tm1
h_t = h_t * mask_t + (1 - mask_t) * h_tm1
return h_t, c_t
def input_embed(self, X, C=None):
x_i = dot(X, self.W_i, self.b_i)
x_f = dot(X, self.W_f, self.b_f)
x_o = dot(X, self.W_o, self.b_o)
x_c = dot(X, self.W_c, self.b_c)
"""
LSTM with constant context. (not attention here.)
"""
if C is not None:
assert C.ndim == 2
ctx_step = C.dimshuffle('x', 0, 1) # C: (nb_samples, context_dim)
x_i += dot(ctx_step, self.C_i)
x_f += dot(ctx_step, self.C_f)
x_o += dot(ctx_step, self.C_o)
x_c += dot(ctx_step, self.C_c)
return x_i, x_f, x_o, x_c
def __call__(self, X, mask=None, C=None, init_h=None, init_c=None, return_sequence=False, one_step=False):
# recurrent cell only work for tensor
if X.ndim == 2:
X = X[:, None, :]
# mask
if mask is None: # sampling or beam-search
mask = T.alloc(1., X.shape[0], X.shape[1])
# one step
if one_step:
assert init_h, 'previous state must be provided!'
padded_mask = self.get_padded_shuffled_mask(mask, pad=0)
X = X.dimshuffle((1, 0, 2))
x_i, x_f, x_o, x_c = self.input_embed(X, C)
"""
LSTM with additional initial/previous state.
"""
if init_h is None:
init_h = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
if init_c is None:
init_c = init_h
if one_step:
seq = [x_i, x_f, x_o, x_c, padded_mask]
outputs_info = [init_h, init_c]
non_seq = [self.U_i, self.U_f, self.U_o, self.U_c]
outputs = self._step(*(seq + outputs_info + non_seq))
else:
outputs, updates = theano.scan(
self._step,
sequences=[x_i, x_f, x_o, x_c, padded_mask],
outputs_info=[init_h, init_c],
non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c],
)
if return_sequence:
return outputs[0].dimshuffle((1, 0, 2)), outputs[1].dimshuffle((1, 0, 2)) # H, C
return outputs[0][-1], outputs[1][-1]
================================================
FILE: emolga/models/__init__.py
================================================
__author__ = 'jiataogu'
================================================
FILE: emolga/models/core.py
================================================
__author__ = 'jiataogu'
import theano
import logging
import deepdish as dd
from emolga.dataset.build_dataset import serialize_to_file, deserialize_from_file
from emolga.utils.theano_utils import floatX
logger = logging.getLogger(__name__)
class Model(object):
def __init__(self):
self.layers = []
self.params = []
self.monitor = {}
self.watchlist = []
def _add(self, layer):
if layer:
self.layers.append(layer)
self.params += layer.params
def _monitoring(self):
# add monitoring variables
for l in self.layers:
for v in l.monitor:
name = v + '@' + l.name
print name
self.monitor[name] = l.monitor[v]
def compile_monitoring(self, inputs, updates=None):
logger.info('compile monitoring')
for i, v in enumerate(self.monitor):
self.watchlist.append(v)
logger.info('monitoring [{0}]: {1}'.format(i, v))
self.watch = theano.function(inputs,
[self.monitor[v] for v in self.watchlist],
updates=updates
)
logger.info('done.')
def set_weights(self, weights):
if hasattr(self, 'save_parm'):
params = self.params + self.save_parm
else:
params = self.params
for p, w in zip(params, weights):
print p.name
if p.eval().shape != w.shape:
raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
p.set_value(floatX(w))
def get_weights(self):
weights = []
for p in self.params:
weights.append(p.get_value())
if hasattr(self, 'save_parm'):
for v in self.save_parm:
weights.append(v.get_value())
return weights
def set_name(self, name):
for i in range(len(self.params)):
if self.params[i].name is None:
self.params[i].name = '%s_p%d' % (name, i)
else:
self.params[i].name = name + '@' + self.params[i].name
self.name = name
def save(self, filename):
if hasattr(self, 'save_parm'):
params = self.params + self.save_parm
else:
params = self.params
ps = 'save: <\n'
for p in params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '> to ... {}'.format(filename)
logger.info(ps)
# hdf5 module seems works abnormal !!
# dd.io.save(filename, self.get_weights())
serialize_to_file(self.get_weights(), filename)
def load(self, filename):
logger.info('load the weights.')
# hdf5 module seems works abnormal !!
# weights = dd.io.load(filename)
weights = deserialize_from_file(filename)
print len(weights)
self.set_weights(weights)
================================================
FILE: emolga/models/covc_encdec.py
================================================
__author__ = 'jiataogu'
import theano
import logging
import copy
import emolga.basic.objectives as objectives
import emolga.basic.optimizers as optimizers
from theano.compile.nanguardmode import NanGuardMode
from emolga.utils.generic_utils import visualize_
from emolga.layers.core import Dropout, Dense, Dense2, Identity
from emolga.layers.recurrent import *
from emolga.layers.ntm_minibatch import Controller
from emolga.layers.embeddings import *
from emolga.layers.attention import *
from core import Model
logger = logging.getLogger(__name__)
RNN = GRU # change it here for other RNN models.
err = 1e-9
class Encoder(Model):
"""
Recurrent Neural Network-based Encoder
It is used to compute the context vector.
"""
def __init__(self,
config, rng, prefix='enc',
mode='Evaluation', embed=None, use_context=False):
super(Encoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.mode = mode
self.name = prefix
self.use_context = use_context
self.return_embed = False
self.return_sequence = False
"""
Create all elements of the Encoder's Computational graph
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['enc_voc_size'],
self.config['enc_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
if self.use_context:
self.Initializer = Dense(
config['enc_contxt_dim'],
config['enc_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
self._add(self.Initializer)
"""
Encoder Core
"""
# create RNN cells
if not self.config['bidirectional']:
logger.info("{}_create RNN cells.".format(self.prefix))
self.RNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.RNN)
else:
logger.info("{}_create forward RNN cells.".format(self.prefix))
self.forwardRNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_fw_cell".format(self.prefix)
)
self._add(self.forwardRNN)
logger.info("{}_create backward RNN cells.".format(self.prefix))
self.backwardRNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_bw_cell".format(self.prefix)
)
self._add(self.backwardRNN)
logger.info("create encoder ok.")
def build_encoder(self, source, context=None, return_embed=False,
return_sequence=False,
return_gates=False,
clean_mask=False):
"""
Build the Encoder Computational Graph
"""
# clean_mask means we set the hidden states of masked places as 0.
# sometimes it will help the program to solve something
# note that this option only works when return_sequence.
# we recommend to leave at least one mask in the end of encoded sequence.
# Initial state
Init_h = None
if self.use_context:
Init_h = self.Initializer(context)
# word embedding
if not self.config['bidirectional']:
X, X_mask = self.Embed(source, True)
if return_gates:
X_out, Z, R = self.RNN(X, X_mask, C=context, init_h=Init_h,
return_sequence=return_sequence,
return_gates=True)
else:
X_out = self.RNN(X, X_mask, C=context, init_h=Init_h,
return_sequence=return_sequence,
return_gates=False)
if return_sequence:
X_tail = X_out[:, -1]
if clean_mask:
X_out = X_out * X_mask[:, :, None]
else:
X_tail = X_out
else:
source2 = source[:, ::-1]
X, X_mask = self.Embed(source, True)
X2, X2_mask = self.Embed(source2, True)
if not return_gates:
X_out1 = self.backwardRNN(X, X_mask, C=context, init_h=Init_h, return_sequence=return_sequence)
X_out2 = self.forwardRNN(X2, X2_mask, C=context, init_h=Init_h, return_sequence=return_sequence)
else:
X_out1, Z1, R1 = self.backwardRNN(X, X_mask, C=context, init_h=Init_h,
return_sequence=return_sequence,
return_gates=True)
X_out2, Z2, R2 = self.forwardRNN(X2, X2_mask, C=context, init_h=Init_h,
return_sequence=return_sequence,
return_gates=True)
Z = T.concatenate([Z1, Z2[:, ::-1, :]], axis=2)
R = T.concatenate([R1, R2[:, ::-1, :]], axis=2)
if not return_sequence:
X_out = T.concatenate([X_out1, X_out2], axis=1)
X_tail = X_out
else:
X_out = T.concatenate([X_out1, X_out2[:, ::-1, :]], axis=2)
X_tail = T.concatenate([X_out1[:, -1], X_out2[:, -1]], axis=1)
if clean_mask:
X_out = X_out * X_mask[:, :, None]
X_mask = T.cast(X_mask, dtype='float32')
if not return_gates:
if return_embed:
return X_out, X, X_mask, X_tail
return X_out
else:
if return_embed:
return X_out, X, X_mask, X_tail, Z, R
return X_out, Z, R
def compile_encoder(self, with_context=False, return_embed=False, return_sequence=False):
source = T.imatrix()
self.return_embed = return_embed
self.return_sequence = return_sequence
if with_context:
context = T.matrix()
self.encode = theano.function([source, context],
self.build_encoder(source, context,
return_embed=return_embed,
return_sequence=return_sequence))
self.gtenc = theano.function([source, context],
self.build_encoder(source, context,
return_embed=return_embed,
return_sequence=return_sequence,
return_gates=True))
else:
self.encode = theano.function([source],
self.build_encoder(source, None,
return_embed=return_embed,
return_sequence=return_sequence))
self.gtenc = theano.function([source],
self.build_encoder(source, None,
return_embed=return_embed,
return_sequence=return_sequence,
return_gates=True))
class Decoder(Model):
"""
Recurrent Neural Network-based Decoder.
It is used for:
(1) Evaluation: compute the probability P(Y|X)
(2) Prediction: sample the best result based on P(Y|X)
(3) Beam-search
(4) Scheduled Sampling (how to implement it?)
"""
def __init__(self,
config, rng, prefix='dec',
mode='RNN', embed=None,
highway=False):
"""
mode = RNN: use a RNN Decoder
"""
super(Decoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.name = prefix
self.mode = mode
self.highway = highway
self.init = initializations.get('glorot_uniform')
self.sigmoid = activations.get('sigmoid')
# use standard drop-out for input & output.
# I believe it should not use for context vector.
self.dropout = config['dropout']
if self.dropout > 0:
logger.info('Use standard-dropout!!!!')
self.D = Dropout(rng=self.rng, p=self.dropout, name='{}_Dropout'.format(prefix))
"""
Create all elements of the Decoder's computational graph.
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['dec_voc_size'],
self.config['dec_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
# create Initialization Layers
logger.info("{}_create initialization layers.".format(self.prefix))
if not config['bias_code']:
self.Initializer = Zero()
else:
self.Initializer = Dense(
config['dec_contxt_dim'],
config['dec_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
# create RNN cells
logger.info("{}_create RNN cells.".format(self.prefix))
if 'location_embed' in self.config:
if config['location_embed']:
dec_embedd_dim = 2 * self.config['dec_embedd_dim']
else:
dec_embedd_dim = self.config['dec_embedd_dim']
else:
dec_embedd_dim = self.config['dec_embedd_dim']
self.RNN = RNN(
dec_embedd_dim,
self.config['dec_hidden_dim'],
self.config['dec_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.Initializer)
self._add(self.RNN)
# HighWay Gating
if highway:
logger.info("HIGHWAY CONNECTION~~~!!!")
assert self.config['context_predict']
assert self.config['dec_contxt_dim'] == self.config['dec_hidden_dim']
self.C_x = self.init((self.config['dec_contxt_dim'],
self.config['dec_hidden_dim']))
self.H_x = self.init((self.config['dec_hidden_dim'],
self.config['dec_hidden_dim']))
self.b_x = initializations.get('zero')(self.config['dec_hidden_dim'])
self.C_x.name = '{}_Cx'.format(self.prefix)
self.H_x.name = '{}_Hx'.format(self.prefix)
self.b_x.name = '{}_bx'.format(self.prefix)
self.params += [self.C_x, self.H_x, self.b_x]
# create readout layers
logger.info("_create Readout layers")
# 1. hidden layers readout.
self.hidden_readout = Dense(
self.config['dec_hidden_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_hidden_readout".format(self.prefix)
)
# 2. previous word readout
self.prev_word_readout = None
if self.config['bigram_predict']:
self.prev_word_readout = Dense(
dec_embedd_dim,
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_prev_word_readout".format(self.prefix),
learn_bias=False
)
# 3. context readout
self.context_readout = None
if self.config['context_predict']:
if not self.config['leaky_predict']:
self.context_readout = Dense(
self.config['dec_contxt_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_context_readout".format(self.prefix),
learn_bias=False
)
else:
assert self.config['dec_contxt_dim'] == self.config['dec_hidden_dim']
self.context_readout = self.hidden_readout
# option: deep output (maxout)
if self.config['deep_out']:
self.activ = Activation(config['deep_out_activ'])
# self.dropout = Dropout(rng=self.rng, p=config['dropout'])
self.output_nonlinear = [self.activ] # , self.dropout]
self.output = Dense(
self.config['output_dim'] / 2
if config['deep_out_activ'] == 'maxout2'
else self.config['output_dim'],
self.config['dec_voc_size'],
activation='softmax',
name="{}_output".format(self.prefix),
learn_bias=False
)
else:
self.output_nonlinear = []
self.output = Activation('softmax')
# registration:
self._add(self.hidden_readout)
if not self.config['leaky_predict']:
self._add(self.context_readout)
self._add(self.prev_word_readout)
self._add(self.output)
if self.config['deep_out']:
self._add(self.activ)
# self._add(self.dropout)
logger.info("create decoder ok.")
@staticmethod
def _grab_prob(probs, X, block_unk=False):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
"""
Build the decoder for evaluation
"""
def prepare_xy(self, target):
# Word embedding
Y, Y_mask = self.Embed(target, True) # (nb_samples, max_len, embedding_dim)
if self.config['use_input']:
X = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, Y.shape[2]), Y[:, :-1, :]], axis=1)
else:
X = 0 * Y
# option ## drop words.
X_mask = T.concatenate([T.ones((Y.shape[0], 1)), Y_mask[:, :-1]], axis=1)
Count = T.cast(T.sum(X_mask, axis=1), dtype=theano.config.floatX)
return X, X_mask, Y, Y_mask, Count
def build_decoder(self, target, context=None,
return_count=False,
train=True):
"""
Build the Decoder Computational Graph
For training/testing
"""
X, X_mask, Y, Y_mask, Count = self.prepare_xy(target)
# input drop-out if any.
if self.dropout > 0:
X = self.D(X, train=train)
# Initial state of RNN
Init_h = self.Initializer(context)
if not self.highway:
X_out = self.RNN(X, X_mask, C=context, init_h=Init_h, return_sequence=True)
# Readout
readout = self.hidden_readout(X_out)
if self.dropout > 0:
readout = self.D(readout, train=train)
if self.config['context_predict']:
readout += self.context_readout(context).dimshuffle(0, 'x', 1)
else:
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
def _recurrence(x, x_mask, prev_h, c):
# compute the highway gate for context vector.
xx = dot(c, self.C_x, self.b_x) + dot(prev_h, self.H_x) # highway gate.
xx = self.sigmoid(xx)
cy = xx * c # the path without using RNN
x_out = self.RNN(x, mask=x_mask, C=c, init_h=prev_h, one_step=True)
hx = (1 - xx) * x_out
return x_out, hx, cy
outputs, _ = theano.scan(
_recurrence,
sequences=[X, X_mask],
outputs_info=[Init_h, None, None],
non_sequences=[context]
)
# hidden readout + context readout
readout = self.hidden_readout( outputs[1].dimshuffle((1, 0, 2)))
if self.dropout > 0:
readout = self.D(readout, train=train)
readout += self.context_readout(outputs[2].dimshuffle((1, 0, 2)))
# return to normal size.
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
prob_dist = self.output(readout) # (nb_samples, max_len, vocab_size)
# log_old = T.sum(T.log(self._grab_prob(prob_dist, target)), axis=1)
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target) + err) * X_mask, axis=1)
log_ppl = log_prob / Count
if return_count:
return log_prob, Count
else:
return log_prob, log_ppl
"""
Sample one step
"""
def _step_sample(self, prev_word, prev_stat, context):
# word embedding (note that for the first word, embedding should be all zero)
if self.config['use_input']:
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim']),
self.Embed(prev_word)
)
else:
X = alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim'])
if self.dropout > 0:
X = self.D(X, train=False)
# apply one step of RNN
if not self.highway:
X_proj = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_proj
# compute the readout probability distribution and sample it
# here the readout is a matrix, different from the learner.
readout = self.hidden_readout(next_stat)
if self.dropout > 0:
readout = self.D(readout, train=False)
if self.config['context_predict']:
readout += self.context_readout(context)
else:
xx = dot(context, self.C_x, self.b_x) + dot(prev_stat, self.H_x) # highway gate.
xx = self.sigmoid(xx)
X_proj = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_proj
readout = self.hidden_readout((1 - xx) * X_proj)
if self.dropout > 0:
readout = self.D(readout, train=False)
readout += self.context_readout(xx * context)
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
next_prob = self.output(readout)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample, next_stat
"""
Build the sampler for sampling/greedy search/beam search
"""
def build_sampler(self):
"""
Build a sampler which only steps once.
Typically it only works for one word a time?
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.matrix() # theano variable.
init_h = self.Initializer(context)
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], init_h, name='get_init_state')
logger.info('done.')
# word sampler: 1 x 1
prev_word = T.vector('prev_word', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
next_prob, next_sample, next_stat \
= self._step_sample(prev_word, prev_stat, context)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_word, prev_stat, context]
outputs = [next_prob, next_sample, next_stat]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
pass
"""
Build a Stochastic Sampler which can use SCAN to work on GPU.
However it cannot be used in Beam-search.
"""
def build_stochastic_sampler(self):
context = T.matrix()
init_h = self.Initializer(context)
logger.info('compile the function: sample')
pass
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, k=1, maxlen=30, stochastic=True, argmax=False, fixlen=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
# get initial state of decoder RNN with context
next_state = self.get_init_state(context)
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1])
next_prob, next_word, next_state \
= self.sample_next(next_word, next_state, ctx) # wtf.
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = next_word[0]
sample.append(nw)
score += next_prob[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
for idx in xrange(len(new_hyp_samples)):
if (new_hyp_states[idx][-1] == 0) and (not fixlen):
sample.append(new_hyp_samples[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
pass
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
score.append(hyp_scores[idx])
return sample, score
class DecoderAtt(Decoder):
"""
Recurrent Neural Network-based Decoder [for CopyNet-b Only]
with Attention Mechanism
"""
def __init__(self,
config, rng, prefix='dec',
mode='RNN', embed=None,
copynet=False, identity=False):
super(DecoderAtt, self).__init__(
config, rng, prefix,
mode, embed, False)
self.init = initializations.get('glorot_uniform')
self.copynet = copynet
self.identity = identity
# attention reader
self.attention_reader = Attention(
self.config['dec_hidden_dim'],
self.config['dec_contxt_dim'],
1000,
name='source_attention',
coverage=self.config['coverage']
)
self._add(self.attention_reader)
# if use copynet
if self.copynet:
if not self.identity:
self.Is = Dense(
self.config['dec_contxt_dim'],
self.config['dec_embedd_dim'],
name='in-trans'
)
else:
assert self.config['dec_contxt_dim'] == self.config['dec_embedd_dim']
self.Is = Identity(name='ini')
self.Os = Dense(
self.config['dec_readout_dim']
if not self.config['location_embed']
else self.config['dec_readout_dim'] + self.config['dec_embedd_dim'],
self.config['dec_contxt_dim'],
name='out-trans'
)
if self.config['copygate']:
self.Gs = Dense(
self.config['dec_readout_dim'] + self.config['dec_embedd_dim'],
1,
name='copy-gate',
activation='linear',
learn_bias=True,
negative_bias=True
)
self._add(self.Gs)
if self.config['location_embed']:
self._add(self.Is)
self._add(self.Os)
logger.info('adjust decoder ok.')
"""
Build the decoder for evaluation
"""
def prepare_xy(self, target, cc_matrix):
# target: (nb_samples, index_seq)
# cc_matrix: (nb_samples, maxlen_t, maxlen_s)
# context: (nb_samples)
Y, Y_mask = self.Embed(target, True) # (nb_samples, maxlen_t, embedding_dim)
X = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, Y.shape[2]), Y[:, :-1, :]], axis=1)
# LL = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, cc_matrix.shape[2]),
# cc_matrix[:, :-1, :]], axis=1)
LL = cc_matrix
XL_mask = T.cast(T.gt(T.sum(LL, axis=2), 0), dtype='float32')
if not self.config['use_input']:
X *= 0
X_mask = T.concatenate([T.ones((Y.shape[0], 1)), Y_mask[:, :-1]], axis=1)
Count = T.cast(T.sum(X_mask, axis=1), dtype=theano.config.floatX)
return X, X_mask, LL, XL_mask, Y_mask, Count
"""
The most different part. Be caution !!
Very different from traditional RNN search.
"""
def build_decoder(self,
target,
cc_matrix,
context,
c_mask,
return_count=False,
train=True):
"""
Build the Computational Graph ::> Context is essential
"""
assert c_mask is not None, 'context must be supplied for this decoder.'
assert context.ndim == 3, 'context must have 3 dimentions.'
# context: (nb_samples, max_len, contxt_dim)
context_A = self.Is(context) # (nb_samples, max_len, embed_dim)
X, X_mask, LL, XL_mask, Y_mask, Count = self.prepare_xy(target, cc_matrix)
# input drop-out if any.
if self.dropout > 0:
X = self.D(X, train=train)
# Initial state of RNN
Init_h = self.Initializer(context[:, 0, :]) # default order ->
Init_a = T.zeros((context.shape[0], context.shape[1]), dtype='float32')
coverage = T.zeros((context.shape[0], context.shape[1]), dtype='float32')
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
LL = LL.dimshuffle((1, 0, 2)) # (maxlen_t, nb_samples, maxlen_s)
XL_mask = XL_mask.dimshuffle((1, 0)) # (maxlen_t, nb_samples)
def _recurrence(x, x_mask, ll, xl_mask, prev_h, prev_a, cov, cc, cm, ca):
"""
x: (nb_samples, embed_dims)
x_mask: (nb_samples, )
ll: (nb_samples, maxlen_s)
xl_mask:(nb_samples, )
-----------------------------------------
prev_h: (nb_samples, hidden_dims)
prev_a: (nb_samples, maxlen_s)
cov: (nb_samples, maxlen_s) *** coverage ***
-----------------------------------------
cc: (nb_samples, maxlen_s, cxt_dim)
cm: (nb_samples, maxlen_s)
ca: (nb_samples, maxlen_s, ebd_dim)
"""
# compute the attention and get the context vector
prob = self.attention_reader(prev_h, cc, Smask=cm, Cov=cov)
ncov = cov + prob
cxt = T.sum(cc * prob[:, :, None], axis=1)
# compute input word embedding (mixed)
x_in = T.concatenate([x, T.sum(ca * prev_a[:, :, None], axis=1)], axis=-1)
# compute the current hidden states of the RNN.
x_out = self.RNN(x_in, mask=x_mask, C=cxt, init_h=prev_h, one_step=True)
# compute the current readout vector.
r_in = [x_out]
if self.config['context_predict']:
r_in += [cxt]
if self.config['bigram_predict']:
r_in += [x_in]
# copynet decoding
r_in = T.concatenate(r_in, axis=-1)
r_out = self.hidden_readout(x_out) # (nb_samples, voc_size)
if self.config['context_predict']:
r_out += self.context_readout(cxt)
if self.config['bigram_predict']:
r_out += self.prev_word_readout(x_in)
for l in self.output_nonlinear:
r_out = l(r_out)
key = self.Os(r_in) # (nb_samples, cxt_dim) :: key
Eng = T.sum(key[:, None, :] * cc, axis=-1)
# # gating
if self.config['copygate']:
gt = self.sigmoid(self.Gs(r_in)) # (nb_samples, 1)
r_out += T.log(gt.flatten()[:, None])
Eng += T.log(1 - gt.flatten()[:, None])
# r_out *= gt.flatten()[:, None]
# Eng *= 1 - gt.flatten()[:, None]
EngSum = logSumExp(Eng, axis=-1, mask=cm, c=r_out)
next_p = T.concatenate([T.exp(r_out - EngSum), T.exp(Eng - EngSum) * cm], axis=-1)
next_c = next_p[:, self.config['dec_voc_size']:] * ll # (nb_samples, maxlen_s)
next_b = next_p[:, :self.config['dec_voc_size']]
sum_a = T.sum(next_c, axis=1, keepdims=True) # (nb_samples,)
next_a = (next_c / (sum_a + err)) * xl_mask[:, None] # numerically consideration
return x_out, next_a, ncov, sum_a, next_b
outputs, _ = theano.scan(
_recurrence,
sequences=[X, X_mask, LL, XL_mask],
outputs_info=[Init_h, Init_a, coverage, None, None],
non_sequences=[context, c_mask, context_A]
)
X_out, source_prob, coverages, source_sum, prob_dist = [z.dimshuffle((1, 0, 2)) for z in outputs]
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
XL_mask = XL_mask.dimshuffle((1, 0))
# unk masking
U_mask = T.ones_like(target) * (1 - T.eq(target, 1))
U_mask += (1 - U_mask) * (1 - XL_mask)
# The most different part is here !!
log_prob = T.sum(T.log(
self._grab_prob(prob_dist, target) * U_mask +
source_sum.sum(axis=-1) + err
) * X_mask, axis=1)
log_ppl = log_prob / (Count + err)
if return_count:
return log_prob, Count
else:
return log_prob, log_ppl
"""
Sample one step
"""
def _step_sample(self,
prev_word,
prev_stat,
prev_loc,
prev_cov,
context,
c_mask,
context_A):
assert c_mask is not None, 'we need the source mask.'
# word embedding (note that for the first word, embedding should be all zero)
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], 2 * self.config['dec_embedd_dim']),
T.concatenate([self.Embed(prev_word),
T.sum(context_A * prev_loc[:, :, None], axis=1)
], axis=-1)
)
if self.dropout > 0:
X = self.D(X, train=False)
# apply one step of RNN
Probs = self.attention_reader(prev_stat, context, c_mask, Cov=prev_cov)
ncov = prev_cov + Probs
cxt = T.sum(context * Probs[:, :, None], axis=1)
X_proj, zz, rr = self.RNN(X, C=cxt,
init_h=prev_stat,
one_step=True,
return_gates=True)
next_stat = X_proj
# compute the readout probability distribution and sample it
# here the readout is a matrix, different from the learner.
readin = [next_stat]
if self.config['context_predict']:
readin += [cxt]
if self.config['bigram_predict']:
readin += [X]
readin = T.concatenate(readin, axis=-1)
# if gating
# if self.config['copygate']:
# gt = self.sigmoid(self.Gs(readin)) # (nb_samples, dim)
# readin *= 1 - gt
# readout = self.hidden_readout(next_stat * gt[:, :self.config['dec_hidden_dim']])
# if self.config['context_predict']:
# readout += self.context_readout(
# cxt * gt[:, self.config['dec_hidden_dim']:
# self.config['dec_hidden_dim'] + self.config['dec_contxt_dim']])
# if self.config['bigram_predict']:
# readout += self.prev_word_readout(
# X * gt[:, -2 * self.config['dec_embedd_dim']:])
# else:
readout = self.hidden_readout(next_stat)
if self.config['context_predict']:
readout += self.context_readout(cxt)
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
key = self.Os(readin)
Eng = T.sum(key[:, None, :] * context, axis=-1)
# # gating
if self.config['copygate']:
gt = self.sigmoid(self.Gs(readin)) # (nb_samples, 1)
readout += T.log(gt.flatten()[:, None])
Eng += T.log(1 - gt.flatten()[:, None])
EngSum = logSumExp(Eng, axis=-1, mask=c_mask, c=readout)
next_prob = T.concatenate([T.exp(readout - EngSum), T.exp(Eng - EngSum) * c_mask], axis=-1)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample, next_stat, ncov, next_stat
def build_sampler(self):
"""
Build a sampler which only steps once.
Typically it only works for one word a time?
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.tensor3() # theano variable.
c_mask = T.matrix() # mask of the input sentence.
context_A = self.Is(context)
init_h = self.Initializer(context[:, 0, :])
init_a = T.zeros((context.shape[0], context.shape[1]))
cov = T.zeros((context.shape[0], context.shape[1]))
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], [init_h, init_a, cov], name='get_init_state')
logger.info('done.')
# word sampler: 1 x 1
prev_word = T.vector('prev_word', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
prev_a = T.matrix('prev_a', dtype='float32')
prev_cov = T.matrix('prev_cov', dtype='float32')
next_prob, next_sample, next_stat, ncov, alpha \
= self._step_sample(prev_word,
prev_stat,
prev_a,
prev_cov,
context,
c_mask,
context_A)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_word, prev_stat, prev_a, prev_cov, context, c_mask]
outputs = [next_prob, next_sample, next_stat, ncov, alpha]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
"""
Generate samples, either with stochastic sampling or beam-search!
[:-:] I have to think over how to modify the BEAM-Search!!
"""
def get_sample(self,
context,
c_mask,
source,
k=1, maxlen=30, stochastic=True,
argmax=False, fixlen=False,
return_attend=False
):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
Lmax = self.config['dec_voc_size']
sample = []
ppp = []
attend = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_ppps = [[]] * live_k
hyp_attends = [[]] * live_k
# get initial state of decoder RNN with context
next_state, ss_prob, coverage = self.get_init_state(context)
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1, 1])
cmk = np.tile(c_mask, [live_k, 1])
sss = np.tile(source, [live_k, 1])
# # process word
def process_():
# caution for index_0: UNK
ll = np.zeros((sss.shape[0], sss.shape[1]), dtype='float32')
for i in xrange(next_word.shape[0]):
if next_word[i] >= Lmax:
ll[i][next_word[i] - Lmax] = 1.
next_word[i] = sss[i][next_word[i] - Lmax]
else:
ll[i] = (sss[i] == next_word[i, None])
# for k in xrange(sss.shape[1]):
# ll[i][k] = (sss[i][k] == next_word[i])
return ll, next_word
# print next_word
ll, next_word = process_()
ll_mask = (np.sum(ll, axis=1, keepdims=True) > 0)
next_a = ss_prob * ll
next_a = next_a / (err + np.sum(next_a, axis=1, keepdims=True)) * ll_mask
next_prob0, next_word, next_state, coverage, alpha \
= self.sample_next(next_word, next_state, next_a, coverage, ctx, cmk)
# print next_prob0.shape[1]
if not self.config['decode_unk']:
next_prob0[:, 1] = 0.
next_prob0 /= np.sum(next_prob0, axis=1, keepdims=True)
def merge_():
# merge the probabilities
temple_prob = copy.copy(next_prob0)
source_prob = copy.copy(next_prob0[:, Lmax:])
for i in xrange(next_prob0.shape[0]):
for j in xrange(sss.shape[1]):
if (sss[i, j] < Lmax) and (sss[i, j] != 1):
temple_prob[i, sss[i, j]] += source_prob[i, j]
temple_prob[i, Lmax + j] = 0.
return temple_prob, source_prob
next_prob, ss_prob = merge_()
next_prob0[:, Lmax:] = 0.
# print '0', next_prob0[:, 3165]
# print '01', next_prob[:, 3165]
# # print next_prob[0, Lmax:]
# print ss_prob[0, :]
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = self.rng.multinomial(pvals=next_prob).argmax(1)
sample.append(nw)
score += next_prob[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_ppps = []
new_hyp_attends = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
new_hyp_coverage = []
new_hyp_ss = []
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
new_hyp_coverage.append(copy.copy(coverage[ti]))
new_hyp_ss.append(copy.copy(ss_prob[ti]))
if not return_attend:
new_hyp_ppps.append(hyp_ppps[ti] + [[next_prob0[ti][wi], next_prob[ti][wi]]])
else:
new_hyp_ppps.append(hyp_ppps[ti] + [(ss_prob[ti], alpha[ti])])
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
hyp_coverage = []
hyp_ppps = []
hyp_ss = []
for idx in xrange(len(new_hyp_samples)):
if (new_hyp_states[idx][-1] == 0) and (not fixlen):
sample.append(new_hyp_samples[idx])
ppp.append(new_hyp_ppps[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_ppps.append(new_hyp_ppps[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
hyp_coverage.append(new_hyp_coverage[idx])
hyp_ss.append(new_hyp_ss[idx])
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
coverage = np.array(hyp_coverage)
ss_prob = np.array(hyp_ss)
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
ppp.append(hyp_ppps[idx])
score.append(hyp_scores[idx])
return sample, score, ppp
class FnnDecoder(Model):
def __init__(self, config, rng, prefix='fnndec'):
"""
mode = RNN: use a RNN Decoder
"""
super(FnnDecoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.name = prefix
"""
Create Dense Predictor.
"""
self.Tr = Dense(self.config['dec_contxt_dim'],
self.config['dec_hidden_dim'],
activation='maxout2',
name='{}_Tr'.format(prefix))
self._add(self.Tr)
self.Pr = Dense(self.config['dec_hidden_dim'] / 2,
self.config['dec_voc_size'],
activation='softmax',
name='{}_Pr'.format(prefix))
self._add(self.Pr)
logger.info("FF decoder ok.")
@staticmethod
def _grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
def build_decoder(self, target, context):
"""
Build the Decoder Computational Graph
"""
prob_dist = self.Pr(self.Tr(context[:, None, :]))
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target) + err), axis=1)
return log_prob
def build_sampler(self):
context = T.matrix()
prob_dist = self.Pr(self.Tr(context))
next_sample = self.rng.multinomial(pvals=prob_dist).argmax(1)
self.sample_next = theano.function([context], [prob_dist, next_sample], name='sample_next_{}'.format(self.prefix))
logger.info('done')
def get_sample(self, context, argmax=True):
prob, sample = self.sample_next(context)
if argmax:
return prob[0].argmax()
else:
return sample[0]
########################################################################################################################
# Encoder-Decoder Models ::::
#
class RNNLM(Model):
"""
RNN-LM, with context vector = 0.
It is very similar with the implementation of VAE.
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'rnnlm'
def build_(self):
logger.info("build the RNN-decoder")
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration:
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get('adadelta')
# saved the initial memories
if self.config['mode'] == 'NTM':
self.memory = initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth']))
logger.info("create the RECURRENT language model. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
if not contrastive:
self.compile_train()
else:
self.compile_train_CE()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['dec_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, context)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
# add monitoring:
self.monitor['context'] = context
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
@abstractmethod
def compile_train_CE(self):
pass
def compile_sample(self):
# context vectors (as)
self.decoder.build_sampler()
logger.info("display functions compile done.")
@abstractmethod
def compile_inference(self):
pass
def default_context(self):
if self.config['mode'] == 'RNN':
return np.zeros(shape=(1, self.config['dec_contxt_dim']), dtype=theano.config.floatX)
elif self.config['mode'] == 'NTM':
memory = self.memory.get_value()
memory = memory.reshape((1, memory.shape[0], memory.shape[1]))
return memory
def generate_(self, context=None, max_len=None, mode='display'):
"""
:param action: action vector to guide the question.
If None, use a Gaussian to simulate the action.
:return: question sentence in natural language.
"""
# assert self.config['sample_stoch'], 'RNNLM sampling must be stochastic'
# assert not self.config['sample_argmax'], 'RNNLM sampling cannot use argmax'
if context is None:
context = self.default_context()
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
sample, score = self.decoder.get_sample(context, **args)
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
sample = sample[score.argmin()]
score = score.min()
else:
score /= float(len(sample))
return sample, np.exp(score)
class AutoEncoder(RNNLM):
"""
Regular Auto-Encoder: RNN Encoder/Decoder
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'vae'
def build_(self):
logger.info("build the RNN auto-encoder")
self.encoder = Encoder(self.config, self.rng, prefix='enc')
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec', embed=self.encoder.Embed)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec')
"""
Build the Transformation
"""
if self.config['nonlinear_A']:
self.action_trans = Dense(
self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='tanh',
name='action_transform'
)
else:
assert self.config['enc_hidden_dim'] == self.config['action_dim'], \
'hidden dimension must match action dimension'
self.action_trans = Identity(name='action_transform')
if self.config['nonlinear_B']:
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='tanh',
name='context_transform'
)
else:
assert self.config['dec_contxt_dim'] == self.config['action_dim'], \
'action dimension must match context dimension'
self.context_trans = Identity(name='context_transform')
# registration
self._add(self.action_trans)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'], kwargs={'lr': self.config['lr']})
logger.info("create Helmholtz RECURRENT neural network. ok")
def compile_train(self, mode='train'):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
context = alloc_zeros_matrix(inputs.shape[0], self.config['dec_contxt_dim'])
assert context.ndim == 2
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, context)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
if mode == 'display' or mode == 'all':
"""
build the sampler function here <:::>
"""
# context vectors (as)
self.decoder.build_sampler()
logger.info("display functions compile done.")
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
class NRM(Model):
"""
Neural Responding Machine
A Encoder-Decoder based responding model.
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation',
use_attention=False,
copynet=False,
identity=False):
super(NRM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'nrm'
self.attend = use_attention
self.copynet = copynet
self.identity = identity
def build_(self, lr=None, iterations=None):
logger.info("build the Neural Responding Machine")
# encoder-decoder:: <<==>>
self.encoder = Encoder(self.config, self.rng, prefix='enc', mode=self.mode)
if not self.attend:
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
else:
self.decoder = DecoderAtt(self.config, self.rng, prefix='dec', mode=self.mode,
copynet=self.copynet, identity=self.identity)
self._add(self.encoder)
self._add(self.decoder)
# objectives and optimizers
if self.config['optimizer'] == 'adam':
self.optimizer = optimizers.get(self.config['optimizer'],
kwargs=dict(rng=self.rng,
save=False))
else:
self.optimizer = optimizers.get(self.config['optimizer'])
if lr is not None:
self.optimizer.lr.set_value(floatX(lr))
self.optimizer.iterations.set_value(floatX(iterations))
logger.info("build ok.")
def compile_(self, mode='all', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
self.compile_train()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
target = T.imatrix() # padded target word sequence (for training)
cc_matrix = T.tensor3()
# encoding & decoding
code, _, c_mask, _ = self.encoder.build_encoder(inputs, None, return_sequence=True, return_embed=True)
# code: (nb_samples, max_len, contxt_dim)
if 'explicit_loc' in self.config:
if self.config['explicit_loc']:
print 'use explicit location!!'
max_len = code.shape[1]
expLoc = T.eye(max_len, self.config['encode_max_len'], dtype='float32')[None, :, :]
expLoc = T.repeat(expLoc, code.shape[0], axis=0)
code = T.concatenate([code, expLoc], axis=2)
logPxz, logPPL = self.decoder.build_decoder(target, cc_matrix,
code, c_mask)
# responding loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs, target, cc_matrix]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
self.train_guard = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun',
mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))
logger.info("training functions compile done.")
# # add monitoring:
# self.monitor['context'] = context
# self._monitoring()
#
# # compiling monitoring
# self.compile_monitoring(train_inputs)
def compile_sample(self):
if not self.attend:
self.encoder.compile_encoder(with_context=False)
else:
self.encoder.compile_encoder(with_context=False, return_sequence=True, return_embed=True)
self.decoder.build_sampler()
logger.info("sampling functions compile done.")
def compile_inference(self):
pass
def generate_(self, inputs, mode='display', return_attend=False, return_all=False):
# assert self.config['sample_stoch'], 'RNNLM sampling must be stochastic'
# assert not self.config['sample_argmax'], 'RNNLM sampling cannot use argmax'
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'],
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None,
return_attend=return_attend)
context, _, c_mask, _, Z, R = self.encoder.gtenc(inputs)
# c_mask[0, 3] = c_mask[0, 3] * 0
# L = context.shape[1]
# izz = np.concatenate([np.arange(3), np.asarray([1,2]), np.arange(3, L)])
# context = context[:, izz, :]
# c_mask = c_mask[:, izz]
# inputs = inputs[:, izz]
# context, _, c_mask, _ = self.encoder.encode(inputs)
# import pylab as plt
# # visualize_(plt.subplots(), Z[0][:, 300:], normal=False)
# visualize_(plt.subplots(), context[0], normal=False)
if 'explicit_loc' in self.config:
if self.config['explicit_loc']:
max_len = context.shape[1]
expLoc = np.eye(max_len, self.config['encode_max_len'], dtype='float32')[None, :, :]
expLoc = np.repeat(expLoc, context.shape[0], axis=0)
context = np.concatenate([context, expLoc], axis=2)
sample, score, ppp = self.decoder.get_sample(context, c_mask, inputs, **args)
if return_all:
return sample, score, ppp
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
idz = score.argmin()
sample = sample[idz]
score = score.min()
ppp = ppp[idz]
else:
score /= float(len(sample))
return sample, np.exp(score), ppp
def evaluate_(self, inputs, outputs, idx2word, inputs_unk=None, encode=True):
def cut_zero_yes(sample, idx2word, ppp=None, Lmax=None):
if Lmax is None:
Lmax = self.config['dec_voc_size']
if ppp is None:
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample[:sample.index(0)]]
else:
if 0 not in sample:
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample, ppp)]
idz = sample.index(0)
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample[:idz], ppp[:idz])]
def cut_zero_no(sample, idx2word, ppp=None, Lmax=None):
if Lmax is None:
Lmax = self.config['dec_voc_size']
if ppp is None:
if 0 not in sample:
return ['{}'.format(idx2word[w])
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample]
return ['{}'.format(idx2word[w])
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample[:sample.index(0)]]
else:
if 0 not in sample:
return ['{0} ({1:1.1f})'.format(
idx2word[w], p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]], p)
for w, p in zip(sample, ppp)]
idz = sample.index(0)
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]], p)
for w, p in zip(sample[:idz], ppp[:idz])]
if inputs_unk is None:
result, _, ppp = self.generate_(inputs[None, :])
else:
result, _, ppp = self.generate_(inputs_unk[None, :])
if encode:
cut_zero = cut_zero_yes
else:
cut_zero = cut_zero_no
pp0, pp1 = [np.asarray(p) for p in zip(*ppp)]
pp = (pp1 - pp0) / pp1
# pp = (pp1 - pp0) / pp1
print len(ppp)
print ' [lr={0}][iter={1}]'.format(self.optimizer.lr.get_value(),
self.optimizer.iterations.get_value())
a = '[SOURCE]: {}\n'.format(' '.join(cut_zero(inputs.tolist(), idx2word, Lmax=len(idx2word))))
b = '[TARGET]: {}\n'.format(' '.join(cut_zero(outputs.tolist(), idx2word, Lmax=len(idx2word))))
c = '[DECODE]: {}\n'.format(' '.join(cut_zero(result, idx2word)))
d = '[CpRate]: {}\n'.format(' '.join(cut_zero(result, idx2word, pp.tolist())))
e = '[CpRate]: {}\n'.format(' '.join(cut_zero(result, idx2word, result)))
print a
print '{0} -> {1}'.format(len(a.split()), len(b.split()))
if inputs_unk is not None:
k = '[_INPUT]: {}\n'.format(' '.join(cut_zero(inputs_unk.tolist(), idx2word, Lmax=len(idx2word))))
print k
a += k
print b
print c
print d
# print e
a += b + c + d
return a
def analyse_(self, inputs, outputs, idx2word, inputs_unk=None, return_attend=False, name=None, display=False):
def cut_zero(sample, idx2word, ppp=None, Lmax=None):
if Lmax is None:
Lmax = self.config['dec_voc_size']
if ppp is None:
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample[:sample.index(0)]]
else:
if 0 not in sample:
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample, ppp)]
idz = sample.index(0)
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample[:idz], ppp[:idz])]
if inputs_unk is None:
result, _, ppp = self.generate_(inputs[None, :],
return_attend=return_attend)
else:
result, _, ppp = self.generate_(inputs_unk[None, :],
return_attend=return_attend)
source = '{}'.format(' '.join(cut_zero(inputs.tolist(), idx2word, Lmax=len(idx2word))))
target = '{}'.format(' '.join(cut_zero(outputs.tolist(), idx2word, Lmax=len(idx2word))))
decode = '{}'.format(' '.join(cut_zero(result, idx2word)))
if display:
print source
print target
print decode
idz = result.index(0)
p1, p2 = [np.asarray(p) for p in zip(*ppp)]
print p1.shape
import pylab as plt
# plt.rc('text', usetex=True)
# plt.rc('font', family='serif')
visualize_(plt.subplots(), 1 - p1[:idz, :].T, grid=True, name=name)
visualize_(plt.subplots(), 1 - p2[:idz, :].T, name=name)
# visualize_(plt.subplots(), 1 - np.mean(p2[:idz, :], axis=1, keepdims=True).T)
return target == decode
def analyse_cover(self, inputs, outputs, idx2word, inputs_unk=None, return_attend=False, name=None, display=False):
def cut_zero(sample, idx2word, ppp=None, Lmax=None):
if Lmax is None:
Lmax = self.config['dec_voc_size']
if ppp is None:
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8'))
if w < Lmax else '{}'.format(idx2word[inputs[w - Lmax]].encode('utf-8'))
for w in sample[:sample.index(0)]]
else:
if 0 not in sample:
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample, ppp)]
idz = sample.index(0)
return ['{0} ({1:1.1f})'.format(
idx2word[w].encode('utf-8'), p)
if w < Lmax
else '{0} ({1:1.1f})'.format(
idx2word[inputs[w - Lmax]].encode('utf-8'), p)
for w, p in zip(sample[:idz], ppp[:idz])]
if inputs_unk is None:
results, _, ppp = self.generate_(inputs[None, :],
return_attend=return_attend,
return_all=True)
else:
results, _, ppp = self.generate_(inputs_unk[None, :],
return_attend=return_attend,
return_all=True)
source = '{}'.format(' '.join(cut_zero(inputs.tolist(), idx2word, Lmax=len(idx2word))))
target = '{}'.format(' '.join(cut_zero(outputs.tolist(), idx2word, Lmax=len(idx2word))))
# decode = '{}'.format(' '.join(cut_zero(result, idx2word)))
score = [target == '{}'.format(' '.join(cut_zero(result, idx2word))) for result in results]
return max(score)
================================================
FILE: emolga/models/encdec.py
================================================
__author__ = 'jiataogu'
import theano
import logging
import copy
import emolga.basic.objectives as objectives
import emolga.basic.optimizers as optimizers
from theano.compile.nanguardmode import NanGuardMode
from emolga.layers.core import Dropout, Dense, Dense2, Identity
from emolga.layers.recurrent import *
from emolga.layers.ntm_minibatch import Controller
from emolga.layers.embeddings import *
from emolga.layers.attention import *
from core import Model
logger = logging.getLogger(__name__)
RNN = GRU # change it here for other RNN models.
########################################################################################################################
# Encoder/Decoder Blocks ::::
#
# Encoder Back-up
# class Encoder(Model):
# """
# Recurrent Neural Network-based Encoder
# It is used to compute the context vector.
# """
#
# def __init__(self,
# config, rng, prefix='enc',
# mode='Evaluation', embed=None, use_context=False):
# super(Encoder, self).__init__()
# self.config = config
# self.rng = rng
# self.prefix = prefix
# self.mode = mode
# self.name = prefix
# self.use_context = use_context
#
# """
# Create all elements of the Encoder's Computational graph
# """
# # create Embedding layers
# logger.info("{}_create embedding layers.".format(self.prefix))
# if embed:
# self.Embed = embed
# else:
# self.Embed = Embedding(
# self.config['enc_voc_size'],
# self.config['enc_embedd_dim'],
# name="{}_embed".format(self.prefix))
# self._add(self.Embed)
#
# if self.use_context:
# self.Initializer = Dense(
# config['enc_contxt_dim'],
# config['enc_hidden_dim'],
# activation='tanh',
# name="{}_init".format(self.prefix)
# )
# self._add(self.Initializer)
#
# """
# Encoder Core
# """
# if self.config['encoder'] == 'RNN':
# # create RNN cells
# if not self.config['bidirectional']:
# logger.info("{}_create RNN cells.".format(self.prefix))
# self.RNN = RNN(
# self.config['enc_embedd_dim'],
# self.config['enc_hidden_dim'],
# None if not use_context
# else self.config['enc_contxt_dim'],
# name="{}_cell".format(self.prefix)
# )
# self._add(self.RNN)
# else:
# logger.info("{}_create forward RNN cells.".format(self.prefix))
# self.forwardRNN = RNN(
# self.config['enc_embedd_dim'],
# self.config['enc_hidden_dim'],
# None if not use_context
# else self.config['enc_contxt_dim'],
# name="{}_fw_cell".format(self.prefix)
# )
# self._add(self.forwardRNN)
#
# logger.info("{}_create backward RNN cells.".format(self.prefix))
# self.backwardRNN = RNN(
# self.config['enc_embedd_dim'],
# self.config['enc_hidden_dim'],
# None if not use_context
# else self.config['enc_contxt_dim'],
# name="{}_bw_cell".format(self.prefix)
# )
# self._add(self.backwardRNN)
#
# logger.info("create encoder ok.")
#
# elif self.config['encoder'] == 'WS':
# # create weighted sum layers.
# if self.config['ws_weight']:
# self.WS = Dense(self.config['enc_embedd_dim'],
# self.config['enc_hidden_dim'], name='{}_ws'.format(self.prefix))
# self._add(self.WS)
#
# logger.info("create encoder ok.")
#
# def build_encoder(self, source, context=None, return_embed=False):
# """
# Build the Encoder Computational Graph
# """
# # Initial state
# Init_h = None
# if self.use_context:
# Init_h = self.Initializer(context)
#
# # word embedding
# if self.config['encoder'] == 'RNN':
# if not self.config['bidirectional']:
# X, X_mask = self.Embed(source, True)
# if not self.config['pooling']:
# X_out = self.RNN(X, X_mask, C=context, init_h=Init_h, return_sequence=False)
# else:
# X_out = self.RNN(X, X_mask, C=context, init_h=Init_h, return_sequence=True)
# else:
# source2 = source[:, ::-1]
# X, X_mask = self.Embed(source, True)
# X2, X2_mask = self.Embed(source2, True)
#
# if not self.config['pooling']:
# X_out1 = self.backwardRNN(X, X_mask, C=context, init_h=Init_h, return_sequence=False)
# X_out2 = self.forwardRNN( X2, X2_mask, C=context, init_h=Init_h, return_sequence=False)
# X_out = T.concatenate([X_out1, X_out2], axis=1)
# else:
# X_out1 = self.backwardRNN(X, X_mask, C=context, init_h=Init_h, return_sequence=True)
# X_out2 = self.forwardRNN( X2, X2_mask, C=context, init_h=Init_h, return_sequence=True)
# X_out = T.concatenate([X_out1, X_out2], axis=2)
#
# if self.config['pooling'] == 'max':
# X_out = T.max(X_out, axis=1)
# elif self.config['pooling'] == 'mean':
# X_out = T.mean(X_out, axis=1)
#
# elif self.config['encoder'] == 'WS':
# X, X_mask = self.Embed(source, True)
# if self.config['ws_weight']:
# X_out = T.sum(self.WS(X) * X_mask[:, :, None], axis=1) / T.sum(X_mask, axis=1, keepdims=True)
# else:
# assert self.config['enc_embedd_dim'] == self.config['enc_hidden_dim'], \
# 'directly sum should match the dimension'
# X_out = T.sum(X * X_mask[:, :, None], axis=1) / T.sum(X_mask, axis=1, keepdims=True)
# else:
# raise NotImplementedError
#
# if return_embed:
# return X_out, X, X_mask
# return X_out
#
# def compile_encoder(self, with_context=False):
# source = T.imatrix()
# if with_context:
# context = T.matrix()
# self.encode = theano.function([source, context],
# self.build_encoder(source, context))
# else:
# self.encode = theano.function([source],
# self.build_encoder(source, None))
class Encoder(Model):
"""
Recurrent Neural Network-based Encoder
It is used to compute the context vector.
"""
def __init__(self,
config, rng, prefix='enc',
mode='Evaluation', embed=None, use_context=False):
super(Encoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.mode = mode
self.name = prefix
self.use_context = use_context
self.return_embed = False
self.return_sequence = False
"""
Create all elements of the Encoder's Computational graph
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['enc_voc_size'],
self.config['enc_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
if self.use_context:
self.Initializer = Dense(
config['enc_contxt_dim'],
config['enc_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
self._add(self.Initializer)
"""
Encoder Core
"""
# create RNN cells
if not self.config['bidirectional']:
logger.info("{}_create RNN cells.".format(self.prefix))
self.RNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.RNN)
else:
logger.info("{}_create forward RNN cells.".format(self.prefix))
self.forwardRNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_fw_cell".format(self.prefix)
)
self._add(self.forwardRNN)
logger.info("{}_create backward RNN cells.".format(self.prefix))
self.backwardRNN = RNN(
self.config['enc_embedd_dim'],
self.config['enc_hidden_dim'],
None if not use_context
else self.config['enc_contxt_dim'],
name="{}_bw_cell".format(self.prefix)
)
self._add(self.backwardRNN)
logger.info("create encoder ok.")
def build_encoder(self, source, context=None, return_embed=False, return_sequence=False):
"""
Build the Encoder Computational Graph
"""
# Initial state
Init_h = None
if self.use_context:
Init_h = self.Initializer(context)
# word embedding
if not self.config['bidirectional']:
X, X_mask = self.Embed(source, True)
X_out = self.RNN(X, X_mask, C=context, init_h=Init_h, return_sequence=return_sequence)
if return_sequence:
X_tail = X_out[:, -1]
else:
X_tail = X_out
else:
source2 = source[:, ::-1]
X, X_mask = self.Embed(source, True)
X2, X2_mask = self.Embed(source2, True)
X_out1 = self.backwardRNN(X, X_mask, C=context, init_h=Init_h, return_sequence=return_sequence)
X_out2 = self.forwardRNN(X2, X2_mask, C=context, init_h=Init_h, return_sequence=return_sequence)
if not return_sequence:
X_out = T.concatenate([X_out1, X_out2], axis=1)
X_tail = X_out
else:
X_out = T.concatenate([X_out1, X_out2[:, ::-1, :]], axis=2)
X_tail = T.concatenate([X_out1[:, -1], X_out2[:, -1]], axis=1)
X_mask = T.cast(X_mask, dtype='float32')
if return_embed:
return X_out, X, X_mask, X_tail
return X_out
def compile_encoder(self, with_context=False, return_embed=False, return_sequence=False):
source = T.imatrix()
self.return_embed = return_embed
self.return_sequence = return_sequence
if with_context:
context = T.matrix()
self.encode = theano.function([source, context],
self.build_encoder(source, context,
return_embed=return_embed,
return_sequence=return_sequence))
else:
self.encode = theano.function([source],
self.build_encoder(source, None,
return_embed=return_embed,
return_sequence=return_sequence))
class Decoder(Model):
"""
Recurrent Neural Network-based Decoder.
It is used for:
(1) Evaluation: compute the probability P(Y|X)
(2) Prediction: sample the best result based on P(Y|X)
(3) Beam-search
(4) Scheduled Sampling (how to implement it?)
"""
def __init__(self,
config, rng, prefix='dec',
mode='RNN', embed=None,
highway=False):
"""
mode = RNN: use a RNN Decoder
"""
super(Decoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.name = prefix
self.mode = mode
self.highway = highway
self.init = initializations.get('glorot_uniform')
self.sigmoid = activations.get('sigmoid')
# use standard drop-out for input & output.
# I believe it should not use for context vector.
self.dropout = config['dropout']
if self.dropout > 0:
logger.info('Use standard-dropout!!!!')
self.D = Dropout(rng=self.rng, p=self.dropout, name='{}_Dropout'.format(prefix))
"""
Create all elements of the Decoder's computational graph.
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['dec_voc_size'],
self.config['dec_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
# create Initialization Layers
logger.info("{}_create initialization layers.".format(self.prefix))
if not config['bias_code']:
self.Initializer = Zero()
else:
self.Initializer = Dense(
config['dec_contxt_dim'],
config['dec_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
# create RNN cells
logger.info("{}_create RNN cells.".format(self.prefix))
self.RNN = RNN(
self.config['dec_embedd_dim'],
self.config['dec_hidden_dim'],
self.config['dec_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.Initializer)
self._add(self.RNN)
# HighWay Gating
if highway:
logger.info("HIGHWAY CONNECTION~~~!!!")
assert self.config['context_predict']
assert self.config['dec_contxt_dim'] == self.config['dec_hidden_dim']
self.C_x = self.init((self.config['dec_contxt_dim'],
self.config['dec_hidden_dim']))
self.H_x = self.init((self.config['dec_hidden_dim'],
self.config['dec_hidden_dim']))
self.b_x = initializations.get('zero')(self.config['dec_hidden_dim'])
self.C_x.name = '{}_Cx'.format(self.prefix)
self.H_x.name = '{}_Hx'.format(self.prefix)
self.b_x.name = '{}_bx'.format(self.prefix)
self.params += [self.C_x, self.H_x, self.b_x]
# create readout layers
logger.info("_create Readout layers")
# 1. hidden layers readout.
self.hidden_readout = Dense(
self.config['dec_hidden_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_hidden_readout".format(self.prefix)
)
# 2. previous word readout
self.prev_word_readout = None
if self.config['bigram_predict']:
self.prev_word_readout = Dense(
self.config['dec_embedd_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_prev_word_readout".format(self.prefix),
learn_bias=False
)
# 3. context readout
self.context_readout = None
if self.config['context_predict']:
if not self.config['leaky_predict']:
self.context_readout = Dense(
self.config['dec_contxt_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_context_readout".format(self.prefix),
learn_bias=False
)
else:
assert self.config['dec_contxt_dim'] == self.config['dec_hidden_dim']
self.context_readout = self.hidden_readout
# option: deep output (maxout)
if self.config['deep_out']:
self.activ = Activation(config['deep_out_activ'])
# self.dropout = Dropout(rng=self.rng, p=config['dropout'])
self.output_nonlinear = [self.activ] # , self.dropout]
self.output = Dense(
self.config['output_dim'] / 2
if config['deep_out_activ'] == 'maxout2'
else self.config['output_dim'],
self.config['dec_voc_size'],
activation='softmax',
name="{}_output".format(self.prefix),
learn_bias=False
)
else:
self.output_nonlinear = []
self.output = Activation('softmax')
# registration:
self._add(self.hidden_readout)
if not self.config['leaky_predict']:
self._add(self.context_readout)
self._add(self.prev_word_readout)
self._add(self.output)
if self.config['deep_out']:
self._add(self.activ)
# self._add(self.dropout)
logger.info("create decoder ok.")
@staticmethod
def _grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
"""
Build the decoder for evaluation
"""
def prepare_xy(self, target):
# Word embedding
Y, Y_mask = self.Embed(target, True) # (nb_samples, max_len, embedding_dim)
if self.config['use_input']:
X = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, Y.shape[2]), Y[:, :-1, :]], axis=1)
else:
X = 0 * Y
# option ## drop words.
X_mask = T.concatenate([T.ones((Y.shape[0], 1)), Y_mask[:, :-1]], axis=1)
Count = T.cast(T.sum(X_mask, axis=1), dtype=theano.config.floatX)
return X, X_mask, Y, Y_mask, Count
def build_decoder(self, target, context=None,
return_count=False,
train=True):
"""
Build the Decoder Computational Graph
For training/testing
"""
X, X_mask, Y, Y_mask, Count = self.prepare_xy(target)
# input drop-out if any.
if self.dropout > 0:
X = self.D(X, train=train)
# Initial state of RNN
Init_h = self.Initializer(context)
if not self.highway:
X_out = self.RNN(X, X_mask, C=context, init_h=Init_h, return_sequence=True)
# Readout
readout = self.hidden_readout(X_out)
if self.dropout > 0:
readout = self.D(readout, train=train)
if self.config['context_predict']:
readout += self.context_readout(context).dimshuffle(0, 'x', 1)
else:
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
def _recurrence(x, x_mask, prev_h, c):
# compute the highway gate for context vector.
xx = dot(c, self.C_x, self.b_x) + dot(prev_h, self.H_x) # highway gate.
xx = self.sigmoid(xx)
cy = xx * c # the path without using RNN
x_out = self.RNN(x, mask=x_mask, C=c, init_h=prev_h, one_step=True)
hx = (1 - xx) * x_out
return x_out, hx, cy
outputs, _ = theano.scan(
_recurrence,
sequences=[X, X_mask],
outputs_info=[Init_h, None, None],
non_sequences=[context]
)
# hidden readout + context readout
readout = self.hidden_readout( outputs[1].dimshuffle((1, 0, 2)))
if self.dropout > 0:
readout = self.D(readout, train=train)
readout += self.context_readout(outputs[2].dimshuffle((1, 0, 2)))
# return to normal size.
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
prob_dist = self.output(readout) # (nb_samples, max_len, vocab_size)
# log_old = T.sum(T.log(self._grab_prob(prob_dist, target)), axis=1)
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target)) * X_mask, axis=1)
log_ppl = log_prob / Count
if return_count:
return log_prob, Count
else:
return log_prob, log_ppl
"""
Sample one step
"""
def _step_sample(self, prev_word, prev_stat, context):
# word embedding (note that for the first word, embedding should be all zero)
if self.config['use_input']:
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim']),
self.Embed(prev_word)
)
else:
X = alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim'])
if self.dropout > 0:
X = self.D(X, train=False)
# apply one step of RNN
if not self.highway:
X_proj = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_proj
# compute the readout probability distribution and sample it
# here the readout is a matrix, different from the learner.
readout = self.hidden_readout(next_stat)
if self.dropout > 0:
readout = self.D(readout, train=False)
if self.config['context_predict']:
readout += self.context_readout(context)
else:
xx = dot(context, self.C_x, self.b_x) + dot(prev_stat, self.H_x) # highway gate.
xx = self.sigmoid(xx)
X_proj = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_proj
readout = self.hidden_readout((1 - xx) * X_proj)
if self.dropout > 0:
readout = self.D(readout, train=False)
readout += self.context_readout(xx * context)
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
next_prob = self.output(readout)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample, next_stat
"""
Build the sampler for sampling/greedy search/beam search
"""
def build_sampler(self):
"""
Build a sampler which only steps once.
Typically it only works for one word a time?
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.matrix() # theano variable.
init_h = self.Initializer(context)
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], init_h, name='get_init_state')
logger.info('done.')
# word sampler: 1 x 1
prev_word = T.vector('prev_word', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
next_prob, next_sample, next_stat \
= self._step_sample(prev_word, prev_stat, context)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_word, prev_stat, context]
outputs = [next_prob, next_sample, next_stat]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
pass
"""
Build a Stochastic Sampler which can use SCAN to work on GPU.
However it cannot be used in Beam-search.
"""
def build_stochastic_sampler(self):
context = T.matrix()
init_h = self.Initializer(context)
logger.info('compile the function: sample')
pass
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, k=1, maxlen=30, stochastic=True, argmax=False, fixlen=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
# get initial state of decoder RNN with context
next_state = self.get_init_state(context)
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1])
next_prob, next_word, next_state \
= self.sample_next(next_word, next_state, ctx) # wtf.
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = next_word[0]
sample.append(nw)
score += next_prob[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
for idx in xrange(len(new_hyp_samples)):
if (new_hyp_states[idx][-1] == 0) and (not fixlen):
sample.append(new_hyp_samples[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
pass
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
score.append(hyp_scores[idx])
return sample, score
class DecoderAtt(Decoder):
"""
Recurrent Neural Network-based Decoder
with Attention Machenism
"""
def __init__(self,
config, rng, prefix='dec',
mode='RNN', embed=None,
copynet=False, identity=False):
super(DecoderAtt, self).__init__(
config, rng, prefix,
mode, embed, False)
self.copynet = copynet
self.identity = identity
# attention reader
self.attention_reader = Attention(
self.config['dec_hidden_dim'],
self.config['dec_contxt_dim'],
1000,
name='source_attention'
)
self._add(self.attention_reader)
# if use copynet
if self.copynet:
if not self.identity:
self.Is = Dense(
self.config['dec_contxt_dim'],
self.config['dec_embedd_dim'],
name='in-trans'
)
else:
assert self.config['dec_contxt_dim'] == self.config['dec_embedd_dim']
self.Is = Identity(name='ini')
self.Os = Dense(
self.config['dec_readout_dim'],
self.config['dec_contxt_dim'],
name='out-trans'
)
self._add(self.Is)
self._add(self.Os)
logger.info('adjust decoder ok.')
"""
Build the decoder for evaluation
"""
def prepare_xy(self, target, context=None):
if not self.copynet:
# Word embedding
Y, Y_mask = self.Embed(target, True) # (nb_samples, max_len, embedding_dim)
else:
Y, Y_mask = self.Embed(target, True, context=self.Is(context))
if self.config['use_input']:
X = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, Y.shape[2]), Y[:, :-1, :]], axis=1)
else:
X = 0 * Y
X_mask = T.concatenate([T.ones((Y.shape[0], 1)), Y_mask[:, :-1]], axis=1)
Count = T.cast(T.sum(X_mask, axis=1), dtype=theano.config.floatX)
return X, X_mask, Y, Y_mask, Count
def build_decoder(self,
target,
context, c_mask,
return_count=False,
train=True):
"""
Build the Computational Graph ::> Context is essential
"""
assert c_mask is not None, 'context must be supplied for this decoder.'
assert context.ndim == 3, 'context must have 3 dimentions.'
# context: (nb_samples, max_len, contxt_dim)
X, X_mask, Y, Y_mask, Count = self.prepare_xy(target, context)
# input drop-out if any.
if self.dropout > 0:
X = self.D(X, train=train)
# Initial state of RNN
Init_h = self.Initializer(context[:, 0, :]) # default order ->
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
def _recurrence(x, x_mask, prev_h, cc, cm):
# compute the attention and get the context vector
prob = self.attention_reader(prev_h, cc, Smask=cm)
c = T.sum(cc * prob[:, :, None], axis=1)
x_out = self.RNN(x, mask=x_mask, C=c, init_h=prev_h, one_step=True)
return x_out, prob, c
outputs, _ = theano.scan(
_recurrence,
sequences=[X, X_mask],
outputs_info=[Init_h, None, None],
non_sequences=[context, c_mask]
)
X_out, Probs, Ctx = [z.dimshuffle((1, 0, 2)) for z in outputs]
# return to normal size.
X = X.dimshuffle((1, 0, 2))
X_mask = X_mask.dimshuffle((1, 0))
# Readout
readin = [X_out]
readout = self.hidden_readout(X_out)
if self.dropout > 0:
readout = self.D(readout, train=train)
if self.config['context_predict']:
readin += [Ctx]
readout += self.context_readout(Ctx)
if self.config['bigram_predict']:
readin += [X]
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
if self.copynet:
readin = T.concatenate(readin, axis=-1)
key = self.Os(readin)
# (nb_samples, max_len_T, embed_size) :: key
# (nb_samples, max_len_S, embed_size) :: context
Eng = T.sum(key[:, :, None, :] * context[:, None, :, :], axis=-1)
# (nb_samples, max_len_T, max_len_S) :: Eng
EngSum = logSumExp(Eng, axis=2, mask=c_mask[:, None, :], c=readout)
prob_dist = T.concatenate([T.exp(readout - EngSum), T.exp(Eng - EngSum) * c_mask[:, None, :]], axis=-1)
else:
prob_dist = self.output(readout) # (nb_samples, max_len, vocab_size)
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target)) * X_mask, axis=1)
log_ppl = log_prob / Count
if return_count:
return log_prob, Count
else:
return log_prob, log_ppl
"""
Sample one step
"""
def _step_sample(self, prev_word, prev_stat, context, c_mask):
assert c_mask is not None, 'we need the source mask.'
# word embedding (note that for the first word, embedding should be all zero)
if self.config['use_input']:
if not self.copynet:
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim']),
self.Embed(prev_word)
)
else:
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim']),
self.Embed(prev_word, context=self.Is(context))
)
else:
X = alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim'])
if self.dropout > 0:
X = self.D(X, train=False)
# apply one step of RNN
Probs = self.attention_reader(prev_stat, context, c_mask)
cxt = T.sum(context * Probs[:, :, None], axis=1)
X_proj = self.RNN(X, C=cxt, init_h=prev_stat, one_step=True)
next_stat = X_proj
# compute the readout probability distribution and sample it
# here the readout is a matrix, different from the learner.
readout = self.hidden_readout(next_stat)
readin = [next_stat]
if self.dropout > 0:
readout = self.D(readout, train=False)
if self.config['context_predict']:
readout += self.context_readout(cxt)
readin += [cxt]
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
readin += [X]
for l in self.output_nonlinear:
readout = l(readout)
if self.copynet:
readin = T.concatenate(readin, axis=-1)
key = self.Os(readin)
# (nb_samples, embed_size) :: key
# (nb_samples, max_len_S, embed_size) :: context
Eng = T.sum(key[:, None, :] * context[:, :, :], axis=-1)
# (nb_samples, max_len_S) :: Eng
EngSum = logSumExp(Eng, axis=-1, mask=c_mask, c=readout)
next_prob = T.concatenate([T.exp(readout - EngSum), T.exp(Eng - EngSum) * c_mask], axis=-1)
else:
next_prob = self.output(readout) # (nb_samples, max_len, vocab_size)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample, next_stat
def build_sampler(self):
"""
Build a sampler which only steps once.
Typically it only works for one word a time?
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.tensor3() # theano variable.
c_mask = T.matrix() # mask of the input sentence.
init_h = self.Initializer(context[:, 0, :])
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], init_h, name='get_init_state')
logger.info('done.')
# word sampler: 1 x 1
prev_word = T.vector('prev_word', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
next_prob, next_sample, next_stat \
= self._step_sample(prev_word, prev_stat, context, c_mask)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_word, prev_stat, context, c_mask]
outputs = [next_prob, next_sample, next_stat]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
pass
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, c_mask, k=1, maxlen=30, stochastic=True, argmax=False, fixlen=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
# get initial state of decoder RNN with context
next_state = self.get_init_state(context)
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1, 1])
cmk = np.tile(c_mask, [live_k, 1])
next_prob, next_word, next_state \
= self.sample_next(next_word, next_state, ctx, cmk)
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = next_word[0]
sample.append(nw)
score += next_prob[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
for idx in xrange(len(new_hyp_samples)):
if (new_hyp_states[idx][-1] == 0) and (not fixlen):
sample.append(new_hyp_samples[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
pass
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
score.append(hyp_scores[idx])
return sample, score
class FnnDecoder(Model):
def __init__(self, config, rng, prefix='fnndec'):
"""
mode = RNN: use a RNN Decoder
"""
super(FnnDecoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.name = prefix
"""
Create Dense Predictor.
"""
self.Tr = Dense(self.config['dec_contxt_dim'],
self.config['dec_hidden_dim'],
activation='maxout2',
name='{}_Tr'.format(prefix))
self._add(self.Tr)
self.Pr = Dense(self.config['dec_hidden_dim'] / 2,
self.config['dec_voc_size'],
activation='softmax',
name='{}_Pr'.format(prefix))
self._add(self.Pr)
logger.info("FF decoder ok.")
@staticmethod
def _grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
def build_decoder(self, target, context):
"""
Build the Decoder Computational Graph
"""
prob_dist = self.Pr(self.Tr(context[:, None, :]))
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target)), axis=1)
return log_prob
def build_sampler(self):
context = T.matrix()
prob_dist = self.Pr(self.Tr(context))
next_sample = self.rng.multinomial(pvals=prob_dist).argmax(1)
self.sample_next = theano.function([context], [prob_dist, next_sample], name='sample_next_{}'.format(self.prefix))
logger.info('done')
def get_sample(self, context, argmax=True):
prob, sample = self.sample_next(context)
if argmax:
return prob[0].argmax()
else:
return sample[0]
########################################################################################################################
# Encoder-Decoder Models ::::
#
class RNNLM(Model):
"""
RNN-LM, with context vector = 0.
It is very similar with the implementation of VAE.
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'rnnlm'
def build_(self):
logger.info("build the RNN-decoder")
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration:
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get('adadelta')
# saved the initial memories
if self.config['mode'] == 'NTM':
self.memory = initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth']))
logger.info("create the RECURRENT language model. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
if not contrastive:
self.compile_train()
else:
self.compile_train_CE()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['dec_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, context)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
# add monitoring:
self.monitor['context'] = context
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
@abstractmethod
def compile_train_CE(self):
pass
def compile_sample(self):
# context vectors (as)
self.decoder.build_sampler()
logger.info("display functions compile done.")
@abstractmethod
def compile_inference(self):
pass
def default_context(self):
if self.config['mode'] == 'RNN':
return np.zeros(shape=(1, self.config['dec_contxt_dim']), dtype=theano.config.floatX)
elif self.config['mode'] == 'NTM':
memory = self.memory.get_value()
memory = memory.reshape((1, memory.shape[0], memory.shape[1]))
return memory
def generate_(self, context=None, max_len=None, mode='display'):
"""
:param action: action vector to guide the question.
If None, use a Gaussian to simulate the action.
:return: question sentence in natural language.
"""
# assert self.config['sample_stoch'], 'RNNLM sampling must be stochastic'
# assert not self.config['sample_argmax'], 'RNNLM sampling cannot use argmax'
if context is None:
context = self.default_context()
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
sample, score = self.decoder.get_sample(context, **args)
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
sample = sample[score.argmin()]
score = score.min()
else:
score /= float(len(sample))
return sample, np.exp(score)
class AutoEncoder(RNNLM):
"""
Regular Auto-Encoder: RNN Encoder/Decoder
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'vae'
def build_(self):
logger.info("build the RNN auto-encoder")
self.encoder = Encoder(self.config, self.rng, prefix='enc')
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec', embed=self.encoder.Embed)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec')
"""
Build the Transformation
"""
if self.config['nonlinear_A']:
self.action_trans = Dense(
self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='tanh',
name='action_transform'
)
else:
assert self.config['enc_hidden_dim'] == self.config['action_dim'], \
'hidden dimension must match action dimension'
self.action_trans = Identity(name='action_transform')
if self.config['nonlinear_B']:
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='tanh',
name='context_transform'
)
else:
assert self.config['dec_contxt_dim'] == self.config['action_dim'], \
'action dimension must match context dimension'
self.context_trans = Identity(name='context_transform')
# registration
self._add(self.action_trans)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'], kwargs={'lr': self.config['lr']})
logger.info("create Helmholtz RECURRENT neural network. ok")
def compile_train(self, mode='train'):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
context = alloc_zeros_matrix(inputs.shape[0], self.config['dec_contxt_dim'])
assert context.ndim == 2
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, context)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
if mode == 'display' or mode == 'all':
"""
build the sampler function here <:::>
"""
# context vectors (as)
self.decoder.build_sampler()
logger.info("display functions compile done.")
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
class NRM(Model):
"""
Neural Responding Machine
A Encoder-Decoder based responding model.
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation',
use_attention=False,
copynet=False,
identity=False):
super(NRM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'nrm'
self.attend = use_attention
self.copynet = copynet
self.identity = identity
def build_(self):
logger.info("build the Neural Responding Machine")
# encoder-decoder:: <<==>>
self.encoder = Encoder(self.config, self.rng, prefix='enc', mode=self.mode)
if not self.attend:
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
else:
self.decoder = DecoderAtt(self.config, self.rng, prefix='dec', mode=self.mode,
copynet=self.copynet, identity=self.identity)
self._add(self.encoder)
self._add(self.decoder)
# objectives and optimizers
# self.optimizer = optimizers.get(self.config['optimizer'])
assert self.config['optimizer'] == 'adam'
self.optimizer = optimizers.get(self.config['optimizer'],
kwargs=dict(rng=self.rng,
save=False))
logger.info("build ok.")
def compile_(self, mode='all', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
self.compile_train()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
target = T.imatrix() # padded target word sequence (for training)
# encoding & decoding
if not self.attend:
code = self.encoder.build_encoder(inputs, None)
logPxz, logPPL = self.decoder.build_decoder(target, code)
else:
code, _, c_mask, _ = self.encoder.build_encoder(inputs, None, return_sequence=True, return_embed=True)
logPxz, logPPL = self.decoder.build_decoder(target, code, c_mask)
# responding loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs, target]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
# mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))
logger.info("training functions compile done.")
# # add monitoring:
# self.monitor['context'] = context
# self._monitoring()
#
# # compiling monitoring
# self.compile_monitoring(train_inputs)
def compile_sample(self):
if not self.attend:
self.encoder.compile_encoder(with_context=False)
else:
self.encoder.compile_encoder(with_context=False, return_sequence=True, return_embed=True)
self.decoder.build_sampler()
logger.info("sampling functions compile done.")
def compile_inference(self):
pass
def generate_(self, inputs, mode='display', return_all=False):
# assert self.config['sample_stoch'], 'RNNLM sampling must be stochastic'
# assert not self.config['sample_argmax'], 'RNNLM sampling cannot use argmax'
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'],
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
if not self.attend:
context = self.encoder.encode(inputs)
sample, score = self.decoder.get_sample(context, **args)
else:
context, _, c_mask, _ = self.encoder.encode(inputs)
sample, score = self.decoder.get_sample(context, c_mask, **args)
if return_all:
return sample, score
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
sample = sample[score.argmin()]
score = score.min()
else:
score /= float(len(sample))
return sample, np.exp(score)
# def evaluate_(self, inputs, outputs, idx2word,
# origin=None, idx2word_o=None):
#
# def cut_zero(sample, idx2word, idx2word_o):
# Lmax = len(idx2word)
# if not self.copynet:
# if 0 not in sample:
# return [idx2word[w] for w in sample]
# return [idx2word[w] for w in sample[:sample.index(0)]]
# else:
# if 0 not in sample:
# if origin is None:
# return [idx2word[w] if w < Lmax else idx2word[inputs[w - Lmax]]
# for w in sample]
# else:
# return [idx2word[w] if w < Lmax else idx2word_o[origin[w - Lmax]]
# for w in sample]
# if origin is None:
# return [idx2word[w] if w < Lmax else idx2word[inputs[w - Lmax]]
# for w in sample[:sample.index(0)]]
# else:
# return [idx2word[w] if w < Lmax else idx2word_o[origin[w - Lmax]]
# for w in sample[:sample.index(0)]]
#
# result, _ = self.generate_(inputs[None, :])
#
# if origin is not None:
# print '[ORIGIN]: {}'.format(' '.join(cut_zero(origin.tolist(), idx2word_o, idx2word_o)))
# print '[DECODE]: {}'.format(' '.join(cut_zero(result, idx2word, idx2word_o)))
# print '[SOURCE]: {}'.format(' '.join(cut_zero(inputs.tolist(), idx2word, idx2word_o)))
# print '[TARGET]: {}'.format(' '.join(cut_zero(outputs.tolist(), idx2word, idx2word_o)))
#
# return True
def evaluate_(self, inputs, outputs, idx2word, inputs_unk=None):
def cut_zero(sample, idx2word, Lmax=None):
if Lmax is None:
Lmax = self.config['dec_voc_size']
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample[:sample.index(0)]]
if inputs_unk is None:
result, _ = self.generate_(inputs[None, :])
else:
result, _ = self.generate_(inputs_unk[None, :])
a = '[SOURCE]: {}'.format(' '.join(cut_zero(inputs.tolist(), idx2word)))
b = '[TARGET]: {}'.format(' '.join(cut_zero(outputs.tolist(), idx2word)))
c = '[DECODE]: {}'.format(' '.join(cut_zero(result, idx2word)))
print a
if inputs_unk is not None:
k = '[_INPUT]: {}\n'.format(' '.join(cut_zero(inputs_unk.tolist(), idx2word, Lmax=len(idx2word))))
print k
a += k
print b
print c
a += b + c
return a
def analyse_(self, inputs, outputs, idx2word):
Lmax = len(idx2word)
def cut_zero(sample, idx2word):
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample[:sample.index(0)]]
result, _ = self.generate_(inputs[None, :])
flag = 0
source = '{}'.format(' '.join(cut_zero(inputs.tolist(), idx2word)))
target = '{}'.format(' '.join(cut_zero(outputs.tolist(), idx2word)))
result = '{}'.format(' '.join(cut_zero(result, idx2word)))
return target == result
def analyse_cover(self, inputs, outputs, idx2word):
Lmax = len(idx2word)
def cut_zero(sample, idx2word):
if 0 not in sample:
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample]
return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample[:sample.index(0)]]
results, _ = self.generate_(inputs[None, :], return_all=True)
flag = 0
source = '{}'.format(' '.join(cut_zero(inputs.tolist(), idx2word)))
target = '{}'.format(' '.join(cut_zero(outputs.tolist(), idx2word)))
score = [target == '{}'.format(' '.join(cut_zero(result, idx2word))) for result in results]
return max(score)
================================================
FILE: emolga/models/ntm_encdec.py
================================================
__author__ = 'jiataogu'
import theano
theano.config.exception_verbosity = 'high'
import logging
import copy
import emolga.basic.objectives as objectives
import emolga.basic.optimizers as optimizers
from emolga.layers.recurrent import *
from emolga.layers.ntm_minibatch import Controller, BernoulliController
from emolga.layers.embeddings import *
from core import Model
logger = logging.getLogger(__name__)
RNN = JZS3 # change it here for other RNN models.
class RecurrentBase(Model):
"""
The recurrent base for SimpleRNN, GRU, JZS3, LSTM and Neural Turing Machines
"""
def __init__(self, config, model='RNN', prefix='enc', use_contxt=True, name=None):
super(RecurrentBase, self).__init__()
self.config = config
self.model = model
self.prefix = prefix
self.use_contxt = use_contxt
if not name:
self.name = self.prefix
else:
self.name = name
if self.config['binary']:
NTM = BernoulliController
else:
NTM = Controller
def _build_RNN():
logger.info('BUILD::>>>>>>>> Gated Recurrent Units.')
core = RNN(
self.config['{}_embedd_dim'.format(self.prefix)],
self.config['{}_hidden_dim'.format(self.prefix)],
self.config['{}_contxt_dim'.format(self.prefix)] if use_contxt else None,
name='{}_rnn'.format(self.prefix)
)
if self.config['bias_code']:
init = Dense(
self.config['{}_contxt_dim'.format(self.prefix)],
self.config['{}_hidden_dim'.format(self.prefix)],
activation='tanh',
name='{}_init'.format(self.prefix)
)
else:
init = Zero()
return core, [init]
def _build_NTM():
"""
Build a simple Neural Turing Machine.
We use a feedforward controller here.
"""
logger.info('BUILD::>>>>>>>> Controller Units.')
core = NTM(
self.config['{}_embedd_dim'.format(self.prefix)],
self.config['{}_memory_dim'.format(self.prefix)],
self.config['{}_memory_wdth'.format(self.prefix)],
self.config['{}_hidden_dim'.format(self.prefix)],
self.config['{}_shift_width'.format(self.prefix)],
name="{}_ntm".format(self.prefix),
readonly=self.config['{}_read-only'.format(self.prefix)],
curr_input=self.config['{}_curr_input'.format(self.prefix)],
recurrence=self.config['{}_recurrence'.format(self.prefix)]
)
if self.config['bias_code']:
raise NotImplementedError
else:
init_w = T.nnet.softmax(initializations.get('glorot_uniform')((1, self.config['{}_memory_dim'.format(self.prefix)])))
init_r = T.nnet.softmax(initializations.get('glorot_uniform')((1, self.config['{}_memory_dim'.format(self.prefix)])))
init_c = initializations.get('glorot_uniform')((1, self.config['{}_hidden_dim'.format(self.prefix)]))
return core, [init_w, init_r, init_c]
if model == 'RNN':
self.core, self.init = _build_RNN()
elif model == 'NTM':
self.core, self.init = _build_NTM()
else:
raise NotImplementedError
self._add(self.core)
if model == 'RNN':
for init in self.init:
self._add(init)
self.set_name(name)
# *****************************************************************
# For Theano inputs.
def get_context(self, context):
# get context if "use_context" is True
info = dict()
# if self.use_contxt:
if self.model == 'RNN':
# context is a matrix (nb_samples, context_dim)
info['C'] = context
info['init_h'] = self.init[0](context)
elif self.model == 'NTM':
# context is a tensor (nb_samples, memory_dim, memory_width)
info['M'] = context
if self.config['bias_code']:
raise NotImplementedError
else:
info['init_ww'] = T.repeat(self.init[0], context.shape[0], axis=0)
info['init_wr'] = T.repeat(self.init[1], context.shape[0], axis=0)
info['init_c'] = T.repeat(self.init[2], context.shape[0], axis=0)
else:
raise NotImplementedError
return info
def loop(self, X, X_mask, info=None, return_sequence=False, return_full=False):
if self.model == 'NTM':
info['return_full'] = return_full
Z = self.core(X, X_mask, return_sequence=return_sequence, **info)
self._monitoring()
return Z
def step(self, X, prev_info):
# run one step of the Recurrence
if self.model == 'RNN':
out = self.core(X, one_step=True, **prev_info)
next_state = out
next_info = {'init_h': out, 'C': prev_info['C']}
elif self.model == 'NTM':
out = self.core(X, one_step=True, **prev_info)
next_state = out[3]
next_info = dict()
next_info['M'] = out[0]
next_info['init_ww'] = out[1]
next_info['init_wr'] = out[2]
next_info['init_c'] = out[3]
else:
raise NotImplementedError
return next_state, next_info
def build_(self):
# build a sampler in theano function for sampling.
if self.model == 'RNN':
context = T.matrix() # theano variable.
logger.info('compile the function: get_init_state')
info = self.get_context(context)
self.get_init_state \
= theano.function([context], info['init_h'],
name='get_init_state')
# **************************************************** #
context = T.matrix() # theano variable.
prev_X = T.matrix('prev_X', dtype='float32')
prev_stat = T.matrix('prev_state', dtype='float32')
prev_info = dict()
prev_info['C'] = context
prev_info['init_h'] = prev_stat
next_stat, next_info \
= self.step(prev_X, prev_info)
logger.info('compile the function: sample_next_state')
inputs = [prev_X, prev_stat, context]
outputs = next_stat
self.sample_next_state = theano.function(inputs, outputs, name='sample_next_state')
elif self.model == 'NTM':
memory = T.tensor3() # theano variable
logger.info('compile the funtion: get_init_state')
info = self.get_context(memory)
self.get_init_wr = theano.function([memory], info['init_wr'], name='get_init_wr')
self.get_init_ww = theano.function([memory], info['init_ww'], name='get_init_ww')
self.get_init_c = theano.function([memory], info['init_c'], name='get_init_c')
# **************************************************** #
memory = T.tensor3() # theano variable
prev_X = T.matrix('prev_X', dtype='float32')
prev_ww = T.matrix('prev_ww', dtype='float32')
prev_wr = T.matrix('prev_wr', dtype='float32')
prev_stat = T.matrix('prev_stat', dtype='float32')
prev_info = {'M': memory, 'init_ww': prev_ww, 'init_wr': prev_wr, 'init_c': prev_stat}
logger.info('compile the function: sample_next_0123')
next_stat, next_info = self.step(prev_X, prev_info)
inputs = [prev_X, prev_ww, prev_wr, memory, prev_stat]
outputs = [next_info['M'], next_info['init_ww'], next_info['init_wr'], next_stat]
self.sample_next_state = theano.function(inputs, outputs, name='sample_next_state')
else:
raise NotImplementedError
logger.info('done.')
# *****************************************************************
# For Numpy inputs.
def get_init(self, context):
info = dict()
if self.model == 'RNN':
info['init_h'] = self.get_init_state(context)
info['C'] = context
elif self.model == 'NTM':
if hasattr(self, 'get_init_ww'):
info['init_ww'] = self.get_init_ww(context)
if hasattr(self, 'get_init_wr'):
info['init_wr'] = self.get_init_wr(context)
if hasattr(self, 'get_init_c'):
info['init_c'] = self.get_init_c(context)
info['M'] = context
else:
raise NotImplementedError
return info
def get_next_state(self, prev_X, prev_info):
if self.model == 'RNN':
next_state = self.sample_next_state(
prev_X, prev_info['init_h'], prev_info['C'])
next_info = dict()
next_info['C'] = prev_info['C']
next_info['init_h'] = next_state
elif self.model == 'NTM':
next_info = dict()
assert 'init_ww' in prev_info
assert 'init_wr' in prev_info
assert 'init_c' in prev_info
assert 'M' in prev_info
next_info['M'], next_info['init_ww'], \
next_info['init_wr'], next_info['init_c'] = self.sample_next_state(
prev_X, prev_info['init_ww'], prev_info['init_wr'],
prev_info['M'], prev_info['init_c'])
next_state = next_info['init_c']
else:
raise NotImplementedError
return next_state, next_info
class Encoder(Model):
"""
Recurrent Neural Network/Neural Turing Machine-based Encoder
It is used to compute the context vector.
"""
def __init__(self,
config, rng, prefix='enc',
mode='RNN', embed=None):
"""
mode = RNN: use a RNN Encoder
mode = NTM: use a NTM Encoder
"""
super(Encoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.mode = mode
self.name = prefix
"""
Create all elements of the Encoder's Computational graph
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['enc_voc_size'],
self.config['enc_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
# create Recurrent Base
logger.info("{}_create Recurrent layers.".format(self.prefix))
if self.mode == 'RNN' and self.config['bidirectional']:
self.Forward = RecurrentBase(self.config, model=self.mode, name='forward',
prefix='enc', use_contxt=self.config['enc_use_contxt'])
self.Bakward = RecurrentBase(self.config, model=self.mode, name='backward',
prefix='enc', use_contxt=self.config['enc_use_contxt'])
self._add(self.Forward)
self._add(self.Bakward)
else:
self.Recurrence = RecurrentBase(self.config, model=self.mode, name='encoder',
prefix='enc', use_contxt=self.config['enc_use_contxt'])
self._add(self.Recurrence)
# there is no readout layers for encoder.
def build_encoder(self, source, context=None):
"""
Build the Encoder Computational Graph
"""
if self.mode == 'RNN':
# we use a Recurrent Neural Network Encoder (GRU)
if not self.config['bidirectional']:
X, X_mask = self.Embed(source, True)
info = self.Recurrence.get_context(context)
X_out = self.Recurrence.loop(X, X_mask, info, return_sequence=False)
else:
source_back = source[:, ::-1]
X1, X1_mask = self.Embed(source, True)
X2, X2_mask = self.Embed(source_back, True)
info = self.Forward.get_context(context)
X_out1 = self.Forward.loop(X1, X1_mask, info, return_sequence=False)
info = self.Bakward.get_context(context)
X_out2 = self.Bakward.loop(X2, X2_mask, info, return_sequence=False)
# X_out = T.concatenate([X_out1, X_out2], axis=1)
X_out = 0.5 * X_out1 + 0.5 * X_out2
elif self.mode == 'NTM':
if not self.config['bidirectional']:
X, X_mask = self.Embed(source, True)
else:
source_back = source[:, ::-1]
X1, X1_mask = self.Embed(source, True)
X2, X2_mask = self.Embed(source_back, True)
X = T.concatenate([X1, X2], axis=1)
X_mask = T.concatenate([X1_mask, X2_mask], axis=1)
info = self.Recurrence.get_context(context)
# X_out here is the extracted memorybook. which can be used as a the initial memory of NTM Decoder.
X_out = self.Recurrence.loop(X, X_mask, info, return_sequence=False, return_full=True)[0]
else:
raise NotImplementedError
self._monitoring()
return X_out
class Decoder(Model):
"""
Recurrent Neural Network-based Decoder.
It is used for:
(1) Evaluation: compute the probability P(Y|X)
(2) Prediction: sample the best result based on P(Y|X)
(3) Beam-search
(4) Scheduled Sampling (how to implement it?)
"""
def __init__(self,
config, rng, prefix='dec',
mode='RNN', embed=None):
"""
mode = RNN: use a RNN Decoder
mode = NTM: use a NTM Decoder (Neural Turing Machine)
"""
super(Decoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
self.name = prefix
self.mode = mode
"""
Create all elements of the Decoder's computational graph.
"""
# create Embedding layers
logger.info("{}_create embedding layers.".format(self.prefix))
if embed:
self.Embed = embed
else:
self.Embed = Embedding(
self.config['dec_voc_size'],
self.config['dec_embedd_dim'],
name="{}_embed".format(self.prefix))
self._add(self.Embed)
# create Recurrent Base.
logger.info("{}_create Recurrent layers.".format(self.prefix))
self.Recurrence = RecurrentBase(self.config, model=self.mode, name='decoder',
prefix='dec', use_contxt=self.config['dec_use_contxt'])
# create readout layers
logger.info("_create Readout layers")
# 1. hidden layers readout.
self.hidden_readout = Dense(
self.config['dec_hidden_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_hidden_readout".format(self.prefix)
)
# 2. previous word readout
self.prev_word_readout = None
if self.config['bigram_predict']:
self.prev_word_readout = Dense(
self.config['dec_embedd_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_prev_word_readout".format(self.prefix),
learn_bias=False
)
# 3. context readout
self.context_readout = None
if self.config['context_predict']:
self.context_readout = Dense(
self.config['dec_contxt_dim'],
self.config['output_dim']
if self.config['deep_out']
else self.config['dec_voc_size'],
activation='linear',
name="{}_context_readout".format(self.prefix),
learn_bias=False
)
# option: deep output (maxout)
if self.config['deep_out']:
self.activ = Activation(config['deep_out_activ'])
# self.dropout = Dropout(rng=self.rng, p=config['dropout'])
self.output_nonlinear = [self.activ] # , self.dropout]
self.output = Dense(
self.config['output_dim'] / 2
if config['deep_out_activ'] == 'maxout2'
else self.config['output_dim'],
self.config['dec_voc_size'],
activation='softmax',
name="{}_output".format(self.prefix),
learn_bias=False
)
else:
self.output_nonlinear = []
self.output = Activation('softmax')
# registration:
self._add(self.Recurrence)
self._add(self.hidden_readout)
self._add(self.context_readout)
self._add(self.prev_word_readout)
self._add(self.output)
if self.config['deep_out']:
self._add(self.activ)
# self._add(self.dropout)
logger.info("create decoder ok.")
@staticmethod
def _grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
"""
Build the decoder for evaluation
"""
def prepare_xy(self, target):
# Word embedding
Y, Y_mask = self.Embed(target, True) # (nb_samples, max_len, embedding_dim)
if self.config['use_input']:
X = T.concatenate([alloc_zeros_matrix(Y.shape[0], 1, Y.shape[2]), Y[:, :-1, :]], axis=1)
else:
X = 0 * Y
# option ## drop words.
X_mask = T.concatenate([T.ones((Y.shape[0], 1)), Y_mask[:, :-1]], axis=1)
Count = T.cast(T.sum(X_mask, axis=1), dtype=theano.config.floatX)
return X, X_mask, Y, Y_mask, Count
def build_decoder(self, target, context=None, return_count=False):
"""
Build the Decoder Computational Graph
"""
X, X_mask, Y, Y_mask, Count = self.prepare_xy(target)
info = self.Recurrence.get_context(context)
X_out = self.Recurrence.loop(X, X_mask, info=info, return_sequence=True)
# Readout
readout = self.hidden_readout(X_out)
if self.config['context_predict']:
# warning: only supports RNN, cannot supports Memory
readout += self.context_readout(context).dimshuffle(0, 'x', 1) \
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
prob_dist = self.output(readout) # (nb_samples, max_len, vocab_size)
# log_old = T.sum(T.log(self._grab_prob(prob_dist, target)), axis=1)
log_prob = T.sum(T.log(self._grab_prob(prob_dist, target)) * X_mask, axis=1)
log_ppl = log_prob / Count
self._monitoring()
if return_count:
return log_prob, Count
else:
return log_prob, log_ppl
"""
Sampling Functions.
"""
def _step_embed(self, prev_word):
# word embedding (note that for the first word, embedding should be all zero)
if self.config['use_input']:
X = T.switch(
prev_word[:, None] < 0,
alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim']),
self.Embed(prev_word)
)
else:
X = alloc_zeros_matrix(prev_word.shape[0], self.config['dec_embedd_dim'])
return X
def _step_sample(self, X, next_stat, context):
# compute the readout probability distribution and sample it
# here the readout is a matrix, different from the learner.
readout = self.hidden_readout(next_stat)
if context.ndim == 2 and self.config['context_predict']:
# warning: only supports RNN, cannot supports Memory
readout += self.context_readout(context)
if self.config['bigram_predict']:
readout += self.prev_word_readout(X)
for l in self.output_nonlinear:
readout = l(readout)
next_prob = self.output(readout)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample
"""
Build the sampler for sampling/greedy search/beam search
"""
def build_sampler(self):
"""
Build a sampler which only steps once.
Typically it only works for one word a time?
"""
prev_word = T.vector('prev_word', dtype='int64')
prev_X = self._step_embed(prev_word)
self.prev_embed = theano.function([prev_word], prev_X)
self.Recurrence.build_()
prev_X = T.matrix('prev_X', dtype='float32')
next_stat = T.matrix('next_state', dtype='float32')
logger.info('compile the function: sample_next')
if self.config['mode'] == 'RNN':
context = T.matrix('context')
else:
context = T.tensor3('memory')
next_prob, next_sample = self._step_sample(prev_X, next_stat, context)
self.sample_next = theano.function([prev_X, next_stat, context],
[next_prob, next_sample],
name='sample_next',
on_unused_input='warn')
logger.info('done')
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, k=1, maxlen=30, stochastic=True, argmax=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
hyp_infos = []
# get initial state of decoder Recurrence
next_info = self.Recurrence.get_init(context)
# print 'sample with memory:\t', next_info['M'][0]
# next_state = next_info['init_h']
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
print '<0e~k>'
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1])
next_embedding = self.prev_embed(next_word)
next_state, next_info = self.Recurrence.get_next_state(next_embedding, next_info)
next_prob, next_word = self.sample_next(next_embedding, next_state, ctx) # wtf.
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = next_word[0]
sample.append(nw)
score += next_prob[0, nw]
if nw == 0: # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
# Recently beam-search does not support NTM !!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
new_hyp_infos = {w: [] for w in next_info}
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
for w in next_info:
new_hyp_infos[w].append(copy.copy(next_info[w][ti]))
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
hyp_infos = {w: [] for w in next_info}
for idx in xrange(len(new_hyp_samples)):
if new_hyp_states[idx][-1] == 0:
sample.append(new_hyp_samples[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
for w in next_info:
hyp_infos[w].append(copy.copy(new_hyp_infos[w][ti]))
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
for w in hyp_infos:
next_info[w] = np.array(hyp_infos[w])
pass
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
score.append(hyp_scores[idx])
return sample, score
class RNNLM(Model):
"""
RNN-LM, with context vector = 0.
It is very similar with the implementation of VAE.
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'rnnlm'
def build_(self):
logger.info("build the RNN/NTM-decoder")
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration:
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get('adadelta')
# saved the initial memories
self.memory = initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth']))
logger.info("create the RECURRENT language model. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
if not contrastive:
self.compile_train()
else:
self.compile_train_CE()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['dec_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, context)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
# add monitoring:
self.monitor['context'] = context
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
def compile_train_CE(self):
pass
def compile_sample(self):
# context vectors (as)
self.decoder.build_sampler()
logger.info("display functions compile done.")
def compile_inference(self):
pass
def default_context(self):
if self.config['mode'] == 'RNN':
return np.zeros(shape=(1, self.config['dec_contxt_dim']), dtype=theano.config.floatX)
elif self.config['mode'] == 'NTM':
memory = self.memory.get_value()
memory = memory.reshape((1, memory.shape[0], memory.shape[1]))
return memory
def generate_(self, context=None, mode='display', max_len=None):
"""
:param action: action vector to guide the question.
If None, use a Gaussian to simulate the action.
:return: question sentence in natural language.
"""
# assert self.config['sample_stoch'], 'RNNLM sampling must be stochastic'
# assert not self.config['sample_argmax'], 'RNNLM sampling cannot use argmax'
if context is None:
context = self.default_context()
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
sample, score = self.decoder.get_sample(context, **args)
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
sample = sample[score.argmin()]
score = score.min()
else:
score /= float(len(sample))
return sample, np.exp(score)
class Helmholtz(RNNLM):
"""
Helmholtz Machine as an probabilistic version AutoEncoder
It is very similar with Variational Auto-Encoder
We implement the Helmholtz RNN as well as Helmholtz Turing Machine here.
Reference:
Reweighted Wake-Sleep
http://arxiv.org/abs/1406.2751
"""
def __init__(self,
config, n_rng, rng,
mode='RNN'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'helmholtz'
def build_(self):
logger.info("build the Helmholtz auto-encoder")
if self.mode == 'NTM':
assert self.config['enc_memory_dim'] == self.config['dec_memory_dim']
assert self.config['enc_memory_wdth'] == self.config['dec_memory_wdth']
self.encoder = Encoder(self.config, self.rng, prefix='enc', mode=self.mode)
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed, mode=self.mode)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration
self._add(self.encoder)
self._add(self.decoder)
# The main difference between VAE and HM is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a sigmoid prior here.
# prior distribution is a bias layer
if self.mode == 'RNN':
# here we first forcus on Helmholtz Turing Machine
# Thus the RNN version will be copied from Dial-DRL projects.
raise NotImplementedError
elif self.mode == 'NTM':
self.Prior = MemoryLinear(
self.config['enc_memory_dim'],
self.config['enc_memory_wdth'],
activation='sigmoid',
name='prior_proj',
has_input=False
)
self.Post = MemoryLinear(
self.config['enc_memory_dim'],
self.config['enc_memory_wdth'],
activation='sigmoid',
name='post_proj',
has_input=True
)
self.Trans = MemoryLinear(
self.config['enc_memory_dim'],
self.config['enc_memory_wdth'],
activation='linear',
name='trans_proj',
has_input=True
)
# registration
self._add(self.Prior)
self._add(self.Post)
self._add(self.Trans)
else:
raise NotImplementedError
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'])
# saved the initial memories
self.memory = initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth']))
logger.info("create Helmholtz Machine. ok")
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
batch_size = inputs.shape[0]
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
# get Q(a|y) = sigmoid
q_dis = self.Post(memorybook)
# repeats
L = self.config['repeats']
target = T.repeat(inputs[:, None, :],
L,
axis=1).reshape((inputs.shape[0] * L, inputs.shape[1]))
q_dis = T.repeat(q_dis[:, None, :, :],
L,
axis=1).reshape((q_dis.shape[0] * L, q_dis.shape[1], q_dis.shape[2]))
# sample actions
u = self.rng.uniform(q_dis.shape)
action = T.cast(u <= q_dis, dtype=theano.config.floatX)
# compute the exact probability for actions
logQax = action * T.log(q_dis) + (1 - action) * T.log(1 - q_dis)
logQax = logQax.sum(axis=-1).sum(axis=-1)
# decoding.
memorybook2 = self.Trans(action)
logPxa, count = self.decoder.build_decoder(target, memorybook2, return_count=True)
# prior.
p_dis = self.Prior()
logPa = action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis)
logPa = logPa.sum(axis=-1).sum(axis=-1)
"""
Compute the weights
"""
# reshape
logQax = logQax.reshape((batch_size, L))
logPa = logPa.reshape((batch_size, L))
logPxa = logPxa.reshape((batch_size, L))
logPx_a = logPa + logPxa
# normalizing the weights
log_wk = logPx_a - logQax
log_bpk = logPa - logQax
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator
logPx = T.mean(log_w_sum - T.log(L))
perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
"""
Compute the Loss function
"""
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
factor = self.config['factor']
if self.config['variant_control']:
lossQ = -T.mean(T.sum(logQax * (weights - bq), axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * (weights - bp), axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
updates = self.optimizer.get_updates(self.params, [lossP + factor * lossQ, weights, bp])
else:
lossQ = -T.mean(T.sum(logQax * weights, axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * weights, axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
updates = self.optimizer.get_updates(self.params, [lossP + factor * lossQ, weights])
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, ess],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
def compile_sample(self):
# # for Typical Auto-encoder, only conditional generation is useful.
# inputs = T.imatrix() # padded input word sequence (for training)
# if self.config['mode'] == 'RNN':
# context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
# elif self.config['mode'] == 'NTM':
# context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
# else:
# raise NotImplementedError
# pass
# sample the memorybook
p_dis = self.Prior()
l = T.iscalar()
u = self.rng.uniform((l, p_dis.shape[-2], p_dis.shape[-1]))
binarybook = T.cast(u <= p_dis, dtype=theano.config.floatX)
memorybook = self.Trans(binarybook)
self.take = theano.function([l], [binarybook, memorybook], name='take_action')
# compile the sampler.
self.decoder.build_sampler()
logger.info('sampler function compile done.')
def compile_inference(self):
"""
build the hidden action prediction.
"""
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
# get Q(a|y) = sigmoid(.|Posterior * encoded)
q_dis = self.Post(memorybook)
p_dis = self.Prior()
self.inference_ = theano.function([inputs], [memorybook, q_dis, p_dis])
logger.info("inference function compile done.")
def default_context(self):
return self.take(1)[-1]
class BinaryHelmholtz(RNNLM):
"""
Helmholtz Machine as an probabilistic version AutoEncoder
It is very similar with Variational Auto-Encoder
We implement the Helmholtz RNN as well as Helmholtz Turing Machine here.
Reference:
Reweighted Wake-Sleep
http://arxiv.org/abs/1406.2751
"""
def __init__(self,
config, n_rng, rng,
mode='RNN'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'helmholtz'
def build_(self):
logger.info("build the Binary-Helmholtz auto-encoder")
if self.mode == 'NTM':
assert self.config['enc_memory_dim'] == self.config['dec_memory_dim']
assert self.config['enc_memory_wdth'] == self.config['dec_memory_wdth']
self.encoder = Encoder(self.config, self.rng, prefix='enc', mode=self.mode)
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed, mode=self.mode)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration
self._add(self.encoder)
self._add(self.decoder)
# The main difference between VAE and HM is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a sigmoid prior here.
# prior distribution is a bias layer
if self.mode == 'RNN':
# here we first forcus on Helmholtz Turing Machine
# Thus the RNN version will be copied from Dial-DRL projects.
raise NotImplementedError
elif self.mode == 'NTM':
self.Prior = MemoryLinear(
self.config['enc_memory_dim'],
self.config['enc_memory_wdth'],
activation='sigmoid',
name='prior_proj',
has_input=False
)
# registration
self._add(self.Prior)
else:
raise NotImplementedError
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'])
# saved the initial memories
self.memory = T.nnet.sigmoid(initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth'])))
logger.info("create Helmholtz Machine. ok")
def compile_train(self):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
batch_size = inputs.shape[0]
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
# get Q(a|y) = sigmoid
q_dis = memorybook
# repeats
L = self.config['repeats']
target = T.repeat(inputs[:, None, :],
L,
axis=1).reshape((inputs.shape[0] * L, inputs.shape[1]))
q_dis = T.repeat(q_dis[:, None, :, :],
L,
axis=1).reshape((q_dis.shape[0] * L, q_dis.shape[1], q_dis.shape[2]))
# sample actions
u = self.rng.uniform(q_dis.shape)
action = T.cast(u <= q_dis, dtype=theano.config.floatX)
# compute the exact probability for actions
logQax = action * T.log(q_dis) + (1 - action) * T.log(1 - q_dis)
logQax = logQax.sum(axis=-1).sum(axis=-1)
# decoding.
memorybook2 = action
logPxa, count = self.decoder.build_decoder(target, memorybook2, return_count=True)
# prior.
p_dis = self.Prior()
logPa = action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis)
logPa = logPa.sum(axis=-1).sum(axis=-1)
"""
Compute the weights
"""
# reshape
logQax = logQax.reshape((batch_size, L))
logPa = logPa.reshape((batch_size, L))
logPxa = logPxa.reshape((batch_size, L))
logPx_a = logPa + logPxa
# normalizing the weights
log_wk = logPx_a - logQax
log_bpk = logPa - logQax
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator
logPx = T.mean(log_w_sum - T.log(L))
perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
"""
Compute the Loss function
"""
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
factor = self.config['factor']
if self.config['variant_control']:
lossQ = -T.mean(T.sum(logQax * (weights - bq), axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * (weights - bp), axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
updates = self.optimizer.get_updates(self.params, [lossP + factor * lossQ, weights, bp])
else:
lossQ = -T.mean(T.sum(logQax * weights, axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * weights, axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
updates = self.optimizer.get_updates(self.params, [lossP + factor * lossQ, weights])
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, ess],
updates=updates,
name='train_fun')
logger.info("pre-training functions compile done.")
def compile_sample(self):
# # for Typical Auto-encoder, only conditional generation is useful.
# inputs = T.imatrix() # padded input word sequence (for training)
# if self.config['mode'] == 'RNN':
# context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
# elif self.config['mode'] == 'NTM':
# context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
# else:
# raise NotImplementedError
# pass
# sample the memorybook
p_dis = self.Prior()
l = T.iscalar()
u = self.rng.uniform((l, p_dis.shape[-2], p_dis.shape[-1]))
binarybook = T.cast(u <= p_dis, dtype=theano.config.floatX)
self.take = theano.function([l], binarybook, name='take_action')
# compile the sampler.
self.decoder.build_sampler()
logger.info('sampler function compile done.')
def compile_inference(self):
"""
build the hidden action prediction.
"""
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
# get Q(a|y) = sigmoid(.|Posterior * encoded)
q_dis = memorybook
p_dis = self.Prior()
self.inference_ = theano.function([inputs], [memorybook, q_dis, p_dis])
logger.info("inference function compile done.")
def default_context(self):
return self.take(1)
class AutoEncoder(RNNLM):
"""
Regular Auto-Encoder: RNN Encoder/Decoder
Regular Neural Turing Machine
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'autoencoder'
def build_(self):
logger.info("build the RNN/NTM auto-encoder")
self.encoder = Encoder(self.config, self.rng, prefix='enc', mode=self.mode)
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed, mode=self.mode)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec', mode=self.mode)
# registration
self._add(self.encoder)
self._add(self.decoder)
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'])
# saved the initial memories
self.memory = initializations.get('glorot_uniform')(
(self.config['dec_memory_dim'], self.config['dec_memory_wdth']))
logger.info("create Autoencoder Network. ok")
def compile_train(self, mode='train'):
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
# decoding.
target = inputs
logPxz, logPPL = self.decoder.build_decoder(target, memorybook)
# reconstruction loss
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
loss = loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_rec, loss_ppl],
updates=updates,
name='train_fun')
self.test = theano.function(train_inputs,
[loss_rec, loss_ppl],
name='test_fun')
logger.info("pre-training functions compile done.")
def compile_sample(self):
# for Typical Auto-encoder, only conditional generation is useful.
inputs = T.imatrix() # padded input word sequence (for training)
if self.config['mode'] == 'RNN':
context = alloc_zeros_matrix(inputs.shape[0], self.config['enc_contxt_dim'])
elif self.config['mode'] == 'NTM':
context = T.repeat(self.memory[None, :, :], inputs.shape[0], axis=0)
else:
raise NotImplementedError
pass
# encoding
memorybook = self.encoder.build_encoder(inputs, context)
self.memorize = theano.function([inputs], memorybook, name='memorize')
# compile the sampler.
self.decoder.build_sampler()
logger.info('sampler function compile done.')
================================================
FILE: emolga/models/pointers.py
================================================
__author__ = 'jiataogu'
import theano
import logging
import copy
from emolga.layers.recurrent import *
from emolga.layers.ntm_minibatch import Controller
from emolga.layers.embeddings import *
from emolga.layers.attention import *
from emolga.layers.highwayNet import *
from emolga.models.encdec import *
from core import Model
# theano.config.exception_verbosity = 'high'
logger = logging #.getLogger(__name__)
RNN = GRU # change it here for other RNN models.
class PtrDecoder(Model):
"""
RNN-Decoder for Pointer Networks
"""
def __init__(self,
config, rng, prefix='ptrdec'):
super(PtrDecoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
"""
Create all elements of the Decoder's computational graph.
"""
# create Initialization Layers
logger.info("{}_create initialization layers.".format(self.prefix))
self.Initializer = Dense(
config['ptr_contxt_dim'],
config['ptr_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
# create RNN cells
logger.info("{}_create RNN cells.".format(self.prefix))
self.RNN = RNN(
self.config['ptr_embedd_dim'],
self.config['ptr_hidden_dim'],
self.config['ptr_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.Initializer)
self._add(self.RNN)
# create readout layers
logger.info("_create Attention-Readout layers")
self.attender = Attention(
self.config['ptr_hidden_dim'],
self.config['ptr_source_dim'],
self.config['ptr_middle_dim'],
name='{}_attender'.format(self.prefix)
)
self._add(self.attender)
@staticmethod
def grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
@staticmethod
def grab_source(source, target):
# source : (nb_samples, source_num, source_dim)
# target : (nb_samples, target_num)
assert source.ndim == 3
batch_size = source.shape[0]
source_num = source.shape[1]
source_dim = source.shape[2]
target_num = target.shape[1]
source_flt = source.reshape((batch_size * source_num, source_dim))
target_idx = (target + (T.arange(batch_size) * source_num)[:, None]).reshape((batch_size * target_num,))
value = source_flt[target_idx].reshape((batch_size, target_num, source_dim))
return value
def build_decoder(self,
inputs,
source, target,
smask=None, tmask=None, context=None):
"""
Build the Pointer Network Decoder Computational Graph
"""
# inputs : (nb_samples, source_num, ptr_embedd_dim)
# source : (nb_samples, source_num, source_dim)
# smask : (nb_samples, source_num)
# target : (nb_samples, target_num)
# tmask : (nb_samples, target_num)
# context: (nb_sample, context_dim)
# initialized hidden state.
assert context is not None
Init_h = self.Initializer(context)
# target is the source inputs.
X = self.grab_source(inputs, target) # (nb_samples, target_num, source_dim)
X = T.concatenate([alloc_zeros_matrix(X.shape[0], 1, X.shape[2]),
X[:, :-1, :]], axis=1)
X = X.dimshuffle((1, 0, 2))
# tmask = tmask.dimshuffle((1, 0))
# eat by recurrent net
def _recurrence(x, prev_h, c, s, s_mask):
# RNN read-out
x_out = self.RNN(x, mask=None, C=c, init_h=prev_h, one_step=True)
s_out = self.attender(x_out, s, s_mask, return_log=True)
return x_out, s_out
outputs, _ = theano.scan(
_recurrence,
sequences=[X],
outputs_info=[Init_h, None],
non_sequences=[context, source, smask]
)
log_prob_dist = outputs[-1].dimshuffle((1, 0, 2))
# tmask = tmask.dimshuffle((1, 0))
log_prob = T.sum(self.grab_prob(log_prob_dist, target) * tmask, axis=1)
return log_prob
"""
Sample one step
"""
def _step_sample(self, prev_idx, prev_stat,
context, inputs, source, smask):
X = T.switch(
prev_idx[:, None] < 0,
alloc_zeros_matrix(prev_idx.shape[0], self.config['ptr_embedd_dim']),
self.grab_source(inputs, prev_idx[:, None])
)
# one step RNN
X_out = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_out
# compute the attention read-out
next_prob = self.attender(X_out, source, smask)
next_sample = self.rng.multinomial(pvals=next_prob).argmax(1)
return next_prob, next_sample, next_stat
def build_sampler(self):
"""
Build a sampler which only steps once.
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.matrix() # theano variable.
init_h = self.Initializer(context)
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], init_h, name='get_init_state')
logger.info('done.')
# sampler: 1 x 1
prev_idx = T.vector('prev_idx', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
inputs = T.tensor3()
source = T.tensor3()
smask = T.imatrix()
next_prob, next_sample, next_stat \
= self._step_sample(prev_idx, prev_stat, context,
inputs, source, smask)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_idx, prev_stat, context, inputs, source, smask]
outputs = [next_prob, next_sample, next_stat]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
pass
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, inputs, source, smask,
k=1, maxlen=30, stochastic=True, argmax=False, fixlen=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
# get initial state of decoder RNN with context
next_state = self.get_init_state(context)
next_word = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1])
ipt = np.tile(inputs, [live_k, 1, 1])
sor = np.tile(source, [live_k, 1, 1])
smk = np.tile(smask, [live_k, 1])
next_prob, next_word, next_state \
= self.sample_next(next_word, next_state,
ctx, ipt, sor, smk) # wtf.
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_prob[0].argmax()
next_word[0] = nw
else:
nw = next_word[0]
sample.append(nw)
score += next_prob[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# we can only computed in a flatten way!
cand_scores = hyp_scores[:, None] - np.log(next_prob)
cand_flat = cand_scores.flatten()
ranks_flat = cand_flat.argsort()[:(k - dead_k)]
# fetch the best results.
voc_size = next_prob.shape[1]
trans_index = ranks_flat / voc_size
word_index = ranks_flat % voc_size
costs = cand_flat[ranks_flat]
# get the new hyp samples
new_hyp_samples = []
new_hyp_scores = np.zeros(k - dead_k).astype(theano.config.floatX)
new_hyp_states = []
for idx, [ti, wi] in enumerate(zip(trans_index, word_index)):
new_hyp_samples.append(hyp_samples[ti] + [wi])
new_hyp_scores[idx] = copy.copy(costs[idx])
new_hyp_states.append(copy.copy(next_state[ti]))
# check the finished samples
new_live_k = 0
hyp_samples = []
hyp_scores = []
hyp_states = []
for idx in xrange(len(new_hyp_samples)):
if (new_hyp_states[idx][-1] == 0) and (not fixlen):
sample.append(new_hyp_samples[idx])
score.append(new_hyp_scores[idx])
dead_k += 1
else:
new_live_k += 1
hyp_samples.append(new_hyp_samples[idx])
hyp_scores.append(new_hyp_scores[idx])
hyp_states.append(new_hyp_states[idx])
hyp_scores = np.array(hyp_scores)
live_k = new_live_k
if new_live_k < 1:
break
if dead_k >= k:
break
next_word = np.array([w[-1] for w in hyp_samples])
next_state = np.array(hyp_states)
pass
pass
# end.
if not stochastic:
# dump every remaining one
if live_k > 0:
for idx in xrange(live_k):
sample.append(hyp_samples[idx])
score.append(hyp_scores[idx])
return sample, score
class PointerDecoder(Model):
"""
RNN-Decoder for Pointer Networks [version 2]
Pointer to 2 place once a time.
"""
def __init__(self,
config, rng, prefix='ptrdec'):
super(PointerDecoder, self).__init__()
self.config = config
self.rng = rng
self.prefix = prefix
"""
Create all elements of the Decoder's computational graph.
"""
# create Initialization Layers
logger.info("{}_create initialization layers.".format(self.prefix))
self.Initializer = Dense(
config['ptr_contxt_dim'],
config['ptr_hidden_dim'],
activation='tanh',
name="{}_init".format(self.prefix)
)
# create RNN cells
logger.info("{}_create RNN cells.".format(self.prefix))
self.RNN = RNN(
self.config['ptr_embedd_dim'],
self.config['ptr_hidden_dim'],
self.config['ptr_contxt_dim'],
name="{}_cell".format(self.prefix)
)
self._add(self.Initializer)
self._add(self.RNN)
# create 2 attention heads
logger.info("_create Attention-Readout layers")
self.att_head = Attention(
self.config['ptr_hidden_dim'],
self.config['ptr_source_dim'],
self.config['ptr_middle_dim'],
name='{}_head_attender'.format(self.prefix)
)
self.att_tail = Attention(
self.config['ptr_hidden_dim'],
self.config['ptr_source_dim'],
self.config['ptr_middle_dim'],
name='{}_tail_attender'.format(self.prefix)
)
self._add(self.att_head)
self._add(self.att_tail)
@staticmethod
def grab_prob(probs, X):
assert probs.ndim == 3
batch_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((batch_size * max_len, vocab_size))
return probs[T.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
@staticmethod
def grab_source(source, target):
# source : (nb_samples, source_num, source_dim)
# target : (nb_samples, target_num)
assert source.ndim == 3
batch_size = source.shape[0]
source_num = source.shape[1]
source_dim = source.shape[2]
target_num = target.shape[1]
source_flt = source.reshape((batch_size * source_num, source_dim))
target_idx = (target + (T.arange(batch_size) * source_num)[:, None]).reshape((batch_size * target_num,))
value = source_flt[target_idx].reshape((batch_size, target_num, source_dim))
return value
def build_decoder(self,
inputs,
source, target,
smask=None, tmask=None, context=None):
"""
Build the Pointer Network Decoder Computational Graph
"""
# inputs : (nb_samples, source_num, ptr_embedd_dim)
# source : (nb_samples, source_num, source_dim)
# smask : (nb_samples, source_num)
# target : (nb_samples, target_num)
# tmask : (nb_samples, target_num)
# context: (nb_sample, context_dim)
# initialized hidden state.
assert context is not None
Init_h = self.Initializer(context)
# target is the source inputs.
X = self.grab_source(inputs, target) # (nb_samples, target_num, source_dim)
nb_dim = X.shape[0]
tg_num = X.shape[1]
sc_dim = X.shape[2]
# since it changes to two pointers once a time:
# concatenate + reshape
def _get_ht(A, mask=False):
if A.ndim == 2:
B = A[:, -1:]
if mask:
B *= 0.
A = T.concatenate([A, B], axis=1)
return A[:, ::2], A[:, 1::2]
else:
B = A[:, -1:, :]
print B.ndim
if mask:
B *= 0.
A = T.concatenate([A, B], axis=1)
return A[:, ::2, :], A[:, 1::2, :]
Xh, Xt = _get_ht(X)
Th, Tt = _get_ht(target)
Mh, Mt = _get_ht(tmask, mask=True)
Xa = Xh + Xt
Xa = T.concatenate([alloc_zeros_matrix(nb_dim, 1, sc_dim),
Xa[:, :-1, :, :]], axis=1)
Xa = Xa.dimshuffle((1, 0, 2))
# eat by recurrent net
def _recurrence(x, prev_h, c, s, s_mask):
# RNN read-out
x_out = self.RNN(x, mask=None, C=c, init_h=prev_h, one_step=True)
h_out = self.att_head(x_out, s, s_mask, return_log=True)
t_out = self.att_tail(x_out, s, s_mask, return_log=True)
return x_out, h_out, t_out
outputs, _ = theano.scan(
_recurrence,
sequences=[Xa],
outputs_info=[Init_h, None, None],
non_sequences=[context, source, smask]
)
log_prob_head = outputs[1].dimshuffle((1, 0, 2))
log_prob_tail = outputs[2].dimshuffle((1, 0, 2))
log_prob = T.sum(self.grab_prob(log_prob_head, Th) * Mh, axis=1) \
+ T.sum(self.grab_prob(log_prob_tail, Tt) * Mt, axis=1)
return log_prob
"""
Sample one step
"""
def _step_sample(self,
prev_idx_h, prev_idx_t,
prev_stat,
context, inputs, source, smask):
X = T.switch(
prev_idx_h[:, None] < 0,
alloc_zeros_matrix(prev_idx_h.shape[0], self.config['ptr_embedd_dim']),
self.grab_source(inputs, prev_idx_h[:, None]) + self.grab_source(inputs, prev_idx_t[:, None])
)
# one step RNN
X_out = self.RNN(X, C=context, init_h=prev_stat, one_step=True)
next_stat = X_out
# compute the attention read-out
next_prob_h = self.att_head(X_out, source, smask)
next_sample_h = self.rng.multinomial(pvals=next_prob_h).argmax(1)
next_prob_t = self.att_tail(X_out, source, smask)
next_sample_t = self.rng.multinomial(pvals=next_prob_t).argmax(1)
return next_prob_h, next_sample_h, next_prob_t, next_sample_t, next_stat
def build_sampler(self):
"""
Build a sampler which only steps once.
"""
logger.info("build sampler ...")
if self.config['sample_stoch'] and self.config['sample_argmax']:
logger.info("use argmax search!")
elif self.config['sample_stoch'] and (not self.config['sample_argmax']):
logger.info("use stochastic sampling!")
elif self.config['sample_beam'] > 1:
logger.info("use beam search! (beam_size={})".format(self.config['sample_beam']))
# initial state of our Decoder.
context = T.matrix() # theano variable.
init_h = self.Initializer(context)
logger.info('compile the function: get_init_state')
self.get_init_state \
= theano.function([context], init_h, name='get_init_state')
logger.info('done.')
# sampler: 1 x 1
prev_idxh = T.vector('prev_idxh', dtype='int64')
prev_idxt = T.vector('prev_idxt', dtype='int64')
prev_stat = T.matrix('prev_state', dtype='float32')
inputs = T.tensor3()
source = T.tensor3()
smask = T.imatrix()
next_prob_h, next_sample_h, next_prob_t, next_sample_t, next_stat \
= self._step_sample(prev_idxh, prev_idxt, prev_stat, context,
inputs, source, smask)
# next word probability
logger.info('compile the function: sample_next')
inputs = [prev_idxh, prev_idxt, prev_stat, context, inputs, source, smask]
outputs = [next_prob_h, next_sample_h, next_prob_t, next_sample_t, next_stat]
self.sample_next = theano.function(inputs, outputs, name='sample_next')
logger.info('done')
pass
"""
Generate samples, either with stochastic sampling or beam-search!
"""
def get_sample(self, context, inputs, source, smask,
k=1, maxlen=30, stochastic=True, argmax=False, fixlen=False):
# beam size
if k > 1:
assert not stochastic, 'Beam search does not support stochastic sampling!!'
# fix length cannot use beam search
# if fixlen:
# assert k == 1
# prepare for searching
sample = []
score = []
if stochastic:
score = 0
live_k = 1
dead_k = 0
hyp_samples = [[]] * live_k
hyp_scores = np.zeros(live_k).astype(theano.config.floatX)
hyp_states = []
# get initial state of decoder RNN with context
next_state = self.get_init_state(context)
next_wordh = -1 * np.ones((1,)).astype('int64') # indicator for the first target word (bos target)
next_wordt = -1 * np.ones((1,)).astype('int64')
# Start searching!
for ii in xrange(maxlen):
# print next_word
ctx = np.tile(context, [live_k, 1])
ipt = np.tile(inputs, [live_k, 1, 1])
sor = np.tile(source, [live_k, 1, 1])
smk = np.tile(smask, [live_k, 1])
next_probh, next_wordh, next_probt, next_wordt, next_state \
= self.sample_next(next_wordh, next_wordt, next_state,
ctx, ipt, sor, smk) # wtf.
if stochastic:
# using stochastic sampling (or greedy sampling.)
if argmax:
nw = next_probh[0].argmax()
next_wordh[0] = nw
else:
nw = next_wordh[0]
sample.append(nw)
score += next_probh[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
if argmax:
nw = next_probt[0].argmax()
next_wordt[0] = nw
else:
nw = next_wordt[0]
sample.append(nw)
score += next_probt[0, nw]
if (not fixlen) and (nw == 0): # sample reached the end
break
else:
# using beam-search
# I don't know how to apply 2 point beam-search
# we can only computed in a flatten way!
assert True, 'In this stage, I do not know how to use Beam-search for this problem.'
return sample, score
class MemNet(Model):
"""
Memory Networks:
==> Assign a Matrix to store rules
"""
def __init__(self,
config, rng, learn_memory=False,
prefix='mem'):
super(MemNet, self).__init__()
self.config = config
self.rng = rng # Theano random stream
self.prefix = prefix
self.init = initializations.get('glorot_uniform')
if learn_memory:
self.memory = self.init((self.config['mem_size'], self.config['mem_source_dim']))
self.memory.name = '{}_inner_memory'.format(self.prefix)
self.params += [self.memory]
"""
Create the read-head of the MemoryNets
"""
if self.config['mem_type'] == 'dnn':
self.attender = Attention(
config['mem_hidden_dim'],
config['mem_source_dim'],
config['mem_middle_dim'],
name='{}_attender'.format(self.prefix)
)
else:
self.attender = CosineAttention(
config['mem_hidden_dim'],
config['mem_source_dim'],
use_pipe=config['mem_use_pipe'],
name='{}_attender'.format(self.prefix)
)
self._add(self.attender)
def __call__(self, key, memory=None, mem_mask=None, out_memory=None):
# key: (nb_samples, mem_hidden_dim)
# memory: (nb_samples, mem_size, mem_source_dim)
nb_samples = key.shape[0]
if not memory:
memory = T.repeat(self.memory[None, :, :], nb_samples, axis=0)
mem_mask = None
if memory.ndim == 2:
memory = T.repeat(memory[None, :, :], nb_samples, axis=0)
probout = self.attender(key, memory, mem_mask) # (nb_samples, mem_size)
if self.config['mem_att_drop'] > 0:
probout = T.clip(probout - self.config['mem_att_drop'], 0, 1)
if out_memory is None:
readout = T.sum(memory * probout[:, :, None], axis=1)
else:
readout = T.sum(out_memory * probout[:, :, None], axis=1)
return readout, probout
class PtrNet(Model):
"""
Pointer Networks [with/without] External Rule Memory
"""
def __init__(self, config, n_rng, rng,
name='PtrNet', w_mem=True):
super(PtrNet, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.name = name
self.w_mem = w_mem
def build_(self, encoder=None):
logger.info("build the Pointer Networks")
# encoder
if not encoder:
self.encoder = Encoder(self.config, self.rng, prefix='enc1')
self._add(self.encoder)
else:
self.encoder = encoder
if self.config['mem_output_mem']:
self.encoder_out = Encoder(self.config, self.rng, prefix='enc_out')
self._add(self.encoder_out)
# twice encoding
if self.config['ptr_twice_enc']:
self.encoder2 = Encoder(self.config, self.rng, prefix='enc2', use_context=True)
self._add(self.encoder2)
# pointer decoder
self.ptrdec = PtrDecoder(self.config, self.rng) # PtrDecoder(self.config, self.rng)
self._add(self.ptrdec)
# memory grabber
self.grabber = MemNet(self.config, self.rng)
self._add(self.grabber)
# memory predictor :: alternative ::
if self.config['use_predict']:
logger.info('create a predictor AS Long Term Memory.s')
if self.config['pred_type'] == 'highway':
self.predictor = HighwayNet(self.config['mem_hidden_dim'],
self.config['pred_depth'],
activation='relu',
name='phw')
elif self.config['pred_type'] == 'dense':
self.predictor = Dense(self.config['mem_hidden_dim'],
self.config['mem_hidden_dim'],
name='pdnn')
elif self.config['pred_type'] == 'encoder':
config = self.config
# config['enc_embedd_dim'] = 300
# config['enc_hidden_dim'] = 300
self.predictor = Encoder(config, self.rng, prefix='enc3', use_context=False)
else:
NotImplementedError
self._add(self.predictor)
# objectives and optimizers
assert self.config['optimizer'] == 'adam'
self.optimizer = optimizers.get(self.config['optimizer'],
kwargs=dict(rng=self.rng,
save=self.config['save_updates']))
def build_train(self, memory=None, out_memory=None, compile_train=False, guide=None):
# training function for Pointer Networks
indices = T.imatrix() # padded word indices (for training)
target = T.imatrix() # target indices (leading to relative locations)
tmask = T.imatrix() # target masks
pmask = T.cast(1 - T.eq(target[:, 0], 0), dtype='float32')
assert memory is not None, 'we must have an input memory'
if self.config['mem_output_mem']:
assert out_memory is not None, 'we must have an output memory'
# L1 of memory
loss_mem = T.sum(abs(T.mean(memory, axis=0)))
# encoding
if not self.config['ptr_twice_enc']:
source, inputs, smask, tail = self.encoder.build_encoder(indices, None, return_embed=True, return_sequence=True)
# grab memory
readout, probout = self.grabber(tail, memory)
if not self.config['use_tail']:
tailx = tail * 0.0
else:
tailx = tail
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context = T.concatenate([tailx, readout], axis=1)
# if predict ?
# predictor: minimize || readout - predict ||^2
if self.config['use_predict']:
if self.config['pred_type'] == 'encoder':
predict = self.predictor.build_encoder(indices, None, return_sequence=False)
else:
predict = self.predictor(tail)
# reconstruction loss [note that we only compute loss for correct memory read.]
loss_r = 0.5 * T.sum(pmask * T.sum(T.sqr(predict - readout), axis=-1).reshape(pmask.shape)) / T.sum(pmask)
# use predicted readout to compute loss
contextz = T.concatenate([tailx, predict], axis=1)
sourcez, inputsz, smaskz = source, inputs, smask
else:
tail = self.encoder.build_encoder(indices, None, return_sequence=False)
# grab memory
readout, probout = self.grabber(tail, memory, out_memory)
# get PrtNet input
if not self.config['use_tail']:
tailx = tail * 0.0
else:
tailx = tail
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context0 = T.concatenate([tailx, readout], axis=1)
# twice encoding ?
source, inputs, smask, context = self.encoder2.build_encoder(
indices, context=context0, return_embed=True, return_sequence=True)
# if predict ?
# predictor: minimize | readout - predict ||^2
if self.config['use_predict']:
if self.config['pred_type'] == 'encoder':
predict = self.predictor.build_encoder(indices, None, return_sequence=False)
else:
predict = self.predictor(tail)
# reconstruction loss [note that we only compute loss for correct memory read.]
loss_r = 0.5 * T.sum(pmask * T.sum(T.sqr(predict - readout), axis=-1).reshape(pmask.shape)) / T.sum(pmask)
dist = T.sum(T.sum(T.sqr(tail - readout), axis=-1).reshape(pmask.shape) * pmask) / T.sum(pmask)
# use predicted readout to compute loss
context1 = T.concatenate([tailx, predict], axis=1)
# twice encoding..
sourcez, inputsz, smaskz, contextz = self.encoder2.build_encoder(
indices, context=context1, return_embed=True, return_sequence=True)
# pointer decoder & loss
logProb = self.ptrdec.build_decoder(inputs, source, target,
smask, tmask, context)
loss = T.mean(-logProb)
# if predict?
if self.config['use_predict']:
logProbz = self.ptrdec.build_decoder(
inputsz, sourcez, target, smaskz, tmask, contextz)
loss_z = -T.sum(pmask * logProbz.reshape(pmask.shape)) / T.sum(pmask)
# if guidance ?
if guide:
# attention loss
# >>>>>>> BE CAUTION !!! <<<<<<
# guide vector may contains '-1' which needs a mask for that.
mask = T.ones_like(guide) * (1 - T.eq(guide, -1))
loss_g = T.mean(
-T.sum(
T.log(PtrDecoder.grab_prob(probout[:, None, :], guide)),
axis=1).reshape(mask.shape) * mask
)
# attention accuracy
attend = probout.argmax(axis=1, keepdims=True)
maxp = T.sum(probout.max(axis=1).reshape(mask.shape) * mask) / T.cast(T.sum(mask), 'float32')
error = T.sum((abs(attend - guide) * mask) > 0) / T.cast(T.sum(mask), 'float32')
if self.config['mem_learn_guide']:
loss += loss_g
# loss += 0.1 * loss_mem
if compile_train:
train_inputs = [indices, target, tmask, memory]
if guide:
train_inputs += [guide]
logger.info("compiling the compuational graph ::training function::")
updates = self.optimizer.get_updates(self.params, loss)
self.train_ = theano.function(train_inputs, loss, updates=updates, name='train_sub')
logger.info("training functions compile done.")
# output the building results for Training
outputs = [loss]
if guide:
outputs += [maxp, error]
outputs += [indices, target, tmask]
if self.config['use_predict']:
outputs += [loss_r, loss_z, dist, readout]
return outputs
def build_sampler(self, memory=None, out_mem=None):
# training function for Pointer Networks
indices = T.imatrix() # padded word indices (for training)
# encoding
if not self.config['ptr_twice_enc']:
# encoding
source, inputs, smask, tail = self.encoder.build_encoder(indices, None, return_embed=True, return_sequence=True)
# grab memory
readout, probout = self.grabber(tail, memory, out_mem)
if not self.config['use_tail']:
tail *= 0.0
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context = T.concatenate([tail, readout], axis=1)
else:
tail = self.encoder.build_encoder(indices, None, return_sequence=False)
# grab memory
readout, probout = self.grabber(tail, memory, out_mem)
if not self.config['use_tail']:
tail *= 0.0
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context0 = T.concatenate([tail, readout], axis=1)
# twice encoding ?
source, inputs, smask, context = self.encoder2.build_encoder(
indices, context=context0, return_embed=True, return_sequence=True)
# monitoring
self.monitor['attention_prob'] = probout
self._monitoring()
return context, source, smask, inputs, indices
def build_predict_sampler(self):
# training function for Pointer Networks
indices = T.imatrix() # padded word indices (for training)
flag = True
# encoding
if not self.config['ptr_twice_enc']:
# encoding
source, inputs, smask, tail = self.encoder.build_encoder(indices, None, return_embed=True, return_sequence=True)
# predict memory
if self.config['pred_type'] == 'encoder':
readout = self.predictor.build_encoder(indices, None, return_sequence=False)
else:
readout = self.predictor(tail)
if not self.config['use_tail']:
tail *= 0.0
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context = T.concatenate([tail, readout], axis=1)
else:
tail = self.encoder.build_encoder(indices, None, return_sequence=False)
# predict memory
if self.config['pred_type'] == 'encoder':
readout = self.predictor.build_encoder(indices, None, return_sequence=False)
else:
readout = self.predictor(tail)
if not self.config['use_tail']:
tail *= 0.0
if not self.config['use_memory']:
readout *= 0.0
# concatenate
context0 = T.concatenate([tail, readout], axis=1)
# twice encoding ?
source, inputs, smask, context = self.encoder2.build_encoder(
indices, context=context0, return_embed=True, return_sequence=True)
return context, source, smask, inputs, indices
def generate_(self, inputs, context, source, smask):
args = dict(k=4, maxlen=5, stochastic=False, argmax=False)
sample, score = self.ptrdec.get_sample(context, inputs, source, smask,
**args)
if not args['stochastic']:
score = score / np.array([len(s) for s in sample])
sample = sample[score.argmin()]
score = score.min()
else:
score /= float(len(sample))
return sample, np.exp(score)
================================================
FILE: emolga/models/variational.py
================================================
__author__ = 'jiataogu'
import theano
# theano.config.exception_verbosity = 'high'
import logging
import emolga.basic.objectives as objectives
import emolga.basic.optimizers as optimizers
from emolga.layers.recurrent import *
from emolga.layers.embeddings import *
from emolga.models.encdec import RNNLM, Encoder, Decoder
from emolga.models.sandbox import SkipDecoder
logger = logging
RNN = JZS3 # change it here for other RNN models.
# Decoder = SkipDecoder
class VAE(RNNLM):
"""
Variational Auto-Encoder: RNN-Variational Encoder/Decoder,
in order to model the sentence generation.
We implement the original VAE and a better version, IWAE.
References:
Auto-Encoding Variational Bayes
http://arxiv.org/abs/1312.6114
Importance Weighted Autoencoders
http://arxiv.org/abs/1509.00519
"""
def __init__(self,
config, n_rng, rng,
mode='Evaluation'):
super(RNNLM, self).__init__()
self.config = config
self.n_rng = n_rng # numpy random stream
self.rng = rng # Theano random stream
self.mode = mode
self.name = 'vae'
self.tparams= dict()
def _add_tag(self, layer, tag):
if tag not in self.tparams:
self.tparams[tag] = []
if layer:
self.tparams[tag] += layer.params
def build_(self):
logger.info("build the variational auto-encoder")
self.encoder = Encoder(self.config, self.rng, prefix='enc')
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec', embed=self.encoder.Embed)
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec')
# additional parameters for building Gaussian:
logger.info("create Gaussian layers.")
"""
Build the Gaussian distribution.
"""
self.action_activ = activations.get('tanh')
self.context_mean = Dense(
self.config['enc_hidden_dim'] * 2
if self.config['bidirectional']
else self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='linear',
name="weight_mean"
)
self.context_std = Dense(
self.config['enc_hidden_dim'] * 2
if self.config['bidirectional']
else self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='linear',
name="weight_std"
)
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='tanh',
name="transform"
)
# registration:
self._add(self.context_mean)
self._add(self.context_std)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.context_mean, 'q')
self._add_tag(self.context_std, 'q')
# P-layers:
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
# objectives and optimizers
self.optimizer = optimizers.get(self.config['optimizer'])
logger.info("create variational RECURRENT auto-encoder. ok")
def compile_train(self):
"""
build the training function here <:::>
"""
# questions (theano variables)
inputs = T.imatrix() # padded input word sequence (for training)
# encoding. (use backward encoding.)
encoded = self.encoder.build_encoder(inputs[:, ::-1])
# gaussian distribution
mean = self.context_mean(encoded)
ln_var = self.context_std(encoded)
# [important] use multiple samples.
if self.config['repeats'] > 1:
L = self.config['repeats']
# repeat mean, ln_var and targets.
func_r = lambda x: T.extra_ops.repeat(
x[:, None, :], L,
axis=1).reshape((x.shape[0] * L, x.shape[1]))
mean, ln_var, target \
= [func_r(x) for x in [mean, ln_var, inputs]]
else:
target = inputs
action = mean + T.exp(ln_var / 2.) * self.rng.normal(mean.shape)
context = self.context_trans(action)
# decoding.
logPxz, logPPL = self.decoder.build_decoder(target, context)
# loss function for variational auto-encoding
# regulation loss + reconstruction loss
loss_reg = T.mean(objectives.get('GKL')(mean, ln_var))
loss_rec = T.mean(-logPxz)
loss_ppl = T.exp(T.mean(-logPPL))
m_mean = T.mean(abs(mean))
m_ln_var = T.mean(abs(ln_var))
L1 = T.sum([T.sum(abs(w)) for w in self.params])
loss = loss_reg + loss_rec
updates = self.optimizer.get_updates(self.params, loss)
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs]
self.train_ = theano.function(train_inputs,
[loss_reg, loss_rec, L1, m_ln_var],
updates=updates,
name='train_fun')
# add monitoring:
self.monitor['action'] = action
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
logger.info("pre-training functions compile done.")
def compile_sample(self):
"""
build the sampler function here <:::>
"""
# context vectors (as)
self.decoder.build_sampler()
l = T.iscalar()
logger.info("compiling the computational graph :: action sampler")
self.action_sampler = theano.function([l], self.rng.normal((l, self.config['action_dim'])))
action = T.matrix()
logger.info("compiling the compuational graph ::transform function::")
self.transform = theano.function([action], self.context_trans(action))
logger.info("display functions compile done.")
def compile_inference(self):
"""
build the hidden action prediction.
"""
inputs = T.imatrix() # padded input word sequence (for training)
# encoding. (use backward encoding.)
encoded = self.encoder.build_encoder(inputs[:, ::-1])
# gaussian distribution
mean = self.context_mean(encoded)
ln_var = self.context_std(encoded)
self.inference_ = theano.function([inputs], [encoded, mean, T.sqrt(T.exp(ln_var))])
logger.info("inference function compile done.")
def default_context(self):
return self.transform(self.action_sampler(1))
class Helmholtz(VAE):
"""
Another alternative I can think about is the Helmholtz Machine
It is trained using a Reweighted Wake Sleep Algorithm.
Reference:
Reweighted Wake-Sleep
http://arxiv.org/abs/1406.2751
"""
def __init__(self,
config, n_rng, rng,
mode = 'Evaluation',
dynamic_prior=False,
):
super(VAE, self).__init__(config, n_rng, rng)
# self.config = config
# self.n_rng = n_rng # numpy random stream
# self.rng = rng # Theano random stream
self.mode = mode
self.name = 'multitask_helmholtz'
self.tparams = dict()
self.dynamic_prior = dynamic_prior
def build_(self):
logger.info('Build Helmholtz Recurrent Neural Networks')
self.encoder = Encoder(self.config, self.rng, prefix='enc')
if self.config['shared_embed']:
self.decoder = Decoder(self.config, self.rng, prefix='dec', embed=self.encoder.Embed,
highway=self.config['highway'])
else:
self.decoder = Decoder(self.config, self.rng, prefix='dec',
highway=self.config['highway'])
# The main difference between VAE and HM is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a sigmoid prior here.
"""
Build the Sigmoid Layers
"""
# prior distribution (bias layer)
self.Prior = Constant(
self.config['action_dim'],
self.config['action_dim'],
activation='sigmoid',
name='prior_proj'
)
# Fake Posterior (Q-function)
self.Posterior = Dense(
self.config['enc_hidden_dim'] * 2
if self.config['bidirectional']
else self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='sigmoid',
name = 'posterior_proj'
)
# Action transform to context
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='linear',
name="transform"
)
# registration:
self._add(self.Posterior)
self._add(self.Prior)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.Posterior, 'q')
# P-layers:
self._add_tag(self.Prior, 'p')
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
# objectives and optimizers
self.optimizer_p = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
self.optimizer_q = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
logger.info("create Helmholtz RECURRENT neural network. ok")
def dynamic(self):
self.Prior = Dense(
self.config['state_dim'],
self.config['action_dim'],
activation='sigmoid',
name='prior_proj'
)
self.params = []
self.layers = []
self.tparams= dict()
# add layers again!
# registration:
self._add(self.Posterior)
self._add(self.Prior)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.Posterior, 'q')
# P-layers:
self._add_tag(self.Prior, 'p')
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
if not contrastive:
self.compile_train()
else:
self.compile_train_CE()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
def compile_train(self):
"""
build the training function here <:::>
"""
# get input sentence (x)
inputs = T.imatrix() # padded input word sequence (for training)
batch_size = inputs.shape[0]
"""
The Computational Flow.
"""
# encoding. (use backward encoding.)
encoded = self.encoder.build_encoder(inputs[:, ::-1])
# get Q(a|y) = sigmoid(.|Posterior * encoded)
q_dis = self.Posterior(encoded)
# use multiple samples
L = T.iscalar('repeats') #self.config['repeats']
def func_r(x):
return T.extra_ops.repeat(x[:, None, :], L, axis=1).reshape((-1, x.shape[1])) # ?
q_dis, target = [func_r(x) for x in [q_dis, inputs]]
# sample actions
u = self.rng.uniform(q_dis.shape)
action = T.cast(u <= q_dis, dtype=theano.config.floatX)
# compute the exact probability for actions
logQax = T.sum(action * T.log(q_dis) + (1 - action) * T.log(1 - q_dis), axis=1)
# decoding.
context = self.context_trans(action)
logPxa, count = self.decoder.build_decoder(target, context, return_count=True)
logPPL = logPxa / count
# logPxa, logPPL = self.decoder.build_decoder(target, context)
# prior.
p_dis = self.Prior(action)
logPa = T.sum(action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis), axis=1)
"""
Compute the weights
"""
# reshape
logQax = logQax.reshape((batch_size, L))
logPa = logPa.reshape((batch_size, L))
logPxa = logPxa.reshape((batch_size, L))
count = count.reshape((batch_size, L))[:, :1]
# P(x, a) = P(a) * P(x|a)
logPx_a = logPa + logPxa
log_wk = logPx_a - logQax
log_bpk = logPa - logQax
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator
# nll = -T.mean(log_w_sum - T.log(L))
nll = T.mean(-(log_w_sum - T.log(L)))
perplexity = T.exp(T.mean(-(log_w_sum - T.log(L)) / count))
# perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
"""
Compute the Loss function
"""
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
# monitoring
# self.monitor['action'] = action
if self.config['variant_control']:
lossQ = -T.mean(T.sum(logQax * (weights - bq), axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * (weights - bp), axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights, bp])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
else:
lossQ = -T.mean(T.sum(logQax * weights, axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(logPa * weights, axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
# lossRes = -T.mean(T.nnet.relu(T.sum((logPa + logPxa - logPx0) * weights, axis=1)))
# lossP = 0.1 * lossRes + lossP
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
updates = updates_p + updates_q
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs] + [theano.Param(L, default=10)]
self.train_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, nll],
updates=updates,
name='train_fun')
logger.info("compile the computational graph:: >__< :: explore function")
self.explore_ = theano.function(train_inputs,
[log_wk, count],
name='explore_fun')
# add monitoring:
# self._monitoring()
# compiling monitoring
# self.compile_monitoring(train_inputs)
logger.info("pre-training functions compile done.")
def build_dynamics(self, states, action, Y):
# this funtion is used to compute probabilities for language generation.
# compute the probability of action
assert self.dynamic_prior, 'only supports dynamic prior'
p_dis = self.Prior(states)
logPa = T.sum(action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis), axis=1)
context = self.context_trans(action)
logPxa, count = self.decoder.build_decoder(Y, context, return_count=True)
return logPa, logPxa, count
def compile_sample(self):
"""
build the sampler function here <:::>
"""
# context vectors (as)
self.decoder.build_sampler()
logger.info("compiling the computational graph :: action sampler")
if self.dynamic_prior:
states = T.matrix()
p_dis = self.Prior(states)
u = self.rng.uniform(p_dis.shape)
else:
p_dis = self.Prior()
l = T.iscalar()
u = self.rng.uniform((l, p_dis.shape[-1]))
action = T.cast(u <= p_dis, dtype=theano.config.floatX)
if self.dynamic_prior:
self.action_sampler = theano.function([states], action)
else:
self.action_sampler = theano.function([l], action)
# compute the action probability
logPa = T.sum(action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis), axis=1)
if self.dynamic_prior:
self.action_prob = theano.function([states, action], logPa)
else:
self.action_prob = theano.function([action], logPa)
action = T.matrix()
logger.info("compiling the computational graph ::transform function::")
self.transform = theano.function([action], self.context_trans(action))
logger.info("display functions compile done.")
def compile_inference(self):
"""
build the hidden action prediction.
"""
inputs = T.imatrix() # padded input word sequence (for training)
# encoding. (use backward encoding.)
encoded = self.encoder.build_encoder(inputs[:, ::-1])
# get Q(a|y) = sigmoid(.|Posterior * encoded)
q_dis = self.Posterior(encoded)
p_dis = self.Prior(inputs)
self.inference_ = theano.function([inputs], [encoded, q_dis, p_dis])
logger.info("inference function compile done.")
def evaluate_(self, inputs):
"""
build the evaluation function for valid/testing
Note that we need multiple sampling for this!
"""
log_wks = []
count = None
N = self.config['eval_N']
L = self.config['eval_repeats']
for _ in xrange(N):
log_wk, count = self.explore_(inputs, L)
log_wks.append(log_wk)
log_wk = np.concatenate(log_wks, axis=1)
log_wk_sum = logSumExp(log_wk, axis=1, status='numpy')
nll = np.mean(-(log_wk_sum - np.log(N * L)))
perplexity = np.exp(np.mean(-(log_wk_sum - np.log(N * L)) / count))
return nll, perplexity
"""
OLD CODE:: >>> It doesn't work !
"""
def compile_train_CE(self):
# compile the computation graph (use contrastive noise, for 1 sample here. )
"""
build the training function here <:::>
"""
# get input sentence (x)
inputs = T.imatrix() # padded input word sequence x (for training)
noises = T.imatrix() # padded noise word sequence y (it stands for another question.)
batch_size = inputs.shape[0]
"""
The Computational Flow.
"""
# encoding. (use backward encoding.)
encodex = self.encoder.build_encoder(inputs[:, ::-1])
encodey = self.encoder.build_encoder(noises[:, ::-1])
# get Q(a|y) = sigmoid(.|Posterior * encoded)
q_dis_x = self.Posterior(encodex)
q_dis_y = self.Posterior(encodey)
# use multiple samples
if self.config['repeats'] > 1:
L = self.config['repeats']
# repeat mean, ln_var and targets.
func_r = lambda x: T.extra_ops.repeat(
x[:, None, :], L,
axis=1).reshape((x.shape[0] * L, x.shape[1]))
q_dis_x, q_dis_y, target \
= [func_r(x) for x in [q_dis_x, q_dis_y, inputs]]
else:
target = inputs
L = 1
# sample actions
u = self.rng.uniform(q_dis_x.shape)
action = T.cast(u <= q_dis_x, dtype=theano.config.floatX)
# compute the exact probability for actions (for data distribution)
logQax = T.sum(action * T.log(q_dis_x) + (1 - action) * T.log(1 - q_dis_x), axis=1)
# compute the exact probability for actions (for noise distribution)
logQay = T.sum(action * T.log(q_dis_y) + (1 - action) * T.log(1 - q_dis_y), axis=1)
# decoding.
context = self.context_trans(action)
logPxa, count = self.decoder.build_decoder(target, context, return_count=True)
# prior.
p_dis = self.Prior(target)
logPa = T.sum(action * T.log(p_dis) + (1 - action) * T.log(1 - p_dis), axis=1)
"""
Compute the weights
"""
# reshape
logQax = logQax.reshape((batch_size, L))
logQay = logQay.reshape((batch_size, L))
logPa = logPa.reshape((batch_size, L))
logPxa = logPxa.reshape((batch_size, L))
# P(x, a) = P(a) * P(x|a)
# logPx_a = logPa + logPxa
logPx_a = logPa + logPxa
# normalizing the weights
log_wk = logPx_a - logQax
log_bpk = logPa - logQax
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator
logPx = T.mean(log_w_sum - T.log(L))
perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
"""
Compute the Loss function
"""
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
"""
Contrastive Estimation
"""
# lossQ = -T.mean(T.sum(logQax * (weights - bq), axis=1)) # log q(a|x)
logC = logQax - logQay
weightC = weights * (1 - T.nnet.sigmoid(logC))
lossQ = -T.mean(T.sum(logC * weightC, axis=1))
# lossQT = -T.mean(T.sum(T.log(T.nnet.sigmoid(logC)) * weights, axis=1))
# monitoring
self.monitor['action'] = logC
"""
Maximum-likelihood Estimation
"""
lossPa = -T.mean(T.sum(logPa * (weights - bp), axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(logPxa * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
# loss = lossQT + lossPa + lossPxa
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights, bp])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weightC])
updates = updates_p + updates_q
logger.info("compiling the compuational graph ::training function::")
train_inputs = [inputs, noises]
self.train_ce_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, ess],
updates=updates,
name='train_fun')
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
logger.info("pre-training functions compile done.")
class HarX(Helmholtz):
"""
Another alternative I can think about is the Helmholtz Machine
It is trained using a Reweighted Wake Sleep Algorithm.
Reference:
Reweighted Wake-Sleep
http://arxiv.org/abs/1406.2751
We extend the original Helmholtz Machine to a recurrent way.
"""
def __init__(self,
config, n_rng, rng,
mode = 'Evaluation',
dynamic_prior=False,
):
super(VAE, self).__init__(config, n_rng, rng)
# self.config = config
# self.n_rng = n_rng # numpy random stream
# self.rng = rng # Theano random stream
self.mode = mode
self.name = 'multitask_helmholtz'
self.tparams = dict()
self.dynamic_prior = dynamic_prior
def build_(self):
logger.info('Build Helmholtz Recurrent Neural Networks')
# backward encoder
self.encoder = Encoder(self.config, self.rng, prefix='enc')
# feedforward + hidden content decoder
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed
if self.config['shared_embed']
else None)
# The main difference between VAE and HM is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a sigmoid prior here.
"""
Build the Sigmoid Layers
"""
# prior distribution (conditional distribution)
self.Prior = Dense(
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='sigmoid',
name='prior_proj'
)
# Fake Posterior (Q-function)
if self.config['decposterior']:
self.Posterior = Dense2(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='sigmoid',
name='posterior_proj'
)
else:
self.Posterior = Dense(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='sigmoid',
name='posterior_proj'
)
# Action transform to context
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='linear',
name="transform"
)
# registration:
self._add(self.Posterior)
self._add(self.Prior)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.Posterior, 'q')
# P-layers:
self._add_tag(self.Prior, 'p')
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
# objectives and optimizers
self.optimizer_p = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
self.optimizer_q = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
logger.info("create Helmholtz RECURRENT neural network. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
self.compile_train()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
"""
Training
"""
def compile_train(self):
"""
build the training function here <:::>
"""
# get input sentence (x)
inputs = T.imatrix() # padded input word sequence (for training)
batch_size = inputs.shape[0]
logger.info(
"""
The Computational Flow. ---> In a recurrent fashion
[= v =] <:::
Inference-Generation in one scan
>>>> Encoding without hidden variable. (use backward encoding.)
"""
)
embeded, mask \
= self.decoder.Embed(inputs, True) # (nb_samples, max_len, embedding_dim)
encoded = self.encoder.build_encoder(inputs[:, ::-1], return_sequence=True)[:, ::-1, :]
count = T.cast(T.sum(mask, axis=1), dtype=theano.config.floatX)[:, None] # (nb_samples,)
logger.info(
"""
>>>> Repeat
"""
)
L = T.iscalar('repeats') # self.config['repeats']
def _repeat(x, dimshuffle=True):
if x.ndim == 3:
y = T.extra_ops.repeat(x[:, None, :, :], L, axis=1).reshape((-1, x.shape[1], x.shape[2]))
if dimshuffle:
y = y.dimshuffle(1, 0, 2)
else:
y = T.extra_ops.repeat(x[:, None, :], L, axis=1).reshape((-1, x.shape[1]))
if dimshuffle:
y = y.dimshuffle(1, 0)
return y
embeded = _repeat(embeded) # (max_len, nb_samples * L, embedding_dim)
encoded = _repeat(encoded) # (max_len, nb_samples * L, enc_hidden_dim)
target = _repeat(inputs, False) # (nb_samples * L, max_len)
mask = _repeat(mask, False) # (nb_samples * L, max_len)
init_dec = T.zeros((encoded.shape[1],
self.config['dec_hidden_dim']),
dtype='float32') # zero initialization
uniform = self.rng.uniform((embeded.shape[0],
embeded.shape[1],
self.config['action_dim'])) # uniform dirstribution pre-sampled.
logger.info(
"""
>>>> Recurrence
"""
)
def _recurrence(embed_t, enc_t, u_t, dec_tm1):
"""
x_t: (nb_samples, dec_embedd_dim)
enc_t: (nb_samples, enc_hidden_dim)
dec_t: (nb_samples, dec_hidden_dim)
"""
# get q(z_t|dec_t, enc_t); sample z_t; compute the Posterior (inference) prob.
if self.config['decposterior']:
q_dis_t = self.Posterior(enc_t, dec_tm1)
else:
q_dis_t = self.Posterior(enc_t)
z_t = T.cast(u_t <= q_dis_t, dtype='float32')
log_qzx_t = T.sum(z_t * T.log(q_dis_t) + (1 - z_t) * T.log(1 - q_dis_t), axis=1) # (nb_samples * L, )
# compute the prior probability
p_dis_t = self.Prior(dec_tm1)
log_pz0_t = T.sum(z_t * T.log(p_dis_t) + (1 - z_t) * T.log(1 - p_dis_t), axis=1)
# compute the decoding probability
context_t = self.context_trans(z_t)
readout_t = self.decoder.hidden_readout(dec_tm1) + self.decoder.context_readout(context_t)
for l in self.decoder.output_nonlinear:
readout_t = l(readout_t)
pxz_dis_t = self.decoder.output(readout_t)
# compute recurrence
dec_t = self.decoder.RNN(embed_t, C=context_t, init_h=dec_tm1, one_step=True)
return dec_t, z_t, log_qzx_t, log_pz0_t, pxz_dis_t
# (max_len, nb_samples, ?)
outputs, _ = theano.scan(
_recurrence,
sequences=[embeded, encoded, uniform],
outputs_info=[init_dec, None, None, None, None])
_, z, log_qzx, log_pz0, pxz_dis = outputs
# summary of scan/ dimshuffle/ reshape
def _grab_prob(probs, x):
assert probs.ndim == 3
b_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((b_size * max_len, vocab_size))
return probs[T.arange(b_size * max_len), x.flatten(1)].reshape(x.shape) # advanced indexing
log_qzx = T.sum(log_qzx.dimshuffle(1, 0) * mask, axis=-1).reshape((batch_size, L))
log_pz0 = T.sum(log_pz0.dimshuffle(1, 0) * mask, axis=-1).reshape((batch_size, L))
log_pxz = T.sum(T.log(_grab_prob(pxz_dis.dimshuffle(1, 0, 2), target)) * mask, axis=-1).reshape((batch_size, L))
logger.info(
"""
>>>> Compute the weights [+ _ =]
"""
)
log_pxnz = log_pz0 + log_pxz # log p(X, Z)
log_wk = log_pxnz - log_qzx # log[p(X, Z)/q(Z|X)]
log_bpk = log_pz0 - log_qzx # log[p(Z)/q(Z|X)]
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator [+ _ =]
# Finally come to this place
nll = T.mean(-(log_w_sum - T.log(L)))
perplexity = T.exp(T.mean(-(log_w_sum - T.log(L)) / count))
# perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
logger.info(
"""
>>>> Compute the gradients [+ _ =]
"""
)
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
# monitoring
self.monitor['hidden state'] = z
if self.config['variant_control']:
lossQ = -T.mean(T.sum(log_qzx * (weights - bq), axis=1)) # log q(z|x)
lossPa = -T.mean(T.sum(log_pz0 * (weights - bp), axis=1)) # log p(z)
lossPxa = -T.mean(T.sum(log_pxz * weights, axis=1)) # log p(x|z)
lossP = lossPxa + lossPa
# L2 regu
lossP += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['p']])
lossQ += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['q']])
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights, bp])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
else:
lossQ = -T.mean(T.sum(log_qzx * weights, axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(log_pz0 * weights, axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(log_pxz * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
# L2 regu
print 'L2 ?'
lossP += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['p']])
lossQ += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['q']])
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
updates = updates_p + updates_q
logger.info("compiling the compuational graph:: >__< ::training function::")
train_inputs = [inputs] + [theano.Param(L, default=10)]
self.train_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, nll],
updates=updates,
name='train_fun')
logger.info("compile the computational graph:: >__< :: explore function")
self.explore_ = theano.function(train_inputs,
[log_wk, count],
name='explore_fun')
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs)
logger.info("pre-training functions compile done.")
def generate_(self, context=None, max_len=None, mode='display'):
# overwrite the RNNLM generator as there are hidden variables every time step
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
class THarX(Helmholtz):
"""
Another alternative I can think about is the Helmholtz Machine
It is trained using a Reweighted Wake Sleep Algorithm.
Reference:
Reweighted Wake-Sleep
http://arxiv.org/abs/1406.2751
We extend the original Helmholtz Machine to a recurrent way.
"""
def __init__(self,
config, n_rng, rng,
mode = 'Evaluation',
dynamic_prior=False,
):
super(VAE, self).__init__(config, n_rng, rng)
# self.config = config
# self.n_rng = n_rng # numpy random stream
# self.rng = rng # Theano random stream
self.mode = mode
self.name = 'multitask_helmholtz'
self.tparams = dict()
self.dynamic_prior = dynamic_prior
def build_(self):
logger.info('Build Helmholtz Recurrent Neural Networks')
# backward encoder
self.encoder = Encoder(self.config, self.rng, prefix='enc')
# feedforward + hidden content decoder
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed
if self.config['shared_embed']
else None)
# The main difference between VAE and HM is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a sigmoid prior here.
"""
Build the Sigmoid Layers
"""
# prior distribution (conditional distribution)
self.Prior = Dense(
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='prior_proj'
)
# Fake Posterior (Q-function)
if self.config['decposterior']:
self.Posterior = Dense2(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='posterior_proj'
)
else:
self.Posterior = Dense(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='posterior_proj'
)
# Action transform to context
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='linear',
name="transform"
)
# registration:
self._add(self.Posterior)
self._add(self.Prior)
self._add(self.context_trans)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.Posterior, 'q')
# P-layers:
self._add_tag(self.Prior, 'p')
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
# objectives and optimizers
self.optimizer_p = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
self.optimizer_q = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
logger.info("create Helmholtz RECURRENT neural network. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
self.compile_train()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
"""
Training
"""
def compile_train(self):
"""
build the training function here <:::>
"""
# get input sentence (x)
inputs = T.imatrix('inputs') # padded input word sequence (for training)
batch_size = inputs.shape[0]
logger.info(
"""
The Computational Flow. ---> In a recurrent fashion
[= v =] <:::
Inference-Generation in one scan
>>>> Encoding without hidden variable. (use backward encoding.)
"""
)
embeded, mask \
= self.decoder.Embed(inputs, True) # (nb_samples, max_len, embedding_dim)
encoded = self.encoder.build_encoder(inputs[:, ::-1], return_sequence=True)[:, ::-1, :]
count = T.cast(T.sum(mask, axis=1), dtype=theano.config.floatX)[:, None] # (nb_samples,)
logger.info(
"""
>>>> Repeat
"""
)
L = T.iscalar('repeats') # self.config['repeats']
def _repeat(x, dimshuffle=True):
if x.ndim == 3:
y = T.extra_ops.repeat(x[:, None, :, :], L, axis=1).reshape((-1, x.shape[1], x.shape[2]))
if dimshuffle:
y = y.dimshuffle(1, 0, 2)
else:
y = T.extra_ops.repeat(x[:, None, :], L, axis=1).reshape((-1, x.shape[1]))
if dimshuffle:
y = y.dimshuffle(1, 0)
return y
embeded = _repeat(embeded) # (max_len, nb_samples * L, embedding_dim)
encoded = _repeat(encoded) # (max_len, nb_samples * L, enc_hidden_dim)
target = _repeat(inputs, False) # (nb_samples * L, max_len)
mask = _repeat(mask, False) # (nb_samples * L, max_len)
init_dec = T.zeros((encoded.shape[1],
self.config['dec_hidden_dim']),
dtype='float32') # zero initialization
# uniform = self.rng.uniform((embeded.shape[0],
# embeded.shape[1],
# self.config['action_dim'])) # uniform dirstribution pre-sampled.
logger.info(
"""
>>>> Recurrence
"""
)
def _recurrence(embed_t, enc_t, dec_tm1):
"""
x_t: (nb_samples, dec_embedd_dim)
enc_t: (nb_samples, enc_hidden_dim)
dec_t: (nb_samples, dec_hidden_dim)
"""
# get q(z_t|dec_t, enc_t); sample z_t; compute the Posterior (inference) prob.
if self.config['decposterior']:
q_dis_t = self.Posterior(enc_t, dec_tm1)
else:
q_dis_t = self.Posterior(enc_t)
z_t = self.rng.multinomial(pvals=q_dis_t, dtype='float32')
log_qzx_t = T.sum(T.log(q_dis_t) * z_t, axis=1)
# log_qzx_t = T.log(q_dis_t[T.arange(q_dis_t.shape[0]), z_t])
# z_t = T.cast(u_t <= q_dis_t, dtype='float32')
# log_qzx_t = T.sum(z_t * T.log(q_dis_t) + (1 - z_t) * T.log(1 - q_dis_t), axis=1) # (nb_samples * L, )
# compute the prior probability
p_dis_t = self.Prior(dec_tm1)
log_pz0_t = T.sum(T.log(p_dis_t) * z_t, axis=1)
# log_pz0_t = T.log(p_dis_t[T.arange(p_dis_t.shape[0]), z_t])
# log_pz0_t = T.sum(z_t * T.log(p_dis_t) + (1 - z_t) * T.log(1 - p_dis_t), axis=1)
# compute the decoding probability
context_t = self.context_trans(z_t)
readout_t = self.decoder.hidden_readout(dec_tm1) + self.decoder.context_readout(context_t)
for l in self.decoder.output_nonlinear:
readout_t = l(readout_t)
pxz_dis_t = self.decoder.output(readout_t)
# compute recurrence
dec_t = self.decoder.RNN(embed_t, C=context_t, init_h=dec_tm1, one_step=True)
return dec_t, z_t, log_qzx_t, log_pz0_t, pxz_dis_t
# (max_len, nb_samples, ?)
outputs, scan_update = theano.scan(
_recurrence,
sequences=[embeded, encoded],
outputs_info=[init_dec, None, None, None, None])
_, z, log_qzx, log_pz0, pxz_dis = outputs
# summary of scan/ dimshuffle/ reshape
def _grab_prob(probs, x):
assert probs.ndim == 3
b_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((b_size * max_len, vocab_size))
return probs[T.arange(b_size * max_len), x.flatten(1)].reshape(x.shape) # advanced indexing
log_qzx = T.sum(log_qzx.dimshuffle(1, 0) * mask, axis=-1).reshape((batch_size, L))
log_pz0 = T.sum(log_pz0.dimshuffle(1, 0) * mask, axis=-1).reshape((batch_size, L))
log_pxz = T.sum(T.log(_grab_prob(pxz_dis.dimshuffle(1, 0, 2), target)) * mask, axis=-1).reshape((batch_size, L))
logger.info(
"""
>>>> Compute the weights [+ _ =]
"""
)
log_pxnz = log_pz0 + log_pxz # log p(X, Z)
log_wk = log_pxnz - log_qzx # log[p(X, Z)/q(Z|X)]
log_bpk = log_pz0 - log_qzx # log[p(Z)/q(Z|X)]
log_w_sum = logSumExp(log_wk, axis=1)
log_bp_sum = logSumExp(log_bpk, axis=1)
log_wnk = log_wk - log_w_sum
log_bpnk = log_bpk - log_bp_sum
# unbiased log-likelihood estimator [+ _ =]
# Finally come to this place
nll = T.mean(-(log_w_sum - T.log(L)))
perplexity = T.exp(T.mean(-(log_w_sum - T.log(L)) / count))
# perplexity = T.exp(-T.mean((log_w_sum - T.log(L)) / count))
logger.info(
"""
>>>> Compute the gradients [+ _ =]
"""
)
# loss = weights * log [p(a)p(x|a)/q(a|x)]
weights = T.exp(log_wnk)
bp = T.exp(log_bpnk)
bq = 1. / L
ess = T.mean(1 / T.sum(weights ** 2, axis=1))
# monitoring
self.monitor['hidden state'] = z
if self.config['variant_control']:
lossQ = -T.mean(T.sum(log_qzx * (weights - bq), axis=1)) # log q(z|x)
lossPa = -T.mean(T.sum(log_pz0 * (weights - bp), axis=1)) # log p(z)
lossPxa = -T.mean(T.sum(log_pxz * weights, axis=1)) # log p(x|z)
lossP = lossPxa + lossPa
# L2 regu
lossP += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['p']])
lossQ += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['q']])
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights, bp])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
else:
lossQ = -T.mean(T.sum(log_qzx * weights, axis=1)) # log q(a|x)
lossPa = -T.mean(T.sum(log_pz0 * weights, axis=1)) # log p(a)
lossPxa = -T.mean(T.sum(log_pxz * weights, axis=1)) # log p(x|a)
lossP = lossPxa + lossPa
# L2 regu
print 'L2 ?'
lossP += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['p']])
lossQ += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['q']])
updates_p = self.optimizer_p.get_updates(self.tparams['p'], [lossP, weights])
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, weights])
updates = updates_p + updates_q + scan_update
logger.info("compiling the compuational graph:: >__< ::training function::")
train_inputs = [inputs] + [theano.Param(L, default=10)]
self.train_ = theano.function(train_inputs,
[lossPa, lossPxa, lossQ, perplexity, nll],
updates=updates,
name='train_fun')
logger.info("compile the computational graph:: >__< :: explore function")
self.explore_ = theano.function(train_inputs,
[log_wk, count],
updates=scan_update,
name='explore_fun')
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs, updates=scan_update)
logger.info("pre-training functions compile done.")
def generate_(self, context=None, max_len=None, mode='display'):
# overwrite the RNNLM generator as there are hidden variables every time step
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
class NVTM(Helmholtz):
"""
Neural Variational Topical Models
We use the Neural Variational Inference and Learning (NVIL) to build the
learning, instead of using Helmholtz Machine(Reweighted Wake-sleep)
"""
def __init__(self,
config, n_rng, rng,
mode = 'Evaluation',
dynamic_prior=False,
):
super(VAE, self).__init__(config, n_rng, rng)
self.mode = mode
self.name = 'neural_variational'
self.tparams = dict()
self.dynamic_prior = dynamic_prior
def build_(self):
logger.info('Build Helmholtz Recurrent Neural Networks')
# backward encoder
self.encoder = Encoder(self.config, self.rng, prefix='enc')
# feedforward + hidden content decoder
self.decoder = Decoder(self.config, self.rng, prefix='dec',
embed=self.encoder.Embed
if self.config['shared_embed']
else None)
# The main difference between VAE and NVIL is that we can use
# a more flexible prior instead of Gaussian here.
# for example, we use a softmax prior here.
"""
Build the Prior Layer (Conditional Prior)
"""
# prior distribution (conditional distribution)
self.Prior = Dense(
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='prior_proj'
)
if self.config['decposterior']: # we use both enc/dec net as input.
# Variational Posterior (Q-function)
self.Posterior = Dense2(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['dec_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='posterior_proj'
)
# Baseline Estimator
self.C_lambda1 = Dense2(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['dec_hidden_dim'],
100,
activation='tanh',
name='baseline-1')
self.C_lambda2 = Dense(100, 1, activation='linear',
name='baseline-2')
else:
# Variational Posterior
self.Posterior = Dense(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
self.config['action_dim'],
activation='softmax',
name='posterior_proj'
)
# Baseline Estimator
self.C_lambda1 = Dense(
self.config['enc_hidden_dim']
if not self.config['bidirectional']
else 2 * self.config['enc_hidden_dim'],
100,
activation='tanh',
name='baseline-1')
self.C_lambda2 = Dense(100, 1, activation='linear',
name='baseline-2')
# Action transform to context
self.context_trans = Dense(
self.config['action_dim'],
self.config['dec_contxt_dim'],
activation='linear',
name="transform"
)
# registration:
self._add(self.Posterior)
self._add(self.Prior)
self._add(self.context_trans)
self._add(self.C_lambda1)
self._add(self.C_lambda2)
self._add(self.encoder)
self._add(self.decoder)
# Q-layers:
self._add_tag(self.encoder, 'q')
self._add_tag(self.Posterior, 'q')
# P-layers:
self._add_tag(self.Prior, 'p')
self._add_tag(self.decoder, 'p')
self._add_tag(self.context_trans, 'p')
# Lambda-layers
self._add_tag(self.C_lambda1, 'l')
self._add_tag(self.C_lambda2, 'l')
# c/v
self.c = shared_scalar(0., dtype='float32')
self.v = shared_scalar(1., dtype='float32')
# objectives and optimizers
self.optimizer_p = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
self.optimizer_q = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
self.optimizer_l = optimizers.get(self.config['optimizer'], kwargs={'clipnorm': 5})
logger.info("create Neural Variational Topic Network. ok")
def compile_(self, mode='train', contrastive=False):
# compile the computational graph.
# INFO: the parameters.
# mode: 'train'/ 'display'/ 'policy' / 'all'
ps = 'params: {\n'
for p in self.params:
ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
ps += '}.'
logger.info(ps)
param_num = np.sum([np.prod(p.shape.eval()) for p in self.params])
logger.info("total number of the parameters of the model: {}".format(param_num))
if mode == 'train' or mode == 'all':
self.compile_train()
if mode == 'display' or mode == 'all':
self.compile_sample()
if mode == 'inference' or mode == 'all':
self.compile_inference()
"""
Training
"""
def compile_train(self):
"""
build the training function here <:::>
"""
# get input sentence (x)
inputs = T.imatrix('inputs') # padded input word sequence (for training)
batch_size = inputs.shape[0]
logger.info(
"""
The Computational Flow. ---> In a recurrent fashion
[= v =] <:::
Inference-Generation in one scan
>>>> Encoding without hidden variable. (use backward encoding.)
"""
)
embeded, mask \
= self.decoder.Embed(inputs, True) # (nb_samples, max_len, embedding_dim)
mask = T.cast(mask, dtype='float32')
encoded = self.encoder.build_encoder(inputs[:, ::-1], return_sequence=True)[:, ::-1, :]
L = T.iscalar('repeats') # self.config['repeats']
def _repeat(x, dimshuffle=True):
if x.ndim == 3:
y = T.extra_ops.repeat(x[:, None, :, :], L, axis=1).reshape((-1, x.shape[1], x.shape[2]))
if dimshuffle:
y = y.dimshuffle(1, 0, 2)
else:
y = T.extra_ops.repeat(x[:, None, :], L, axis=1).reshape((-1, x.shape[1]))
if dimshuffle:
y = y.dimshuffle(1, 0)
return y
embeded = _repeat(embeded) # (max_len, nb_samples * L, embedding_dim)
encoded = _repeat(encoded) # (max_len, nb_samples * L, enc_hidden_dim)
target = _repeat(inputs, False) # (nb_samples * L, max_len)
mask = _repeat(mask, False)
count = T.cast(T.sum(mask, axis=1), dtype=theano.config.floatX)[:, None] # (nb_samples,)
init_dec = T.zeros((encoded.shape[1],
self.config['dec_hidden_dim']),
dtype='float32') # zero initialization
logger.info(
"""
>>>> Recurrence
"""
)
def _recurrence(embed_t, enc_t, dec_tm1):
"""
x_t: (nb_samples, dec_embedd_dim)
enc_t: (nb_samples, enc_hidden_dim)
dec_t: (nb_samples, dec_hidden_dim)
"""
# get q(z_t|dec_t, enc_t); sample z_t;
# compute the Posterior (inference) prob.
# compute the baseline estimator
if self.config['decposterior']:
q_dis_t = self.Posterior(enc_t, dec_tm1)
c_lmd_t = self.C_lambda2(self.C_lambda1(enc_t, dec_tm1)).flatten(1)
else:
q_dis_t = self.Posterior(enc_t)
c_lmd_t = self.C_lambda2(self.C_lambda1(enc_t)).flatten(1)
# sampling
z_t = self.rng.multinomial(pvals=q_dis_t, dtype='float32')
log_qzx_t = T.sum(T.log(q_dis_t) * z_t, axis=1)
# compute the prior probability
p_dis_t = self.Prior(dec_tm1)
log_pz0_t = T.sum(T.log(p_dis_t) * z_t, axis=1)
# compute the decoding probability
context_t = self.context_trans(z_t)
readout_t = self.decoder.hidden_readout(dec_tm1) + self.decoder.context_readout(context_t)
for l in self.decoder.output_nonlinear:
readout_t = l(readout_t)
pxz_dis_t = self.decoder.output(readout_t)
# compute recurrence
dec_t = self.decoder.RNN(embed_t, C=context_t, init_h=dec_tm1, one_step=True)
return dec_t, z_t, log_qzx_t, log_pz0_t, pxz_dis_t, c_lmd_t
# (max_len, nb_samples, ?)
outputs, scan_update = theano.scan(
_recurrence,
sequences=[embeded, encoded],
outputs_info=[init_dec, None, None, None, None, None])
_, z, log_qzx, log_pz0, pxz_dis, c_lmd = outputs
# summary of scan/ dimshuffle/ reshape
def _grab_prob(probs, x):
assert probs.ndim == 3
b_size = probs.shape[0]
max_len = probs.shape[1]
vocab_size = probs.shape[2]
probs = probs.reshape((b_size * max_len, vocab_size))
return probs[T.arange(b_size * max_len), x.flatten(1)].reshape(x.shape) # advanced indexing
logger.info(
"""
>>>> Compute the weights [+ _ =]
"""
)
# log Q/P and C
log_qzx = log_qzx.dimshuffle(1, 0) * mask
log_pz0 = log_pz0.dimshuffle(1, 0) * mask
log_pxz = T.log(_grab_prob(pxz_dis.dimshuffle(1, 0, 2), target)) * mask
c_lambda = c_lmd.dimshuffle(1, 0) * mask
Lb = T.sum(log_pz0 + log_pxz - log_qzx, axis=-1) # lower bound
l_lambda = log_pz0 + log_pxz - log_qzx - c_lambda
alpha = T.cast(0.0, dtype='float32')
numel = T.sum(mask)
cb = T.sum(l_lambda) / numel
vb = T.sum(l_lambda ** 2) / T.sum(mask) - cb ** 2
c = self.c * alpha + (1 - alpha) * cb # T.cast(cb, dtype='float32')
v = self.v * alpha + (1 - alpha) * vb # T.cast(vb, dtype='float32')
l_normal = (l_lambda - c) / T.max((1., T.sqrt(v))) * mask
l_base = T.mean(T.sum(l_normal, axis=1))
nll = T.mean(-Lb) # variational lower-bound
perplexity = T.exp(T.mean(-Lb[:, None] / count)) # perplexity of lower-bound
logger.info(
"""
>>>> Compute the gradients [+ _ =]
"""
)
# monitoring
self.monitor['hidden state'] = z
lossP = -T.mean(T.sum(log_pxz + log_pz0, axis=1))
lossQ = -T.mean(T.sum(log_qzx * l_normal, axis=1))
lossL = -T.mean(T.sum(c_lambda * l_normal, axis=1)) # ||L - c - c_lambda||2-> 0
# lossP = -T.sum(log_pxz + log_pz0) / numel
# lossQ = -T.sum(log_qzx * l_normal) / numel
# lossL = -T.sum(c_lambda * l_normal) / numel # ||L - c - c_lambda||2-> 0
#
# # L2 regu
# print 'L2 ?'
# lossP += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['p']])
# lossQ += 0.0001 * T.sum([T.sum(p**2) for p in self.tparams['q']])
updates_p = self.optimizer_p.get_updates(self.tparams['p'], lossP)
updates_q = self.optimizer_q.get_updates(self.tparams['q'], [lossQ, l_normal])
updates_l = self.optimizer_l.get_updates(self.tparams['l'], [lossL, l_normal])
updates = updates_p + updates_q + updates_l + scan_update
updates += [(self.c, c), (self.v, v)]
logger.info("compiling the compuational graph:: >__< ::training function::")
train_inputs = [inputs] + [theano.Param(L, default=1)]
self.train_ = theano.function(train_inputs,
[lossL, lossP, lossQ, perplexity, nll, l_base],
updates=updates,
name='train_fun')
logger.info("compile the computational graph:: >__< :: explore function")
self.explore_ = theano.function(train_inputs,
[lossL, lossP, lossQ, perplexity, nll, l_base],
updates=scan_update,
name='explore_fun')
# add monitoring:
self._monitoring()
# compiling monitoring
self.compile_monitoring(train_inputs, updates=scan_update)
logger.info("pre-training functions compile done.")
def generate_(self, context=None, max_len=None, mode='display'):
# overwrite the RNNLM generator as there are hidden variables every time step
args = dict(k=self.config['sample_beam'],
maxlen=self.config['max_len'] if not max_len else max_len,
stochastic=self.config['sample_stoch'] if mode == 'display' else None,
argmax=self.config['sample_argmax'] if mode == 'display' else None)
================================================
FILE: emolga/run.py
================================================
# coding=utf-8
__author__ = 'jiataogu'
import logging
from matplotlib import pyplot
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from config import setup, setup_main
from dataset import deserialize_from_file, divide_dataset, build_fuel, GuessOrder
from game.asker import Asker
from game.responder import Responder
from models.variational import Helmholtz
from utils.generic_utils import *
logger = logging.getLogger(__name__)
lm_config = setup()
main_config = setup_main()
# logging.basicConfig(level= main_config['level'], format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
np.random.seed(main_config['seed'])
n_rng = np.random.RandomState(main_config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30), use_cuda=True)
"""
Main Loop.
"""
print 'start.'
# load the dataset and build a fuel-dataset.
idx2word, word2idx = deserialize_from_file(lm_config['vocabulary_set'])
# load the fake_dialogue dataset.
fake_data = deserialize_from_file(main_config['fake_diag'])
train_set, test_set = divide_dataset(fake_data, main_config['test_size'], 200000)
lm_config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
lm_config['dec_voc_size'] = lm_config['enc_voc_size']
lm_config['state_dim'] = main_config['core_hidden_dim']
main_config['enc_voc_size'] = lm_config['enc_voc_size']
database = deserialize_from_file(lm_config['dataset'])
dataset = build_fuel(database)
weights_file = lm_config['weights_file']
answer_templates = {0: 'I cannot understand.', 1: 'Congrats!', 2: 'Pity.'}
logger.info('build dataset done. vocabulary size = {0}'.format(lm_config['dec_voc_size']))
start_time = time.time()
# build the environment
game = GuessOrder(rng=n_rng, size=8)
environment = Responder(game=game)
# load the pretrained generator
generator = Helmholtz(lm_config, n_rng, rng, dynamic_prior=True)
generator.build_()
generator.load(weights_file)
generator.dynamic()
# build the agent.
agent = Asker(main_config, lm_config, n_rng, rng, generator)
agent.build_()
agent.compile_asker()
logger.info('compile the asker sampler ok.')
# build the scheduled trainer if any.
agent.compile_scheduled_trainer()
logger.info('compile the asker ss-learner ok.')
# build the trainer
agent.compile_trainer()
logger.info('compile the asker learner ok.')
end_time = time.time()
logger.info('compiling done. It costs {} seconds'.format(end_time - start_time))
def simulator(M=25, display=False):
"""
Dialogue Simulation
"""
start_time = time.time()
progbar = Progbar(M)
logger.info('Start simulation.')
train_data = {'X': [], 'Y': [], 'A': [], 'R': [], 'G': [], 'T': [], 'text': [], 'acc': []}
for ep in xrange(M):
environment.reset()
episode = {'x': [], 'y': [], 'a': [], 'r': []}
conversation = ''
conversation += '\n\n\n' + '***' * 30
conversation += '\nGame start.'
turn = 0
maxturn = 16
kwargs = {'turn': turn, 'maxturn': maxturn}
for k in xrange(maxturn + 1):
if kwargs['turn'] == maxturn:
guess, score = agent.act(kwargs)
conversation += '\n' + '_' * 93 + '[{}]'.format(kwargs['turn'])
conversation += '\n(´✪ ‿ ✪`)ノ : {}'.format('My answer = ' + ' '.join([str(w) for w in guess]))
corrects = environment.get_answer()
conversation += '\n{:>78} : ლ(´∀`ლ)'.format(' '.join([str(w) for w in corrects]))
Accuracy = sum([g == c for g, c in zip(guess, corrects)]) / float(len(guess))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Accuracy = {}%'.format(Accuracy * 100))
episode['g'] = np.asarray(guess)
episode['t'] = np.asarray(corrects)
episode['r'].append(Accuracy)
episode['c'] = Accuracy
break
next_action, next_sent, kwargs = agent.act(kwargs)
question = ' '.join(print_sample(idx2word, next_sent)[:-1])
conversation += '\n' + '_' * 93 + '[{}]'.format(kwargs['turn'])
conversation += '\n(´◉ ω ◉`)? : {}'.format(question)
got = environment.parse(question)
reward = 0 if got > 0 else -1
kwargs['prev_asw'] = np.asarray([got], dtype='int32')
conversation += '\n{:>78} : (●´ε`●)'.format(answer_templates[got])
# registration
episode['a'].append(next_action)
episode['y'].append(next_sent[None, :])
episode['x'].append(got)
episode['r'].append(reward)
conversation += '\nGame End\n' + '***' * 30
if display:
logger.info(conversation)
# concatenate
train_data['A'].append(np.concatenate(episode['a'], axis=0)[None, :, :])
train_data['Y'].append(np.concatenate(episode['y'], axis=0)[None, :, :])
train_data['X'].append(np.asarray(episode['x'], dtype='int32')[None, :])
train_data['R'].append(np.asarray(episode['r'], dtype='float32')[::-1].cumsum()[::-1][None, :])
train_data['G'].append(episode['g'][None, :])
train_data['T'].append(episode['t'][None, :])
train_data['text'].append(conversation)
train_data['acc'].append(episode['c'])
progbar.update(ep + 1, [('accuracy', episode['c'])])
train_data['A'] = np.concatenate(train_data['A'], axis=0).astype('float32')
train_data['X'] = np.concatenate(train_data['X'], axis=0).astype('int32')
train_data['Y'] = np.concatenate(train_data['Y'], axis=0).astype('int32')
train_data['R'] = np.concatenate(train_data['R'], axis=0).astype('float32')
train_data['G'] = np.concatenate(train_data['G'], axis=0).astype('int32')
train_data['T'] = np.concatenate(train_data['T'], axis=0).astype('int32')
end_time = time.time()
print ''
logger.info('Simulation {0} eposides with {1} seconds.'.format(M, end_time - start_time))
return train_data
def learner(data, fr=1., fs=1., fb=1.):
"""
Training.
"""
start_time = time.time()
X = data['X'] # answers obtained from the environment;
Y = data['Y'] # questions generated based on policy;
A = data['A'] # actions performed in Helmholtz questions generator;
R = data['R'] # cumulative reward obtained through conversation;
guess = data['G'] # final guess order given by the agent
truth = data['T'] # real order in the environment
loss = agent.train(X, Y, A, R, guess, truth, fr, fs, fb)
end_time = time.time()
logger.info('Training this batch with {0} seconds.'.format(end_time - start_time))
logger.info('REINFORCE Loss = {0}, Supervised loss = {1}, Baseline loss = {2}'.format(
loss[0], loss[1], loss[2]))
return loss
def SL_learner(data, batch_size=25):
"""
Supervised Learning with fake-optimal logs.
One epoch for all data.
"""
start_time = time.time()
X = data['X'].astype('int32') # answers obtained from the environment;
Y = data['Y'].astype('int32') # questions generated based on policy;
T = data['T'].astype('int32') # real order in the environment
# index shuffle
idx = np.arange(X.shape[0]).tolist()
np.random.shuffle(idx)
num_batch = X.shape[0] / batch_size
progbar = Progbar(num_batch)
batch_from = 0
loss = []
for batch in xrange(num_batch):
batch_to = batch_from + batch_size
if batch_to > X.shape[0]:
batch_to = X.shape[0]
batch_X = X[idx[batch_from: batch_to]]
batch_Y = Y[idx[batch_from: batch_to]]
batch_T = T[idx[batch_from: batch_to]]
if not main_config['multi_task']:
if not main_config['ssl']:
loss.append(agent.train_sl(batch_X, batch_Y, batch_T))
else:
loss.append(agent.train_ssl(batch_X, batch_Y, batch_T, 3, 10.))
progbar.update(batch + 1, [('loss', loss[-1])])
else:
loss.append(agent.train_mul(batch_X, batch_Y, batch_T, 3, 10.))
progbar.update(batch + 1, [('loss', loss[-1][0]), ('loss_ssl', loss[-1][1]), ('ppl', loss[-1][2])])
batch_from = batch_to
end_time = time.time()
logger.info('Training this epoch with {0} seconds.'.format(end_time - start_time))
logger.info('Supervised loss = {}'.format(np.mean(loss)))
return loss
def main():
losses = []
accuracy = []
for echo in xrange(4000):
logger.info('Iteration = {}'.format(echo))
train_data = simulator(M=20)
print train_data['text'][-1]
loss = learner(train_data, fr=0.)
losses.append(loss)
accuracy += train_data['acc']
if echo % 100 == 99:
plt.plot(accuracy)
plt.show()
# pkl.dump(losses, open('losses.temp.pkl'))
def check_answer(x, y, g):
g = np.asarray(g)
environment.game.set_answer(g)
s = 0
for k in xrange(x.shape[1]):
question = ' '.join(print_sample(idx2word, y[0][k].tolist())[:-1])
got = environment.parse(question)
if got == 2 - x[0][k]:
s += 1.
return s / x.shape[1]
def display_session(x, y, g, t, acc, cov):
"""
display a dialogue session
"""
conversation = ''
conversation += '\n\n\n' + '***' * 30
conversation += '\nGame start.'
for k in xrange(x.shape[1]):
question = ' '.join(print_sample(idx2word, y[0][k].tolist())[:-1])
conversation += '\n' + '_' * 93 + '[{}]'.format(k + 1)
conversation += '\n(´◉ ω ◉`)? : {}'.format(question)
got = x[0][k]
conversation += '\n{:>78} : (●´ε`●)'.format(answer_templates[got])
conversation += '\n' + '_' * 93 + '[{}]'.format(k + 1)
conversation += '\n(´✪ ‿ ✪`)ノ : {}'.format('My answer = ' + ' '.join([str(w) for w in g]))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format(' '.join([str(w) for w in t[0]]))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Accuracy = {}%'.format(acc * 100))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Understand = {}%'.format(cov * 100))
conversation += '\nGame End\n' + '***' * 30
return conversation
def main_sl():
# get the evaluation set.
evaluation_set = n_rng.randint(0, train_set['X'].shape[0], main_config['test_size']).tolist()
acc_s, acc_t = [], []
los_s, los_t = [], []
und_s, und_t = [], []
for echo in xrange(500):
logger.info('Epoch = {}'.format(echo))
loss = SL_learner(train_set, batch_size=50)
los_s.append(loss)
# sampling on training set.
logger.info('testing on sampled training set.')
progbar = Progbar(main_config['test_size'])
accuracy = []
understand = []
untruth = []
at = 0
for k in evaluation_set:
at += 1
x = train_set['X'][None, k]
y = train_set['Y'][None, k]
t = train_set['T'][None, k]
g, _, acc = agent.evaluate(x, y, t)
cov = check_answer(x, y, g)
cov_t = check_answer(x, y, t[0].tolist())
progbar.update(at, [('acc', acc), ('und', cov)])
untruth.append(cov_t)
accuracy.append(acc)
understand.append(cov)
# if at == 1:
# x_ = 2 - x
# logger.info(display_session(x_, y, g, t, acc, cov))
print '\ntraining set test.. avarage accuracy = {0}% /understand {1}% questions'.format(
100 * np.mean(accuracy), 100 * np.mean(understand))
print 'check truth {}%'.format(100 * np.mean(untruth))
acc_s.append(np.mean(accuracy))
und_s.append(np.mean(understand))
# sampling on testing set.
logger.info('testing on testing set.')
progbar2 = Progbar(main_config['test_size'])
accuracy = []
understand = []
at = 0
for k in xrange(main_config['test_size']):
at += 1
x = test_set['X'][None, k]
y = test_set['Y'][None, k]
t = test_set['T'][None, k]
g, _, acc = agent.evaluate(x, y, t)
cov = check_answer(x, y, g)
progbar2.update(at, [('acc', acc), ('und', cov)])
accuracy.append(acc)
understand.append(cov)
# if at == 1:
# x_ = 2 - x
# logger.info(display_session(x_, y, g, t, acc, cov))
print '\ntesting set test.. avarage accuracy = {0}% /understand {1}% questions'.format(
100 * np.mean(accuracy), 100 * np.mean(understand))
acc_t.append(np.mean(accuracy))
und_t.append(np.mean(understand))
if echo % 20 == 19:
pyplot.figure(1)
pyplot.plot(acc_s, 'r')
pyplot.plot(acc_t, 'g')
pyplot.figure(2)
pyplot.plot(und_s, 'r')
pyplot.plot(und_t, 'g')
pyplot.show()
# agent.main_config['sample_beam'] = 1
# agent.main_config['sample_argmax'] = True
main_sl()
================================================
FILE: emolga/test_lm.py
================================================
__author__ = 'jiataogu'
import logging
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from emolga.models.encdec import RNNLM, AutoEncoder
from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
# from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
from emolga.utils.generic_utils import *
from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
from emolga.config import setup_ptbz, setup_ptb2
from emolga.config_variant import *
setup = setup_bienc
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
# load the dataset and build a fuel-dataset.
idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
# training & valid & tesing set.
train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try
# weiget save.
savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
# build the agent
if config['model'] == 'RNNLM':
agent = RNNLM(config, n_rng, rng, mode=config['mode'])
elif config['model'] == 'HarX':
agent = THarX(config, n_rng, rng, mode=config['mode'])
elif config['model'] == 'Helmholtz':
agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
else:
raise NotImplementedError
agent.build_()
agent.compile_('train')
print 'compile ok'
# learning to speak language.
count = 1000
echo = 0
epochs = 50
while echo < epochs:
echo += 1
loss = []
correct = 0
scans = 0
# visualization the embedding weights.
# if echo > 1:
# plt.figure(3)
# visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
# text=idx2word)
# plt.show()
# if not config['use_noise']:
# training
train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
def prepare_batch(batch):
data = batch['data'].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data):
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data)
return data
# training
logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# get data
data = prepare_batch(batch)
if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
loss.append(agent.train_(data, config['repeats']))
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
elif config['model'] == 'Helmholtz' or 'HarX':
loss.append(agent.train_(data, config['repeats']))
weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
progbar.update(it, [('lossPa', loss[-1][0]), ('lossPxa', loss[-1][1]), ('lossQ', loss[-1][2]),
('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('L1', weightss)])
"""
watch = agent.watch(data)
print '.'
pprint(watch[0][0])
pprint(watch[2][0])
# pprint(watch[2][0])
sys.exit(111)
"""
# if it % 100 == 50:
# sys.exit(-1)
# # print '.'
# # print 'encoded = {}'.format(encoded[11])
# # print 'mean = {}'.format(mean[11])
# # print 'std = {}'.format(std[11])
#
# # watch = agent.watch(data)
# # print '.'
# # print 'train memory {}'.format(watch[0][0])
#
# for kk in xrange(5):
# # sample a sentence.
# # action = agent.action_sampler()
# # context = agent.context_trans(action)
# if config['model'] is 'AutoEncoder':
# source = data[kk][None, :]
# truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
# print '\ntruth: {}'.format(truth)
# context = agent.memorize(source)
# sample, score = agent.generate_(context, max_len=data.shape[1])
# else:
# sample, score = agent.generate_(max_len=data.shape[1])
#
# if sample[-1] is not 0:
# sample += [0] # fix the end.
# question = ' '.join(print_sample(idx2word, sample)[:-1])
# print '\nsample: {}'.format(question)
# print 'PPL: {}'.format(score)
# scans += 1.0
print ' .'
logger.info('Epoch = {0} finished.'.format(echo))
# validation
logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
progbar = Progbar(valid_size / config['batch_size'])
for it, batch in enumerate(valid_batches):
# get data
data = prepare_batch(batch)
if config['model'] == 'Helmholtz' or 'HarX':
loss.append(agent.evaluate_(data))
progbar.update(it, [('NLL', loss[-1][0]), ('perplexity', np.log(loss[-1][1]))])
else:
raise NotImplementedError
print ' .'
# save the weights.
agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
# logger.info('Learning percentage: {}'.format(correct / scans))
# inference test
# batches = data_stream.get_epoch_iterator(as_dict=True)
# for it, batch in enumerate(batches):
# data = batch['data'].astype('int32')
# data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
# mean, std = agent.inference_(data)
# print mean
# break
# print count
================================================
FILE: emolga/test_nvtm.py
================================================
__author__ = 'jiataogu'
import logging
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from emolga.models.encdec import RNNLM, AutoEncoder
from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
# from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
from emolga.utils.generic_utils import *
from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
from emolga.config import setup_ptbz, setup_ptb2
from emolga.config_variant import *
setup = setup_bienc
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
# load the dataset and build a fuel-dataset.
idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
# training & valid & tesing set.
train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try
# weiget save.
savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
# build the agent
if config['model'] == 'RNNLM':
agent = RNNLM(config, n_rng, rng, mode=config['mode'])
elif config['model'] == 'HarX':
agent = NVTM(config, n_rng, rng, mode=config['mode'])
elif config['model'] == 'Helmholtz':
agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
else:
raise NotImplementedError
agent.build_()
agent.compile_('train')
print 'compile ok'
# learning to speak language.
count = 1000
echo = 0
epochs = 50
while echo < epochs:
echo += 1
loss = []
correct = 0
scans = 0
# visualization the embedding weights.
# if echo > 1:
# plt.figure(3)
# visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
# text=idx2word)
# plt.show()
# if not config['use_noise']:
# training
train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
def prepare_batch(batch):
data = batch['data'].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data):
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data)
return data
# training
logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# get data
data = prepare_batch(batch)
if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
loss.append(agent.train_(data, config['repeats']))
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
elif config['model'] == 'Helmholtz' or 'HarX':
loss.append(agent.train_(data, 1))
weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
"""
watch = agent.watch(data)
print '.'
pprint(watch[0][0])
pprint(watch[2][0])
# pprint(watch[2][0])
sys.exit(111)
"""
# if it % 100 == 50:
# sys.exit(-1)
# # print '.'
# # print 'encoded = {}'.format(encoded[11])
# # print 'mean = {}'.format(mean[11])
# # print 'std = {}'.format(std[11])
#
# # watch = agent.watch(data)
# # print '.'
# # print 'train memory {}'.format(watch[0][0])
#
# for kk in xrange(5):
# # sample a sentence.
# # action = agent.action_sampler()
# # context = agent.context_trans(action)
# if config['model'] is 'AutoEncoder':
# source = data[kk][None, :]
# truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
# print '\ntruth: {}'.format(truth)
# context = agent.memorize(source)
# sample, score = agent.generate_(context, max_len=data.shape[1])
# else:
# sample, score = agent.generate_(max_len=data.shape[1])
#
# if sample[-1] is not 0:
# sample += [0] # fix the end.
# question = ' '.join(print_sample(idx2word, sample)[:-1])
# print '\nsample: {}'.format(question)
# print 'PPL: {}'.format(score)
# scans += 1.0
print ' .'
logger.info('Epoch = {0} finished.'.format(echo))
loss = zip(*loss)
logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
))
# validation
loss = []
logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
progbar = Progbar(valid_size / config['batch_size'])
for it, batch in enumerate(valid_batches):
# get data
data = prepare_batch(batch)
if config['model'] == 'Helmholtz' or 'HarX':
# loss.append(agent.evaluate_(data))
loss.append(agent.explore_(data, 10))
weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
else:
raise NotImplementedError
print ' .'
loss = zip(*loss)
logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
))
# save the weights.
agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
# logger.info('Learning percentage: {}'.format(correct / scans))
# inference test
# batches = data_stream.get_epoch_iterator(as_dict=True)
# for it, batch in enumerate(batches):
# data = batch['data'].astype('int32')
# data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
# mean, std = agent.inference_(data)
# print mean
# break
# print count
================================================
FILE: emolga/test_run.py
================================================
# coding=utf-8
__author__ = 'jiataogu'
import logging
import theano
from matplotlib import pyplot
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from config import setup, setup_main
from dataset import deserialize_from_file, divide_dataset, build_fuel, GuessOrder
from game.asker import Asker
from game.responder import Responder
from models.variational import Helmholtz
from utils.generic_utils import *
theano.config.optimizer = 'fast_compile'
Asker = Asker # GridAsker # PyramidAsker # GridAsker
logger = logging.getLogger(__name__)
lm_config = setup()
main_config = setup_main() # setup_grid6() # setup_pyramid() # setup_grid() # setup_main()
# logging.basicConfig(level= main_config['level'], format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
np.random.seed(main_config['seed'])
n_rng = np.random.RandomState(main_config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30), use_cuda=True)
"""
Main Loop.
"""
print 'start.'
# load the dataset and build a fuel-dataset.
idx2word, word2idx = deserialize_from_file(lm_config['vocabulary_set'])
# load the fake_dialogue dataset.
print 'Dataset: {}'.format(main_config['fake_diag'])
fake_data = deserialize_from_file(main_config['fake_diag'])
train_set, test_set = divide_dataset(fake_data, main_config['test_size'], 200000)
lm_config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
lm_config['dec_voc_size'] = lm_config['enc_voc_size']
lm_config['state_dim'] = main_config['core_hidden_dim']
main_config['enc_voc_size'] = lm_config['enc_voc_size']
database = deserialize_from_file(lm_config['dataset'])
dataset = build_fuel(database)
weights_file = lm_config['weights_file']
answer_templates = {0: 'I cannot understand.', 1: 'Congrats!', 2: 'Pity.'}
logger.info('build dataset done. vocabulary size = {0}'.format(lm_config['dec_voc_size']))
start_time = time.time()
# build the environment
game = GuessOrder(rng=n_rng, size=main_config['game_length'])
environment = Responder(game=game)
# load the pretrained generator
generator = Helmholtz(lm_config, n_rng, rng, dynamic_prior=True)
generator.build_()
generator.load(weights_file)
generator.dynamic()
# build the agent.
agent = Asker(main_config, lm_config, n_rng, rng, generator)
agent.build_()
agent.compile_asker()
logger.info('compile the asker sampler ok.')
# # build the scheduled trainer if any.
# agent.compile_scheduled_trainer()
# logger.info('compile the asker ss-learner ok.')
# build the trainer
agent.compile_trainer()
logger.info('compile the asker learner ok.')
end_time = time.time()
logger.info('compiling done. It costs {} seconds'.format(end_time - start_time))
def simulator(M=25, display=False):
"""
Dialogue Simulation
"""
start_time = time.time()
progbar = Progbar(M)
logger.info('Start simulation.')
train_data = {'X': [], 'Y': [], 'A': [], 'R': [], 'G': [], 'T': [], 'text': [], 'acc': []}
for ep in xrange(M):
environment.reset()
episode = {'x': [], 'y': [], 'a': [], 'r': []}
conversation = ''
conversation += '\n\n\n' + '***' * 30
conversation += '\nGame start.'
turn = 0
maxturn = 16
kwargs = {'turn': turn, 'maxturn': maxturn}
for k in xrange(maxturn + 1):
if kwargs['turn'] == maxturn:
guess, score = agent.act(kwargs)
conversation += '\n' + '_' * 93 + '[{}]'.format(kwargs['turn'])
conversation += '\n(´✪ ‿ ✪`)ノ : {}'.format('My answer = ' + ' '.join([str(w) for w in guess]))
corrects = environment.get_answer()
conversation += '\n{:>78} : ლ(´∀`ლ)'.format(' '.join([str(w) for w in corrects]))
Accuracy = sum([g == c for g, c in zip(guess, corrects)]) / float(len(guess))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Accuracy = {}%'.format(Accuracy * 100))
episode['g'] = np.asarray(guess)
episode['t'] = np.asarray(corrects)
episode['r'].append(Accuracy)
episode['c'] = Accuracy
break
next_action, next_sent, kwargs = agent.act(kwargs)
question = ' '.join(print_sample(idx2word, next_sent)[:-1])
conversation += '\n' + '_' * 93 + '[{}]'.format(kwargs['turn'])
conversation += '\n(´◉ ω ◉`)? : {}'.format(question)
got = environment.parse(question)
reward = 0 if got > 0 else -1
kwargs['prev_asw'] = np.asarray([got], dtype='int32')
conversation += '\n{:>78} : (●´ε`●)'.format(answer_templates[got])
# registration
episode['a'].append(next_action)
episode['y'].append(next_sent[None, :])
episode['x'].append(got)
episode['r'].append(reward)
conversation += '\nGame End\n' + '***' * 30
if display:
logger.info(conversation)
# concatenate
train_data['A'].append(np.concatenate(episode['a'], axis=0)[None, :, :])
train_data['Y'].append(np.concatenate(episode['y'], axis=0)[None, :, :])
train_data['X'].append(np.asarray(episode['x'], dtype='int32')[None, :])
train_data['R'].append(np.asarray(episode['r'], dtype='float32')[::-1].cumsum()[::-1][None, :])
train_data['G'].append(episode['g'][None, :])
train_data['T'].append(episode['t'][None, :])
train_data['text'].append(conversation)
train_data['acc'].append(episode['c'])
progbar.update(ep + 1, [('accuracy', episode['c'])])
train_data['A'] = np.concatenate(train_data['A'], axis=0).astype('float32')
train_data['X'] = np.concatenate(train_data['X'], axis=0).astype('int32')
train_data['Y'] = np.concatenate(train_data['Y'], axis=0).astype('int32')
train_data['R'] = np.concatenate(train_data['R'], axis=0).astype('float32')
train_data['G'] = np.concatenate(train_data['G'], axis=0).astype('int32')
train_data['T'] = np.concatenate(train_data['T'], axis=0).astype('int32')
end_time = time.time()
print ''
logger.info('Simulation {0} eposides with {1} seconds.'.format(M, end_time - start_time))
return train_data
def learner(data, fr=1., fs=1., fb=1.):
"""
Training.
"""
start_time = time.time()
X = data['X'] # answers obtained from the environment;
Y = data['Y'] # questions generated based on policy;
A = data['A'] # actions performed in Helmholtz questions generator;
R = data['R'] # cumulative reward obtained through conversation;
guess = data['G'] # final guess order given by the agent
truth = data['T'] # real order in the environment
loss = agent.train(X, Y, A, R, guess, truth, fr, fs, fb)
end_time = time.time()
logger.info('Training this batch with {0} seconds.'.format(end_time - start_time))
logger.info('REINFORCE Loss = {0}, Supervised loss = {1}, Baseline loss = {2}'.format(
loss[0], loss[1], loss[2]))
return loss
def SL_learner(data, batch_size=25, eval_freq=0, eval_train=None, eval_test=None):
"""
Supervised Learning with fake-optimal logs.
One epoch for all data.
"""
start_time = time.time()
X = data['X'].astype('int32') # answers obtained from the environment;
Y = data['Y'].astype('int32') # questions generated based on policy;
T = data['T'].astype('int32') # real order in the environment
# index shuffle
idx = np.arange(X.shape[0]).tolist()
np.random.shuffle(idx)
num_batch = X.shape[0] / batch_size
progbar = Progbar(num_batch)
batch_from = 0
loss = []
if eval_freq > 0:
eval_batch = num_batch / eval_freq
eval_start = 0
batches = []
accs, unds = [], []
acct, undt = [], []
for batch in xrange(num_batch):
batch_to = batch_from + batch_size
if batch_to > X.shape[0]:
batch_to = X.shape[0]
batch_X = X[idx[batch_from: batch_to]]
batch_Y = Y[idx[batch_from: batch_to]]
batch_T = T[idx[batch_from: batch_to]]
if not main_config['multi_task']:
loss.append(agent.train_sl(batch_X, batch_Y, batch_T))
# if not main_config['ssl']:
# loss.append(agent.train_sl(batch_X, batch_Y, batch_T))
# else:
# loss.append(agent.train_ssl(batch_X, batch_Y, batch_T, 3, 10.))
progbar.update(batch + 1, [('loss', loss[-1])])
else:
loss.append(agent.train_sl(batch_X, batch_Y, batch_T))
# loss.append(agent.train_mul(batch_X, batch_Y, batch_T, 3, 10.))
progbar.update(batch + 1, [('loss', loss[-1][0]), ('ppl', loss[-1][1]), ('asw loss', loss[-1][2])])
batch_from = batch_to
if eval_freq > 0:
eval_start += 1
if eval_start == eval_batch or batch == num_batch - 1:
batches.append(batch_to)
if eval_train:
logger.info('\ntesting on sampled training set.')
acc, und = SL_test(eval_train)
accs.append(acc)
unds.append(und)
if eval_train:
logger.info('testing on sampled testing set.')
acc, und = SL_test(eval_test)
acct.append(acc)
undt.append(und)
eval_start = 0
end_time = time.time()
logger.info('Training this epoch with {0} seconds.'.format(end_time - start_time))
logger.info('Supervised loss = {}'.format(np.mean(loss)))
if eval_freq > 0:
eval_details = {'batch_id': batches, 'acc_train': accs, 'acc_test': acct, 'und_train': unds, 'und_test': undt}
return loss, eval_details
return loss
def main():
losses = []
accuracy = []
for echo in xrange(4000):
logger.info('Iteration = {}'.format(echo))
train_data = simulator(M=20)
print train_data['text'][-1]
loss = learner(train_data, fr=0.)
losses.append(loss)
accuracy += train_data['acc']
if echo % 100 == 99:
plt.plot(accuracy)
plt.show()
# pkl.dump(losses, open('losses.temp.pkl'))
def check_answer(x, y, g):
g = np.asarray(g)
environment.game.set_answer(g)
s = 0
for k in xrange(x.shape[1]):
question = ' '.join(print_sample(idx2word, y[0][k].tolist())[:-1])
got = environment.parse(question)
if got == 2 - x[0][k]:
s += 1.
return s / x.shape[1]
def display_session(x, y, g, t, acc, cov):
"""
display a dialogue session
"""
conversation = ''
conversation += '\n\n\n' + '***' * 30
conversation += '\nGame start.'
for k in xrange(x.shape[1]):
question = ' '.join(print_sample(idx2word, y[0][k].tolist())[:-1])
conversation += '\n' + '_' * 93 + '[{}]'.format(k + 1)
conversation += '\n(´◉ ω ◉`)? : {}'.format(question)
got = x[0][k]
conversation += '\n{:>78} : (●´ε`●)'.format(answer_templates[got])
conversation += '\n' + '_' * 93 + '[{}]'.format(k + 1)
conversation += '\n(´✪ ‿ ✪`)ノ : {}'.format('My answer = ' + ' '.join([str(w) for w in g]))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format(' '.join([str(w) for w in t[0]]))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Accuracy = {}%'.format(acc * 100))
conversation += '\n{:>78} : ლ(´∀`ლ)'.format('Understand = {}%'.format(cov * 100))
conversation += '\nGame End\n' + '***' * 30
return conversation
def SL_test(test_set):
print '...'
progbar = Progbar(main_config['test_size'])
accuracy = []
understand = []
# untruth = []
at = 0
for k in xrange(main_config['test_size']):
at += 1
x = test_set['X'][None, k]
y = test_set['Y'][None, k]
t = test_set['T'][None, k]
g, _, acc = agent.evaluate(x, y, t)
cov = check_answer(x, y, g)
# cov_t = check_answer(x, y, t[0].tolist())
progbar.update(at, [('acc', acc), ('und', cov)])
# untruth.append(cov_t)
accuracy.append(acc)
understand.append(cov)
acc = np.mean(accuracy)
und = np.mean(understand)
print '\nevaluation.. avarage accuracy = {0}% /understand {1}% questions'.format(
100 * acc, 100 * und)
# print 'check truth {}%'.format(100 * np.mean(untruth))
return acc, und
def main_sl():
# get the evaluation set.
evaluation_set = n_rng.randint(0, train_set['X'].shape[0], main_config['test_size']).tolist()
eval_train = dict()
eval_train['X'] = train_set['X'][evaluation_set]
eval_train['Y'] = train_set['Y'][evaluation_set]
eval_train['T'] = train_set['T'][evaluation_set]
eval_test = test_set
eval_details = {'batch_id': [], 'acc_train': [],
'acc_test': [], 'und_train': [], 'und_test': []}
for echo in xrange(500):
logger.info('Epoch = {}'.format(echo))
loss, ed = SL_learner(train_set, batch_size=50, eval_freq=10,
eval_train=eval_train, eval_test=eval_test)
eval_details['acc_train'] += ed['acc_train']
eval_details['acc_test'] += ed['acc_test']
eval_details['und_train'] += ed['und_train']
eval_details['und_test'] += ed['und_test']
eval_details['batch_id'] += [(t + 200000 * echo) / 1000.0 for t in ed['batch_id']]
pyplot.figure(1)
pyplot.plot(eval_details['batch_id'], eval_details['acc_train'], 'b')
pyplot.plot(eval_details['batch_id'], eval_details['acc_test'], 'r')
pyplot.xlabel('iterations (x 1000)')
pyplot.ylabel('accuracy')
pyplot.savefig('./acc-gru.png')
pyplot.figure(2)
pyplot.plot(eval_details['batch_id'], eval_details['und_train'], 'b')
pyplot.plot(eval_details['batch_id'], eval_details['und_test'], 'r')
pyplot.xlabel('iterations (x 1000)')
pyplot.ylabel('understand rate')
pyplot.savefig('./und-gru.png')
logger.info("saving ok!")
# if echo % 20 == 19:
# pyplot.figure(1)
# pyplot.plot(acc_s, 'r')
# pyplot.plot(acc_t, 'g')
# pyplot.figure(2)
# pyplot.plot(und_s, 'r')
# pyplot.plot(und_t, 'g')
# pyplot.show()
# agent.main_config['sample_beam'] = 1
# agent.main_config['sample_argmax'] = True
main_sl()
================================================
FILE: emolga/utils/__init__.py
================================================
__author__ = 'yinpengcheng'
================================================
FILE: emolga/utils/generic_utils.py
================================================
from __future__ import absolute_import
from matplotlib.ticker import FuncFormatter
import numpy as np
import time
import sys
import six
import matplotlib.pyplot as plt
import matplotlib
def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None):
if isinstance(identifier, six.string_types):
res = module_params.get(identifier)
if not res:
raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier))
if instantiate and not kwargs:
return res()
elif instantiate and kwargs:
return res(**kwargs)
else:
return res
return identifier
def make_tuple(*args):
return args
def printv(v, prefix=''):
if type(v) == dict:
if 'name' in v:
print(prefix + '#' + v['name'])
del v['name']
prefix += '...'
for nk, nv in v.items():
if type(nv) in [dict, list]:
print(prefix + nk + ':')
printv(nv, prefix)
else:
print(prefix + nk + ':' + str(nv))
elif type(v) == list:
prefix += '...'
for i, nv in enumerate(v):
print(prefix + '#' + str(i))
printv(nv, prefix)
else:
prefix += '...'
print(prefix + str(v))
def make_batches(size, batch_size):
nb_batch = int(np.ceil(size/float(batch_size)))
return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
def slice_X(X, start=None, stop=None):
if type(X) == list:
if hasattr(start, '__len__'):
return [x[start] for x in X]
else:
return [x[start:stop] for x in X]
else:
if hasattr(start, '__len__'):
return X[start]
else:
return X[start:stop]
class Progbar(object):
def __init__(self, target, width=30, verbose=1):
'''
@param target: total number of steps expected
'''
self.width = width
self.target = target
self.sum_values = {}
self.unique_values = []
self.start = time.time()
self.total_width = 0
self.seen_so_far = 0
self.verbose = verbose
def update(self, current, values=[]):
'''
@param current: index of current step
@param values: list of tuples (name, value_for_last_step).
The progress bar will display averages for these values.
'''
for k, v in values:
if k not in self.sum_values:
self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
self.unique_values.append(k)
else:
self.sum_values[k][0] += v * (current - self.seen_so_far)
self.sum_values[k][1] += (current - self.seen_so_far)
self.seen_so_far = current
now = time.time()
if self.verbose == 1:
prev_total_width = self.total_width
sys.stdout.write("\b" * prev_total_width)
sys.stdout.write("\r")
numdigits = int(np.floor(np.log10(self.target))) + 1
barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
bar = barstr % (current, self.target)
prog = float(current)/self.target
prog_width = int(self.width*prog)
if prog_width > 0:
bar += ('.'*(prog_width-1))
if current < self.target:
bar += '(-w-)'
else:
bar += '(-v-)!!'
bar += ('~' * (self.width-prog_width))
bar += ']'
sys.stdout.write(bar)
self.total_width = len(bar)
if current:
time_per_unit = (now - self.start) / current
else:
time_per_unit = 0
eta = time_per_unit*(self.target - current)
info = ''
if current < self.target:
info += ' - ETA: %ds' % eta
else:
info += ' - %ds' % (now - self.start)
for k in self.unique_values:
if k == 'perplexity' or k == 'PPL':
info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
else:
info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
self.total_width += len(info)
if prev_total_width > self.total_width:
info += ((prev_total_width-self.total_width) * " ")
sys.stdout.write(info)
sys.stdout.flush()
if current >= self.target:
sys.stdout.write("\n")
if self.verbose == 2:
if current >= self.target:
info = '%ds' % (now - self.start)
for k in self.unique_values:
info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
sys.stdout.write(info + "\n")
def add(self, n, values=[]):
self.update(self.seen_so_far + n, values)
def clear(self):
self.sum_values = {}
self.unique_values = []
self.total_width = 0
self.seen_so_far = 0
def print_sample(idx2word, idx):
def cut_eol(words):
for i, word in enumerate(words):
if words[i] == '':
return words[:i + 1]
raise Exception("No end-of-line found")
return cut_eol(map(lambda w_idx : idx2word[w_idx], idx))
def visualize_(subplots, data, w=None, h=None, name=None,
display='on', size=10, text=None, normal=True,
grid=False):
fig, ax = subplots
if data.ndim == 1:
if w and h:
# vector visualization
assert w * h == np.prod(data.shape)
data = data.reshape((w, h))
else:
L = data.shape[0]
w = int(np.sqrt(L))
while L % w > 0:
w -= 1
h = L / w
assert w * h == np.prod(data.shape)
data = data.reshape((w, h))
else:
w = data.shape[0]
h = data.shape[1]
if not size:
size = 30 / np.sqrt(w * h)
print data.shape
major_ticks = np.arange(0, h, 1)
ax.set_xticks(major_ticks)
ax.set_xlim(0, h)
major_ticks = np.arange(0, w, 1)
ax.set_ylim(w, -1)
ax.set_yticks(major_ticks)
ax.set_aspect('equal')
if grid:
pass
ax.grid(which='both')
# ax.axis('equal')
if normal:
cax = ax.imshow(data, cmap=plt.cm.pink, interpolation='nearest',
vmax=1.0, vmin=0.0, aspect='auto')
else:
cax = ax.imshow(data, cmap=plt.cm.bone, interpolation='nearest', aspect='auto')
if name:
ax.set_title(name)
else:
ax.set_title('sample.')
import matplotlib.ticker as ticker
# ax.xaxis.set_ticks(np.arange(0, h, 1.))
# ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
# ax.yaxis.set_ticks(np.arange(0, w, 1.))
# ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
# ax.set_xticks(np.linspace(0, 1, h))
# ax.set_yticks(np.linspace(0, 1, w))
# Move left and bottom spines outward by 10 points
# ax.spines['left'].set_position(('outward', size))
# ax.spines['bottom'].set_position(('outward', size))
# # Hide the right and top spines
# ax.spines['right'].set_visible(False)
# ax.spines['top'].set_visible(False)
# # Only show ticks on the left and bottom spines
# ax.yaxis.set_ticks_position('left')
# ax.xaxis.set_ticks_position('bottom')
if text:
ax.set_yticks(np.linspace(0, 1, 33) * size * 3.2)
ax.set_yticklabels([text[s] for s in xrange(33)])
# cbar = fig.colorbar(cax)
if display == 'on':
plt.show()
else:
return ax
def vis_Gaussian(subplot, mean, std, name=None, display='off', size=10):
ax = subplot
data = np.random.normal(size=(2, 10000))
data[0] = data[0] * std[0] + mean[0]
data[1] = data[1] * std[1] + mean[1]
ax.scatter(data[0].tolist(), data[1].tolist(), 'r.')
if display == 'on':
plt.show()
else:
return ax
================================================
FILE: emolga/utils/io_utils.py
================================================
from __future__ import absolute_import
import h5py
import numpy as np
import cPickle
from collections import defaultdict
class HDF5Matrix():
refs = defaultdict(int)
def __init__(self, datapath, dataset, start, end, normalizer=None):
if datapath not in list(self.refs.keys()):
f = h5py.File(datapath)
self.refs[datapath] = f
else:
f = self.refs[datapath]
self.start = start
self.end = end
self.data = f[dataset]
self.normalizer = normalizer
def __len__(self):
return self.end - self.start
def __getitem__(self, key):
if isinstance(key, slice):
if key.stop + self.start <= self.end:
idx = slice(key.start+self.start, key.stop + self.start)
else:
raise IndexError
elif isinstance(key, int):
if key + self.start < self.end:
idx = key+self.start
else:
raise IndexError
elif isinstance(key, np.ndarray):
if np.max(key) + self.start < self.end:
idx = (self.start + key).tolist()
else:
raise IndexError
elif isinstance(key, list):
if max(key) + self.start < self.end:
idx = [x + self.start for x in key]
else:
raise IndexError
if self.normalizer is not None:
return self.normalizer(self.data[idx])
else:
return self.data[idx]
@property
def shape(self):
return tuple([self.end - self.start, self.data.shape[1]])
def save_array(array, name):
import tables
f = tables.open_file(name, 'w')
atom = tables.Atom.from_dtype(array.dtype)
ds = f.createCArray(f.root, 'data', atom, array.shape)
ds[:] = array
f.close()
def load_array(name):
import tables
f = tables.open_file(name)
array = f.root.data
a = np.empty(shape=array.shape, dtype=array.dtype)
a[:] = array[:]
f.close()
return a
def save_config():
pass
def load_config():
pass
================================================
FILE: emolga/utils/np_utils.py
================================================
from __future__ import absolute_import
import numpy as np
import scipy as sp
from six.moves import range
from six.moves import zip
def to_categorical(y, nb_classes=None):
'''Convert class vector (integers from 0 to nb_classes)
to binary class matrix, for use with categorical_crossentropy
'''
y = np.asarray(y, dtype='int32')
if not nb_classes:
nb_classes = np.max(y)+1
Y = np.zeros((len(y), nb_classes))
for i in range(len(y)):
Y[i, y[i]] = 1.
return Y
def normalize(a, axis=-1, order=2):
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
l2[l2 == 0] = 1
return a / np.expand_dims(l2, axis)
def binary_logloss(p, y):
epsilon = 1e-15
p = sp.maximum(epsilon, p)
p = sp.minimum(1-epsilon, p)
res = sum(y * sp.log(p) + sp.subtract(1, y) * sp.log(sp.subtract(1, p)))
res *= -1.0/len(y)
return res
def multiclass_logloss(P, Y):
score = 0.
npreds = [P[i][Y[i]-1] for i in range(len(Y))]
score = -(1. / len(Y)) * np.sum(np.log(npreds))
return score
def accuracy(p, y):
return np.mean([a == b for a, b in zip(p, y)])
def probas_to_classes(y_pred):
if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
return categorical_probas_to_classes(y_pred)
return np.array([1 if p > 0.5 else 0 for p in y_pred])
def categorical_probas_to_classes(p):
return np.argmax(p, axis=1)
================================================
FILE: emolga/utils/test_utils.py
================================================
import numpy as np
def get_test_data(nb_train=1000, nb_test=500, input_shape=(10,), output_shape=(2,),
classification=True, nb_class=2):
'''
classification=True overrides output_shape
(i.e. output_shape is set to (1,)) and the output
consists in integers in [0, nb_class-1].
Otherwise: float output with shape output_shape.
'''
nb_sample = nb_train + nb_test
if classification:
y = np.random.randint(0, nb_class, size=(nb_sample, 1))
X = np.zeros((nb_sample,) + input_shape)
for i in range(nb_sample):
X[i] = np.random.normal(loc=y[i], scale=1.0, size=input_shape)
else:
y_loc = np.random.random((nb_sample,))
X = np.zeros((nb_sample,) + input_shape)
y = np.zeros((nb_sample,) + output_shape)
for i in range(nb_sample):
X[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=input_shape)
y[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=output_shape)
return (X[:nb_train], y[:nb_train]), (X[nb_train:], y[nb_train:])
================================================
FILE: emolga/utils/theano_utils.py
================================================
from __future__ import absolute_import
from theano import gof
from theano.tensor import basic as tensor
import numpy as np
import theano
import theano.tensor as T
def floatX(X):
return np.asarray(X, dtype=theano.config.floatX)
def sharedX(X, dtype=theano.config.floatX, name=None):
return theano.shared(np.asarray(X, dtype=dtype), name=name)
def shared_zeros(shape, dtype=theano.config.floatX, name=None):
return sharedX(np.zeros(shape), dtype=dtype, name=name)
def shared_scalar(val=0., dtype=theano.config.floatX, name=None):
return theano.shared(np.cast[dtype](val), name=name)
def shared_ones(shape, dtype=theano.config.floatX, name=None):
return sharedX(np.ones(shape), dtype=dtype, name=name)
def alloc_zeros_matrix(*dims):
return T.alloc(np.cast[theano.config.floatX](0.), *dims)
def alloc_ones_matrix(*dims):
return T.alloc(np.cast[theano.config.floatX](1.), *dims)
def ndim_tensor(ndim):
if ndim == 1:
return T.vector()
elif ndim == 2:
return T.matrix()
elif ndim == 3:
return T.tensor3()
elif ndim == 4:
return T.tensor4()
return T.matrix()
# get int32 tensor
def ndim_itensor(ndim, name=None):
if ndim == 2:
return T.imatrix(name)
elif ndim == 3:
return T.itensor3(name)
elif ndim == 4:
return T.itensor4(name)
return T.imatrix(name)
# dot-product
def dot(inp, matrix, bias=None):
"""
Decide the right type of dot product depending on the input
arguments
"""
if 'int' in inp.dtype and inp.ndim == 2:
return matrix[inp.flatten()]
elif 'int' in inp.dtype:
return matrix[inp]
elif 'float' in inp.dtype and inp.ndim == 3:
shape0 = inp.shape[0]
shape1 = inp.shape[1]
shape2 = inp.shape[2]
if bias:
return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1]))
else:
return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1]))
else:
if bias:
return T.dot(inp, matrix) + bias
else:
return T.dot(inp, matrix)
# Numerically stable log(sum(exp(A))). Can also be used in softmax function.
def logSumExp(x, axis=None, mask=None, status='theano', c=None, err=1e-7):
"""
Numerically stable log(sum(exp(A))). Can also be used in softmax function.
c is the additional input when it doesn't require masking but x need.
"""
if status == 'theano':
J = T
else:
J = np
if c is None:
x_max = J.max(x, axis=axis, keepdims=True)
else:
x_max = J.max(J.concatenate([c, x], axis=-1), axis=axis, keepdims=True)
if c is None:
if not mask:
l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True)
else:
l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True)
else:
if not mask:
l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True) + \
J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
else:
l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True) + \
J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
x_t = J.log(J.maximum(l_t, err)) + x_max
return x_t
def softmax(x):
return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
def masked_softmax(x, mask, err=1e-9):
assert x.ndim == 2, 'support two-dimension'
weights = softmax(x)
weights *= mask
weights = weights / (T.sum(weights, axis=-1)[:, None] + err) * mask
return weights
def cosine_sim(k, M):
k_unit = k / (T.sqrt(T.sum(k**2)) + 1e-5)
# T.patternbroadcast(k_unit.reshape((1,k_unit.shape[0])),(True,False))
k_unit = k_unit.dimshuffle(('x', 0))
k_unit.name = "k_unit"
M_lengths = T.sqrt(T.sum(M**2, axis=1)).dimshuffle((0, 'x'))
M_unit = M / (M_lengths + 1e-5)
M_unit.name = "M_unit"
return T.sum(k_unit * M_unit, axis=1)
def cosine_sim2d(k, M):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
k_norm = k_norm[:, None] # (nb_samples, 1)
sim = T.sum(k * M, axis=2) # (nb_samples, memory_dim,)
sim /= k_norm * M_norm # (nb_samples, memory_dim,)
return sim
def dot_2d(k, M, b=None, g=None):
# k: (nb_samples, memory_width)
# M: (nb_samples, memory_dim, memory_width)
# norms of keys and memories
# k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
# M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
k = k[:, None, :] # (nb_samples, 1, memory_width)
value = k * M
if b is not None:
b = b[:, None, :]
value *= b # (nb_samples, memory_dim,)
if g is not None:
g = g[None, None, :]
value *= g
sim = T.sum(value, axis=2)
return sim
def shift_convolve(weight, shift, shift_conv):
shift = shift.dimshuffle((0, 'x'))
return T.sum(shift * weight[shift_conv], axis=0)
def shift_convolve2d(weight, shift, shift_conv):
return T.sum(shift[:, :, None] * weight[:, shift_conv], axis=1)
================================================
FILE: experiments/__init__.py
================================================
__author__ = 'jiataogu'
================================================
FILE: experiments/bst_dataset.py
================================================
# coding=utf-8
__author__ = 'jiataogu'
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
import numpy.random as n_rng
class BSTnode(object):
"""
Representation of a node in a binary search tree.
Has a left child, right child, and key value, and stores its subtree size.
"""
def __init__(self, parent, t):
"""Create a new leaf with key t."""
self.key = t
self.parent = parent
self.left = None
self.right = None
self.size = 1
def update_stats(self):
"""Updates this node's size based on its children's sizes."""
self.size = (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size)
def insert(self, t, NodeType):
"""Insert key t into the subtree rooted at this node (updating subtree size)."""
self.size += 1
if t < self.key:
if self.left is None:
self.left = NodeType(self, t)
return self.left
else:
return self.left.insert(t, NodeType)
elif t > self.key:
if self.right is None:
self.right = NodeType(self, t)
return self.right
else:
return self.right.insert(t, NodeType)
else:
return self
def find(self, t):
"""Return the node for key t if it is in this tree, or None otherwise."""
if t == self.key:
return self
elif t < self.key:
if self.left is None:
return None
else:
return self.left.find(t)
else:
if self.right is None:
return None
else:
return self.right.find(t)
def rank(self, t):
"""Return the number of keys <= t in the subtree rooted at this node."""
left_size = 0 if self.left is None else self.left.size
if t == self.key:
return left_size + 1
elif t < self.key:
if self.left is None:
return 0
else:
return self.left.rank(t)
else:
if self.right is None:
return left_size + 1
else:
return self.right.rank(t) + left_size + 1
def minimum(self):
"""Returns the node with the smallest key in the subtree rooted by this node."""
current = self
while current.left is not None:
current = current.left
return current
def successor(self):
"""Returns the node with the smallest key larger than this node's key, or None if this has the largest key in the tree."""
if self.right is not None:
return self.right.minimum()
current = self
while current.parent is not None and current.parent.right is current:
current = current.parent
return current.parent
def delete(self):
""""Delete this node from the tree."""
if self.left is None or self.right is None:
if self is self.parent.left:
self.parent.left = self.left or self.right
if self.parent.left is not None:
self.parent.left.parent = self.parent
else:
self.parent.right = self.left or self.right
if self.parent.right is not None:
self.parent.right.parent = self.parent
current = self.parent
while current.key is not None:
current.update_stats()
current = current.parent
return self
else:
s = self.successor()
self.key, s.key = s.key, self.key
return s.delete()
def check(self, lokey, hikey):
"""Checks that the subtree rooted at t is a valid BST and all keys are between (lokey, hikey)."""
if lokey is not None and self.key <= lokey:
raise "BST RI violation"
if hikey is not None and self.key >= hikey:
raise "BST RI violation"
if self.left is not None:
if self.left.parent is not self:
raise "BST RI violation"
self.left.check(lokey, self.key)
if self.right is not None:
if self.right.parent is not self:
raise "BST RI violation"
self.right.check(self.key, hikey)
if self.size != 1 + (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size):
raise "BST RI violation"
def __repr__(self):
return ""
class BST(object):
"""
Simple binary search tree implementation, augmented with subtree sizes.
This BST supports insert, find, and delete-min operations.
Each tree contains some (possibly 0) BSTnode objects, representing nodes,
and a pointer to the root.
"""
def __init__(self, NodeType=BSTnode):
self.root = None
self.NodeType = NodeType
self.psroot = self.NodeType(None, None)
def reroot(self):
self.root = self.psroot.left
def insert(self, t):
"""Insert key t into this BST, modifying it in-place."""
if self.root is None:
self.psroot.left = self.NodeType(self.psroot, t)
self.reroot()
return self.root
else:
return self.root.insert(t, self.NodeType)
def find(self, t):
"""Return the node for key t if is in the tree, or None otherwise."""
if self.root is None:
return None
else:
return self.root.find(t)
def rank(self, t):
"""The number of keys <= t in the tree."""
if self.root is None:
return 0
else:
return self.root.rank(t)
def delete(self, t):
"""Delete the node for key t if it is in the tree."""
node = self.find(t)
deleted = self.root.delete()
self.reroot()
return deleted
def check(self):
if self.root is not None:
self.root.check(None, None)
def __str__(self):
if self.root is None:
return ''
def nested(node):
if node is None:
return '0'
head = str(node.key)
left = nested(node.left)
right = nested(node.right)
if left == '0' and right == '0':
return head
else:
return ' '.join(['(', head, left, right, ')'])
return nested(self.root)
# def recurse(node):
# if node is None:
# return [], 0, 0
# label = str(node.key)
# left_lines, left_pos, left_width = recurse(node.left)
# right_lines, right_pos, right_width = recurse(node.right)
# middle = max(right_pos + left_width - left_pos + 1, len(label), 2)
# pos = left_pos + middle // 2
# width = left_pos + middle + right_width - right_pos
# while len(left_lines) < len(right_lines):
# left_lines.append(' ' * left_width)
# while len(right_lines) < len(left_lines):
# right_lines.append(' ' * right_width)
# if (middle - len(label)) % 2 == 1 and node.parent is not None and \
# node is node.parent.left and len(label) < middle:
# label += '.'
# label = label.center(middle, '.')
# if label[0] == '.': label = ' ' + label[1:]
# if label[-1] == '.': label = label[:-1] + ' '
# lines = [' ' * left_pos + label + ' ' * (right_width - right_pos),
# ' ' * left_pos + '/' + ' ' * (middle-2) +
# '\\' + ' ' * (right_width - right_pos)] + \
# [left_line + ' ' * (width - left_width - right_width) +
# right_line
# for left_line, right_line in zip(left_lines, right_lines)]
# return lines, pos, width
# return '\n'.join(recurse(self.root) [0])
test1 = range(0, 100, 10)
test2 = [31, 41, 59, 26, 53, 58, 97, 93, 23]
test3 = "algorithms"
def printsizes(node):
if node is None:
print "node is nil"
else:
print "node", node.key, "has a subtree of size", node.size
def test(args=None, BSTtype=BST):
import random, sys
random.seed(19920206)
if not args:
args = sys.argv[1:]
if not args:
print 'usage: %s ' % \
sys.argv[0]
sys.exit()
elif len(args) == 1:
items = (random.randrange(100) for i in xrange(int(args[0])))
else:
items = [int(i) for i in args]
tree = BSTtype()
source = []
for item in items:
tree.insert(item)
source += [str(item)]
print ' '.join(source)
print tree
def generate():
import random, sys
random.seed(19920206)
Lmin = 2 ** 2 - 1
Lmax = 2 ** 4 - 1
Xnum = 1000000
voc = 26
wfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'w')
for id in xrange(Xnum):
tree = BST()
items = (random.randrange(voc) for i in
xrange(random.randint(Lmin, Lmax)))
source = []
for item in items:
item = chr(item + 65)
tree.insert(item)
source += [str(item)]
source = ' '.join(source)
target = str(tree)
line = '{0} -> {1}'.format(source, target)
wfile.write(line + '\n')
if id % 10000 == 0:
print id
def obtain_dataset():
rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r')
line = rfile.readline()
word2idx = dict()
word2idx[''] = 0
word2idx[''] = 1
pairs = []
at = 2
lines = 0
while line:
lines += 1
line = line.strip()
source, target = line.split('->')
source = source.split()
target = target.split()
for w in source:
if w not in word2idx:
word2idx[w] = at
at += 1
for w in target:
if w not in word2idx:
word2idx[w] = at
at += 1
pairs.append((source, target))
if lines % 20000 == 0:
print lines
line = rfile.readline()
idx2word = dict()
for v, k in word2idx.items():
idx2word[k] = v
Lmax = len(idx2word)
print 'read dataset ok.'
print Lmax
for i in xrange(Lmax):
print idx2word[i]
def build_data(data):
instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
for pair in data:
source, target = pair
A = [word2idx[w] for w in source]
B = [word2idx[w] for w in target]
# C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
C = [0 if w not in source else source.index(w) + Lmax for w in target]
instance['text'] += [source]
instance['summary'] += [target]
instance['source'] += [A]
instance['target'] += [B]
# instance['cc_matrix'] += [C]
instance['target_c'] += [C]
print instance['target'][5000]
print instance['target_c'][5000]
return instance
train_set = build_data(pairs[100000:])
test_set = build_data(pairs[:100000])
serialize_to_file([train_set, test_set, idx2word, word2idx],
'/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl')
if __name__ == '__main__':
generate()
obtain_dataset()
================================================
FILE: experiments/bst_vest.py
================================================
# coding=utf-8
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
# setup = setup_lcsts
# setup = setup_syn
setup = setup_bst
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = len(word2idx)
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
# train_data_plain = zip(*(train_set['source'], train_set['target']))
# test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
notrain = False
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
if notrain:
agent.compile_('display')
else:
agent.compile_('all')
print 'compile ok.'
# load the model
# agent.load(config['path_h5'] +
# '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format('20160229-105153', 1, config['modelname']))
echo = 0
epochs = 10
skip = -1 # 25000
if echo > 0:
tmark = '20160229-105153' # '20160227-013418' # copynet multi-source model
agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
def cc_martix(source, target):
cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
for k in xrange(source.shape[0]):
for j in xrange(target.shape[1]):
for i in xrange(source.shape[1]):
if (source[k, i] == target[k, j]) and (source[k, i] > 0):
cc[k][j][i] = 1.
return cc
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
# training
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# skip some iterations
if echo == 1 and it < skip:
continue
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
if config['copynet']:
data_c = cc_martix(data_s, data_t)
# data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
else:
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(train_s), dtype='int32'))
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(test_s), dtype='int32'))
print '*' * 50
# save the weights.
if it % 5000 == 0:
agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
if (it % 5000 == 0) and it > 0:
print 'testing accuracy !!'
def analysis_(data_plain, t_idx, mode='Training'):
progbar_tr = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
for it, idx in enumerate(t_idx):
train_s, train_t = data_plain[idx]
c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word))
# copy mode
cpy += 1
cpy_pos += c
progbar_tr.update(it + 1, [('Copy', cpy_pos)])
logger.info('\n{0} Accuracy:' +
'\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
print '==' * 50
# analysis_(train_data_plain, tr_idx, 'Training')
analysis_(test_data_plain, ts_idx, 'Testing')
================================================
FILE: experiments/config.py
================================================
__author__ = 'jiataogu'
import os
import os.path as path
def setup():
config = dict()
# config['seed'] = 3030029828
config['seed'] = 19920206
config['use_noise'] = False
config['optimizer'] = 'adam'
config['save_updates'] = True
config['get_instance'] = True
config['path'] = '/home/thoma/Work/Dial-DRL' # path.realpath(path.curdir) + '/'
config['dataset'] = config['path'] + '/dataset/bAbI/dataset-b.pkl'
config['voc'] = config['path'] + '/dataset/bAbI/voc-b.pkl'
# output log place
config['path_log'] = config['path'] + 'Logs'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
# # output hdf5 file.
# config['weights_file'] = config['path'] + '/froslass/model-pool/'
# if not os.path.exists(config['weights_file']):
# os.mkdir(config['weights_file'])
# size
config['batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
# Encoder: Model
config['bidirectional'] = True
config['enc_use_contxt'] = False
config['enc_learn_nrm'] = True
config['enc_embedd_dim'] = 100 # 100
config['enc_hidden_dim'] = 150 # 180
config['enc_contxt_dim'] = 0
config['encoder'] = 'RNN'
config['pooling'] = False
# Decoder: dimension
config['dec_embedd_dim'] = 100 # 100
config['dec_hidden_dim'] = 150 # 180
config['dec_contxt_dim'] = config['enc_hidden_dim'] \
if not config['bidirectional'] \
else 2 * config['enc_hidden_dim']
# Decoder: CopyNet
config['copynet'] = True
config['identity'] = False
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = True
config['bias_code'] = True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = True
config['context_predict'] = True
config['dropout'] = 0.0 # 5
config['leaky_predict'] = False
config['dec_readout_dim'] = config['dec_hidden_dim']
if config['dec_use_contxt']:
config['dec_readout_dim'] += config['dec_contxt_dim']
if config['bigram_predict']:
config['dec_readout_dim'] += config['dec_embedd_dim']
# Decoder: sampling
config['max_len'] = 27
config['sample_beam'] = 8
config['sample_stoch'] = False
config['sample_argmax'] = False
# Gradient Tracking !!!
config['gradient_check'] = True
config['gradient_noise'] = True
config['skip_size'] = 15
for w in config:
print '{0} => {1}'.format(w, config[w])
print 'setup ok.'
return config
def setup_syn():
config = dict()
config['seed'] = 3030029828
# config['seed'] = 19920206
# model ids
# voc_size 10000: 20160224-021106
# voc_size 5000 : 20160224-144747 / 20160224-162424 (discard UNK)
config['use_noise'] = False
config['optimizer'] = 'adam'
config['save_updates'] = True
config['get_instance'] = True
config['path'] = path.realpath(path.curdir)
config['path_h5'] = config['path'] + '/H5'
# config['dataset'] = config['path'] + '/dataset/lcsts_data-word-full.pkl'
config['dataset'] = config['path'] + '/dataset/synthetic_data_c.pkl'
config['modelname'] = 'syn'
# output log place
config['path_log'] = config['path'] + '/Logs'
config['path_logX'] = config['path'] + '/LogX'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
if not os.path.exists(config['path_logX']):
os.mkdir(config['path_logX'])
# # output hdf5 file.
# config['weights_file'] = config['path'] + '/froslass/model-pool/'
# if not os.path.exists(config['weights_file']):
# os.mkdir(config['weights_file'])
# size
config['batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
config['voc_size'] = -1 # 20000
# Encoder: Model
config['bidirectional'] = True
config['enc_use_contxt'] = False
config['enc_learn_nrm'] = True
config['enc_embedd_dim'] = 150 # 100
config['enc_hidden_dim'] = 300 # 180
config['enc_contxt_dim'] = 0
config['encoder'] = 'RNN'
config['pooling'] = False
config['encode_max_len'] = 57
config['decode_unk'] = False
config['explicit_loc'] = True
# Decoder: dimension
config['dec_embedd_dim'] = 150 # 100
config['dec_hidden_dim'] = 300 # 180
config['dec_contxt_dim'] = config['enc_hidden_dim'] \
if not config['bidirectional'] \
else 2 * config['enc_hidden_dim']
if config['explicit_loc']:
config['dec_contxt_dim'] += config['encode_max_len']
# Decoder: CopyNet
config['copynet'] = True # False
config['identity'] = False
config['location_embed'] = True
config['coverage'] = True
config['copygate'] = False
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = True
config['bias_code'] = True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = True
config['context_predict'] = True
config['dropout'] = 0.0 # 5
config['leaky_predict'] = False
config['dec_readout_dim'] = config['dec_hidden_dim']
if config['dec_use_contxt']:
config['dec_readout_dim'] += config['dec_contxt_dim']
if config['bigram_predict']:
config['dec_readout_dim'] += config['dec_embedd_dim']
# Decoder: sampling
config['max_len'] = 57
config['sample_beam'] = 10
config['sample_stoch'] = False
config['sample_argmax'] = False
# Gradient Tracking !!!
config['gradient_check'] = True
config['gradient_noise'] = True
config['skip_size'] = 15
for w in config:
print '{0} => {1}'.format(w, config[w])
print 'setup ok.'
return config
# config = dict()
# # config['seed'] = 3030029828
# config['seed'] = 19920206
#
# config['use_noise'] = False
# config['optimizer'] = 'adam'
# config['save_updates'] = True
# config['get_instance'] = True
# config['path'] = '/home/thoma/Work/Dial-DRL' # path.realpath(path.curdir) + '/'
# config['path_h5'] = config['path'] + '/H5'
# config['dataset'] = config['path'] + '/dataset/synthetic_data_b.pkl'
#
# # output log place
# config['path_log'] = config['path'] + 'Logs'
# if not os.path.exists(config['path_log']):
# os.mkdir(config['path_log'])
#
# # # output hdf5 file.
# # config['weights_file'] = config['path'] + '/froslass/model-pool/'
# # if not os.path.exists(config['weights_file']):
# # os.mkdir(config['weights_file'])
#
# # size
# config['batch_size'] = 20
# config['mode'] = 'RNN' # NTM
# config['binary'] = False
#
# # Encoder: Model
# config['bidirectional'] = True
# config['enc_use_contxt'] = False
# config['enc_learn_nrm'] = True
# config['enc_embedd_dim'] = 150 # 100
# config['enc_hidden_dim'] = 500 # 180
# config['enc_contxt_dim'] = 0
# config['encoder'] = 'RNN'
# config['pooling'] = False
#
# # Decoder: dimension
# config['dec_embedd_dim'] = 150 # 100
# config['dec_hidden_dim'] = 500 # 180
# config['dec_contxt_dim'] = config['enc_hidden_dim'] \
# if not config['bidirectional'] \
# else 2 * config['enc_hidden_dim']
#
# # Decoder: CopyNet
# config['copynet'] = True # False
# config['identity'] = False
# config['location_embed'] = True
#
# # Decoder: Model
# config['shared_embed'] = False
# config['use_input'] = True
# config['bias_code'] = True
# config['dec_use_contxt'] = True
# config['deep_out'] = False
# config['deep_out_activ'] = 'tanh' # maxout2
# config['bigram_predict'] = True
# config['context_predict'] = True
# config['dropout'] = 0.0 # 5
# config['leaky_predict'] = False
#
# config['dec_readout_dim'] = config['dec_hidden_dim']
# if config['dec_use_contxt']:
# config['dec_readout_dim'] += config['dec_contxt_dim']
# if config['bigram_predict']:
# config['dec_readout_dim'] += config['dec_embedd_dim']
#
# # Decoder: sampling
# config['max_len'] = 57
# config['sample_beam'] = 8
# config['sample_stoch'] = False
# config['sample_argmax'] = False
#
# # Gradient Tracking !!!
# config['gradient_check'] = True
# config['gradient_noise'] = True
#
# config['skip_size'] = 15
#
# for w in config:
# print '{0} => {1}'.format(w, config[w])
# print 'setup ok.'
# return config
def setup_bst():
config = dict()
config['seed'] = 3030029828
# config['seed'] = 19920206
# model ids
# voc_size 10000: 20160224-021106
# voc_size 5000 : 20160224-144747 / 20160224-162424 (discard UNK)
config['use_noise'] = False
config['optimizer'] = 'adam'
config['save_updates'] = True
config['get_instance'] = True
config['path'] = path.realpath(path.curdir)
config['path_h5'] = config['path'] + '/H5'
# config['dataset'] = config['path'] + '/dataset/lcsts_data-word-full.pkl'
config['dataset'] = config['path'] + '/dataset/BST_1M.data.pkl'
config['modelname'] = 'bst'
# output log place
config['path_log'] = config['path'] + '/Logs'
config['path_logX'] = config['path'] + '/LogX'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
if not os.path.exists(config['path_logX']):
os.mkdir(config['path_logX'])
# # output hdf5 file.
# config['weights_file'] = config['path'] + '/froslass/model-pool/'
# if not os.path.exists(config['weights_file']):
# os.mkdir(config['weights_file'])
# size
config['batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
config['voc_size'] = -1 # 20000
# Encoder: Model
config['bidirectional'] = True
config['enc_use_contxt'] = False
config['enc_learn_nrm'] = True
config['enc_embedd_dim'] = 150 # 100
config['enc_hidden_dim'] = 300 # 180
config['enc_contxt_dim'] = 0
config['encoder'] = 'RNN'
config['pooling'] = False
config['decode_unk'] = False
# Decoder: dimension
config['dec_embedd_dim'] = 150 # 100
config['dec_hidden_dim'] = 300 # 180
config['dec_contxt_dim'] = config['enc_hidden_dim'] \
if not config['bidirectional'] \
else 2 * config['enc_hidden_dim']
# Decoder: CopyNet
config['copynet'] = False # True # False
config['identity'] = False
config['location_embed'] = True
config['coverage'] = True
config['copygate'] = False
config['encourage_gen'] = 0.1 # lambda if 0 no encourage
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = True
config['bias_code'] = True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = True
config['context_predict'] = True
config['dropout'] = 0.0 # 5
config['leaky_predict'] = False
config['dec_readout_dim'] = config['dec_hidden_dim']
if config['dec_use_contxt']:
config['dec_readout_dim'] += config['dec_contxt_dim']
if config['bigram_predict']:
config['dec_readout_dim'] += config['dec_embedd_dim']
# Decoder: sampling
config['max_len'] = 100
config['sample_beam'] = 10
config['sample_stoch'] = False
config['sample_argmax'] = False
# Gradient Tracking !!!
config['gradient_check'] = True
config['gradient_noise'] = True
config['skip_size'] = 15
for w in config:
print '{0} => {1}'.format(w, config[w])
print 'setup ok.'
return config
def setup_lcsts():
config = dict()
config['seed'] = 3030029828
# config['seed'] = 19920206
# model ids
# voc_size 10000: 20160224-021106
# voc_size 5000 : 20160224-144747 / 20160224-162424 (discard UNK)
config['use_noise'] = False
config['optimizer'] = 'adam'
config['save_updates'] = True
config['get_instance'] = True
config['path'] = path.realpath(path.curdir)
config['path_h5'] = config['path'] + '/H5'
config['dataset'] = config['path'] + '/dataset/lcsts_data-word-full.pkl'
# config['dataset'] = config['path'] + '/dataset/lcsts_data-word.pkl'
config['modelname'] = 'LCSTS'
config['segment'] = True
# output log place
config['path_log'] = config['path'] + '/Logs'
config['path_logX'] = config['path'] + '/LogX'
config['path_model'] = config['path'] + '/H5'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
if not os.path.exists(config['path_logX']):
os.mkdir(config['path_logX'])
# # output hdf5 file.
# config['weights_file'] = config['path'] + '/froslass/model-pool/'
# if not os.path.exists(config['weights_file']):
# os.mkdir(config['weights_file'])
# size
config['batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
config['voc_size'] = 20000 # 20000
# # based on characters (modified)
# config['segment'] = False
# config['dataset'] = config['path'] + '/dataset/lcsts_data-char-full.pkl'
# config['modelname'] = 'LCSTS-CCC'
# config['voc_size'] = 3000
# trained_model
# config['trained_model'] = config['path_model'] + '/experiments.CopyLCSTSXXX.id=20160305-004957.epoch=1.iter=20000.pkl'
# config['trained_model'] = config['path_model'] + '/experiments.CopyLCSTSXXX.id=20160301-105813.epoch=2.iter=80000.pkl'
config['trained_model'] = config['path_model'] + '/experiments.CopyLCSTSXXX.id=20160301-114653.epoch=2.iter=100000.pkl'
# Encoder: Model
config['bidirectional'] = True
config['enc_use_contxt'] = False
config['enc_learn_nrm'] = True
config['enc_embedd_dim'] = 500 # 100
config['enc_hidden_dim'] = 750 # 180
config['enc_contxt_dim'] = 0
config['encoder'] = 'RNN'
config['pooling'] = False
config['encode_max_len'] = 140
config['decode_unk'] = False
# Decoder: sample
config['max_len'] = 33
config['sample_beam'] = 30 # 10
config['sample_stoch'] = False
config['sample_argmax'] = False
# Decoder: train
config['dec_embedd_dim'] = 500 # 100
config['dec_hidden_dim'] = 750 # 180
config['dec_contxt_dim'] = config['enc_hidden_dim'] \
if not config['bidirectional'] \
else 2 * config['enc_hidden_dim']
config['explicit_loc'] = False
if config['explicit_loc']:
config['dec_contxt_dim'] += config['encode_max_len']
# Decoder: CopyNet
config['copynet'] = True # False
config['identity'] = False
config['location_embed'] = True
config['coverage'] = True
config['copygate'] = False
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = True
config['bias_code'] = True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = True
config['context_predict'] = True
config['dropout'] = 0.0 # 5
config['leaky_predict'] = False
config['dec_readout_dim'] = config['dec_hidden_dim']
if config['dec_use_contxt']:
config['dec_readout_dim'] += config['dec_contxt_dim']
if config['bigram_predict']:
config['dec_readout_dim'] += config['dec_embedd_dim']
# Gradient Tracking !!!
config['gradient_check'] = True
config['gradient_noise'] = True
config['skip_size'] = 15
for w in config:
print '{0} => {1}'.format(w, config[w])
print 'setup ok.'
return config
def setup_weibo():
config = dict()
config['seed'] = 3030029828
# config['seed'] = 19920206
# model ids
config['use_noise'] = False
config['optimizer'] = 'adam'
config['save_updates'] = True
config['get_instance'] = True
config['path'] = path.realpath(path.curdir)
config['path_h5'] = config['path'] + '/H5'
# config['dataset'] = config['path'] + '/dataset/lcsts_data-word-full.pkl'
# config['dataset'] = config['path'] + '/dataset/weibo_data-word-cooc.pkl'
config['dataset'] = config['path'] + '/dataset/movie_dialogue_data.pkl'
# output log place
config['path_log'] = config['path'] + '/Logs'
config['path_logX'] = config['path'] + '/LogX'
if not os.path.exists(config['path_log']):
os.mkdir(config['path_log'])
if not os.path.exists(config['path_logX']):
os.mkdir(config['path_logX'])
# # output hdf5 file.
# config['weights_file'] = config['path'] + '/froslass/model-pool/'
# if not os.path.exists(config['weights_file']):
# os.mkdir(config['weights_file'])
# size
config['batch_size'] = 20
config['mode'] = 'RNN' # NTM
config['binary'] = False
config['voc_size'] = 10000 # 30000
# Encoder: Model
config['bidirectional'] = True
config['enc_use_contxt'] = False
config['enc_learn_nrm'] = True
config['enc_embedd_dim'] = 350 # 100
config['enc_hidden_dim'] = 500 # 180
config['enc_contxt_dim'] = 0
config['encoder'] = 'RNN'
config['pooling'] = False
config['decode_unk'] = False
config['utf-8'] = False
# Decoder: dimension
config['dec_embedd_dim'] = 350 # 100
config['dec_hidden_dim'] = 500 # 180
config['dec_contxt_dim'] = config['enc_hidden_dim'] \
if not config['bidirectional'] \
else 2 * config['enc_hidden_dim']
# Decoder: CopyNet
config['copynet'] = True # False # False
config['identity'] = False
config['location_embed'] = True
config['coverage'] = True
config['copygate'] = True
config['killcopy'] = False
# Decoder: Model
config['shared_embed'] = False
config['use_input'] = True
config['bias_code'] = True
config['dec_use_contxt'] = True
config['deep_out'] = False
config['deep_out_activ'] = 'tanh' # maxout2
config['bigram_predict'] = True
config['context_predict'] = True
config['dropout'] = 0.0 # 5
config['leaky_predict'] = False
config['dec_readout_dim'] = config['dec_hidden_dim']
if config['dec_use_contxt']:
config['dec_readout_dim'] += config['dec_contxt_dim']
if config['bigram_predict']:
config['dec_readout_dim'] += config['dec_embedd_dim']
# Decoder: sampling
config['max_len'] = 50
config['sample_beam'] = 10
config['sample_stoch'] = False
config['sample_argmax'] = False
# Gradient Tracking !!!
config['gradient_check'] = True
config['gradient_noise'] = True
config['skip_size'] = 15
conc = sorted(config.items(), key=lambda c:c[0])
for c, v in conc:
print '{0} => {1}'.format(c, v)
print 'setup ok.'
return config
================================================
FILE: experiments/copynet.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup
from emolga.utils.generic_utils import *
from emolga.models.encdec import *
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
idx2word, word2idx, idx2word_o, word2idx_o \
= deserialize_from_file(config['voc'])
idx2word_o[0] = ''
word2idx_o[''] = 0
source, target, origin = deserialize_from_file(config['dataset'])
samlpes = len(source)
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samlpes) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(source, target):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', source), ('target', target)]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset, len(source)
train_data, train_size = build_data(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):])
train_data_plain = zip(*(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):], origin[int(0.2 * samlpes):]))
test_data_plain = zip(*(source[:int(0.2 * samlpes)], target[:int(0.2 * samlpes)], origin[:int(0.2 * samlpes)]))
test_size = len(test_data_plain)
logger.info('load the data ok.')
# build the agent
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('all')
print 'compile ok.'
echo = 0
epochs = 10
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data):
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data)
return data
# training
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# obtain data
data_s, data_t = prepare_batch(batch, 'source'), prepare_batch(batch, 'target')
loss += [agent.train_(data_s, data_t)]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 500 == 0:
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t, train_o = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word, np.asarray(train_o, dtype='int32'), idx2word_o)
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t, test_o = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word, np.asarray(test_o, dtype='int32'), idx2word_o)
print '*' * 50
================================================
FILE: experiments/copynet_input.py
================================================
# coding=utf-8
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_lcsts
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
source = '临近 岁末 , 新 基金 发行 步入 旺季 , 11 月份 以来 单周 新基 ' + \
'发行 数 始终保持 35 只 以上 的 高位 , 仅 11 月 25 日 一天 , ' + \
'就 有 12 只 基金 同时 发售 。 国内 首只 公募 对冲 混合型 基金 — 嘉实 绝对 收益 策略 ' + \
'定期 混合 基金 自 发行 首日 便 备受 各界 青睐 , 每日 认购 均 能 达到 上 亿'
target = '首只 公募 对冲 基金 每日 吸金 上 亿'
test_s = [word2idx[w.decode('utf-8')] for w in source.split()]
test_t = [word2idx[w.decode('utf-8')] for w in target.split()]
logger.info('load the data ok.')
logger.info('Evaluate CopyNet')
echo = 9
tmark = '20160226-164053' # '20160221-025049' # copy-net model [no unk]
config['copynet'] = True
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('display')
agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
logger.info('generating [testing set] samples')
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
logger.info('Complete!')
================================================
FILE: experiments/dataset.py
================================================
"""
Preprocess the bAbI datset.
"""
import logging
import os
import sys
import numpy.random as n_rng
from emolga.dataset.build_dataset import serialize_to_file
data_path = './dataset/bAbI/en-10k/'
data = []
n_rng.seed(19920206)
for p, folders, docs in os.walk(data_path):
for doc in docs:
with open(os.path.join(p, doc)) as f:
l = f.readline()
while l:
l = l.strip().lower()
l = l[l.find(' ') + 1:]
if len(l.split('\t')) == 1:
data += [l[:-1].split()]
l = f.readline()
idx2word = dict(enumerate(set([w for l in data for w in l]), 1))
word2idx = {v: k for k, v in idx2word.items()}
persons = [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]
colors = [3, 20, 34, 48, 99, 121]
shapes = [11, 15, 27, 99]
def repeat_name(l):
ll = []
for word in l:
if word2idx[word] in persons:
k = n_rng.randint(5) + 1
ll += [idx2word[persons[i]] for i in n_rng.randint(len(persons), size=k).tolist()]
elif word2idx[word] in colors:
k = n_rng.randint(5) + 1
ll += [idx2word[colors[i]] for i in n_rng.randint(len(colors), size=k).tolist()]
elif word2idx[word] in shapes:
k = n_rng.randint(5) + 1
ll += [idx2word[shapes[i]] for i in n_rng.randint(len(shapes), size=k).tolist()]
else:
ll += [word]
return ll
data_rep = [repeat_name(l) for l in data]
origin = [[word2idx[w] for w in l] for l in data_rep]
def replace(word):
if word2idx[word] in [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]:
return ''
elif word2idx[word] in [3, 20, 34, 48, 99, 121]:
return ''
elif word2idx[word] in [11, 15, 27, 99]:
return ''
else:
return word
# prepare the vocabulary
data_clean = [[replace(w) for w in l] for l in data_rep]
idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1))
idx2word2[0] = ''
word2idx2 = {v: k for k, v in idx2word2.items()}
Lmax = len(idx2word2)
for k in xrange(len(idx2word2)):
print k, '\t', idx2word2[k]
print 'Max: {}'.format(Lmax)
serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], './dataset/bAbI/voc-b.pkl')
# get ready for the dataset.
source = [[word2idx2[w] for w in l] for l in data_clean]
target = [[word2idx2[w] if w not in ['', '', '']
else it + Lmax
for it, w in enumerate(l)] for l in data_clean]
def print_str(data):
for d in data:
print ' '.join(str(w) for w in d)
print_str(data[10000: 10005])
print_str(data_rep[10000: 10005])
print_str(data_clean[10000: 10005])
print_str(source[10000: 10005])
print_str(target[10000: 10005])
serialize_to_file([source, target, origin], './dataset/bAbI/dataset-b.pkl')
================================================
FILE: experiments/lcsts_dataset.py
================================================
# coding=utf-8
import chardet
import sys
import numpy as np
import jieba as jb
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
word2idx = dict()
wordfreq = dict()
word2idx[''] = 0
word2idx[''] = 1
segment = False # True
# training set
pairs = []
f = open('./dataset/LCSTS/PART_I/PART_full.txt', 'r')
line = f.readline().strip()
at = 2
lines = 0
while line:
if line == '':
summary = f.readline().strip().decode('utf-8')
if segment:
summary = [w for w in jb.cut(summary)]
for w in summary:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
f.readline()
f.readline()
text = f.readline().strip().decode('utf-8')
if segment:
text = [w for w in jb.cut(text)]
for w in text:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
pair = (text, summary)
pairs.append(pair)
lines += 1
if lines % 20000 == 0:
print lines
line = f.readline().strip()
# testing set
tests = []
f = open('./dataset/LCSTS/PART_II/PART_II.txt', 'r')
line = f.readline().strip()
lines = 0
while line:
if line == '':
summary = f.readline().strip().decode('utf-8')
if segment:
summary = [w for w in jb.cut(summary)]
for w in summary:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
f.readline()
f.readline()
text = f.readline().strip().decode('utf-8')
if segment:
text = [w for w in jb.cut(text)]
for w in text:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
pair = (text, summary)
tests.append(pair)
lines += 1
if lines % 20000 == 0:
print lines
line = f.readline().strip()
print len(pairs), len(tests)
# sort the vocabulary
wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
for w in wordfreq:
word2idx[w[0]] = at
at += 1
idx2word = {k: v for v, k in word2idx.items()}
Lmax = len(idx2word)
print 'read dataset ok.'
print Lmax
for i in xrange(Lmax):
print idx2word[i].encode('utf-8')
# use character-based model [on]
# use word-based model [off]
def build_data(data):
instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
for pair in data:
source, target = pair
A = [word2idx[w] for w in source]
B = [word2idx[w] for w in target]
# C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
C = [0 if w not in source else source.index(w) + Lmax for w in target]
instance['text'] += [source]
instance['summary'] += [target]
instance['source'] += [A]
instance['target'] += [B]
# instance['cc_matrix'] += [C]
instance['target_c'] += [C]
print instance['target'][5000]
print instance['target_c'][5000]
return instance
train_set = build_data(pairs)
test_set = build_data(tests)
serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl')
================================================
FILE: experiments/lcsts_rouge.py
================================================
"""
Evaluation using ROUGE for LCSTS dataset.
"""
# load the testing set.
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
import jieba as jb
import logging
import copy
from pyrouge import Rouge155
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo, setup_syn
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
from pprint import pprint
setup = setup_lcsts
def build_evaluation(train_set, segment):
_, _, idx2word, word2idx = deserialize_from_file(train_set)
pairs = []
f = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r')
line = f.readline().strip()
lines = 0
segment = segment
while line:
if '' in line:
score = int(line[13])
if score >= 3:
f.readline()
summary = f.readline().strip().decode('utf-8')
if segment:
summary = [w for w in jb.cut(summary)]
target = []
for w in summary:
if w not in word2idx:
word2idx[w] = len(word2idx)
idx2word[len(idx2word)] = w
target += [word2idx[w]]
f.readline()
f.readline()
text = f.readline().strip().decode('utf-8')
if segment:
text = [w for w in jb.cut(text)]
source = []
for w in text:
if w not in word2idx:
word2idx[w] = len(word2idx)
idx2word[len(idx2word)] = w
source += [word2idx[w]]
pair = (text, summary, score, source, target)
pairs.append(pair)
lines += 1
if lines % 1000 == 0:
print lines
line = f.readline().strip()
print 'lines={}'.format(len(pairs))
return pairs, word2idx, idx2word
# words, wwi, wiw = build_evaluation('./dataset/lcsts_data-word-full.pkl', True)
# chars, cwi, ciw = build_evaluation('./dataset/lcsts_data-char-full.pkl', False)
#
# serialize_to_file([words, chars, [wwi, wiw], [cwi, ciw]], './dataset/lcsts_evaluate_data.pkl')
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
logger = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
segment = config['segment']
word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl')
if segment:
eval_set = word_set
word2idx, idx2word = word_voc
else:
eval_set = char_set
word2idx, idx2word = char_voc
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = len(word2idx)
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(eval_set)
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
logger.info('load the data ok.')
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('display')
print 'compile ok.'
# load the model
agent.load(config['trained_model'])
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
rouge = Rouge155(n_words=40)
evalsets = {'rouge_1_f_score': 'R1',
'rouge_2_f_score': 'R2',
'rouge_3_f_score': 'R3',
'rouge_4_f_score': 'R4',
'rouge_l_f_score': 'RL',
'rouge_su4_f_score': 'RSU4'}
scores = dict()
for id, sample in enumerate(eval_set):
text, summary, score, source, target = sample
v = agent.evaluate_(np.asarray(source, dtype='int32'),
np.asarray(target, dtype='int32'),
idx2word,
np.asarray(unk_filter(source), dtype='int32')).decode('utf-8').split('\n')
print 'ID = {} ||'.format(id) + '*' * 50
ref = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[2][9:].split()])])
sym = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[3][9:].split()])])
sssss = rouge.score_summary(sym, {'A': ref})
for si in sssss:
if si not in scores:
scores[si] = sssss[si]
else:
scores[si] += sssss[si]
for e in evalsets:
print '{0}: {1}'.format(evalsets[e], scores[e] / (id + 1)),
print './.'
# average
for si in scores:
scores[si] /= float(len(eval_set))
================================================
FILE: experiments/lcsts_sample.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_lcsts
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size)[:100].tolist()
logger.info('load the data ok.')
# logger.info('Evaluate Enc-Dec')
# log_gen = open(config['path_log'] + '/experiments.CopyLCSTS.generate_{}.log'.format(0), 'w')
# config['copynet'] = True
# echo = 10
# tmark = '20160224-185023' # '20160221-171853' # enc-dec model [no unk]
# agent = NRM(config, n_rng, rng, mode=config['mode'],
# use_attention=True, copynet=config['copynet'], identity=config['identity'])
# agent.build_()
# agent.compile_('display')
# agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
# logger.info('generating [testing set] samples')
# for idx in ts_idx:
# # idx = int(np.floor(n_rng.rand() * test_size))
# test_s, test_t = test_data_plain[idx]
# v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
# np.asarray(test_t, dtype='int32'),
# idx2word)
# log_gen.write(v)
# log_gen.write('*' * 50 + '\n')
# log_gen.close()
logger.info('Evaluate CopyNet')
echo = 6
tmark = '20160224-185023' # '20160221-025049' # copy-net model [no unk]
log_cp = open(config['path_logX'] + '/experiments.copy_{0}_{1}.log'.format(tmark, echo), 'w')
config['copynet'] = True
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('display')
agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
logger.info('generating [testing set] samples')
for idx in ts_idx:
# idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
log_cp.write(v)
log_cp.write('*' * 50 + '\n')
log_cp.close()
logger.info('Complete!')
================================================
FILE: experiments/lcsts_test.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts
from emolga.utils.generic_utils import *
from emolga.models.cooc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_lcsts
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('all')
print 'compile ok.'
echo = 2
epochs = 10
if echo > 0:
tmark = '20160217-232113'
agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
# training
notrain = False
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
if config['copynet']:
loss += [agent.train_(data_s, data_t, data_c)]
else:
loss += [agent.train_(data_s, data_t)]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word)
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word)
print '*' * 50
# save the weights.
agent.save(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
# # test accuracy
# progbar_tr = Progbar(2000)
#
# print '\n' + '__' * 50
# gen, gen_pos = 0, 0
# cpy, cpy_pos = 0, 0
# for it, idx in enumerate(tr_idx):
# train_s, train_t = train_data_plain[idx]
#
# c = agent.analyse_(np.asarray(train_s, dtype='int32'),
# np.asarray(train_t, dtype='int32'),
# idx2word)
# if c[1] == 0:
# # generation mode
# gen += 1
# gen_pos += c[0]
# else:
# # copy mode
# cpy += 1
# cpy_pos += c[0]
#
# progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
#
# logger.info('\nTraining Accuracy:' +
# '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
# '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
#
# progbar_ts = Progbar(2000)
# print '\n' + '__' * 50
# gen, gen_pos = 0, 0
# cpy, cpy_pos = 0, 0
# for it, idx in enumerate(ts_idx):
# test_s, test_t = test_data_plain[idx]
# c = agent.analyse_(np.asarray(test_s, dtype='int32'),
# np.asarray(test_t, dtype='int32'),
# idx2word)
# if c[1] == 0:
# # generation mode
# gen += 1
# gen_pos += c[0]
# else:
# # copy mode
# cpy += 1
# cpy_pos += c[0]
#
# progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
#
# logger.info('\nTesting Accuracy:' +
# '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
# '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
================================================
FILE: experiments/lcsts_vest.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo, setup_syn
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_lcsts
# setup = setup_syn
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = len(word2idx)
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
# train_data_plain = zip(*(train_set['source'], train_set['target']))
# test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
echo = 0
epochs = 10
if echo > 0:
tmark = '20160221-025049' # copynet multi-source model
agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
# recover from Nan
tmark = '20160307-135907' # '20160301-105813'
skip = 26000
sep = 1
# tmark = '20160301-114653'
# skip = 14000
agent.build_(0.001, 26000)
agent.compile_('all')
print 'compile ok.'
agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
def cc_martix(source, target):
cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
for k in xrange(source.shape[0]):
for j in xrange(target.shape[1]):
for i in xrange(source.shape[1]):
if (source[k, i] == target[k, j]) and (source[k, i] > 0):
cc[k][j][i] = 1.
return cc
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
# training
notrain = False
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
if (echo < sep):
continue
if (echo == sep) and (skip > it):
if it % 200 == 0:
print it
continue
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
if config['copynet']:
data_c = cc_martix(data_s, data_t)
# data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
else:
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(train_s), dtype='int32'))
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(test_s), dtype='int32'))
print '*' * 50
# save the weights.
if it % 2000 == 0:
agent.save(config['path_h5'] +
'/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
tmark, echo, it))
# # test accuracy
test = False
if test:
progbar_tr = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
for it, idx in enumerate(tr_idx):
train_s, train_t = train_data_plain[idx]
c = agent.analyse_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word)
# copy mode
cpy += 1
cpy_pos += c
progbar_tr.update(it + 1, [('Copy', cpy_pos)])
logger.info('\nTraining Accuracy:' +
'\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
progbar_ts = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
for it, idx in enumerate(ts_idx):
test_s, test_t = test_data_plain[idx]
c = agent.analyse_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word)
cpy += 1
cpy_pos += c
progbar_ts.update(it + 1, [('Copy', cpy_pos)])
logger.info('\nTesting Accuracy:' +
'\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
================================================
FILE: experiments/lcsts_vest_new.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo, setup_syn
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_lcsts
# setup = setup_syn
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = len(word2idx)
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
# train_data_plain = zip(*(train_set['source'], train_set['target']))
# test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('all')
print 'compile ok.'
echo = 0
epochs = 10
if echo > 0:
tmark = '20160221-025049' # copynet multi-source model
agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
# recover from Nan
# tmark = '20160301-105813'
skip = -1 #100000
sep = -1 #2
# tmark = '20160301-114653'
# skip = 14000
# agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
def cc_martix(source, target):
cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
for k in xrange(source.shape[0]):
for j in xrange(target.shape[1]):
for i in xrange(source.shape[1]):
if (source[k, i] == target[k, j]) and (source[k, i] > 0):
cc[k][j][i] = 1.
return cc
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
# training
notrain = False
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
if (echo < sep):
continue
if (echo == sep) and (skip > it):
if it % 200 == 0:
print it
continue
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
if config['copynet']:
data_c = cc_martix(data_s, data_t)
# data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
else:
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(train_s), dtype='int32'))
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(test_s), dtype='int32'))
print '*' * 50
# save the weights.
if it % 2000 == 0:
agent.save(config['path_h5'] +
'/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
tmark, echo, it))
# # test accuracy
test = False
if test:
progbar_tr = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
for it, idx in enumerate(tr_idx):
train_s, train_t = train_data_plain[idx]
c = agent.analyse_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word)
# copy mode
cpy += 1
cpy_pos += c
progbar_tr.update(it + 1, [('Copy', cpy_pos)])
logger.info('\nTraining Accuracy:' +
'\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
progbar_ts = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
for it, idx in enumerate(ts_idx):
test_s, test_t = test_data_plain[idx]
c = agent.analyse_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word)
cpy += 1
cpy_pos += c
progbar_ts.update(it + 1, [('Copy', cpy_pos)])
logger.info('\nTesting Accuracy:' +
'\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
================================================
FILE: experiments/movie_dataset.py
================================================
# coding=utf-8
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
import string
import random
import sys
random.seed(19920206)
word2idx = dict()
wordfreq = dict()
word2idx[''] = 0
word2idx[''] = 1
word2freq = dict()
def mark(line):
tmp_line = ''
for c in line:
if c in string.punctuation:
if c is not "'":
tmp_line += ' ' + c + ' '
else:
tmp_line += ' ' + c
else:
tmp_line += c
tmp_line = tmp_line.lower()
words = [w for w in tmp_line.split() if len(w) > 0]
for w in words:
if w not in word2freq:
word2freq[w] = 1
else:
word2freq[w] += 1
return words
fline = open('./dataset/cornell_movie/movie_lines.txt', 'r')
sets = [w.split('+++$+++') for w in fline.read().split('\n')]
lines = {w[0].strip(): mark(w[-1].strip()) for w in sets}
#
# for w in lines:
# if len(lines[w]) == 0:
# print w
fline.close()
print 'read lines ok'
fconv = open('./dataset/cornell_movie/movie_conversations.txt', 'r')
turns = []
convs = fconv.readline()
while convs:
turn = eval(convs.split('+++$+++')[-1].strip())
turns += zip(turn[:-1], turn[1:])
convs = fconv.readline()
pairs = [(lines[a], lines[b]) for a, b in turns
if len(lines[a]) > 0 and len(lines[b]) > 0]
# shuffle!
random.shuffle(pairs)
word2freq = sorted(word2freq.items(), key=lambda a: a[1], reverse=True)
for at, w in enumerate(word2freq):
word2idx[w[0]] = at + 2
idx2word = {k: v for v, k in word2idx.items()}
print idx2word[1], idx2word[2]
Lmax = len(idx2word)
# for i in xrange(Lmax):
# print idx2word[i]
print 'read dataset ok.'
print Lmax
print pairs[0]
def build_data(data):
instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
print len(data)
for pair in data:
source, target = pair
A = [word2idx[w] for w in source]
B = [word2idx[w] for w in target]
# C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
C = [0 if w not in source else source.index(w) + Lmax for w in target]
instance['text'] += [source]
instance['summary'] += [target]
instance['source'] += [A]
instance['target'] += [B]
# instance['cc_matrix'] += [C]
instance['target_c'] += [C]
print instance['source'][4000]
print instance['target'][4000]
print instance['target_c'][4000]
return instance
train_set = build_data(pairs[10000:])
test_set = build_data(pairs[:10000])
serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/movie_dialogue_data.pkl')
================================================
FILE: experiments/syn_vest.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
# setup = setup_lcsts
setup = setup_syn
# setup = setup_bst
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = len(word2idx)
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule']))
test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule']))
# train_data_plain = zip(*(train_set['source'], train_set['target']))
# test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
config['copynet'] = True # False
notrain = True
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
if notrain:
agent.compile_('display')
else:
agent.compile_('all')
print 'compile ok.'
echo = 6
epochs = 10
if echo > 0:
tmark = '20160227-013418' # copynet multi-source model
agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
def cc_martix(source, target):
cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
for k in xrange(source.shape[0]):
for j in xrange(target.shape[1]):
for i in xrange(source.shape[1]):
if (source[k, i] == target[k, j]) and (source[k, i] > 0):
cc[k][j][i] = 1.
return cc
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
# training
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
if config['copynet']:
data_c = cc_martix(data_s, data_t)
# data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
else:
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(train_s), dtype='int32'))
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(test_s), dtype='int32'))
print '*' * 50
# save the weights.
agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
# # test accuracy
def judge_rule(rule):
rule = rule.split()
fine = ''
for w in rule:
if w not in word2idx:
fine += w
return fine
test = True
if test:
def analysis_(data_plain, mode='Training'):
progbar_tr = Progbar(2000)
print '\n' + '__' * 50
cpy, cpy_pos = 0, 0
types = dict()
for it, idx in enumerate(tr_idx):
train_s, train_t, rule = data_plain[idx]
t = judge_rule(rule)
c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word))
# copy mode
cpy += 1
cpy_pos += c
if t not in types:
types[t] = {}
types[t][0] = c
types[t][1] = 1
else:
types[t][0] += c
types[t][1] += 1
progbar_tr.update(it + 1, [('Copy', cpy_pos)])
logger.info('\n{0} Accuracy:' +
'\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
print '==' * 50
for t in types:
print 'Type: {0}: {1}/{2}={3}%'.format(t, int(types[t][0]), types[t][1],
100 * types[t][0] / float(types[t][1]))
# analysis_(train_data_plain, 'Training')
analysis_(test_data_plain, 'Testing')
================================================
FILE: experiments/syntest.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_syn
from emolga.utils.generic_utils import *
from emolga.models.cooc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_syn
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
# the vocabulary
tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0'
voc = [tmp[a] + tmp[b] + tmp[c]
for c in xrange(10)
for b in xrange(10)
for a in xrange(10)]
word2idx = {voc[k]: k + 1 for k in xrange(len(voc))}
word2idx[''] = 0
idx2word = {word2idx[w]: w for w in word2idx}
voc = [''] + voc
train_set, test_set = deserialize_from_file(config['dataset'])
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
('rule_id', data['rule_id']),
('rule', data['rule'])]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule_id'], train_set['rule']))
test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule_id'], test_set['rule']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
agent.compile_('all')
print 'compile ok.'
echo = 3
epochs = 4
if echo > 0:
tmark = '20160216-152155'
agent.load(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
# training
notrain = True
if not notrain:
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# obtain data
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
if config['copynet']:
loss += [agent.train_(data_s, data_t, data_c)]
else:
loss += [agent.train_(data_s, data_t)]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t, _, _ = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word)
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t, _, _ = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word)
print '*' * 50
# save the weights.
agent.save(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
# test accuracy
progbar_tr = Progbar(2000)
print '\n' + '__' * 50
gen, gen_pos = 0, 0
cpy, cpy_pos = 0, 0
grs, crs = [], []
for it, idx in enumerate(tr_idx):
train_s, train_t, rid, rule = train_data_plain[idx]
c = agent.analyse_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word)
if c[1] == 0:
# generation mode
gen += 1
gen_pos += c[0]
if c[0] == 1:
grs += [rule]
else:
# copy mode
cpy += 1
cpy_pos += c[0]
if c[0] == 1:
crs += [rule]
progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
grs = set(grs)
crs = set(crs)
irs = set.intersection(grs, crs)
logger.info('\nTraining Accuracy:' +
'\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
'\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
'\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
print 'Generate Mode:'
for r in grs:
print r
print 'Copy Mode:'
for r in crs:
print r
print 'Interaction:'
for r in irs:
print r
progbar_ts = Progbar(2000)
print '\n' + '__' * 50
gen, gen_pos = 0, 0
cpy, cpy_pos = 0, 0
grs, crs = [], []
for it, idx in enumerate(ts_idx):
test_s, test_t, rid, rule = test_data_plain[idx]
c = agent.analyse_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word)
if c[1] == 0:
# generation mode
gen += 1
gen_pos += c[0]
grs += [rule]
else:
# copy mode
cpy += 1
cpy_pos += c[0]
crs += [rule]
progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
grs = set(grs)
crs = set(crs)
irs = set.intersection(grs, crs)
logger.info('\nTesting Accuracy:' +
'\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
'\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
'\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
print 'Generate Mode:'
for r in grs:
print r
print 'Copy Mode:'
for r in crs:
print r
print 'Interaction:'
for r in irs:
print r
================================================
FILE: experiments/synthetic.py
================================================
__author__ = 'jiataogu'
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
import numpy.random as n_rng
n_rng.seed(19920206)
# the vocabulary
tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0'
voc = [tmp[a] + tmp[b] + tmp[c]
for c in xrange(10)
for b in xrange(10)
for a in xrange(10)]
word2idx = {voc[k]: k + 2 for k in xrange(len(voc))}
word2idx[''] = 0
word2idx[''] = 1
idx2word = {word2idx[w]: w for w in word2idx}
voc = ['', ''] + voc
# word2idx['X'] = len(voc)
# idx2word[len(voc)] = 'X'
# voc += ['X']
#
# word2idx['Y'] = len(voc)
# idx2word[len(voc)] = 'Y'
# voc += ['Y']
# print word2idx['X'], word2idx['Y']
# load the dataset
Rules, _ = deserialize_from_file('/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl')
num = 200
repeats = 100
maxleg = 15
Lmax = len(idx2word)
rules = dict(source=Rules['source'][:num],
target=Rules['target'][:num])
def ftr(v):
if v < 10:
return '00' + str(v)
elif v < 100:
return '0' + str(v)
else:
return str(v)
def build_instance():
instance = dict(x=[], y=[], source=[], target=[], target_c=[], rule_id=[], rule=[])
for k in xrange(num):
source = rules['source'][k]
target = rules['target'][k]
for j in xrange(repeats):
X = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
Y = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
S = []
T = []
for w in source:
if w is 'X':
S += [ftr(v) for v in X]
elif w is 'Y':
S += [ftr(v) for v in Y]
else:
S += [w]
for w in target:
if w is 'X':
T += [ftr(v) for v in X]
elif w is 'Y':
T += [ftr(v) for v in Y]
else:
T += [w]
A = [word2idx[w] for w in S]
B = [word2idx[w] for w in T]
C = [0 if w not in S else S.index(w) + Lmax for w in T]
instance['x'] += [S]
instance['y'] += [T]
instance['source'] += [A]
instance['target'] += [B]
instance['target_c'] += [C]
instance['rule_id'] += [k]
instance['rule'] += [' '.join(source) + ' -> ' + ' '.join(target)]
return instance
train_set = build_instance()
print 'build ok.'
test_set = build_instance()
print 'build ok.'
serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl')
# serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl')
================================================
FILE: experiments/weibo_dataset.py
================================================
# coding=utf-8
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
word2idx = dict()
wordfreq = dict()
word2idx[''] = 0
word2idx[''] = 1
# segment = False # True
# training set
pairs = []
f = open('./dataset/weibo/co-occur.txt', 'r')
line = f.readline().strip().decode('utf-8')
at = 2
lines = 0
while line:
post = line # f.readline().strip().decode('utf-8')
post= [w.strip() for w in post.split() if len(w.strip()) > 0]
# if segment:
# summary = [w for w in jb.cut(summary)]
for w in post:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
text = f.readline().strip().decode('utf-8')
text = [w.strip() for w in text.split() if len(w.strip()) > 0]
# if segment:
# text = [w for w in jb.cut(text)]
for w in text:
if w not in wordfreq:
wordfreq[w] = 1
else:
wordfreq[w] += 1
# if w not in word2idx:
# word2idx[w] = at
# at += 1
pair = (post, text)
pairs.append(pair)
lines += 1
if lines % 20000 == 0:
print lines
f.readline()
line = f.readline().strip().decode('utf-8')
# sort the vocabulary
wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
for w in wordfreq:
word2idx[w[0]] = at
at += 1
idx2word = dict()
for v, k in word2idx.items():
idx2word[k] = v
Lmax = len(idx2word)
print 'read dataset ok.'
print Lmax
for i in xrange(Lmax):
print idx2word[i].encode('utf-8')
# use character-based model [on]
# use word-based model [off]
def build_data(data):
instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
for pair in data:
source, target = pair
A = [word2idx[w] for w in source]
B = [word2idx[w] for w in target]
# C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
C = [0 if w not in source else source.index(w) + Lmax for w in target]
instance['text'] += [source]
instance['summary'] += [target]
instance['source'] += [A]
instance['target'] += [B]
# instance['cc_matrix'] += [C]
instance['target_c'] += [C]
print instance['target'][5000]
print instance['target_c'][5000]
return instance
train_set = build_data(pairs[10000:])
test_set = build_data(pairs[:10000])
serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl')
================================================
FILE: experiments/weibo_vest.py
================================================
"""
This is the implementation of Copy-NET
We start from the basic Seq2seq framework for a auto-encoder.
"""
import logging
import time
import numpy as np
import sys
import copy
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from experiments.config import setup_lcsts, setup_weibo
from emolga.utils.generic_utils import *
from emolga.models.covc_encdec import NRM
from emolga.models.encdec import NRM as NRM0
from emolga.dataset.build_dataset import deserialize_from_file
from collections import OrderedDict
from fuel import datasets
from fuel import transformers
from fuel import schemes
setup = setup_weibo
def init_logging(logfile):
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S' )
fh = logging.FileHandler(logfile)
# ch = logging.StreamHandler()
fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# fh.setLevel(logging.INFO)
# ch.setLevel(logging.INFO)
# logging.getLogger().addHandler(ch)
logging.getLogger().addHandler(fh)
logging.getLogger().setLevel(logging.INFO)
return logging
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup() # load settings.
for w in config:
print '{0}={1}'.format(w, config[w])
logger = init_logging(config['path_log'] + '/experiments.CopyWeibo.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')
train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
if config['voc_size'] == -1: # not use unk
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
else:
config['enc_voc_size'] = config['voc_size']
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])
logger.info('build dataset done. ' +
'dataset size: {} ||'.format(samples) +
'vocabulary size = {0}/ batch size = {1}'.format(
config['dec_voc_size'], config['batch_size']))
def build_data(data):
# create fuel dataset.
dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
('target', data['target']),
('target_c', data['target_c']),
]))
dataset.example_iteration_scheme \
= schemes.ShuffledExampleScheme(dataset.num_examples)
return dataset
train_data = build_data(train_set)
train_data_plain = zip(*(train_set['source'], train_set['target']))
test_data_plain = zip(*(test_set['source'], test_set['target']))
train_size = len(train_data_plain)
test_size = len(test_data_plain)
tr_idx = n_rng.permutation(train_size)[:2000].tolist()
ts_idx = n_rng.permutation(test_size )[:2000].tolist()
logger.info('load the data ok.')
notrain = False
# build the agent
if config['copynet']:
agent = NRM(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
else:
agent = NRM0(config, n_rng, rng, mode=config['mode'],
use_attention=True, copynet=config['copynet'], identity=config['identity'])
agent.build_()
if notrain:
agent.compile_('display')
else:
agent.compile_('all')
print 'compile ok.'
echo = 0
epochs = 10
if echo > 0:
tmark = '20160227-164324' # copynet multi-source model
agent.load(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
while echo < epochs:
echo += 1
loss = []
def output_stream(dataset, batch_size, size=1):
data_stream = dataset.get_example_stream()
data_stream = transformers.Batch(data_stream,
iteration_scheme=schemes.ConstantScheme(batch_size))
# add padding and masks to the dataset
data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
return data_stream
def prepare_batch(batch, mask, fix_len=None):
data = batch[mask].astype('int32')
data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
def cut_zeros(data, fix_len=None):
if fix_len is not None:
return data[:, : fix_len]
for k in range(data.shape[1] - 1, 0, -1):
data_col = data[:, k].sum()
if data_col > 0:
return data[:, : k + 2]
return data
data = cut_zeros(data, fix_len)
return data
def cc_martix(source, target):
cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
for k in xrange(source.shape[0]):
for j in xrange(target.shape[1]):
for i in xrange(source.shape[1]):
if (source[k, i] == target[k, j]) and (source[k, i] > 0):
cc[k][j][i] = 1.
return cc
def unk_filter(data):
if config['voc_size'] == -1:
return copy.copy(data)
else:
mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
data = copy.copy(data * mask + (1 - mask))
return data
# training
train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
progbar = Progbar(train_size / config['batch_size'])
for it, batch in enumerate(train_batches):
# obtain data
if not notrain:
data_s = prepare_batch(batch, 'source')
data_t = prepare_batch(batch, 'target')
if config['copynet']:
data_c = cc_martix(data_s, data_t)
# data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
else:
loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
if it % 200 == 0:
logger.info('Echo={} Evaluation Sampling.'.format(it))
logger.info('generating [training set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * train_size))
train_s, train_t = train_data_plain[idx]
v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
np.asarray(train_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(train_s), dtype='int32'),
encode=config['utf-8'])
print '*' * 50
logger.info('generating [testing set] samples')
for _ in xrange(5):
idx = int(np.floor(n_rng.rand() * test_size))
test_s, test_t = test_data_plain[idx]
v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
np.asarray(test_t, dtype='int32'),
idx2word,
np.asarray(unk_filter(test_s), dtype='int32'),
encode=config['utf-8'])
print '*' * 50
if it % 10000 == 0:
# save the weights.
agent.save(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
# # test accuracy
# progbar_tr = Progbar(2000)
#
# print '\n' + '__' * 50
# gen, gen_pos = 0, 0
# cpy, cpy_pos = 0, 0
# for it, idx in enumerate(tr_idx):
# train_s, train_t = train_data_plain[idx]
#
# c = agent.analyse_(np.asarray(train_s, dtype='int32'),
# np.asarray(train_t, dtype='int32'),
# idx2word)
# if c[1] == 0:
# # generation mode
# gen += 1
# gen_pos += c[0]
# else:
# # copy mode
# cpy += 1
# cpy_pos += c[0]
#
# progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
#
# logger.info('\nTraining Accuracy:' +
# '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
# '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
#
# progbar_ts = Progbar(2000)
# print '\n' + '__' * 50
# gen, gen_pos = 0, 0
# cpy, cpy_pos = 0, 0
# for it, idx in enumerate(ts_idx):
# test_s, test_t = test_data_plain[idx]
# c = agent.analyse_(np.asarray(test_s, dtype='int32'),
# np.asarray(test_t, dtype='int32'),
# idx2word)
# if c[1] == 0:
# # generation mode
# gen += 1
# gen_pos += c[0]
# else:
# # copy mode
# cpy += 1
# cpy_pos += c[0]
#
# progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
#
# logger.info('\nTesting Accuracy:' +
# '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
# '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))