Repository: white127/QA-deep-learning
Branch: master
Commit: 94971ec4b305
Files: 20
Total size: 111.3 KB
Directory structure:
gitextract_f17xp5p4/
├── README.md
├── cnn/
│ ├── tensorflow/
│ │ ├── README.md
│ │ ├── insqa_cnn.py
│ │ ├── insqa_cnn.py.old
│ │ ├── insqa_train.py
│ │ ├── insqa_train.py.old
│ │ ├── insurance_qa_data_helpers.py
│ │ └── test.py
│ └── theano/
│ ├── README.md
│ └── insqa_cnn.py
├── config.py
├── gen.py
├── lstm_cnn/
│ └── theano/
│ ├── README.md
│ └── insqa_lstm.py
├── rnn_attention/
│ └── tensorflow/
│ ├── insurance_qa_data_helpers.py
│ └── tf_rnn_char.py
├── swem/
│ ├── swem_hier.py
│ ├── swem_hier_margin.py
│ └── swem_max_margin.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
Insurance-QA deeplearning model
======
This is a repo for Q&A Mathing, includes some deep learning models, such as CNN、RNN.
1. CNN. Basic CNN model from 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
2. RNN. RNN seems the best model on Insurance-QA dataset.
3. SWEM. SWEM is the fastest, and has good effect on other datasets, such as WikiQA ..., but is seems not so good on Insurance-QA dataset. I think that, SWEM is more suitable for Q&Q matching, not Q&A matching.
It's hard to say which model is the best in other datasets, you have to choose the most suitable model for you.
More models are on the way, pay attention to the updates.
## Requirements
1. tensorflow 1.4.0
2. python3.5
## Performance
margin loss version
Model/Score | Ins_qa_top1_precision | quora_best_prec
------------ | ------------- | -------------
CNN | 62% | None
LSTM+CNN | 68% | None
SWEM | <55% | None
logloss version
Model/Score | Insqa_top1_precision | quora_best_prec
------------ | ------------- | -------------
CNN | None | 79.60%
LSTM+CNN | None | None
SWEM | <40% | 82.69%
## Running
Change configuration to your own environment, just like data pathes
vim config.py
Data processing
python3 gen.py
Run CNN model
cd ./cnn/tensorflow && python3 insqa_train.py
It will take few hours(thousands of epoches) to train this model on a single GPU.
## Downloads
1. You can get Insurance-QA data from here https://github.com/shuzi/insuranceQA
2. You can get Quora data from here http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv
## Links
1. CNN and RNN textual classification repo https://github.com/white127/TextClassification_CNN_RNN
2. 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
================================================
FILE: cnn/tensorflow/README.md
================================================
================result==================
结果和theano版本的差不多,具体数值忘了
虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的话训练速度要快一些。
================dataset================
数据格式和theano版本的是一样的
github上给出的是样本数据,如果需要全量的,也可直接联系我
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
./insqa_train.py
我使用的是python3.4,部分代码可能会和python2不兼容,如使用python2需要自己做一些小修改,核心的CNN代码应该
不用改动的
代码里的数据路径(类似'/export/...')是需要根据自己的环境修改的,指向自己的数据路径即可。核心的CNN代码无需改动
================================================
FILE: cnn/tensorflow/insqa_cnn.py
================================================
import tensorflow as tf
import numpy as np
##########################################################################
# embedding_lookup + cnn + cosine margine , batch
##########################################################################
class InsQACNN(object):
def __init__(self, _margin, sequence_length, batch_size,
vocab_size, embedding_size,
filter_sizes, num_filters, l2_reg_lambda=0.0):
self.L, self.B, self.V, self.E, self.FS, self.NF = sequence_length, batch_size, \
vocab_size, embedding_size, filter_sizes, num_filters
#用户问题,字向量使用embedding_lookup
self.q = tf.placeholder(tf.int32, [self.B, self.L], name="q")
#待匹配正向问题
self.qp = tf.placeholder(tf.int32, [self.B, self.L], name="qp")
#负向问题
self.qn = tf.placeholder(tf.int32, [self.B, self.L], name="qn")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
W = tf.get_variable(
initializer=tf.random_uniform([self.V, self.E], -1.0, 1.0),
name='We')
self.qe = tf.nn.embedding_lookup(W, self.q)
self.qpe = tf.nn.embedding_lookup(W, self.qp)
self.qne = tf.nn.embedding_lookup(W, self.qn)
self.qe = tf.expand_dims(self.qe, -1)
self.qpe = tf.expand_dims(self.qpe, -1)
self.qne = tf.expand_dims(self.qne, -1)
with tf.variable_scope('shared-conv') as scope:
self.qe = self.conv(self.qe)
scope.reuse_variables()
#tf.get_variable_scope().reuse_variables()
self.qpe = self.conv(self.qpe)
scope.reuse_variables()
#tf.get_variable_scope().reuse_variables()
self.qne = self.conv(self.qne)
self.cos_q_qp = self.cosine(self.qe, self.qpe)
self.cos_q_qn = self.cosine(self.qe, self.qne)
zero = tf.constant(0, shape=[self.B], dtype=tf.float32)
margin = tf.constant(_margin, shape=[self.B], dtype=tf.float32)
with tf.name_scope("loss"):
self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_q_qp, self.cos_q_qn)))
self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
print('loss ', self.loss)
# Accuracy
with tf.name_scope("accuracy"):
self.correct = tf.equal(zero, self.losses)
self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
for v in tf.trainable_variables():
print(v)
def conv(self, tensor):
pooled = []
#with tf.variable_scope(name_or_scope='my-conv', reuse=tf.AUTO_REUSE):
with tf.variable_scope("my-conv-shared"):
for i, fs in enumerate(self.FS):
filter_shape = [fs, self.E, 1, self.NF]
W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1),
name="W-%s" % str(fs))
b = tf.get_variable(initializer=tf.constant(0.1, shape=[self.NF]),
name="b-%s" % str(fs))
conv = tf.nn.conv2d(
tensor, W, strides=[1, 1, 1, 1], padding='VALID',
name="conv")
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
output = tf.nn.max_pool(
h, ksize=[1, self.L - fs + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID',
name="pool")
pooled.append(output)
num_filters_total = self.NF * len(self.FS)
pooled = tf.reshape(tf.concat(pooled, 3), [-1, num_filters_total])
pooled = tf.nn.dropout(pooled, self.dropout_keep_prob)
return pooled
def cosine(self, v1, v2):
l1 = tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1))
l2 = tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1))
a = tf.reduce_sum(tf.multiply(v1, v2), 1)
cos = tf.div(a, tf.multiply(l1, l2), name='score')
return tf.clip_by_value(cos, 1e-5, 0.99999)
================================================
FILE: cnn/tensorflow/insqa_cnn.py.old
================================================
import tensorflow as tf
import numpy as np
##########################################################################
# embedding_lookup + cnn + cosine margine , batch
##########################################################################
class InsQACNN1(object):
def __init__(
self, sequence_length, batch_size,
vocab_size, embedding_size,
filter_sizes, num_filters, l2_reg_lambda=0.0):
#用户问题,字向量使用embedding_lookup
self.input_x_1 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_1")
#待匹配正向问题
self.input_x_2 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_2")
#负向问题
self.input_x_3 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_3")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)
print("input_x_1 ", self.input_x_1)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
chars_1 = tf.nn.embedding_lookup(W, self.input_x_1)
chars_2 = tf.nn.embedding_lookup(W, self.input_x_2)
chars_3 = tf.nn.embedding_lookup(W, self.input_x_3)
#self.embedded_chars_1 = tf.nn.dropout(chars_1, self.dropout_keep_prob)
#self.embedded_chars_2 = tf.nn.dropout(chars_2, self.dropout_keep_prob)
#self.embedded_chars_3 = tf.nn.dropout(chars_3, self.dropout_keep_prob)
self.embedded_chars_1 = chars_1
self.embedded_chars_2 = chars_2
self.embedded_chars_3 = chars_3
self.embedded_chars_expanded_1 = tf.expand_dims(self.embedded_chars_1, -1)
self.embedded_chars_expanded_2 = tf.expand_dims(self.embedded_chars_2, -1)
self.embedded_chars_expanded_3 = tf.expand_dims(self.embedded_chars_3, -1)
pooled_outputs_1 = []
pooled_outputs_2 = []
pooled_outputs_3 = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded_1,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-1"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-1")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-1"
)
pooled_outputs_1.append(pooled)
conv = tf.nn.conv2d(
self.embedded_chars_expanded_2,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-2"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-2")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-2"
)
pooled_outputs_2.append(pooled)
conv = tf.nn.conv2d(
self.embedded_chars_expanded_3,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-3"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-3")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-3"
)
pooled_outputs_3.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
pooled_reshape_1 = tf.reshape(tf.concat(pooled_outputs_1, 3), [-1, num_filters_total])
pooled_reshape_2 = tf.reshape(tf.concat(pooled_outputs_2, 3), [-1, num_filters_total])
pooled_reshape_3 = tf.reshape(tf.concat(pooled_outputs_3, 3), [-1, num_filters_total])
#dropout
pooled_flat_1 = tf.nn.dropout(pooled_reshape_1, self.dropout_keep_prob)
pooled_flat_2 = tf.nn.dropout(pooled_reshape_2, self.dropout_keep_prob)
pooled_flat_3 = tf.nn.dropout(pooled_reshape_3, self.dropout_keep_prob)
pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) #计算向量长度Batch模式
pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1))
pooled_len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_3, pooled_flat_3), 1))
pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) #计算向量的点乘Batch模式
pooled_mul_13 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_3), 1)
with tf.name_scope("output"):
self.cos_12 = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") #计算向量夹角Batch模式
self.cos_13 = tf.div(pooled_mul_13, tf.multiply(pooled_len_1, pooled_len_3))
zero = tf.constant(0, shape=[batch_size], dtype=tf.float32)
margin = tf.constant(0.05, shape=[batch_size], dtype=tf.float32)
with tf.name_scope("loss"):
self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_12, self.cos_13)))
self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
print('loss ', self.loss)
# Accuracy
with tf.name_scope("accuracy"):
self.correct = tf.equal(zero, self.losses)
self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
for v in tf.trainable_variables():
print(v)
exit(1)
================================================
FILE: cnn/tensorflow/insqa_train.py
================================================
#! /usr/bin/env python3.4
import tensorflow as tf
import numpy as np
import os, time, datetime, operator, sys
from insqa_cnn import InsQACNN
sys.path.append('../../')
import config, utils
print(tf.__version__)
# Parameters
# ==================================================
# Model Hyperparameters
tf.flags.DEFINE_float("margin", 0.05, "CNN model margin")
tf.flags.DEFINE_integer("sequence_length", 200, "Max sequence lehgth(default: 200)")
tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 3000, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
vocab, embeddings = utils.load_embeddings()
train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
print("Load done...")
# Training
# ==================================================
prev_auc = 0
with tf.Graph().as_default():
with tf.device("/gpu:1"):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = InsQACNN(
_margin=FLAGS.margin,
sequence_length=FLAGS.sequence_length,
batch_size=FLAGS.batch_size,
vocab_size=len(vocab),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-2)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
# Initialize all variables
sess.run(tf.initialize_all_variables())
def train_step(q, qp, qn):
feed_dict = {
cnn.q: q, cnn.qp: qp, cnn.qn: qn,
#cnn.input_x_1: q, cnn.input_x_2: qp, cnn.input_x_3: qn,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy, cos1, cos2 = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.cos_q_qp, cnn.cos_q_qn],
feed_dict)
#print(cos1)
#print(cos2)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def test_step():
yp, y, group, of = [], [], [], open(config.predict1_file, 'w')
for i in range(0, len(test_data), FLAGS.batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+FLAGS.batch_size)
feed_dict = {
cnn.q: q1, cnn.qp: q2, cnn.qn: q2,
#cnn.input_x_1: q1, cnn.input_x_2: q2, cnn.input_x_3: q2,
cnn.dropout_keep_prob: 1.0
}
cos = sess.run([cnn.cos_q_qp], feed_dict)
yp.extend(cos[0])
y.extend(f)
group.extend(g)
y, g, yp = y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
auc = utils.eval_auc(y[:len(test_data)], g, yp[:len(test_data)])
top1_prec = utils._eval_top1_prec(y, g, yp)
for p in yp[:len(test_data)]: of.write(str(p) + '\n')
of.write(str(top1_prec) + '\n')
of.close()
return auc
# Generate batches
# Training loop. For each batch...
for i in range(FLAGS.num_epochs):
try:
q, qp, qn = utils.gen_train_batch_qpn(train_data, FLAGS.batch_size)
train_step(q, qp, qn)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
auc = test_step()
#if auc < prev_auc: break
prev_auc = auc
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
except Exception as e:
print(e)
================================================
FILE: cnn/tensorflow/insqa_train.py.old
================================================
#! /usr/bin/env python3.4
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import insurance_qa_data_helpers
from insqa_cnn import InsQACNN1
import operator
#print tf.__version__
# Parameters
# ==================================================
# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 100, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
vocab = insurance_qa_data_helpers.build_vocab()
alist = insurance_qa_data_helpers.read_alist()
raw = insurance_qa_data_helpers.read_raw()
x_train_1, x_train_2, x_train_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
testList, vectors = insurance_qa_data_helpers.load_test_and_vectors()
vectors = ''
print('x_train_1', np.shape(x_train_1))
print("Load done...")
val_file = '/export/jw/cnn/insuranceQA/test1'
precision = '/export/jw/cnn/insuranceQA/test1.acc'
#x_val, y_val = data_deepqa.load_data_val()
# Training
# ==================================================
with tf.Graph().as_default():
with tf.device("/gpu:1"):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = InsQACNN1(
sequence_length=x_train_1.shape[1],
batch_size=FLAGS.batch_size,
vocab_size=len(vocab),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-2)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
# Initialize all variables
sess.run(tf.initialize_all_variables())
def train_step(x_batch_1, x_batch_2, x_batch_3):
"""
A single training step
"""
feed_dict = {
cnn.input_x_1: x_batch_1,
cnn.input_x_2: x_batch_2,
cnn.input_x_3: x_batch_3,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def dev_step():
scoreList = []
i = int(0)
while True:
x_test_1, x_test_2, x_test_3 = insurance_qa_data_helpers.load_data_val_6(testList, vocab, i, FLAGS.batch_size)
feed_dict = {
cnn.input_x_1: x_test_1,
cnn.input_x_2: x_test_2,
cnn.input_x_3: x_test_3,
cnn.dropout_keep_prob: 1.0
}
batch_scores = sess.run([cnn.cos_12], feed_dict)
for score in batch_scores[0]:
scoreList.append(score)
i += FLAGS.batch_size
if i >= len(testList):
break
sessdict = {}
index = int(0)
for line in open(val_file):
items = line.strip().split(' ')
qid = items[1].split(':')[1]
if not qid in sessdict:
sessdict[qid] = []
sessdict[qid].append((scoreList[index], items[0]))
index += 1
if index >= len(testList):
break
lev1 = float(0)
lev0 = float(0)
of = open(precision, 'a')
for k, v in sessdict.items():
v.sort(key=operator.itemgetter(0), reverse=True)
score, flag = v[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print('lev1 ' + str(lev1))
print('lev0 ' + str(lev0))
of.close()
# Generate batches
# Training loop. For each batch...
for i in range(FLAGS.num_epochs):
try:
x_batch_1, x_batch_2, x_batch_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
train_step(x_batch_1, x_batch_2, x_batch_3)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
print("\nEvaluation:")
dev_step()
print("")
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
except Exception as e:
print(e)
================================================
FILE: cnn/tensorflow/insurance_qa_data_helpers.py
================================================
import numpy as np
import random
empty_vector = []
for i in range(0, 100):
empty_vector.append(float(0.0))
onevector = []
for i in range(0, 10):
onevector.append(float(1))
zerovector = []
for i in range(0, 10):
zerovector.append(float(0))
def build_vocab():
code = int(0)
vocab = {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 4):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/test1'):
items = line.strip().split(' ')
for i in range(2, 4):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def rand_qa(qalist):
index = random.randint(0, len(qalist) - 1)
return qalist[index]
def read_alist():
alist = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
alist.append(items[3])
print('read_alist done ......')
return alist
def vocab_plus_overlap(vectors, sent, over, size):
global onevector
global zerovector
oldict = {}
words = over.split('_')
if len(words) < size:
size = len(words)
for i in range(0, size):
if words[i] == '':
continue
oldict[words[i]] = '#'
matrix = []
words = sent.split('_')
if len(words) < size:
size = len(words)
for i in range(0, size):
vec = read_vector(vectors, words[i])
newvec = vec.copy()
#if words[i] in oldict:
# newvec += onevector
#else:
# newvec += zerovector
matrix.append(newvec)
return matrix
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if (len(items) < 101):
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def read_vector(vectors, word):
global empty_vector
if word in vectors:
return vectors[word]
else:
return empty_vector
#return vectors['']
def load_test_and_vectors():
testList = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
testList.append(line.strip())
vectors = load_vectors()
return testList, vectors
def load_train_and_vectors():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
trainList.append(line.strip())
vectors = load_vectors()
return trainList, vectors
def load_data_val_10(testList, vectors, index):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = testList[index].split(' ')
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def read_raw():
raw = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
raw.append(items)
return raw
def encode_sent(vocab, string, size):
x = []
words = string.split('_')
for i in range(0, 200):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
return x
def load_data_6(vocab, alist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
for i in range(0, size):
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(alist)
x_train_1.append(encode_sent(vocab, items[2], 100))
x_train_2.append(encode_sent(vocab, items[3], 100))
x_train_3.append(encode_sent(vocab, nega, 100))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_val_6(testList, vocab, index, batch):
x_train_1 = []
x_train_2 = []
x_train_3 = []
for i in range(0, batch):
true_index = index + i
if (true_index >= len(testList)):
true_index = len(testList) - 1
items = testList[true_index].split(' ')
x_train_1.append(encode_sent(vocab, items[2], 100))
x_train_2.append(encode_sent(vocab, items[3], 100))
x_train_3.append(encode_sent(vocab, items[3], 100))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_9(trainList, vectors, size):
x_train_1 = []
x_train_2 = []
y_train = []
for i in range(0, size):
pos = trainList[random.randint(0, len(trainList) - 1)]
posItems = pos.strip().split(' ')
x_train_1.append(vocab_plus_overlap(vectors, posItems[2], posItems[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, posItems[3], posItems[2], 200))
y_train.append([1, 0])
neg = trainList[random.randint(0, len(trainList) - 1)]
negItems = neg.strip().split(' ')
x_train_1.append(vocab_plus_overlap(vectors, posItems[2], negItems[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, negItems[3], posItems[2], 200))
y_train.append([0, 1])
return np.array(x_train_1), np.array(x_train_2), np.array(y_train)
def load_data_val_9(testList, vectors, index):
x_train_1 = []
x_train_2 = []
items = testList[index].split(' ')
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
return np.array(x_train_1), np.array(x_train_2)
def load_data_10(vectors, qalist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(qalist)
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_11(vectors, qalist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(qalist)
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def batch_iter(data, batch_size, num_epochs, shuffle=True):
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
================================================
FILE: cnn/tensorflow/test.py
================================================
import random
_list = [i for i in range(0, 10)]
_l1 = random.sample(_list, 2)
_l2 = random.sample(_list, 2)
print(_l1)
print(_l2)
for i in range(2, 2):
print(i)
================================================
FILE: cnn/theano/README.md
================================================
================result==================
theano and tensorflow cnn code for insuranceQA
theano code, test1 top-1 precision : 61.5% (see ./insuranceQA/acc)
tensorflow code, test1 top-1 precision : 62.6%
the best precision in the paper is 62.8% (see Applying Deep Leaarning To Answer Selection: A study and an open task)
================dataset================
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
reformat the original dataset(see my train and test1.sample)
change filepath to your dataset(see TODO in insqa_cnn.py)
python insqa_cnn.py
================================================
FILE: cnn/theano/insqa_cnn.py
================================================
###########################################################
# test1 top-1 precision: 62%
###########################################################
import os, sys, timeit, random, operator
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
#TODO change path to your dataset
trainfile = '/export/jw/cnn/insuranceQA/train'
test1file = '/export/jw/cnn/insuranceQA/test1'
vectorsfile = '/export/jw/cnn/insuranceQA/vectors.nobin'
###########################################################
# read qa data
###########################################################
def build_vocab():
global trainfile
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open(trainfile):
items = line.strip().split(' ')
for i in range(2, 3):
for word in items[i].split('_'):
if len(word) <= 0:
continue
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def load_vectors():
global vectorsfile
vectors = {}
for line in open(vectorsfile):
items = line.strip().split(' ')
if len(items[0]) <= 0:
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def load_word_embeddings(vocab, dim):
vectors = load_vectors()
embeddings = [] #brute initialization
for i in range(0, len(vocab)):
vec = []
for j in range(0, dim):
vec.append(0.01)
embeddings.append(vec)
for word, code in vocab.items():
if word in vectors:
embeddings[code] = vectors[word]
return np.array(embeddings, dtype='float32')
#be attention initialization of UNKNNOW
def encode_sent(vocab, string, size):
x = []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
return x
def load_train_list():
global trainfile
trainList = []
for line in open(trainfile):
trainList.append(line.strip().split(' '))
return trainList
def load_test_list():
global test1file
testList = []
for line in open(test1file):
testList.append(line.strip().split(' '))
return testList
def load_data(trainList, vocab, batch_size):
train_1, train_2, train_3 = [], [], []
for i in range(0, batch_size):
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
train_1.append(encode_sent(vocab, pos[2], 100))
train_2.append(encode_sent(vocab, pos[3], 100))
train_3.append(encode_sent(vocab, neg[3], 100))
return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32')
def load_data_val(testList, vocab, index, batch_size):
x1, x2, x3 = [], [], []
for i in range(0, batch_size):
true_index = index + i
if true_index >= len(testList):
true_index = len(testList) - 1
items = testList[true_index]
x1.append(encode_sent(vocab, items[2], 100))
x2.append(encode_sent(vocab, items[3], 100))
x3.append(encode_sent(vocab, items[3], 100))
return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32')
def validation(validate_model, testList, vocab, batch_size):
index, score_list = int(0), []
while True:
x1, x2, x3 = load_data_val(testList, vocab, index, batch_size)
batch_scores, nouse = validate_model(x1, x2, x3, 1.0)
for score in batch_scores:
score_list.append(score)
index += batch_size
if index >= len(testList):
break
print 'Evaluation ' + str(index)
sdict, index = {}, int(0)
for items in testList:
qid = items[1].split(':')[1]
if not qid in sdict:
sdict[qid] = []
sdict[qid].append((score_list[index], items[0]))
index += 1
lev0, lev1 = float(0), float(0)
for qid, cases in sdict.items():
cases.sort(key=operator.itemgetter(0), reverse=True)
score, flag = cases[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
print 'top-1 precition: ' + str(lev1 / (lev0 + lev1))
class QACnn(object):
def __init__(self, input1, input2, input3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters, keep_prob):
rng = np.random.RandomState(23455)
self.params = []
lookup_table = theano.shared(word_embeddings)
self.params += [lookup_table]
#input1-问题, input2-正向答案, input3-负向答案
#将每个字替换成字向量
input_matrix1 = lookup_table[T.cast(input1.flatten(), dtype="int32")]
input_matrix2 = lookup_table[T.cast(input2.flatten(), dtype="int32")]
input_matrix3 = lookup_table[T.cast(input3.flatten(), dtype="int32")]
#CNN的输入是4维矩阵,这里只是增加了一个维度而已
input_x1 = input_matrix1.reshape((batch_size, 1, sequence_len, embedding_size))
input_x2 = input_matrix2.reshape((batch_size, 1, sequence_len, embedding_size))
input_x3 = input_matrix3.reshape((batch_size, 1, sequence_len, embedding_size))
#print(input_x1.shape.eval())
self.dbg_x1 = input_x1
outputs_1, outputs_2, outputs_3 = [], [], []
#设置多种大小的filter
for filter_size in filter_sizes:
#每种大小的filter的数量是num_filters
filter_shape = (num_filters, 1, filter_size, embedding_size)
image_shape = (batch_size, 1, sequence_len, embedding_size)
fan_in = np.prod(filter_shape[1:])
fan_out = filter_shape[0] * np.prod(filter_shape[2:])
W_bound = np.sqrt(6. / (fan_in + fan_out))
W = theano.shared(
np.asarray(
rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
dtype=theano.config.floatX
),
borrow=True
)
b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
b = theano.shared(value=b_values, borrow=True)
#卷积+max_pooling
conv_out = conv2d(input=input_x1, filters=W, filter_shape=filter_shape, input_shape=image_shape)
#卷积后的向量的长度为ds
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_1.append(pooled_active)
conv_out = conv2d(input=input_x2, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_2.append(pooled_active)
conv_out = conv2d(input=input_x3, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_3.append(pooled_active)
self.params += [W, b]
self.dbg_conv_out = conv_out.shape
num_filters_total = num_filters * len(filter_sizes)
self.dbg_outputs_1 = outputs_1[0].shape
#每一个句子的语义表示向量的长度为num_filters_total
output_flat1 = T.reshape(T.concatenate(outputs_1, axis=1), [batch_size, num_filters_total])
output_flat2 = T.reshape(T.concatenate(outputs_2, axis=1), [batch_size, num_filters_total])
output_flat3 = T.reshape(T.concatenate(outputs_3, axis=1), [batch_size, num_filters_total])
#dropout, keep_prob为1表示不进行dropout
output_drop1 = self._dropout(rng, output_flat1, keep_prob)
output_drop2 = self._dropout(rng, output_flat2, keep_prob)
output_drop3 = self._dropout(rng, output_flat3, keep_prob)
#计算问题和答案之前的向量夹角
#计算向量的长度
len1 = T.sqrt(T.sum(output_drop1 * output_drop1, axis=1))
len2 = T.sqrt(T.sum(output_drop2 * output_drop2, axis=1))
len3 = T.sqrt(T.sum(output_drop3 * output_drop3, axis=1))
#计算向量之间的夹角
cos12 = T.sum(output_drop1 * output_drop2, axis=1) / (len1 * len2)
self.cos12 = cos12
cos13 = T.sum(output_drop1 * output_drop3, axis=1) / (len1 * len3)
self.cos13 = cos13
zero = theano.shared(np.zeros(batch_size, dtype=theano.config.floatX), borrow=True)
margin = theano.shared(np.full(batch_size, 0.05, dtype=theano.config.floatX), borrow=True)
#Loss损失函数
diff = T.cast(T.maximum(zero, margin - cos12 + cos13), dtype=theano.config.floatX)
self.cost = T.sum(diff, acc_dtype=theano.config.floatX)
#mini-batch数据的准确率(如果正向答案和问题之间的cosine大于负向答案和问题的cosine,则认为正确,
#否则是错误的)
#Loss和Accuracy是用来评估训练中模型时候收敛的两个很重要的指标
self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
def _dropout(self, rng, layer, keep_prob):
srng = T.shared_randomstreams.RandomStreams(rng.randint(123456))
mask = srng.binomial(n=1, p=keep_prob, size=layer.shape)
output = layer * T.cast(mask, theano.config.floatX)
output = output / keep_prob
return output
def train():
batch_size = int(256)
filter_sizes = [2,3,5]
num_filters = 500
embedding_size = 100
learning_rate = 0.001
n_epochs = 2000000
validation_freq = 1000
keep_prob_value = 0.25
vocab = build_vocab()
word_embeddings = load_word_embeddings(vocab, embedding_size)
trainList = load_train_list()
testList = load_test_list()
train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3')
keep_prob = T.fscalar('keep_prob')
model = QACnn(
input1=x1, input2=x2, input3=x3, keep_prob=keep_prob,
word_embeddings=word_embeddings,
batch_size=batch_size,
sequence_len=train_x1.shape[1],
embedding_size=embedding_size,
filter_sizes=filter_sizes,
num_filters=num_filters)
dbg_x1 = model.dbg_x1
dbg_outputs_1 = model.dbg_outputs_1
cost, cos12, cos13 = model.cost, model.cos12, model.cos13
print 'cost'
print cost
params, accuracy = model.params, model.accuracy
grads = T.grad(cost, params)
updates = [
(param_i, param_i - learning_rate * grad_i)
for param_i, grad_i in zip(params, grads)
]
p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3')
prob = T.fscalar('prob')
train_model = theano.function(
[p1, p2, p3, prob],
[cost, accuracy, dbg_x1, dbg_outputs_1],
updates=updates,
givens={
x1: p1, x2: p2, x3: p3, keep_prob: prob
}
)
v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
validate_model = theano.function(
inputs=[v1, v2, v3, prob],
outputs=[cos12, cos13],
#updates=updates,
givens={
x1: v1, x2: v2, x3: v3, keep_prob: prob
}
)
epoch = 0
done_looping = False
while (epoch < n_epochs) and (not done_looping):
epoch = epoch + 1
train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
#print train_x3.shape
cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value)
print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
if epoch % validation_freq == 0:
print 'Evaluation ......'
validation(validate_model, testList, vocab, batch_size)
#print dbg_outputs_1
if __name__ == '__main__':
train()
================================================
FILE: config.py
================================================
import os
dataset_ins = 'insurance-qa'
dataset_qur = 'quora-qa'
##################################################################
# ajust to your runnning environment
# which data do you want
dataset = dataset_qur
# word2vec command path
w2v_command = '/export/jw/word2vec/word2vec'
##################################################################
home = ''
if dataset == dataset_ins:
home = os.path.expanduser('/export/jw/insuranceQA')
elif dataset == dataset_qur:
home = os.path.expanduser('/export/jw/quora')
#Insurance-QA original data directory
qa_version = 'V1'
vocab_file = os.path.join(home, qa_version, 'vocabulary')
answers_file = os.path.join(home, qa_version, 'answers.label.token_idx')
question_train_file = os.path.join(home, qa_version, 'question.train.token_idx.label')
question_test1_file = os.path.join(home, qa_version, 'question.test1.label.token_idx.pool')
question_test2_file = os.path.join(home, qa_version, 'question.test2.label.token_idx.pool')
question_dev_file = os.path.join(home, qa_version, 'question.dev.label.token_idx.pool')
#quora original data directory
qr_file = os.path.join(home, 'quora_duplicate_questions.tsv')
qr_train_ratio = 0.8
#processed files
train_file = os.path.join(home, 'data', 'train.prepro')
test1_file = os.path.join(home, 'data', 'test1.prepro')
test2_file = os.path.join(home, 'data', 'test2.prepro')
w2v_train_file = os.path.join(home, 'data', 'w2v.train')
w2v_bin_file = os.path.join(home, 'data', 'w2v.bin')
predict1_file = os.path.join(home, 'data', 'predict1')
================================================
FILE: gen.py
================================================
import config, os, random
#####################################################################
# function: load vocab
# return: dict[word] = [word_id]
#####################################################################
def load_vocab():
voc = {}
for line in open(config.vocab_file):
word, _id = line.strip().split('\t')
voc[word] = _id
return voc
#####################################################################
# function: load answers, restore idx to real word
# return : [answer_1, answer_2, ..., answer_n]
#####################################################################
def ins_load_answers():
_list, voc = [''], load_vocab()
for line in open(config.answers_file):
_, sent = line.strip().split('\t')
_list.append('_'.join([voc[wid] for wid in sent.split(' ')]))
return _list
#####################################################################
# function: preprea word2vec binary file
# return :
#####################################################################
def ins_w2v():
print('preparing word2vec ......')
_data, voc = [], load_vocab()
for line in open(config.question_train_file):
items = line.strip().split('\t')
_data.append(' '.join([voc[_id] for _id in items[0].split(' ')]))
for _file in [config.answers_file, config.question_dev_file, \
config.question_test1_file, config.question_test2_file]:
for line in open(_file):
items = line.strip().split('\t')
_data.append(' '.join([voc[_id] for _id in items[1].split(' ')]))
of = open(config.w2v_train_file, 'w')
for s in _data: of.write(s + '\n')
of.close()
os.system('time ' + config.w2c_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
#####################################################################
# function: preprea train file
# file format: flag question answer
#####################################################################
def ins_train():
print('preparing train ......')
answers, voc, _data = ins_load_answers(), load_vocab(), []
for line in open(config.question_train_file):
qsent, ids = line.strip().split('\t')
qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
for _id in ids.split(' '):
_data.append(' '.join(['1', qsent, answers[int(_id)]]))
of = open(config.train_file, 'w')
for _s in _data: of.write(_s + '\n')
of.close()
#####################################################################
# function: preprea test file
# file format: flag group_id question answer
#####################################################################
def ins_test():
print('preparing test ......')
answers, voc = ins_load_answers(), load_vocab()
for _in, _out in ([(config.question_test2_file, config.test2_file), \
(config.question_test1_file, config.test1_file)]):
_data, group = [], int(0)
for line in open(_in):
pids, qsent, pnids = line.strip().split('\t')
positive = {_id:'#' for _id in pids.split(' ')}
qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
for _id in pnids.split(' '):
flag = '1' if _id in positive else '0'
_data.append(' '.join([flag, str(group), qsent, answers[int(_id)]]))
group += 1
of = open(_out, 'w')
for s in _data: of.write(s + '\n')
of.close()
def ins_qa():
ins_w2v()
ins_train()
ins_test()
def qur_prepare():
#pretrain word2vec
_list = []
for line in open(config.qr_file):
items = line.strip().split('\t')
if len(items) != 6:
continue
_list.append(items)
_list = _list[1:]
random.shuffle(_list)
_list = [(f, q1, q2) for _,_,_,q1,q2,f in _list]
of = open(config.w2v_train_file, 'w')
for f, q1, q2 in _list:
of.write(q1 + '\n')
of.write(q2 + '\n')
of.close()
os.system('time ' + config.w2v_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
#train file
_newlist = []
for f, q1, q2 in _list:
if len(q1) <= 1 or len(q2) <= 1: continue
q1 = '_'.join(q1.split(' '))
q2 = '_'.join(q2.split(' '))
_newlist.append((f, q1, q2))
_list = _newlist
of = open(config.train_file, 'w')
for f, q1, q2 in _list[:int(len(_list) * 0.8)]:
of.write(' '.join([f, q1, q2]) + '\n')
of.close()
#test file
of = open(config.test1_file, 'w')
for f, q1, q2 in _list[int(len(_list) * 0.8):]:
of.write(' '.join([f, q1, q2]) + '\n')
of.close()
def qur_qa():
qur_prepare()
if __name__ == '__main__':
if config.dataset == config.dataset_ins:
ins_qa()
elif config.dataset == config.dataset_qur:
qur_qa()
================================================
FILE: lstm_cnn/theano/README.md
================================================
theano lstm+cnn code for insuranceQA
================result==================
theano code, test1 top-1 precision : 68.3%
lstm+cnn is better than cnn(61.5%).
================dataset================
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
reformat the original dataset(see my train and test1.sample)
change filepath to your dataset(see TODO in insqa_cnn.py)
python insqa_lstm.py
================================================
FILE: lstm_cnn/theano/insqa_lstm.py
================================================
############################################################
# if batch_size is 1, there must be a dtype error when doing
# T.grad, this is something about scan func
# see https://github.com/Theano/Theano/issues/1772
#
# LSTM + cnn
# test1 top-1 precision: 68.3%
############################################################
from collections import OrderedDict
import sys, time, random, operator
import numpy as np
import theano
from theano import config
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
#TODO change filepath to your local environment
#include train test1 vectors.nobin
def build_vocab():
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 3):
for word in items[i].split('_'):
if len(word) <= 0:
continue
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if len(items[0]) <= 0:
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def load_word_embeddings(vocab, dim):
vectors = load_vectors()
embeddings = [] #brute initialization
for i in range(0, len(vocab)):
vec = []
for j in range(0, dim):
vec.append(0.01)
embeddings.append(vec)
for word, code in vocab.items():
if word in vectors:
embeddings[code] = vectors[word]
return np.array(embeddings, dtype='float32')
#be attention initialization of UNKNNOW
def encode_sent(vocab, string, size):
x, m = [], []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
if words[i] == '': #TODO
m.append(1) #fixed sequence length, else use 0
else:
m.append(1)
return x, m
def load_train_list():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
trainList.append(line.strip().split(' '))
return trainList
def load_test_list():
testList = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
testList.append(line.strip().split(' '))
return testList
def load_data(trainList, vocab, batch_size):
train_1, train_2, train_3 = [], [], []
mask_1, mask_2, mask_3 = [], [], []
counter = 0
while True:
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''):
#print 'empty string ......'
continue
x, m = encode_sent(vocab, pos[2], 100)
train_1.append(x)
mask_1.append(m)
x, m = encode_sent(vocab, pos[3], 100)
train_2.append(x)
mask_2.append(m)
x, m = encode_sent(vocab, neg[3], 100)
train_3.append(x)
mask_3.append(m)
counter += 1
if counter >= batch_size:
break
return np.transpose(np.array(train_1, dtype=config.floatX)), np.transpose(np.array(train_2, dtype=config.floatX)), np.transpose(np.array(train_3, dtype=config.floatX)), np.transpose(np.array(mask_1, dtype=config.floatX)) , np.transpose(np.array(mask_2, dtype=config.floatX)), np.transpose(np.array(mask_3, dtype=config.floatX))
def load_data_val(testList, vocab, index, batch_size):
x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
for i in range(0, batch_size):
true_index = index + i
if true_index >= len(testList):
true_index = len(testList) - 1
items = testList[true_index]
x, m = encode_sent(vocab, items[2], 100)
x1.append(x)
m1.append(m)
x, m = encode_sent(vocab, items[3], 100)
x2.append(x)
m2.append(m)
x, m = encode_sent(vocab, items[3], 100)
x3.append(x)
m3.append(m)
return np.transpose(np.array(x1, dtype=config.floatX)), np.transpose(np.array(x2, dtype=config.floatX)), np.transpose(np.array(x3, dtype=config.floatX)), np.transpose(np.array(m1, dtype=config.floatX)) , np.transpose(np.array(m2, dtype=config.floatX)), np.transpose(np.array(m3, dtype=config.floatX))
def validation(validate_model, testList, vocab, batch_size):
index, score_list = int(0), []
while True:
x1, x2, x3, m1, m2, m3 = load_data_val(testList, vocab, index, batch_size)
batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3)
for score in batch_scores:
score_list.append(score)
index += batch_size
if index >= len(testList):
break
print 'Evaluation ' + str(index)
sdict, index = {}, int(0)
for items in testList:
qid = items[1].split(':')[1]
if not qid in sdict:
sdict[qid] = []
sdict[qid].append((score_list[index], items[0]))
index += 1
lev0, lev1 = float(0), float(0)
of = open('/export/jw/cnn/insuranceQA/acc.lstm', 'a')
for qid, cases in sdict.items():
cases.sort(key=operator.itemgetter(0), reverse=True)
score, flag = cases[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
for s in score_list:
of.write(str(s) + '\n')
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print 'lev1:' + str(lev1)
print 'lev0:' + str(lev0)
of.close()
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W)
return u.astype(config.floatX)
def numpy_floatX(data):
return np.asarray(data, dtype=config.floatX)
def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_params):
rng = np.random.RandomState(23455)
for filter_size in filter_sizes:
filter_shape = (num_filters, 1, filter_size, proj_size)
fan_in = np.prod(filter_shape[1:])
fan_out = filter_shape[0] * np.prod(filter_shape[2:])
W_bound = np.sqrt(6. / (fan_in + fan_out))
W = theano.shared(
np.asarray(
rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
dtype=theano.config.floatX
),
borrow=True
)
tparams['cnn_W_' + str(filter_size)] = W
b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
b = theano.shared(value=b_values, borrow=True)
tparams['cnn_b_' + str(filter_size)] = b
grad_params += [W, b]
return tparams, grad_params
def param_init_lstm(proj_size, tparams, grad_params):
W = np.concatenate([ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size)], axis=1)
W_t = theano.shared(W, borrow=True)
tparams[_p('lstm', 'W')] = W_t
U = np.concatenate([ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size)], axis=1)
U_t = theano.shared(U, borrow=True)
tparams[_p('lstm', 'U')] = U_t
b = np.zeros((4 * proj_size,))
b_t = theano.shared(b.astype(config.floatX), borrow=True)
tparams[_p('lstm', 'b')] = b_t
grad_params += [W_t, U_t, b_t]
return tparams, grad_params
def dropout_layer(state_before, use_noise, trng):
proj = T.switch(use_noise,
(state_before *
trng.binomial(state_before.shape,
p=0.5, n=1,
dtype=state_before.dtype)),
state_before * 0.5)
return proj
class LSTM(object):
def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters):
#proj_size means embedding_size
#'lstm_W' = [embedding_size, embedding_size]
#'lstm_U' = [embedding_size, embedding_size]
#'lstm_b' = [embedding_size]
proj_size = 100 #TODO, what does proj mean
self.params, tparams = [], {}
tparams, self.params = param_init_lstm(proj_size, tparams, self.params)
tparams, self.params = param_init_cnn(filter_sizes, num_filters, proj_size, tparams, self.params)
lookup_table = theano.shared(word_embeddings, borrow=True)
tparams['lookup_table'] = lookup_table
self.params += [lookup_table]
n_timesteps = input1.shape[0]
n_samples = input1.shape[1]
lstm1, lstm_whole1 = self._lstm_net(tparams, input1, sequence_len, batch_size, embedding_size, mask1, proj_size)
lstm2, lstm_whole2 = self._lstm_net(tparams, input2, sequence_len, batch_size, embedding_size, mask2, proj_size)
lstm3, lstm_whole3 = self._lstm_net(tparams, input3, sequence_len, batch_size, embedding_size, mask3, proj_size)
#dimshuffle [sequence_len, batch_size, proj_size] to [batch_size, sequence_len, proj_size]
cnn_input1 = T.reshape(lstm1.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn_input2 = T.reshape(lstm2.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn_input3 = T.reshape(lstm3.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn1 = self._cnn_net(tparams, cnn_input1, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
cnn2 = self._cnn_net(tparams, cnn_input2, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
cnn3 = self._cnn_net(tparams, cnn_input3, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
len1 = T.sqrt(T.sum(cnn1 * cnn1, axis=1))
len2 = T.sqrt(T.sum(cnn2 * cnn2, axis=1))
len3 = T.sqrt(T.sum(cnn3 * cnn3, axis=1))
self.cos12 = T.sum(cnn1 * cnn2, axis=1) / (len1 * len2)
self.cos13 = T.sum(cnn1 * cnn3, axis=1) / (len1 * len3)
zero = theano.shared(np.zeros(batch_size, dtype=config.floatX), borrow=True)
margin = theano.shared(np.full(batch_size, 0.05, dtype=config.floatX), borrow=True)
diff = T.cast(T.maximum(zero, margin - self.cos12 + self.cos13), dtype=config.floatX)
self.cost = T.sum(diff, acc_dtype=config.floatX)
self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_filters, filter_sizes, proj_size):
outputs = []
for filter_size in filter_sizes:
filter_shape = (num_filters, 1, filter_size, proj_size)
image_shape = (batch_size, 1, sequence_len, proj_size)
W = tparams['cnn_W_' + str(filter_size)]
b = tparams['cnn_b_' + str(filter_size)]
conv_out = conv2d(input=cnn_input, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs.append(pooled_active)
num_filters_total = num_filters * len(filter_sizes)
output_tensor = T.reshape(T.concatenate(outputs, axis=1), [batch_size, num_filters_total])
return output_tensor
def _lstm_net(self, tparams, _input, sequence_len, batch_size, embedding_size, mask, proj_size):
input_matrix = tparams['lookup_table'][T.cast(_input.flatten(), dtype="int32")]
input_x = input_matrix.reshape((sequence_len, batch_size, embedding_size))
proj, proj_whole = lstm_layer(tparams, input_x, proj_size, prefix='lstm', mask=mask)
#if useMask == True:
#proj = (proj * mask[:, :, None]).sum(axis=0)
#proj = proj / mask.sum(axis=0)[:, None]
#if options['use_dropout']:
#proj = dropout_layer(proj, use_noise, trng)
return proj, proj_whole
#state_below is word_embbeding tensor(3dim)
def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None):
#dim-0 steps, dim-1 samples(batch_size), dim-3 word_embedding
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
#h means hidden output? c means context? so we'll use h?
#rval[0] = [sequence_len, batch_size, proj_size], rval[1] the same
#so preact size must equl to x_(lstm input slice)
#if you want change lstm h(t) size, 'lstm_U' and 'lstm_b'
#and precat must be changed to another function, like h*U+b
#see http://colah.github.io/posts/2015-08-Understanding-LSTMs/
#f(t) = sigmoid(Wf * [h(t-1),x(t)] + bf)
def _step(m_, x_, h_, c_):
preact = T.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
i = T.nnet.sigmoid(_slice(preact, 0, proj_size))
f = T.nnet.sigmoid(_slice(preact, 1, proj_size))
o = T.nnet.sigmoid(_slice(preact, 2, proj_size))
c = T.tanh(_slice(preact, 3, proj_size))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * T.tanh(c)
#if mask(t-1)==0, than make h(t) = h(t-1)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
state_below = (T.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = proj_size
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[T.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
T.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0], rval[1]
def _p(pp, name):
return '%s_%s' % (pp, name)
def train():
batch_size = int(256)
embedding_size = 100
learning_rate = 0.05
n_epochs = 20000000
validation_freq = 1000
filter_sizes = [1, 2, 3, 5]
num_filters = 500
vocab = build_vocab()
word_embeddings = load_word_embeddings(vocab, embedding_size)
trainList = load_train_list()
testList = load_test_list()
train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3')
m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3')
model = LSTM(
input1=x1, input2=x2, input3=x3,
mask1=m1, mask2=m2, mask3=m3,
word_embeddings=word_embeddings,
batch_size=batch_size,
sequence_len=train_x1.shape[0], #row is sequence_len
embedding_size=embedding_size,
filter_sizes=filter_sizes,
num_filters=num_filters)
cost, cos12, cos13 = model.cost, model.cos12, model.cos13
params, accuracy = model.params, model.accuracy
grads = T.grad(cost, params)
updates = [
(param_i, param_i - learning_rate * grad_i)
for param_i, grad_i in zip(params, grads)
]
p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3')
q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3')
train_model = theano.function(
[p1, p2, p3, q1, q2, q3],
[cost, accuracy],
updates=updates,
givens={
x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3
}
)
v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3')
validate_model = theano.function(
inputs=[v1, v2, v3, u1, u2, u3],
outputs=[cos12, cos13],
#updates=updates,
givens={
x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3
}
)
epoch = 0
done_looping = False
while (epoch < n_epochs) and (not done_looping):
epoch += 1
train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
#print('train_x1, train_x2, train_x3')
#print(train_x1.shape, train_x2.shape, train_x3.shape)
cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3)
print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
if epoch % validation_freq == 0:
print 'Evaluation ......'
validation(validate_model, testList, vocab, batch_size)
if __name__ == '__main__':
train()
================================================
FILE: rnn_attention/tensorflow/insurance_qa_data_helpers.py
================================================
import numpy as np
import random
from operator import itemgetter
precision = '/export/jw/cnn/insuranceQA/acc.lstm'
empty_vector = []
for i in range(0, 100):
empty_vector.append(float(0.0))
onevector = []
for i in range(0, 10):
onevector.append(float(1))
zerovector = []
for i in range(0, 10):
zerovector.append(float(0))
def build_vocab():
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 3):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/test1'):
items = line.strip().split(' ')
for i in range(2, 3):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def read_alist():
alist = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
alist.append(items[3])
print('read_alist done ......')
return alist
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if (len(items) < 101):
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def read_vector(vectors, word):
global empty_vector
if word in vectors:
return vectors[word]
else:
return empty_vector
#return vectors['']
def load_train_list():
train_list = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
train_list.append(line.strip().split(' '))
return train_list
def load_test_list():
test_list = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
test_list.append(line.strip().split(' '))
return test_list
def load_train_and_vectors():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
trainList.append(line.strip())
vectors = load_vectors()
return trainList, vectors
def read_raw():
raw = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
raw.append(items)
return raw
def encode_sent(vocab, string, size):
x, m = [], []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab: x.append(vocab[words[i]])
else: x.append(vocab['UNKNOWN'])
if words[i] == '': m.append(1)
else: m.append(1)
return x, m
def load_val_data(test_list, vocab, index, batch_size, max_len):
x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
for i in range(0, batch_size):
t_i = index + i
if t_i >= len(test_list):
t_i = len(test_list) - 1
items = test_list[t_i]
x, m = encode_sent(vocab, items[2], max_len)
x1.append(x)
m1.append(m)
x, m = encode_sent(vocab, items[3], max_len)
x2.append(x)
m2.append(m)
x, m = encode_sent(vocab, items[3], max_len)
x3.append(x)
m3.append(m)
return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32'), np.transpose(np.array(m1, dtype='float32')) , np.transpose(np.array(m2, dtype='float32')), np.transpose(np.array(m3, dtype='float32'))
def load_train_data(trainList, vocab, batch_size, max_len):
train_1, train_2, train_3 = [], [], []
mask_1, mask_2, mask_3 = [], [], []
counter = 0
while True:
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''):
#print 'empty string ......'
continue
x, m = encode_sent(vocab, pos[2], max_len)
train_1.append(x)
mask_1.append(m)
x, m = encode_sent(vocab, pos[3], max_len)
train_2.append(x)
mask_2.append(m)
x, m = encode_sent(vocab, neg[3], max_len)
train_3.append(x)
mask_3.append(m)
counter += 1
if counter >= batch_size:
break
return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32'), np.transpose(np.array(mask_1, dtype='float32')) , np.transpose(np.array(mask_2, dtype='float32')), np.transpose(np.array(mask_3, dtype='float32'))
def evaluation(score_list, test_list):
global precision
sessdict, index = {}, int(0)
for items in test_list:
qid = items[1].split(':')[1]
if not qid in sessdict:
sessdict[qid] = []
sessdict[qid].append((score_list[index], items[0]))
index += 1
if index >= len(test_list):
break
lev1, lev0 = float(0), float(0)
of = open(precision, 'a')
for k, v in sessdict.items():
v.sort(key=itemgetter(0), reverse=True)
score, flag = v[0]
if flag == '1': lev1 += 1
if flag == '0': lev0 += 1
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print('lev1 ' + str(lev1))
print('lev0 ' + str(lev0))
print('precision:' + str(lev1 / (lev0 + lev1)))
of.close()
================================================
FILE: rnn_attention/tensorflow/tf_rnn_char.py
================================================
# -*- coding: utf-8 -*-
####################################################################################
#test1 top1准确率59%
####################################################################################
import tensorflow as tf
import numpy as np
from operator import itemgetter
import random, datetime, json, insurance_qa_data_helpers
class RNN_Model(object):
def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, embed_dim, num_step, fw_cell, bw_cell):
_initial_state = fw_cell.zero_state(batch_size,dtype=tf.float32)
inputs=tf.nn.embedding_lookup(embedding, inputs)
inputs = tf.nn.dropout(inputs, self.keep_prob)
#[batch_size, sequence_length, embedding_size]转换为[sequence_length, batch_size, embedding_size]
inputs = tf.transpose(inputs, [1, 0, 2])
#[sequence_length, batch_size, embedding_size]转换为list, sequence_length个[batch_size, embedding_size]
inputs = tf.unstack(inputs)
#inputs = tf.reshape(inputs, [-1, embed_dim])
#inputs = tf.split(inputs, num_step, 0)
#输出为list, sequence_length个[batch_size, embedding_size * 2]
outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=_initial_state, initial_state_bw=_initial_state)
outputs = tf.transpose(tf.stack(outputs), [1, 0, 2])
self.outputs = outputs
#对rnn的输出[batch_size, sequence_length, embedding_size],目前采用maxpooling是最好的效果
#mean_pooling以及取最后一个step的向量,效果都不好
outputs = self._max_pooling(outputs)
print outputs
#outputs = outputs[-1]
#outputs = outputs * mask[:, :, None]
#mean pooling
#outputs = tf.reduce_sum(outputs, 0) / (tf.reduce_sum(mask, 0)[:,None])
return outputs
def _max_pooling(self, lstm):
sequence_length, embedding_size = int(lstm.get_shape()[1]), int(lstm.get_shape()[2])
lstm = tf.expand_dims(lstm, -1)
output = tf.nn.max_pool(lstm, ksize=[1, sequence_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
output = tf.reshape(output, [-1, embedding_size])
return output
def __init__(self, config, is_training=True):
self.keep_prob=tf.placeholder(tf.float32, name='dropout_keep_prob')
self.batch_size=config.batch_size
self.num_step=config.num_step
self.qlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
#这个版本没有使用mask
self.mask_q = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
self.plist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
self.mask_p = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
self.nlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
self.mask_n = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
hidden_neural_size=config.hidden_neural_size
vocabulary_size=config.vocabulary_size
self.embed_dim=config.embed_dim
hidden_layer_num=config.hidden_layer_num
#fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
fw_cell = tf.contrib.rnn.DropoutWrapper(
fw_cell,output_keep_prob=self.keep_prob
)
#bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
bw_cell = tf.contrib.rnn.DropoutWrapper(
bw_cell,output_keep_prob=self.keep_prob
)
#embedding layer
with tf.device("/cpu:1"),tf.name_scope("embedding_layer"):
self.embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
#self.a_embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
q = self._rnn_net(self.qlist, mask_q, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
tf.get_variable_scope().reuse_variables()
p = self._rnn_net(self.plist, mask_p, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
tf.get_variable_scope().reuse_variables()
n = self._rnn_net(self.nlist, mask_n, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
#len_1 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)), 0.01, 100000)
#len_2 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)), 0.01, 100000)
#len_3 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)), 0.01, 100000)
len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1))
len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1))
len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1))
self.cos12 = tf.reduce_sum(tf.multiply(q, p), axis=1) / (len_1 * len_2)
self.cos13 = tf.reduce_sum(tf.multiply(q, n), axis=1) / (len_1 * len_3)
self.q = q
self.p = p
zero = tf.constant(np.zeros(self.batch_size, dtype='float32'))
margin = tf.constant(np.full(self.batch_size, 0.1, dtype='float32'))
diff = tf.cast(tf.maximum(zero, margin - self.cos12 + self.cos13), dtype='float32')
self.cost = tf.reduce_sum(diff)
self.accuracy = tf.reduce_sum(tf.cast(tf.equal(zero, diff), dtype='float32')) / float(self.batch_size)
def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n):
fetches = [model.cost, model.accuracy, global_step, train_op, model.cos12, model.q, model.p, model.outputs]
feed_dict = {
model.qlist: qlist,
model.plist: plist,
model.nlist: nlist,
model.mask_q : mask_q,
model.mask_p : mask_p,
model.mask_n : mask_n,
model.keep_prob: config.keep_prob
}
cost, accuracy, step, _, cos12, q, p, outputs = sess.run(fetches, feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))
def dev_step(model, vocab, batch_size, max_len):
score_list, i = [], int(0)
while True:
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_val_data(test_list, vocab, i, FLAGS.batch_size, max_len)
feed_dict = {
model.qlist: qlist,
model.plist: plist,
model.nlist: nlist,
model.mask_q : mask_q,
model.mask_p : mask_p,
model.mask_n : mask_n,
model.keep_prob: float(1.0)
}
batch_scores = sess.run([model.cos12], feed_dict)
for score in batch_scores[0]:
score_list.append(score)
i += FLAGS.batch_size
if i >= len(test_list):
break
insurance_qa_data_helpers.evaluation(score_list, test_list)
tf.flags.DEFINE_integer('evaluate_every',10000,'evaluate every')
tf.flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim')
tf.flags.DEFINE_integer('hidden_neural_size',200,'LSTM hidden neural size')
tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num')
tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence')
tf.flags.DEFINE_float('init_scale',0.1,'init scale')
tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate')
tf.flags.DEFINE_integer('num_epoch',1000000,'num epoch')
tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
vocab = insurance_qa_data_helpers.build_vocab()
train_list = insurance_qa_data_helpers.load_train_list()
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
test_list = insurance_qa_data_helpers.load_test_list()
class Config(object):
hidden_neural_size=FLAGS.hidden_neural_size
vocabulary_size=len(vocab)
embed_dim=FLAGS.emdedding_dim
hidden_layer_num=FLAGS.hidden_layer_num
keep_prob=FLAGS.keep_prob
batch_size = FLAGS.batch_size
num_step = FLAGS.max_len
max_grad_norm=FLAGS.max_grad_norm
num_epoch = FLAGS.num_epoch
config = Config()
eval_config=Config()
eval_config.keep_prob=1.0
with tf.Graph().as_default():
with tf.device('/gpu:1'):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
with tf.variable_scope("model",reuse=None,initializer=initializer):
model = RNN_Model(config=config, is_training=True)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
#optimizer = tf.train.RMSPropOptimizer(0.01)
#optimizer = tf.train.AdamOptimizer(0.1)
optimizer = tf.train.GradientDescentOptimizer(0.2)
grads_and_vars = optimizer.compute_gradients(model.cost)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Initialize all variables
sess.run(tf.global_variables_initializer())
for i in range(config.num_epoch):
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
dev_step(model, vocab, FLAGS.batch_size, FLAGS.max_len)
================================================
FILE: swem/swem_hier.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
################################################################################
# Insurance-QA
# AUC 0.96, top 1 precision:31%
#
# quora-data
# best precision: 0.8369, best threshold:0.62
################################################################################
class SWEM_HIER(object):
def __init__(self,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.x1 = tf.placeholder(tf.int32, [None, sequence_length])
self.x2 = tf.placeholder(tf.int32, [None, sequence_length])
self.y = tf.placeholder(tf.float32, [None])
self.one = tf.placeholder(tf.float32, [None])
#self.dropout_keep_prob = tf.placeholder(tf.float32)
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
x1_mat = tf.nn.embedding_lookup(self.word_mat, self.x1)
x2_mat = tf.nn.embedding_lookup(self.word_mat, self.x2)
self.x1_mat_exp = tf.expand_dims(x1_mat, -1)
self.x2_mat_exp = tf.expand_dims(x2_mat, -1)
p1 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p2 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p1 = tf.reshape(tf.reduce_max(p1, 1), [-1, embedding_size])
p2 = tf.reshape(tf.reduce_max(p2, 1), [-1, embedding_size])
"""
p11 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 3, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p21 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 3, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p11 = tf.reshape(tf.reduce_max(p11, 1), [-1, embedding_size])
p21 = tf.reshape(tf.reduce_max(p21, 1), [-1, embedding_size])
p1 = tf.concat([p1, p11], 1)
p2 = tf.concat([p2, p21], 1)
"""
self.cos = self.cosine(p1, p2)
self.losses = self.logloss(self.y, self.one, self.cos)
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
max_len = 100
num_epoch = 200000
batch_size = 256
checkpoint_every = 10000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
y, x1, x2 = utils.gen_train_batch_yxx(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:y, swem.one:one}
_, step, loss, cos = sess.run(
[train_op, global_step, swem.losses, swem.cos], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}".format(time_str, step, loss))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, x1, x2 = utils.gen_test_batch_yxx(test_data, i, i + batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:f, swem.one:one}
loss, cos = sess.run([swem.losses, swem.cos], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
utils._eval(y, g, yp)
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: swem/swem_hier_margin.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
#top 1 precision:54%
class SWEM_HIER(object):
def __init__(self,
margin,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.zero = tf.placeholder(tf.float32, [None])
self.q = tf.placeholder(tf.int32, [None, sequence_length])
self.qp = tf.placeholder(tf.int32, [None, sequence_length])
self.qn = tf.placeholder(tf.int32, [None, sequence_length])
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
self.q_mat_exp = tf.expand_dims(q_mat, -1)
self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
self.word_mat1 = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.q)
qp_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qp)
qn_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qn)
self.q_mat_exp1 = tf.expand_dims(q_mat1, -1)
self.qp_mat_exp1 = tf.expand_dims(qp_mat1, -1)
self.qn_mat_exp1 = tf.expand_dims(qn_mat1, -1)
q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
q = tf.reshape(tf.reduce_max(q, 1), [-1, embedding_size])
qp = tf.reshape(tf.reduce_max(qp, 1), [-1, embedding_size])
qn = tf.reshape(tf.reduce_max(qn, 1), [-1, embedding_size])
q1 = tf.nn.avg_pool(self.q_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp1 = tf.nn.avg_pool(self.qp_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn1 = tf.nn.avg_pool(self.qn_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
q1 = tf.reshape(tf.reduce_max(q1, 1), [-1, embedding_size])
qp1 = tf.reshape(tf.reduce_max(qp1, 1), [-1, embedding_size])
qn1 = tf.reshape(tf.reduce_max(qn1, 1), [-1, embedding_size])
q = tf.concat([q, q1], 1)
qp = tf.concat([qp, qp1], 1)
qn = tf.concat([qn, qn1], 1)
self.cos_q_qp = self.cosine(q, qp)
self.cos_q_qn = self.cosine(q, qn)
self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
correct = tf.equal(self.zero, loss_batch)
self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
losses = tf.reduce_sum(loss_batch)
return losses, loss_batch
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
margin = 0.05
max_len = 200
num_epoch = 200000
batch_size = 256
checkpoint_every = 50000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
_, step, loss, cos, acc = sess.run(
[train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc{:g}".format(time_str, step, loss, acc))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
auc = utils.eval_auc(y, g, yp)
top1_prec = utils._eval_top1_prec(y, g, yp)
#if auc < prev_auc:
# _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
# features.append(_flist)
# break
#prev_auc = auc
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: swem/swem_max_margin.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
class SWEM_HIER(object):
def __init__(self,
margin,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.zero = tf.placeholder(tf.float32, [None])
self.q = tf.placeholder(tf.int32, [None, sequence_length])
self.qp = tf.placeholder(tf.int32, [None, sequence_length])
self.qn = tf.placeholder(tf.int32, [None, sequence_length])
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
self.q_mat_exp = tf.expand_dims(q_mat, -1)
self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
"""
q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
"""
q = tf.reshape(tf.reduce_max(self.q_mat_exp, 1), [-1, embedding_size])
qp = tf.reshape(tf.reduce_max(self.qp_mat_exp, 1), [-1, embedding_size])
qn = tf.reshape(tf.reduce_max(self.qn_mat_exp, 1), [-1, embedding_size])
self.cos_q_qp = self.cosine(q, qp)
self.cos_q_qn = self.cosine(q, qn)
self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
correct = tf.equal(self.zero, loss_batch)
self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
losses = tf.reduce_sum(loss_batch)
return losses, loss_batch
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
margin = 0.05
max_len = 200
num_epoch = 200000
batch_size = 256
checkpoint_every = 50000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
_, step, loss, cos, acc = sess.run(
[train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, acc))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
auc = utils.eval_auc(y, g, yp)
top1_prec = utils._eval_top1_prec(y, g, yp)
#if auc < prev_auc:
# _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
# features.append(_flist)
# break
#prev_auc = auc
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: utils.py
================================================
import numpy as np
import random, sys, config
from sklearn import metrics
from operator import itemgetter
from itertools import groupby
def load_embeddings():
_data, embeddings, vocab, _id = [], [], {}, int(0)
for line in open(config.w2v_bin_file):
_data.append(line.strip().split(' '))
size, dim = int(_data[0][0]), int(_data[0][1])
for i in range(1, len(_data)):
w, vec = _data[i][0], [float(_data[i][k]) for k in range(1, dim+1)]
embeddings.append(vec)
vocab[w] = _id
_id += 1
embeddings.append([0.01] * dim)
vocab['UNKNOWN'] = _id
_id += 1
embeddings.append([0.01] * dim)
vocab[''] = _id
return vocab, np.array(embeddings)
def encode_sent(s, vocab, max_len):
ws = [w for w in s.split('_')]
ws = ws[:max_len] if len(ws) >= max_len else ws + [''] * (max_len - len(ws))
nws = []
for w in ws:
nw = w if w in vocab else 'UNKNOWN'
nws.append(vocab[nw])
return nws
def load_train_data(vocab, max_len):
if config.dataset == config.dataset_ins:
return ins_load_train_data(vocab, max_len)
if config.dataset == config.dataset_qur:
return qur_load_train_test_data(config.train_file, vocab, max_len)
print('bad load_train_data')
exit(1)
def qur_load_train_test_data(_file, vocab, max_len):
_data = []
for line in open(_file):
f, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((int(f), q1, q2))
return _data
def ins_load_train_data(vocab, max_len):
_data = []
for line in open(config.train_file):
f, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((q1, q2))
return _data
def load_test_data(vocab, max_len):
if config.dataset == config.dataset_ins:
return ins_load_test_data(vocab, max_len)
if config.dataset == config.dataset_qur:
return qur_load_train_test_data(config.test1_file, vocab, max_len)
print('bad load_test_data')
exit(1)
def ins_load_test_data(vocab, max_len):
_data = []
for line in open(config.test1_file):
f, g, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((f, g, q1, q2))
return _data
def gen_train_batch_qpn(_data, batch_size):
psample = random.sample(_data, batch_size)
nsample = random.sample(_data, batch_size)
q = [s1 for s1, s2 in psample]
qp = [s2 for s1, s2 in psample]
qn = [s2 for s1, s2 in nsample]
return np.array(q), np.array(qp), np.array(qn)
def gen_train_batch_yxx(_data, batch_size):
if config.dataset == config.dataset_ins:
return ins_gen_train_batch_yxx(_data, batch_size)
if config.dataset == config.dataset_qur:
return qur_gen_train_batch_yxx(_data, batch_size)
print('bad gen_train_batch_yxx')
exit(1)
def qur_gen_train_batch_yxx(_data, batch_size):
sample = random.sample(_data, batch_size)
y = [i for i,_,_ in sample]
x1 = [i for _,i,_ in sample]
x2 = [i for _,_,i in sample]
return np.array(y), np.array(x1), np.array(x2)
def ins_gen_train_batch_yxx(_data, batch_size):
part_one, part_two = int(batch_size / 4 * 3), int(batch_size / 4)
psample = random.sample(_data, part_one)
nsample = random.sample(_data, part_two)
y = [1.0] * part_one + [0.0] * part_two
x1 = [s1 for s1, s2 in psample] + [s1 for s1, s2 in psample[:part_two]]
x2 = [s2 for s1, s2 in psample] + [s2 for s1, s2 in nsample]
return np.array(y), np.array(x1), np.array(x2)
def gen_test_batch_qpn(_data, start, end):
sample = _data[start:end]
for i in range(len(sample), end - start):
sample.append(sample[-1])
f = [int(i) for i,_,_,_ in sample]
g = [int(i) for _,i,_,_ in sample]
q1 = [i for _,_,i,_ in sample]
q2 = [i for _,_,_,i in sample]
return f, g, np.array(q1), np.array(q2)
def gen_test_batch_yxx(_data, start, end):
if config.dataset == config.dataset_ins:
return ins_gen_test_batch_yxx(_data, start, end)
if config.dataset == config.dataset_qur:
return qur_gen_test_batch_yxx(_data, start, end)
print('bad gen_test_batch_yxx')
exit(1)
def qur_gen_test_batch_yxx(_data, start, end):
sample = _data[start:end]
y = [i for i,_,_ in sample]
x1 = [i for _,i,_ in sample]
x2 = [i for _,_,i in sample]
return y, y, np.array(x1), np.array(x2)
def ins_gen_test_batch_yxx(_data, start, end):
sample = _data[start:end]
for i in range(len(sample), end - start):
sample.append(sample[-1])
f = [int(i) for i,_,_,_ in sample]
g = [int(i) for _,i,_,_ in sample]
q1 = [i for _,_,i,_ in sample]
q2 = [i for _,_,_,i in sample]
return f, g, np.array(q1), np.array(q2)
def _eval(y, g, yp):
if config.dataset == config.dataset_ins:
eval_auc(y, g, yp)
eval_top1_prec(y, g, yp)
if config.dataset == config.dataset_qur:
eval_auc(y, g, yp)
eval_best_prec(y, g, yp)
def eval_best_prec(y, g, yp):
best_p, best_s = 0.0, 0.0
for i in range(50, 100, 1):
i = float(i) / 100
positive = 0
for _y, _yp in zip(y, yp):
p = 1 if _yp >= i else 0
if p == _y: positive += 1
prec = positive / len(yp)
if prec > best_p:
best_p = prec
best_s = i
print('best_prec: ' + str(best_p) + ' best_threshold:' + str(best_s))
return best_p, best_s
def eval_auc(y, g, yp):
auc = metrics.roc_auc_score(y, yp)
print('auc: ' + str(auc))
return auc
def eval_top1_prec(y, g, yp):
_list = [(_y, _g, _yp) for _y, _g, _yp in zip(y, g, yp)]
_dict = {}
for _y, _g, _yp in _list:
if not _g in _dict: _dict[_g] = []
_dict[_g].append((_y, _g, _yp))
positive, gc = 0 , 0
for _, group in _dict.items():
group = sorted(group, key=itemgetter(2), reverse=True)
gc += 1
if group[0][0] == 1:
positive += 1
prec = positive / gc
print('top1 precision ' + str(positive) + '/' + str(gc) + ': '+ str(positive / gc))
return prec