Repository: white127/QA-deep-learning
Branch: master
Commit: 94971ec4b305
Files: 20
Total size: 111.3 KB
Directory structure:
gitextract_f17xp5p4/
├── README.md
├── cnn/
│ ├── tensorflow/
│ │ ├── README.md
│ │ ├── insqa_cnn.py
│ │ ├── insqa_cnn.py.old
│ │ ├── insqa_train.py
│ │ ├── insqa_train.py.old
│ │ ├── insurance_qa_data_helpers.py
│ │ └── test.py
│ └── theano/
│ ├── README.md
│ └── insqa_cnn.py
├── config.py
├── gen.py
├── lstm_cnn/
│ └── theano/
│ ├── README.md
│ └── insqa_lstm.py
├── rnn_attention/
│ └── tensorflow/
│ ├── insurance_qa_data_helpers.py
│ └── tf_rnn_char.py
├── swem/
│ ├── swem_hier.py
│ ├── swem_hier_margin.py
│ └── swem_max_margin.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
Insurance-QA deeplearning model
======
This is a repo for Q&A Mathing, includes some deep learning models, such as CNN、RNN.<br>
1. CNN. Basic CNN model from 《Applying Deep Learning To Answer Selection: A Study And An Open Task》<br>
2. RNN. RNN seems the best model on Insurance-QA dataset.<br>
3. SWEM. SWEM is the fastest, and has good effect on other datasets, such as WikiQA ..., but is seems not so good on Insurance-QA dataset. I think that, SWEM is more suitable for Q&Q matching, not Q&A matching.<br>
It's hard to say which model is the best in other datasets, you have to choose the most suitable model for you.<br><br>
More models are on the way, pay attention to the updates.<br>
## Requirements
1. tensorflow 1.4.0<br>
2. python3.5<br>
## Performance
margin loss version<br>
Model/Score | Ins_qa_top1_precision | quora_best_prec
------------ | ------------- | -------------
CNN | 62% | None
LSTM+CNN | 68% | None
SWEM | <55% | None
logloss version<br>
Model/Score | Insqa_top1_precision | quora_best_prec
------------ | ------------- | -------------
CNN | None | 79.60%
LSTM+CNN | None | None
SWEM | <40% | 82.69%
## Running
Change configuration to your own environment, just like data pathes<br>
vim config.py
Data processing<br>
python3 gen.py
Run CNN model<br>
cd ./cnn/tensorflow && python3 insqa_train.py
It will take few hours(thousands of epoches) to train this model on a single GPU.<br>
## Downloads
1. You can get Insurance-QA data from here https://github.com/shuzi/insuranceQA<br>
2. You can get Quora data from here http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv<br>
## Links
1. CNN and RNN textual classification repo https://github.com/white127/TextClassification_CNN_RNN<br>
2. 《Applying Deep Learning To Answer Selection: A Study And An Open Task》<br>
================================================
FILE: cnn/tensorflow/README.md
================================================
================result==================
结果和theano版本的差不多,具体数值忘了
虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的话训练速度要快一些。
================dataset================
数据格式和theano版本的是一样的
github上给出的是样本数据,如果需要全量的,也可直接联系我
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
./insqa_train.py
我使用的是python3.4,部分代码可能会和python2不兼容,如使用python2需要自己做一些小修改,核心的CNN代码应该
不用改动的
代码里的数据路径(类似'/export/...')是需要根据自己的环境修改的,指向自己的数据路径即可。核心的CNN代码无需改动
================================================
FILE: cnn/tensorflow/insqa_cnn.py
================================================
import tensorflow as tf
import numpy as np
##########################################################################
# embedding_lookup + cnn + cosine margine , batch
##########################################################################
class InsQACNN(object):
def __init__(self, _margin, sequence_length, batch_size,
vocab_size, embedding_size,
filter_sizes, num_filters, l2_reg_lambda=0.0):
self.L, self.B, self.V, self.E, self.FS, self.NF = sequence_length, batch_size, \
vocab_size, embedding_size, filter_sizes, num_filters
#用户问题,字向量使用embedding_lookup
self.q = tf.placeholder(tf.int32, [self.B, self.L], name="q")
#待匹配正向问题
self.qp = tf.placeholder(tf.int32, [self.B, self.L], name="qp")
#负向问题
self.qn = tf.placeholder(tf.int32, [self.B, self.L], name="qn")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
W = tf.get_variable(
initializer=tf.random_uniform([self.V, self.E], -1.0, 1.0),
name='We')
self.qe = tf.nn.embedding_lookup(W, self.q)
self.qpe = tf.nn.embedding_lookup(W, self.qp)
self.qne = tf.nn.embedding_lookup(W, self.qn)
self.qe = tf.expand_dims(self.qe, -1)
self.qpe = tf.expand_dims(self.qpe, -1)
self.qne = tf.expand_dims(self.qne, -1)
with tf.variable_scope('shared-conv') as scope:
self.qe = self.conv(self.qe)
scope.reuse_variables()
#tf.get_variable_scope().reuse_variables()
self.qpe = self.conv(self.qpe)
scope.reuse_variables()
#tf.get_variable_scope().reuse_variables()
self.qne = self.conv(self.qne)
self.cos_q_qp = self.cosine(self.qe, self.qpe)
self.cos_q_qn = self.cosine(self.qe, self.qne)
zero = tf.constant(0, shape=[self.B], dtype=tf.float32)
margin = tf.constant(_margin, shape=[self.B], dtype=tf.float32)
with tf.name_scope("loss"):
self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_q_qp, self.cos_q_qn)))
self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
print('loss ', self.loss)
# Accuracy
with tf.name_scope("accuracy"):
self.correct = tf.equal(zero, self.losses)
self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
for v in tf.trainable_variables():
print(v)
def conv(self, tensor):
pooled = []
#with tf.variable_scope(name_or_scope='my-conv', reuse=tf.AUTO_REUSE):
with tf.variable_scope("my-conv-shared"):
for i, fs in enumerate(self.FS):
filter_shape = [fs, self.E, 1, self.NF]
W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1),
name="W-%s" % str(fs))
b = tf.get_variable(initializer=tf.constant(0.1, shape=[self.NF]),
name="b-%s" % str(fs))
conv = tf.nn.conv2d(
tensor, W, strides=[1, 1, 1, 1], padding='VALID',
name="conv")
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
output = tf.nn.max_pool(
h, ksize=[1, self.L - fs + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID',
name="pool")
pooled.append(output)
num_filters_total = self.NF * len(self.FS)
pooled = tf.reshape(tf.concat(pooled, 3), [-1, num_filters_total])
pooled = tf.nn.dropout(pooled, self.dropout_keep_prob)
return pooled
def cosine(self, v1, v2):
l1 = tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1))
l2 = tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1))
a = tf.reduce_sum(tf.multiply(v1, v2), 1)
cos = tf.div(a, tf.multiply(l1, l2), name='score')
return tf.clip_by_value(cos, 1e-5, 0.99999)
================================================
FILE: cnn/tensorflow/insqa_cnn.py.old
================================================
import tensorflow as tf
import numpy as np
##########################################################################
# embedding_lookup + cnn + cosine margine , batch
##########################################################################
class InsQACNN1(object):
def __init__(
self, sequence_length, batch_size,
vocab_size, embedding_size,
filter_sizes, num_filters, l2_reg_lambda=0.0):
#用户问题,字向量使用embedding_lookup
self.input_x_1 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_1")
#待匹配正向问题
self.input_x_2 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_2")
#负向问题
self.input_x_3 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_3")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
l2_loss = tf.constant(0.0)
print("input_x_1 ", self.input_x_1)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
chars_1 = tf.nn.embedding_lookup(W, self.input_x_1)
chars_2 = tf.nn.embedding_lookup(W, self.input_x_2)
chars_3 = tf.nn.embedding_lookup(W, self.input_x_3)
#self.embedded_chars_1 = tf.nn.dropout(chars_1, self.dropout_keep_prob)
#self.embedded_chars_2 = tf.nn.dropout(chars_2, self.dropout_keep_prob)
#self.embedded_chars_3 = tf.nn.dropout(chars_3, self.dropout_keep_prob)
self.embedded_chars_1 = chars_1
self.embedded_chars_2 = chars_2
self.embedded_chars_3 = chars_3
self.embedded_chars_expanded_1 = tf.expand_dims(self.embedded_chars_1, -1)
self.embedded_chars_expanded_2 = tf.expand_dims(self.embedded_chars_2, -1)
self.embedded_chars_expanded_3 = tf.expand_dims(self.embedded_chars_3, -1)
pooled_outputs_1 = []
pooled_outputs_2 = []
pooled_outputs_3 = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded_1,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-1"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-1")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-1"
)
pooled_outputs_1.append(pooled)
conv = tf.nn.conv2d(
self.embedded_chars_expanded_2,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-2"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-2")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-2"
)
pooled_outputs_2.append(pooled)
conv = tf.nn.conv2d(
self.embedded_chars_expanded_3,
W,
strides=[1, 1, 1, 1],
padding='VALID',
name="conv-3"
)
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-3")
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="poll-3"
)
pooled_outputs_3.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
pooled_reshape_1 = tf.reshape(tf.concat(pooled_outputs_1, 3), [-1, num_filters_total])
pooled_reshape_2 = tf.reshape(tf.concat(pooled_outputs_2, 3), [-1, num_filters_total])
pooled_reshape_3 = tf.reshape(tf.concat(pooled_outputs_3, 3), [-1, num_filters_total])
#dropout
pooled_flat_1 = tf.nn.dropout(pooled_reshape_1, self.dropout_keep_prob)
pooled_flat_2 = tf.nn.dropout(pooled_reshape_2, self.dropout_keep_prob)
pooled_flat_3 = tf.nn.dropout(pooled_reshape_3, self.dropout_keep_prob)
pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) #计算向量长度Batch模式
pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1))
pooled_len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_3, pooled_flat_3), 1))
pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) #计算向量的点乘Batch模式
pooled_mul_13 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_3), 1)
with tf.name_scope("output"):
self.cos_12 = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") #计算向量夹角Batch模式
self.cos_13 = tf.div(pooled_mul_13, tf.multiply(pooled_len_1, pooled_len_3))
zero = tf.constant(0, shape=[batch_size], dtype=tf.float32)
margin = tf.constant(0.05, shape=[batch_size], dtype=tf.float32)
with tf.name_scope("loss"):
self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_12, self.cos_13)))
self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
print('loss ', self.loss)
# Accuracy
with tf.name_scope("accuracy"):
self.correct = tf.equal(zero, self.losses)
self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
for v in tf.trainable_variables():
print(v)
exit(1)
================================================
FILE: cnn/tensorflow/insqa_train.py
================================================
#! /usr/bin/env python3.4
import tensorflow as tf
import numpy as np
import os, time, datetime, operator, sys
from insqa_cnn import InsQACNN
sys.path.append('../../')
import config, utils
print(tf.__version__)
# Parameters
# ==================================================
# Model Hyperparameters
tf.flags.DEFINE_float("margin", 0.05, "CNN model margin")
tf.flags.DEFINE_integer("sequence_length", 200, "Max sequence lehgth(default: 200)")
tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 3000, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
vocab, embeddings = utils.load_embeddings()
train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
print("Load done...")
# Training
# ==================================================
prev_auc = 0
with tf.Graph().as_default():
with tf.device("/gpu:1"):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = InsQACNN(
_margin=FLAGS.margin,
sequence_length=FLAGS.sequence_length,
batch_size=FLAGS.batch_size,
vocab_size=len(vocab),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-2)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
# Initialize all variables
sess.run(tf.initialize_all_variables())
def train_step(q, qp, qn):
feed_dict = {
cnn.q: q, cnn.qp: qp, cnn.qn: qn,
#cnn.input_x_1: q, cnn.input_x_2: qp, cnn.input_x_3: qn,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy, cos1, cos2 = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.cos_q_qp, cnn.cos_q_qn],
feed_dict)
#print(cos1)
#print(cos2)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def test_step():
yp, y, group, of = [], [], [], open(config.predict1_file, 'w')
for i in range(0, len(test_data), FLAGS.batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+FLAGS.batch_size)
feed_dict = {
cnn.q: q1, cnn.qp: q2, cnn.qn: q2,
#cnn.input_x_1: q1, cnn.input_x_2: q2, cnn.input_x_3: q2,
cnn.dropout_keep_prob: 1.0
}
cos = sess.run([cnn.cos_q_qp], feed_dict)
yp.extend(cos[0])
y.extend(f)
group.extend(g)
y, g, yp = y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
auc = utils.eval_auc(y[:len(test_data)], g, yp[:len(test_data)])
top1_prec = utils._eval_top1_prec(y, g, yp)
for p in yp[:len(test_data)]: of.write(str(p) + '\n')
of.write(str(top1_prec) + '\n')
of.close()
return auc
# Generate batches
# Training loop. For each batch...
for i in range(FLAGS.num_epochs):
try:
q, qp, qn = utils.gen_train_batch_qpn(train_data, FLAGS.batch_size)
train_step(q, qp, qn)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
auc = test_step()
#if auc < prev_auc: break
prev_auc = auc
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
except Exception as e:
print(e)
================================================
FILE: cnn/tensorflow/insqa_train.py.old
================================================
#! /usr/bin/env python3.4
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import insurance_qa_data_helpers
from insqa_cnn import InsQACNN1
import operator
#print tf.__version__
# Parameters
# ==================================================
# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 100, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
vocab = insurance_qa_data_helpers.build_vocab()
alist = insurance_qa_data_helpers.read_alist()
raw = insurance_qa_data_helpers.read_raw()
x_train_1, x_train_2, x_train_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
testList, vectors = insurance_qa_data_helpers.load_test_and_vectors()
vectors = ''
print('x_train_1', np.shape(x_train_1))
print("Load done...")
val_file = '/export/jw/cnn/insuranceQA/test1'
precision = '/export/jw/cnn/insuranceQA/test1.acc'
#x_val, y_val = data_deepqa.load_data_val()
# Training
# ==================================================
with tf.Graph().as_default():
with tf.device("/gpu:1"):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = InsQACNN1(
sequence_length=x_train_1.shape[1],
batch_size=FLAGS.batch_size,
vocab_size=len(vocab),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-2)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
# Initialize all variables
sess.run(tf.initialize_all_variables())
def train_step(x_batch_1, x_batch_2, x_batch_3):
"""
A single training step
"""
feed_dict = {
cnn.input_x_1: x_batch_1,
cnn.input_x_2: x_batch_2,
cnn.input_x_3: x_batch_3,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def dev_step():
scoreList = []
i = int(0)
while True:
x_test_1, x_test_2, x_test_3 = insurance_qa_data_helpers.load_data_val_6(testList, vocab, i, FLAGS.batch_size)
feed_dict = {
cnn.input_x_1: x_test_1,
cnn.input_x_2: x_test_2,
cnn.input_x_3: x_test_3,
cnn.dropout_keep_prob: 1.0
}
batch_scores = sess.run([cnn.cos_12], feed_dict)
for score in batch_scores[0]:
scoreList.append(score)
i += FLAGS.batch_size
if i >= len(testList):
break
sessdict = {}
index = int(0)
for line in open(val_file):
items = line.strip().split(' ')
qid = items[1].split(':')[1]
if not qid in sessdict:
sessdict[qid] = []
sessdict[qid].append((scoreList[index], items[0]))
index += 1
if index >= len(testList):
break
lev1 = float(0)
lev0 = float(0)
of = open(precision, 'a')
for k, v in sessdict.items():
v.sort(key=operator.itemgetter(0), reverse=True)
score, flag = v[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print('lev1 ' + str(lev1))
print('lev0 ' + str(lev0))
of.close()
# Generate batches
# Training loop. For each batch...
for i in range(FLAGS.num_epochs):
try:
x_batch_1, x_batch_2, x_batch_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
train_step(x_batch_1, x_batch_2, x_batch_3)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
print("\nEvaluation:")
dev_step()
print("")
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
except Exception as e:
print(e)
================================================
FILE: cnn/tensorflow/insurance_qa_data_helpers.py
================================================
import numpy as np
import random
empty_vector = []
for i in range(0, 100):
empty_vector.append(float(0.0))
onevector = []
for i in range(0, 10):
onevector.append(float(1))
zerovector = []
for i in range(0, 10):
zerovector.append(float(0))
def build_vocab():
code = int(0)
vocab = {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 4):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/test1'):
items = line.strip().split(' ')
for i in range(2, 4):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def rand_qa(qalist):
index = random.randint(0, len(qalist) - 1)
return qalist[index]
def read_alist():
alist = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
alist.append(items[3])
print('read_alist done ......')
return alist
def vocab_plus_overlap(vectors, sent, over, size):
global onevector
global zerovector
oldict = {}
words = over.split('_')
if len(words) < size:
size = len(words)
for i in range(0, size):
if words[i] == '<a>':
continue
oldict[words[i]] = '#'
matrix = []
words = sent.split('_')
if len(words) < size:
size = len(words)
for i in range(0, size):
vec = read_vector(vectors, words[i])
newvec = vec.copy()
#if words[i] in oldict:
# newvec += onevector
#else:
# newvec += zerovector
matrix.append(newvec)
return matrix
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if (len(items) < 101):
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def read_vector(vectors, word):
global empty_vector
if word in vectors:
return vectors[word]
else:
return empty_vector
#return vectors['</s>']
def load_test_and_vectors():
testList = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
testList.append(line.strip())
vectors = load_vectors()
return testList, vectors
def load_train_and_vectors():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
trainList.append(line.strip())
vectors = load_vectors()
return trainList, vectors
def load_data_val_10(testList, vectors, index):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = testList[index].split(' ')
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def read_raw():
raw = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
raw.append(items)
return raw
def encode_sent(vocab, string, size):
x = []
words = string.split('_')
for i in range(0, 200):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
return x
def load_data_6(vocab, alist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
for i in range(0, size):
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(alist)
x_train_1.append(encode_sent(vocab, items[2], 100))
x_train_2.append(encode_sent(vocab, items[3], 100))
x_train_3.append(encode_sent(vocab, nega, 100))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_val_6(testList, vocab, index, batch):
x_train_1 = []
x_train_2 = []
x_train_3 = []
for i in range(0, batch):
true_index = index + i
if (true_index >= len(testList)):
true_index = len(testList) - 1
items = testList[true_index].split(' ')
x_train_1.append(encode_sent(vocab, items[2], 100))
x_train_2.append(encode_sent(vocab, items[3], 100))
x_train_3.append(encode_sent(vocab, items[3], 100))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_9(trainList, vectors, size):
x_train_1 = []
x_train_2 = []
y_train = []
for i in range(0, size):
pos = trainList[random.randint(0, len(trainList) - 1)]
posItems = pos.strip().split(' ')
x_train_1.append(vocab_plus_overlap(vectors, posItems[2], posItems[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, posItems[3], posItems[2], 200))
y_train.append([1, 0])
neg = trainList[random.randint(0, len(trainList) - 1)]
negItems = neg.strip().split(' ')
x_train_1.append(vocab_plus_overlap(vectors, posItems[2], negItems[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, negItems[3], posItems[2], 200))
y_train.append([0, 1])
return np.array(x_train_1), np.array(x_train_2), np.array(y_train)
def load_data_val_9(testList, vectors, index):
x_train_1 = []
x_train_2 = []
items = testList[index].split(' ')
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
return np.array(x_train_1), np.array(x_train_2)
def load_data_10(vectors, qalist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(qalist)
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def load_data_11(vectors, qalist, raw, size):
x_train_1 = []
x_train_2 = []
x_train_3 = []
items = raw[random.randint(0, len(raw) - 1)]
nega = rand_qa(qalist)
x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def batch_iter(data, batch_size, num_epochs, shuffle=True):
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
================================================
FILE: cnn/tensorflow/test.py
================================================
import random
_list = [i for i in range(0, 10)]
_l1 = random.sample(_list, 2)
_l2 = random.sample(_list, 2)
print(_l1)
print(_l2)
for i in range(2, 2):
print(i)
================================================
FILE: cnn/theano/README.md
================================================
================result==================
theano and tensorflow cnn code for insuranceQA
theano code, test1 top-1 precision : 61.5% (see ./insuranceQA/acc)
tensorflow code, test1 top-1 precision : 62.6%
the best precision in the paper is 62.8% (see Applying Deep Leaarning To Answer Selection: A study and an open task)
================dataset================
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
reformat the original dataset(see my train and test1.sample)
change filepath to your dataset(see TODO in insqa_cnn.py)
python insqa_cnn.py
================================================
FILE: cnn/theano/insqa_cnn.py
================================================
###########################################################
# test1 top-1 precision: 62%
###########################################################
import os, sys, timeit, random, operator
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
#TODO change path to your dataset
trainfile = '/export/jw/cnn/insuranceQA/train'
test1file = '/export/jw/cnn/insuranceQA/test1'
vectorsfile = '/export/jw/cnn/insuranceQA/vectors.nobin'
###########################################################
# read qa data
###########################################################
def build_vocab():
global trainfile
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open(trainfile):
items = line.strip().split(' ')
for i in range(2, 3):
for word in items[i].split('_'):
if len(word) <= 0:
continue
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def load_vectors():
global vectorsfile
vectors = {}
for line in open(vectorsfile):
items = line.strip().split(' ')
if len(items[0]) <= 0:
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def load_word_embeddings(vocab, dim):
vectors = load_vectors()
embeddings = [] #brute initialization
for i in range(0, len(vocab)):
vec = []
for j in range(0, dim):
vec.append(0.01)
embeddings.append(vec)
for word, code in vocab.items():
if word in vectors:
embeddings[code] = vectors[word]
return np.array(embeddings, dtype='float32')
#be attention initialization of UNKNNOW
def encode_sent(vocab, string, size):
x = []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
return x
def load_train_list():
global trainfile
trainList = []
for line in open(trainfile):
trainList.append(line.strip().split(' '))
return trainList
def load_test_list():
global test1file
testList = []
for line in open(test1file):
testList.append(line.strip().split(' '))
return testList
def load_data(trainList, vocab, batch_size):
train_1, train_2, train_3 = [], [], []
for i in range(0, batch_size):
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
train_1.append(encode_sent(vocab, pos[2], 100))
train_2.append(encode_sent(vocab, pos[3], 100))
train_3.append(encode_sent(vocab, neg[3], 100))
return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32')
def load_data_val(testList, vocab, index, batch_size):
x1, x2, x3 = [], [], []
for i in range(0, batch_size):
true_index = index + i
if true_index >= len(testList):
true_index = len(testList) - 1
items = testList[true_index]
x1.append(encode_sent(vocab, items[2], 100))
x2.append(encode_sent(vocab, items[3], 100))
x3.append(encode_sent(vocab, items[3], 100))
return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32')
def validation(validate_model, testList, vocab, batch_size):
index, score_list = int(0), []
while True:
x1, x2, x3 = load_data_val(testList, vocab, index, batch_size)
batch_scores, nouse = validate_model(x1, x2, x3, 1.0)
for score in batch_scores:
score_list.append(score)
index += batch_size
if index >= len(testList):
break
print 'Evaluation ' + str(index)
sdict, index = {}, int(0)
for items in testList:
qid = items[1].split(':')[1]
if not qid in sdict:
sdict[qid] = []
sdict[qid].append((score_list[index], items[0]))
index += 1
lev0, lev1 = float(0), float(0)
for qid, cases in sdict.items():
cases.sort(key=operator.itemgetter(0), reverse=True)
score, flag = cases[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
print 'top-1 precition: ' + str(lev1 / (lev0 + lev1))
class QACnn(object):
def __init__(self, input1, input2, input3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters, keep_prob):
rng = np.random.RandomState(23455)
self.params = []
lookup_table = theano.shared(word_embeddings)
self.params += [lookup_table]
#input1-问题, input2-正向答案, input3-负向答案
#将每个字替换成字向量
input_matrix1 = lookup_table[T.cast(input1.flatten(), dtype="int32")]
input_matrix2 = lookup_table[T.cast(input2.flatten(), dtype="int32")]
input_matrix3 = lookup_table[T.cast(input3.flatten(), dtype="int32")]
#CNN的输入是4维矩阵,这里只是增加了一个维度而已
input_x1 = input_matrix1.reshape((batch_size, 1, sequence_len, embedding_size))
input_x2 = input_matrix2.reshape((batch_size, 1, sequence_len, embedding_size))
input_x3 = input_matrix3.reshape((batch_size, 1, sequence_len, embedding_size))
#print(input_x1.shape.eval())
self.dbg_x1 = input_x1
outputs_1, outputs_2, outputs_3 = [], [], []
#设置多种大小的filter
for filter_size in filter_sizes:
#每种大小的filter的数量是num_filters
filter_shape = (num_filters, 1, filter_size, embedding_size)
image_shape = (batch_size, 1, sequence_len, embedding_size)
fan_in = np.prod(filter_shape[1:])
fan_out = filter_shape[0] * np.prod(filter_shape[2:])
W_bound = np.sqrt(6. / (fan_in + fan_out))
W = theano.shared(
np.asarray(
rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
dtype=theano.config.floatX
),
borrow=True
)
b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
b = theano.shared(value=b_values, borrow=True)
#卷积+max_pooling
conv_out = conv2d(input=input_x1, filters=W, filter_shape=filter_shape, input_shape=image_shape)
#卷积后的向量的长度为ds
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_1.append(pooled_active)
conv_out = conv2d(input=input_x2, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_2.append(pooled_active)
conv_out = conv2d(input=input_x3, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs_3.append(pooled_active)
self.params += [W, b]
self.dbg_conv_out = conv_out.shape
num_filters_total = num_filters * len(filter_sizes)
self.dbg_outputs_1 = outputs_1[0].shape
#每一个句子的语义表示向量的长度为num_filters_total
output_flat1 = T.reshape(T.concatenate(outputs_1, axis=1), [batch_size, num_filters_total])
output_flat2 = T.reshape(T.concatenate(outputs_2, axis=1), [batch_size, num_filters_total])
output_flat3 = T.reshape(T.concatenate(outputs_3, axis=1), [batch_size, num_filters_total])
#dropout, keep_prob为1表示不进行dropout
output_drop1 = self._dropout(rng, output_flat1, keep_prob)
output_drop2 = self._dropout(rng, output_flat2, keep_prob)
output_drop3 = self._dropout(rng, output_flat3, keep_prob)
#计算问题和答案之前的向量夹角
#计算向量的长度
len1 = T.sqrt(T.sum(output_drop1 * output_drop1, axis=1))
len2 = T.sqrt(T.sum(output_drop2 * output_drop2, axis=1))
len3 = T.sqrt(T.sum(output_drop3 * output_drop3, axis=1))
#计算向量之间的夹角
cos12 = T.sum(output_drop1 * output_drop2, axis=1) / (len1 * len2)
self.cos12 = cos12
cos13 = T.sum(output_drop1 * output_drop3, axis=1) / (len1 * len3)
self.cos13 = cos13
zero = theano.shared(np.zeros(batch_size, dtype=theano.config.floatX), borrow=True)
margin = theano.shared(np.full(batch_size, 0.05, dtype=theano.config.floatX), borrow=True)
#Loss损失函数
diff = T.cast(T.maximum(zero, margin - cos12 + cos13), dtype=theano.config.floatX)
self.cost = T.sum(diff, acc_dtype=theano.config.floatX)
#mini-batch数据的准确率(如果正向答案和问题之间的cosine大于负向答案和问题的cosine,则认为正确,
#否则是错误的)
#Loss和Accuracy是用来评估训练中模型时候收敛的两个很重要的指标
self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
def _dropout(self, rng, layer, keep_prob):
srng = T.shared_randomstreams.RandomStreams(rng.randint(123456))
mask = srng.binomial(n=1, p=keep_prob, size=layer.shape)
output = layer * T.cast(mask, theano.config.floatX)
output = output / keep_prob
return output
def train():
batch_size = int(256)
filter_sizes = [2,3,5]
num_filters = 500
embedding_size = 100
learning_rate = 0.001
n_epochs = 2000000
validation_freq = 1000
keep_prob_value = 0.25
vocab = build_vocab()
word_embeddings = load_word_embeddings(vocab, embedding_size)
trainList = load_train_list()
testList = load_test_list()
train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3')
keep_prob = T.fscalar('keep_prob')
model = QACnn(
input1=x1, input2=x2, input3=x3, keep_prob=keep_prob,
word_embeddings=word_embeddings,
batch_size=batch_size,
sequence_len=train_x1.shape[1],
embedding_size=embedding_size,
filter_sizes=filter_sizes,
num_filters=num_filters)
dbg_x1 = model.dbg_x1
dbg_outputs_1 = model.dbg_outputs_1
cost, cos12, cos13 = model.cost, model.cos12, model.cos13
print 'cost'
print cost
params, accuracy = model.params, model.accuracy
grads = T.grad(cost, params)
updates = [
(param_i, param_i - learning_rate * grad_i)
for param_i, grad_i in zip(params, grads)
]
p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3')
prob = T.fscalar('prob')
train_model = theano.function(
[p1, p2, p3, prob],
[cost, accuracy, dbg_x1, dbg_outputs_1],
updates=updates,
givens={
x1: p1, x2: p2, x3: p3, keep_prob: prob
}
)
v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
validate_model = theano.function(
inputs=[v1, v2, v3, prob],
outputs=[cos12, cos13],
#updates=updates,
givens={
x1: v1, x2: v2, x3: v3, keep_prob: prob
}
)
epoch = 0
done_looping = False
while (epoch < n_epochs) and (not done_looping):
epoch = epoch + 1
train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
#print train_x3.shape
cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value)
print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
if epoch % validation_freq == 0:
print 'Evaluation ......'
validation(validate_model, testList, vocab, batch_size)
#print dbg_outputs_1
if __name__ == '__main__':
train()
================================================
FILE: config.py
================================================
import os
dataset_ins = 'insurance-qa'
dataset_qur = 'quora-qa'
##################################################################
# ajust to your runnning environment
# which data do you want
dataset = dataset_qur
# word2vec command path
w2v_command = '/export/jw/word2vec/word2vec'
##################################################################
home = ''
if dataset == dataset_ins:
home = os.path.expanduser('/export/jw/insuranceQA')
elif dataset == dataset_qur:
home = os.path.expanduser('/export/jw/quora')
#Insurance-QA original data directory
qa_version = 'V1'
vocab_file = os.path.join(home, qa_version, 'vocabulary')
answers_file = os.path.join(home, qa_version, 'answers.label.token_idx')
question_train_file = os.path.join(home, qa_version, 'question.train.token_idx.label')
question_test1_file = os.path.join(home, qa_version, 'question.test1.label.token_idx.pool')
question_test2_file = os.path.join(home, qa_version, 'question.test2.label.token_idx.pool')
question_dev_file = os.path.join(home, qa_version, 'question.dev.label.token_idx.pool')
#quora original data directory
qr_file = os.path.join(home, 'quora_duplicate_questions.tsv')
qr_train_ratio = 0.8
#processed files
train_file = os.path.join(home, 'data', 'train.prepro')
test1_file = os.path.join(home, 'data', 'test1.prepro')
test2_file = os.path.join(home, 'data', 'test2.prepro')
w2v_train_file = os.path.join(home, 'data', 'w2v.train')
w2v_bin_file = os.path.join(home, 'data', 'w2v.bin')
predict1_file = os.path.join(home, 'data', 'predict1')
================================================
FILE: gen.py
================================================
import config, os, random
#####################################################################
# function: load vocab
# return: dict[word] = [word_id]
#####################################################################
def load_vocab():
voc = {}
for line in open(config.vocab_file):
word, _id = line.strip().split('\t')
voc[word] = _id
return voc
#####################################################################
# function: load answers, restore idx to real word
# return : [answer_1, answer_2, ..., answer_n]
#####################################################################
def ins_load_answers():
_list, voc = ['<None>'], load_vocab()
for line in open(config.answers_file):
_, sent = line.strip().split('\t')
_list.append('_'.join([voc[wid] for wid in sent.split(' ')]))
return _list
#####################################################################
# function: preprea word2vec binary file
# return :
#####################################################################
def ins_w2v():
print('preparing word2vec ......')
_data, voc = [], load_vocab()
for line in open(config.question_train_file):
items = line.strip().split('\t')
_data.append(' '.join([voc[_id] for _id in items[0].split(' ')]))
for _file in [config.answers_file, config.question_dev_file, \
config.question_test1_file, config.question_test2_file]:
for line in open(_file):
items = line.strip().split('\t')
_data.append(' '.join([voc[_id] for _id in items[1].split(' ')]))
of = open(config.w2v_train_file, 'w')
for s in _data: of.write(s + '\n')
of.close()
os.system('time ' + config.w2c_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
#####################################################################
# function: preprea train file
# file format: flag question answer
#####################################################################
def ins_train():
print('preparing train ......')
answers, voc, _data = ins_load_answers(), load_vocab(), []
for line in open(config.question_train_file):
qsent, ids = line.strip().split('\t')
qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
for _id in ids.split(' '):
_data.append(' '.join(['1', qsent, answers[int(_id)]]))
of = open(config.train_file, 'w')
for _s in _data: of.write(_s + '\n')
of.close()
#####################################################################
# function: preprea test file
# file format: flag group_id question answer
#####################################################################
def ins_test():
print('preparing test ......')
answers, voc = ins_load_answers(), load_vocab()
for _in, _out in ([(config.question_test2_file, config.test2_file), \
(config.question_test1_file, config.test1_file)]):
_data, group = [], int(0)
for line in open(_in):
pids, qsent, pnids = line.strip().split('\t')
positive = {_id:'#' for _id in pids.split(' ')}
qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
for _id in pnids.split(' '):
flag = '1' if _id in positive else '0'
_data.append(' '.join([flag, str(group), qsent, answers[int(_id)]]))
group += 1
of = open(_out, 'w')
for s in _data: of.write(s + '\n')
of.close()
def ins_qa():
ins_w2v()
ins_train()
ins_test()
def qur_prepare():
#pretrain word2vec
_list = []
for line in open(config.qr_file):
items = line.strip().split('\t')
if len(items) != 6:
continue
_list.append(items)
_list = _list[1:]
random.shuffle(_list)
_list = [(f, q1, q2) for _,_,_,q1,q2,f in _list]
of = open(config.w2v_train_file, 'w')
for f, q1, q2 in _list:
of.write(q1 + '\n')
of.write(q2 + '\n')
of.close()
os.system('time ' + config.w2v_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
#train file
_newlist = []
for f, q1, q2 in _list:
if len(q1) <= 1 or len(q2) <= 1: continue
q1 = '_'.join(q1.split(' '))
q2 = '_'.join(q2.split(' '))
_newlist.append((f, q1, q2))
_list = _newlist
of = open(config.train_file, 'w')
for f, q1, q2 in _list[:int(len(_list) * 0.8)]:
of.write(' '.join([f, q1, q2]) + '\n')
of.close()
#test file
of = open(config.test1_file, 'w')
for f, q1, q2 in _list[int(len(_list) * 0.8):]:
of.write(' '.join([f, q1, q2]) + '\n')
of.close()
def qur_qa():
qur_prepare()
if __name__ == '__main__':
if config.dataset == config.dataset_ins:
ins_qa()
elif config.dataset == config.dataset_qur:
qur_qa()
================================================
FILE: lstm_cnn/theano/README.md
================================================
theano lstm+cnn code for insuranceQA
================result==================
theano code, test1 top-1 precision : 68.3%
lstm+cnn is better than cnn(61.5%).
================dataset================
dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
you can get the original dataset from https://github.com/shuzi/insuranceQA
word embedding is trained by word2vec toolkit
=================run=====================
reformat the original dataset(see my train and test1.sample)
change filepath to your dataset(see TODO in insqa_cnn.py)
python insqa_lstm.py
================================================
FILE: lstm_cnn/theano/insqa_lstm.py
================================================
############################################################
# if batch_size is 1, there must be a dtype error when doing
# T.grad, this is something about scan func
# see https://github.com/Theano/Theano/issues/1772
#
# LSTM + cnn
# test1 top-1 precision: 68.3%
############################################################
from collections import OrderedDict
import sys, time, random, operator
import numpy as np
import theano
from theano import config
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
#TODO change filepath to your local environment
#include train test1 vectors.nobin
def build_vocab():
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 3):
for word in items[i].split('_'):
if len(word) <= 0:
continue
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if len(items[0]) <= 0:
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def load_word_embeddings(vocab, dim):
vectors = load_vectors()
embeddings = [] #brute initialization
for i in range(0, len(vocab)):
vec = []
for j in range(0, dim):
vec.append(0.01)
embeddings.append(vec)
for word, code in vocab.items():
if word in vectors:
embeddings[code] = vectors[word]
return np.array(embeddings, dtype='float32')
#be attention initialization of UNKNNOW
def encode_sent(vocab, string, size):
x, m = [], []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab:
x.append(vocab[words[i]])
else:
x.append(vocab['UNKNOWN'])
if words[i] == '<a>': #TODO
m.append(1) #fixed sequence length, else use 0
else:
m.append(1)
return x, m
def load_train_list():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
trainList.append(line.strip().split(' '))
return trainList
def load_test_list():
testList = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
testList.append(line.strip().split(' '))
return testList
def load_data(trainList, vocab, batch_size):
train_1, train_2, train_3 = [], [], []
mask_1, mask_2, mask_3 = [], [], []
counter = 0
while True:
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
if pos[2].startswith('<a>') or pos[3].startswith('<a>') or neg[3].startswith('<a>'):
#print 'empty string ......'
continue
x, m = encode_sent(vocab, pos[2], 100)
train_1.append(x)
mask_1.append(m)
x, m = encode_sent(vocab, pos[3], 100)
train_2.append(x)
mask_2.append(m)
x, m = encode_sent(vocab, neg[3], 100)
train_3.append(x)
mask_3.append(m)
counter += 1
if counter >= batch_size:
break
return np.transpose(np.array(train_1, dtype=config.floatX)), np.transpose(np.array(train_2, dtype=config.floatX)), np.transpose(np.array(train_3, dtype=config.floatX)), np.transpose(np.array(mask_1, dtype=config.floatX)) , np.transpose(np.array(mask_2, dtype=config.floatX)), np.transpose(np.array(mask_3, dtype=config.floatX))
def load_data_val(testList, vocab, index, batch_size):
x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
for i in range(0, batch_size):
true_index = index + i
if true_index >= len(testList):
true_index = len(testList) - 1
items = testList[true_index]
x, m = encode_sent(vocab, items[2], 100)
x1.append(x)
m1.append(m)
x, m = encode_sent(vocab, items[3], 100)
x2.append(x)
m2.append(m)
x, m = encode_sent(vocab, items[3], 100)
x3.append(x)
m3.append(m)
return np.transpose(np.array(x1, dtype=config.floatX)), np.transpose(np.array(x2, dtype=config.floatX)), np.transpose(np.array(x3, dtype=config.floatX)), np.transpose(np.array(m1, dtype=config.floatX)) , np.transpose(np.array(m2, dtype=config.floatX)), np.transpose(np.array(m3, dtype=config.floatX))
def validation(validate_model, testList, vocab, batch_size):
index, score_list = int(0), []
while True:
x1, x2, x3, m1, m2, m3 = load_data_val(testList, vocab, index, batch_size)
batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3)
for score in batch_scores:
score_list.append(score)
index += batch_size
if index >= len(testList):
break
print 'Evaluation ' + str(index)
sdict, index = {}, int(0)
for items in testList:
qid = items[1].split(':')[1]
if not qid in sdict:
sdict[qid] = []
sdict[qid].append((score_list[index], items[0]))
index += 1
lev0, lev1 = float(0), float(0)
of = open('/export/jw/cnn/insuranceQA/acc.lstm', 'a')
for qid, cases in sdict.items():
cases.sort(key=operator.itemgetter(0), reverse=True)
score, flag = cases[0]
if flag == '1':
lev1 += 1
if flag == '0':
lev0 += 1
for s in score_list:
of.write(str(s) + '\n')
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print 'lev1:' + str(lev1)
print 'lev0:' + str(lev0)
of.close()
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W)
return u.astype(config.floatX)
def numpy_floatX(data):
return np.asarray(data, dtype=config.floatX)
def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_params):
rng = np.random.RandomState(23455)
for filter_size in filter_sizes:
filter_shape = (num_filters, 1, filter_size, proj_size)
fan_in = np.prod(filter_shape[1:])
fan_out = filter_shape[0] * np.prod(filter_shape[2:])
W_bound = np.sqrt(6. / (fan_in + fan_out))
W = theano.shared(
np.asarray(
rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
dtype=theano.config.floatX
),
borrow=True
)
tparams['cnn_W_' + str(filter_size)] = W
b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
b = theano.shared(value=b_values, borrow=True)
tparams['cnn_b_' + str(filter_size)] = b
grad_params += [W, b]
return tparams, grad_params
def param_init_lstm(proj_size, tparams, grad_params):
W = np.concatenate([ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size)], axis=1)
W_t = theano.shared(W, borrow=True)
tparams[_p('lstm', 'W')] = W_t
U = np.concatenate([ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size),
ortho_weight(proj_size)], axis=1)
U_t = theano.shared(U, borrow=True)
tparams[_p('lstm', 'U')] = U_t
b = np.zeros((4 * proj_size,))
b_t = theano.shared(b.astype(config.floatX), borrow=True)
tparams[_p('lstm', 'b')] = b_t
grad_params += [W_t, U_t, b_t]
return tparams, grad_params
def dropout_layer(state_before, use_noise, trng):
proj = T.switch(use_noise,
(state_before *
trng.binomial(state_before.shape,
p=0.5, n=1,
dtype=state_before.dtype)),
state_before * 0.5)
return proj
class LSTM(object):
def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters):
#proj_size means embedding_size
#'lstm_W' = [embedding_size, embedding_size]
#'lstm_U' = [embedding_size, embedding_size]
#'lstm_b' = [embedding_size]
proj_size = 100 #TODO, what does proj mean
self.params, tparams = [], {}
tparams, self.params = param_init_lstm(proj_size, tparams, self.params)
tparams, self.params = param_init_cnn(filter_sizes, num_filters, proj_size, tparams, self.params)
lookup_table = theano.shared(word_embeddings, borrow=True)
tparams['lookup_table'] = lookup_table
self.params += [lookup_table]
n_timesteps = input1.shape[0]
n_samples = input1.shape[1]
lstm1, lstm_whole1 = self._lstm_net(tparams, input1, sequence_len, batch_size, embedding_size, mask1, proj_size)
lstm2, lstm_whole2 = self._lstm_net(tparams, input2, sequence_len, batch_size, embedding_size, mask2, proj_size)
lstm3, lstm_whole3 = self._lstm_net(tparams, input3, sequence_len, batch_size, embedding_size, mask3, proj_size)
#dimshuffle [sequence_len, batch_size, proj_size] to [batch_size, sequence_len, proj_size]
cnn_input1 = T.reshape(lstm1.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn_input2 = T.reshape(lstm2.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn_input3 = T.reshape(lstm3.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
cnn1 = self._cnn_net(tparams, cnn_input1, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
cnn2 = self._cnn_net(tparams, cnn_input2, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
cnn3 = self._cnn_net(tparams, cnn_input3, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
len1 = T.sqrt(T.sum(cnn1 * cnn1, axis=1))
len2 = T.sqrt(T.sum(cnn2 * cnn2, axis=1))
len3 = T.sqrt(T.sum(cnn3 * cnn3, axis=1))
self.cos12 = T.sum(cnn1 * cnn2, axis=1) / (len1 * len2)
self.cos13 = T.sum(cnn1 * cnn3, axis=1) / (len1 * len3)
zero = theano.shared(np.zeros(batch_size, dtype=config.floatX), borrow=True)
margin = theano.shared(np.full(batch_size, 0.05, dtype=config.floatX), borrow=True)
diff = T.cast(T.maximum(zero, margin - self.cos12 + self.cos13), dtype=config.floatX)
self.cost = T.sum(diff, acc_dtype=config.floatX)
self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_filters, filter_sizes, proj_size):
outputs = []
for filter_size in filter_sizes:
filter_shape = (num_filters, 1, filter_size, proj_size)
image_shape = (batch_size, 1, sequence_len, proj_size)
W = tparams['cnn_W_' + str(filter_size)]
b = tparams['cnn_b_' + str(filter_size)]
conv_out = conv2d(input=cnn_input, filters=W, filter_shape=filter_shape, input_shape=image_shape)
pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
outputs.append(pooled_active)
num_filters_total = num_filters * len(filter_sizes)
output_tensor = T.reshape(T.concatenate(outputs, axis=1), [batch_size, num_filters_total])
return output_tensor
def _lstm_net(self, tparams, _input, sequence_len, batch_size, embedding_size, mask, proj_size):
input_matrix = tparams['lookup_table'][T.cast(_input.flatten(), dtype="int32")]
input_x = input_matrix.reshape((sequence_len, batch_size, embedding_size))
proj, proj_whole = lstm_layer(tparams, input_x, proj_size, prefix='lstm', mask=mask)
#if useMask == True:
#proj = (proj * mask[:, :, None]).sum(axis=0)
#proj = proj / mask.sum(axis=0)[:, None]
#if options['use_dropout']:
#proj = dropout_layer(proj, use_noise, trng)
return proj, proj_whole
#state_below is word_embbeding tensor(3dim)
def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None):
#dim-0 steps, dim-1 samples(batch_size), dim-3 word_embedding
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
#h means hidden output? c means context? so we'll use h?
#rval[0] = [sequence_len, batch_size, proj_size], rval[1] the same
#so preact size must equl to x_(lstm input slice)
#if you want change lstm h(t) size, 'lstm_U' and 'lstm_b'
#and precat must be changed to another function, like h*U+b
#see http://colah.github.io/posts/2015-08-Understanding-LSTMs/
#f(t) = sigmoid(Wf * [h(t-1),x(t)] + bf)
def _step(m_, x_, h_, c_):
preact = T.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
i = T.nnet.sigmoid(_slice(preact, 0, proj_size))
f = T.nnet.sigmoid(_slice(preact, 1, proj_size))
o = T.nnet.sigmoid(_slice(preact, 2, proj_size))
c = T.tanh(_slice(preact, 3, proj_size))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * T.tanh(c)
#if mask(t-1)==0, than make h(t) = h(t-1)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
state_below = (T.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = proj_size
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[T.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
T.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0], rval[1]
def _p(pp, name):
return '%s_%s' % (pp, name)
def train():
batch_size = int(256)
embedding_size = 100
learning_rate = 0.05
n_epochs = 20000000
validation_freq = 1000
filter_sizes = [1, 2, 3, 5]
num_filters = 500
vocab = build_vocab()
word_embeddings = load_word_embeddings(vocab, embedding_size)
trainList = load_train_list()
testList = load_test_list()
train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3')
m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3')
model = LSTM(
input1=x1, input2=x2, input3=x3,
mask1=m1, mask2=m2, mask3=m3,
word_embeddings=word_embeddings,
batch_size=batch_size,
sequence_len=train_x1.shape[0], #row is sequence_len
embedding_size=embedding_size,
filter_sizes=filter_sizes,
num_filters=num_filters)
cost, cos12, cos13 = model.cost, model.cos12, model.cos13
params, accuracy = model.params, model.accuracy
grads = T.grad(cost, params)
updates = [
(param_i, param_i - learning_rate * grad_i)
for param_i, grad_i in zip(params, grads)
]
p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3')
q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3')
train_model = theano.function(
[p1, p2, p3, q1, q2, q3],
[cost, accuracy],
updates=updates,
givens={
x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3
}
)
v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3')
validate_model = theano.function(
inputs=[v1, v2, v3, u1, u2, u3],
outputs=[cos12, cos13],
#updates=updates,
givens={
x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3
}
)
epoch = 0
done_looping = False
while (epoch < n_epochs) and (not done_looping):
epoch += 1
train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
#print('train_x1, train_x2, train_x3')
#print(train_x1.shape, train_x2.shape, train_x3.shape)
cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3)
print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
if epoch % validation_freq == 0:
print 'Evaluation ......'
validation(validate_model, testList, vocab, batch_size)
if __name__ == '__main__':
train()
================================================
FILE: rnn_attention/tensorflow/insurance_qa_data_helpers.py
================================================
import numpy as np
import random
from operator import itemgetter
precision = '/export/jw/cnn/insuranceQA/acc.lstm'
empty_vector = []
for i in range(0, 100):
empty_vector.append(float(0.0))
onevector = []
for i in range(0, 10):
onevector.append(float(1))
zerovector = []
for i in range(0, 10):
zerovector.append(float(0))
def build_vocab():
code, vocab = int(0), {}
vocab['UNKNOWN'] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
for i in range(2, 3):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
for line in open('/export/jw/cnn/insuranceQA/test1'):
items = line.strip().split(' ')
for i in range(2, 3):
words = items[i].split('_')
for word in words:
if not word in vocab:
vocab[word] = code
code += 1
return vocab
def read_alist():
alist = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
alist.append(items[3])
print('read_alist done ......')
return alist
def load_vectors():
vectors = {}
for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
items = line.strip().split(' ')
if (len(items) < 101):
continue
vec = []
for i in range(1, 101):
vec.append(float(items[i]))
vectors[items[0]] = vec
return vectors
def read_vector(vectors, word):
global empty_vector
if word in vectors:
return vectors[word]
else:
return empty_vector
#return vectors['</s>']
def load_train_list():
train_list = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
train_list.append(line.strip().split(' '))
return train_list
def load_test_list():
test_list = []
for line in open('/export/jw/cnn/insuranceQA/test1'):
test_list.append(line.strip().split(' '))
return test_list
def load_train_and_vectors():
trainList = []
for line in open('/export/jw/cnn/insuranceQA/train'):
trainList.append(line.strip())
vectors = load_vectors()
return trainList, vectors
def read_raw():
raw = []
for line in open('/export/jw/cnn/insuranceQA/train'):
items = line.strip().split(' ')
if items[0] == '1':
raw.append(items)
return raw
def encode_sent(vocab, string, size):
x, m = [], []
words = string.split('_')
for i in range(0, size):
if words[i] in vocab: x.append(vocab[words[i]])
else: x.append(vocab['UNKNOWN'])
if words[i] == '<a>': m.append(1)
else: m.append(1)
return x, m
def load_val_data(test_list, vocab, index, batch_size, max_len):
x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
for i in range(0, batch_size):
t_i = index + i
if t_i >= len(test_list):
t_i = len(test_list) - 1
items = test_list[t_i]
x, m = encode_sent(vocab, items[2], max_len)
x1.append(x)
m1.append(m)
x, m = encode_sent(vocab, items[3], max_len)
x2.append(x)
m2.append(m)
x, m = encode_sent(vocab, items[3], max_len)
x3.append(x)
m3.append(m)
return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32'), np.transpose(np.array(m1, dtype='float32')) , np.transpose(np.array(m2, dtype='float32')), np.transpose(np.array(m3, dtype='float32'))
def load_train_data(trainList, vocab, batch_size, max_len):
train_1, train_2, train_3 = [], [], []
mask_1, mask_2, mask_3 = [], [], []
counter = 0
while True:
pos = trainList[random.randint(0, len(trainList)-1)]
neg = trainList[random.randint(0, len(trainList)-1)]
if pos[2].startswith('<a>') or pos[3].startswith('<a>') or neg[3].startswith('<a>'):
#print 'empty string ......'
continue
x, m = encode_sent(vocab, pos[2], max_len)
train_1.append(x)
mask_1.append(m)
x, m = encode_sent(vocab, pos[3], max_len)
train_2.append(x)
mask_2.append(m)
x, m = encode_sent(vocab, neg[3], max_len)
train_3.append(x)
mask_3.append(m)
counter += 1
if counter >= batch_size:
break
return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32'), np.transpose(np.array(mask_1, dtype='float32')) , np.transpose(np.array(mask_2, dtype='float32')), np.transpose(np.array(mask_3, dtype='float32'))
def evaluation(score_list, test_list):
global precision
sessdict, index = {}, int(0)
for items in test_list:
qid = items[1].split(':')[1]
if not qid in sessdict:
sessdict[qid] = []
sessdict[qid].append((score_list[index], items[0]))
index += 1
if index >= len(test_list):
break
lev1, lev0 = float(0), float(0)
of = open(precision, 'a')
for k, v in sessdict.items():
v.sort(key=itemgetter(0), reverse=True)
score, flag = v[0]
if flag == '1': lev1 += 1
if flag == '0': lev0 += 1
of.write('lev1:' + str(lev1) + '\n')
of.write('lev0:' + str(lev0) + '\n')
print('lev1 ' + str(lev1))
print('lev0 ' + str(lev0))
print('precision:' + str(lev1 / (lev0 + lev1)))
of.close()
================================================
FILE: rnn_attention/tensorflow/tf_rnn_char.py
================================================
# -*- coding: utf-8 -*-
####################################################################################
#test1 top1准确率59%
####################################################################################
import tensorflow as tf
import numpy as np
from operator import itemgetter
import random, datetime, json, insurance_qa_data_helpers
class RNN_Model(object):
def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, embed_dim, num_step, fw_cell, bw_cell):
_initial_state = fw_cell.zero_state(batch_size,dtype=tf.float32)
inputs=tf.nn.embedding_lookup(embedding, inputs)
inputs = tf.nn.dropout(inputs, self.keep_prob)
#[batch_size, sequence_length, embedding_size]转换为[sequence_length, batch_size, embedding_size]
inputs = tf.transpose(inputs, [1, 0, 2])
#[sequence_length, batch_size, embedding_size]转换为list, sequence_length个[batch_size, embedding_size]
inputs = tf.unstack(inputs)
#inputs = tf.reshape(inputs, [-1, embed_dim])
#inputs = tf.split(inputs, num_step, 0)
#输出为list, sequence_length个[batch_size, embedding_size * 2]
outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=_initial_state, initial_state_bw=_initial_state)
outputs = tf.transpose(tf.stack(outputs), [1, 0, 2])
self.outputs = outputs
#对rnn的输出[batch_size, sequence_length, embedding_size],目前采用maxpooling是最好的效果
#mean_pooling以及取最后一个step的向量,效果都不好
outputs = self._max_pooling(outputs)
print outputs
#outputs = outputs[-1]
#outputs = outputs * mask[:, :, None]
#mean pooling
#outputs = tf.reduce_sum(outputs, 0) / (tf.reduce_sum(mask, 0)[:,None])
return outputs
def _max_pooling(self, lstm):
sequence_length, embedding_size = int(lstm.get_shape()[1]), int(lstm.get_shape()[2])
lstm = tf.expand_dims(lstm, -1)
output = tf.nn.max_pool(lstm, ksize=[1, sequence_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
output = tf.reshape(output, [-1, embedding_size])
return output
def __init__(self, config, is_training=True):
self.keep_prob=tf.placeholder(tf.float32, name='dropout_keep_prob')
self.batch_size=config.batch_size
self.num_step=config.num_step
self.qlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
#这个版本没有使用mask
self.mask_q = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
self.plist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
self.mask_p = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
self.nlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
self.mask_n = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
hidden_neural_size=config.hidden_neural_size
vocabulary_size=config.vocabulary_size
self.embed_dim=config.embed_dim
hidden_layer_num=config.hidden_layer_num
#fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
fw_cell = tf.contrib.rnn.DropoutWrapper(
fw_cell,output_keep_prob=self.keep_prob
)
#bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
bw_cell = tf.contrib.rnn.DropoutWrapper(
bw_cell,output_keep_prob=self.keep_prob
)
#embedding layer
with tf.device("/cpu:1"),tf.name_scope("embedding_layer"):
self.embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
#self.a_embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
q = self._rnn_net(self.qlist, mask_q, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
tf.get_variable_scope().reuse_variables()
p = self._rnn_net(self.plist, mask_p, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
tf.get_variable_scope().reuse_variables()
n = self._rnn_net(self.nlist, mask_n, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
#len_1 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)), 0.01, 100000)
#len_2 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)), 0.01, 100000)
#len_3 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)), 0.01, 100000)
len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1))
len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1))
len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1))
self.cos12 = tf.reduce_sum(tf.multiply(q, p), axis=1) / (len_1 * len_2)
self.cos13 = tf.reduce_sum(tf.multiply(q, n), axis=1) / (len_1 * len_3)
self.q = q
self.p = p
zero = tf.constant(np.zeros(self.batch_size, dtype='float32'))
margin = tf.constant(np.full(self.batch_size, 0.1, dtype='float32'))
diff = tf.cast(tf.maximum(zero, margin - self.cos12 + self.cos13), dtype='float32')
self.cost = tf.reduce_sum(diff)
self.accuracy = tf.reduce_sum(tf.cast(tf.equal(zero, diff), dtype='float32')) / float(self.batch_size)
def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n):
fetches = [model.cost, model.accuracy, global_step, train_op, model.cos12, model.q, model.p, model.outputs]
feed_dict = {
model.qlist: qlist,
model.plist: plist,
model.nlist: nlist,
model.mask_q : mask_q,
model.mask_p : mask_p,
model.mask_n : mask_n,
model.keep_prob: config.keep_prob
}
cost, accuracy, step, _, cos12, q, p, outputs = sess.run(fetches, feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))
def dev_step(model, vocab, batch_size, max_len):
score_list, i = [], int(0)
while True:
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_val_data(test_list, vocab, i, FLAGS.batch_size, max_len)
feed_dict = {
model.qlist: qlist,
model.plist: plist,
model.nlist: nlist,
model.mask_q : mask_q,
model.mask_p : mask_p,
model.mask_n : mask_n,
model.keep_prob: float(1.0)
}
batch_scores = sess.run([model.cos12], feed_dict)
for score in batch_scores[0]:
score_list.append(score)
i += FLAGS.batch_size
if i >= len(test_list):
break
insurance_qa_data_helpers.evaluation(score_list, test_list)
tf.flags.DEFINE_integer('evaluate_every',10000,'evaluate every')
tf.flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim')
tf.flags.DEFINE_integer('hidden_neural_size',200,'LSTM hidden neural size')
tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num')
tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence')
tf.flags.DEFINE_float('init_scale',0.1,'init scale')
tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate')
tf.flags.DEFINE_integer('num_epoch',1000000,'num epoch')
tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
vocab = insurance_qa_data_helpers.build_vocab()
train_list = insurance_qa_data_helpers.load_train_list()
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
test_list = insurance_qa_data_helpers.load_test_list()
class Config(object):
hidden_neural_size=FLAGS.hidden_neural_size
vocabulary_size=len(vocab)
embed_dim=FLAGS.emdedding_dim
hidden_layer_num=FLAGS.hidden_layer_num
keep_prob=FLAGS.keep_prob
batch_size = FLAGS.batch_size
num_step = FLAGS.max_len
max_grad_norm=FLAGS.max_grad_norm
num_epoch = FLAGS.num_epoch
config = Config()
eval_config=Config()
eval_config.keep_prob=1.0
with tf.Graph().as_default():
with tf.device('/gpu:1'):
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
with tf.variable_scope("model",reuse=None,initializer=initializer):
model = RNN_Model(config=config, is_training=True)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
#optimizer = tf.train.RMSPropOptimizer(0.01)
#optimizer = tf.train.AdamOptimizer(0.1)
optimizer = tf.train.GradientDescentOptimizer(0.2)
grads_and_vars = optimizer.compute_gradients(model.cost)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Initialize all variables
sess.run(tf.global_variables_initializer())
for i in range(config.num_epoch):
qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
dev_step(model, vocab, FLAGS.batch_size, FLAGS.max_len)
================================================
FILE: swem/swem_hier.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
################################################################################
# Insurance-QA
# AUC 0.96, top 1 precision:31%
#
# quora-data
# best precision: 0.8369, best threshold:0.62
################################################################################
class SWEM_HIER(object):
def __init__(self,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.x1 = tf.placeholder(tf.int32, [None, sequence_length])
self.x2 = tf.placeholder(tf.int32, [None, sequence_length])
self.y = tf.placeholder(tf.float32, [None])
self.one = tf.placeholder(tf.float32, [None])
#self.dropout_keep_prob = tf.placeholder(tf.float32)
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
x1_mat = tf.nn.embedding_lookup(self.word_mat, self.x1)
x2_mat = tf.nn.embedding_lookup(self.word_mat, self.x2)
self.x1_mat_exp = tf.expand_dims(x1_mat, -1)
self.x2_mat_exp = tf.expand_dims(x2_mat, -1)
p1 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p2 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p1 = tf.reshape(tf.reduce_max(p1, 1), [-1, embedding_size])
p2 = tf.reshape(tf.reduce_max(p2, 1), [-1, embedding_size])
"""
p11 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 3, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p21 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 3, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
p11 = tf.reshape(tf.reduce_max(p11, 1), [-1, embedding_size])
p21 = tf.reshape(tf.reduce_max(p21, 1), [-1, embedding_size])
p1 = tf.concat([p1, p11], 1)
p2 = tf.concat([p2, p21], 1)
"""
self.cos = self.cosine(p1, p2)
self.losses = self.logloss(self.y, self.one, self.cos)
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
max_len = 100
num_epoch = 200000
batch_size = 256
checkpoint_every = 10000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
y, x1, x2 = utils.gen_train_batch_yxx(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:y, swem.one:one}
_, step, loss, cos = sess.run(
[train_op, global_step, swem.losses, swem.cos], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}".format(time_str, step, loss))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, x1, x2 = utils.gen_test_batch_yxx(test_data, i, i + batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:f, swem.one:one}
loss, cos = sess.run([swem.losses, swem.cos], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
utils._eval(y, g, yp)
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: swem/swem_hier_margin.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
#top 1 precision:54%
class SWEM_HIER(object):
def __init__(self,
margin,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.zero = tf.placeholder(tf.float32, [None])
self.q = tf.placeholder(tf.int32, [None, sequence_length])
self.qp = tf.placeholder(tf.int32, [None, sequence_length])
self.qn = tf.placeholder(tf.int32, [None, sequence_length])
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
self.q_mat_exp = tf.expand_dims(q_mat, -1)
self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
self.word_mat1 = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.q)
qp_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qp)
qn_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qn)
self.q_mat_exp1 = tf.expand_dims(q_mat1, -1)
self.qp_mat_exp1 = tf.expand_dims(qp_mat1, -1)
self.qn_mat_exp1 = tf.expand_dims(qn_mat1, -1)
q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
q = tf.reshape(tf.reduce_max(q, 1), [-1, embedding_size])
qp = tf.reshape(tf.reduce_max(qp, 1), [-1, embedding_size])
qn = tf.reshape(tf.reduce_max(qn, 1), [-1, embedding_size])
q1 = tf.nn.avg_pool(self.q_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp1 = tf.nn.avg_pool(self.qp_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn1 = tf.nn.avg_pool(self.qn_mat_exp1, ksize=[1, 1, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
q1 = tf.reshape(tf.reduce_max(q1, 1), [-1, embedding_size])
qp1 = tf.reshape(tf.reduce_max(qp1, 1), [-1, embedding_size])
qn1 = tf.reshape(tf.reduce_max(qn1, 1), [-1, embedding_size])
q = tf.concat([q, q1], 1)
qp = tf.concat([qp, qp1], 1)
qn = tf.concat([qn, qn1], 1)
self.cos_q_qp = self.cosine(q, qp)
self.cos_q_qn = self.cosine(q, qn)
self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
correct = tf.equal(self.zero, loss_batch)
self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
losses = tf.reduce_sum(loss_batch)
return losses, loss_batch
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
margin = 0.05
max_len = 200
num_epoch = 200000
batch_size = 256
checkpoint_every = 50000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
_, step, loss, cos, acc = sess.run(
[train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc{:g}".format(time_str, step, loss, acc))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
auc = utils.eval_auc(y, g, yp)
top1_prec = utils._eval_top1_prec(y, g, yp)
#if auc < prev_auc:
# _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
# features.append(_flist)
# break
#prev_auc = auc
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: swem/swem_max_margin.py
================================================
import numpy as np
import tensorflow as tf
import time, os, random, datetime, sys
from sklearn import metrics
sys.path.append('../')
import config, utils
class SWEM_HIER(object):
def __init__(self,
margin,
sequence_length,
vocab_size,
embedding_size,
embeddings):
self.zero = tf.placeholder(tf.float32, [None])
self.q = tf.placeholder(tf.int32, [None, sequence_length])
self.qp = tf.placeholder(tf.int32, [None, sequence_length])
self.qn = tf.placeholder(tf.int32, [None, sequence_length])
with tf.device('/cpu:0'), tf.name_scope('embedding'):
self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
self.q_mat_exp = tf.expand_dims(q_mat, -1)
self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
"""
q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
strides=[1, 1, 1, 1], padding='VALID')
"""
q = tf.reshape(tf.reduce_max(self.q_mat_exp, 1), [-1, embedding_size])
qp = tf.reshape(tf.reduce_max(self.qp_mat_exp, 1), [-1, embedding_size])
qn = tf.reshape(tf.reduce_max(self.qn_mat_exp, 1), [-1, embedding_size])
self.cos_q_qp = self.cosine(q, qp)
self.cos_q_qn = self.cosine(q, qn)
self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
correct = tf.equal(self.zero, loss_batch)
self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
losses = tf.reduce_sum(loss_batch)
return losses, loss_batch
def logloss(self, y, v_one, sim):
a = tf.multiply(y, tf.log(sim)) #y*log(p)
b = tf.subtract(v_one, y)#1-y
c = tf.log(tf.subtract(v_one, sim))#log(1-p)
losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
losses = tf.reduce_sum(losses, -1)
return losses
def cosine(self, t1, t2):
len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
cos = tf.div(multiply, tf.multiply(len1, len2))
return tf.clip_by_value(cos, 1e-5, 0.99999)
def get_constant(batch_size):
one, zero = [1.0] * batch_size, [0.0] * batch_size
return np.array(one), np.array(zero)
margin = 0.05
max_len = 200
num_epoch = 200000
batch_size = 256
checkpoint_every = 50000
vocab, embeddings = utils.load_embeddings()
embedding_size = len(embeddings[0])
train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
print('load data done ......')
print(embeddings.shape)
prev_auc = 0.0
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-1)
#optimizer = tf.train.GradientDescentOptimizer(1e-1)
grads_and_vars = optimizer.compute_gradients(swem.losses)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.all_variables())
sess.run(tf.initialize_all_variables())
def train_step():
q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
one, zero = get_constant(batch_size)
feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
_, step, loss, cos, acc = sess.run(
[train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, acc))
def test_step():
yp, y, group = [], [], []
for i in range(0, len(test_data), batch_size):
f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
one, zero = get_constant(len(f))
feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
yp.extend(cos)
y.extend(f)
group.extend(g)
ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
#for _y, _g, _yp in ppp:
# print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
for i in range(num_epoch):
train_step()
current_step = tf.train.global_step(sess, global_step)
if current_step % checkpoint_every == 0:
y, g, yp = test_step()
auc = utils.eval_auc(y, g, yp)
top1_prec = utils._eval_top1_prec(y, g, yp)
#if auc < prev_auc:
# _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
# features.append(_flist)
# break
#prev_auc = auc
#utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
#utils.save_features(features[3], './data/gen_sweg_hier_test.f')
================================================
FILE: utils.py
================================================
import numpy as np
import random, sys, config
from sklearn import metrics
from operator import itemgetter
from itertools import groupby
def load_embeddings():
_data, embeddings, vocab, _id = [], [], {}, int(0)
for line in open(config.w2v_bin_file):
_data.append(line.strip().split(' '))
size, dim = int(_data[0][0]), int(_data[0][1])
for i in range(1, len(_data)):
w, vec = _data[i][0], [float(_data[i][k]) for k in range(1, dim+1)]
embeddings.append(vec)
vocab[w] = _id
_id += 1
embeddings.append([0.01] * dim)
vocab['UNKNOWN'] = _id
_id += 1
embeddings.append([0.01] * dim)
vocab['<a>'] = _id
return vocab, np.array(embeddings)
def encode_sent(s, vocab, max_len):
ws = [w for w in s.split('_')]
ws = ws[:max_len] if len(ws) >= max_len else ws + ['<a>'] * (max_len - len(ws))
nws = []
for w in ws:
nw = w if w in vocab else 'UNKNOWN'
nws.append(vocab[nw])
return nws
def load_train_data(vocab, max_len):
if config.dataset == config.dataset_ins:
return ins_load_train_data(vocab, max_len)
if config.dataset == config.dataset_qur:
return qur_load_train_test_data(config.train_file, vocab, max_len)
print('bad load_train_data')
exit(1)
def qur_load_train_test_data(_file, vocab, max_len):
_data = []
for line in open(_file):
f, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((int(f), q1, q2))
return _data
def ins_load_train_data(vocab, max_len):
_data = []
for line in open(config.train_file):
f, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((q1, q2))
return _data
def load_test_data(vocab, max_len):
if config.dataset == config.dataset_ins:
return ins_load_test_data(vocab, max_len)
if config.dataset == config.dataset_qur:
return qur_load_train_test_data(config.test1_file, vocab, max_len)
print('bad load_test_data')
exit(1)
def ins_load_test_data(vocab, max_len):
_data = []
for line in open(config.test1_file):
f, g, q1, q2 = line.strip().split(' ')
q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
_data.append((f, g, q1, q2))
return _data
def gen_train_batch_qpn(_data, batch_size):
psample = random.sample(_data, batch_size)
nsample = random.sample(_data, batch_size)
q = [s1 for s1, s2 in psample]
qp = [s2 for s1, s2 in psample]
qn = [s2 for s1, s2 in nsample]
return np.array(q), np.array(qp), np.array(qn)
def gen_train_batch_yxx(_data, batch_size):
if config.dataset == config.dataset_ins:
return ins_gen_train_batch_yxx(_data, batch_size)
if config.dataset == config.dataset_qur:
return qur_gen_train_batch_yxx(_data, batch_size)
print('bad gen_train_batch_yxx')
exit(1)
def qur_gen_train_batch_yxx(_data, batch_size):
sample = random.sample(_data, batch_size)
y = [i for i,_,_ in sample]
x1 = [i for _,i,_ in sample]
x2 = [i for _,_,i in sample]
return np.array(y), np.array(x1), np.array(x2)
def ins_gen_train_batch_yxx(_data, batch_size):
part_one, part_two = int(batch_size / 4 * 3), int(batch_size / 4)
psample = random.sample(_data, part_one)
nsample = random.sample(_data, part_two)
y = [1.0] * part_one + [0.0] * part_two
x1 = [s1 for s1, s2 in psample] + [s1 for s1, s2 in psample[:part_two]]
x2 = [s2 for s1, s2 in psample] + [s2 for s1, s2 in nsample]
return np.array(y), np.array(x1), np.array(x2)
def gen_test_batch_qpn(_data, start, end):
sample = _data[start:end]
for i in range(len(sample), end - start):
sample.append(sample[-1])
f = [int(i) for i,_,_,_ in sample]
g = [int(i) for _,i,_,_ in sample]
q1 = [i for _,_,i,_ in sample]
q2 = [i for _,_,_,i in sample]
return f, g, np.array(q1), np.array(q2)
def gen_test_batch_yxx(_data, start, end):
if config.dataset == config.dataset_ins:
return ins_gen_test_batch_yxx(_data, start, end)
if config.dataset == config.dataset_qur:
return qur_gen_test_batch_yxx(_data, start, end)
print('bad gen_test_batch_yxx')
exit(1)
def qur_gen_test_batch_yxx(_data, start, end):
sample = _data[start:end]
y = [i for i,_,_ in sample]
x1 = [i for _,i,_ in sample]
x2 = [i for _,_,i in sample]
return y, y, np.array(x1), np.array(x2)
def ins_gen_test_batch_yxx(_data, start, end):
sample = _data[start:end]
for i in range(len(sample), end - start):
sample.append(sample[-1])
f = [int(i) for i,_,_,_ in sample]
g = [int(i) for _,i,_,_ in sample]
q1 = [i for _,_,i,_ in sample]
q2 = [i for _,_,_,i in sample]
return f, g, np.array(q1), np.array(q2)
def _eval(y, g, yp):
if config.dataset == config.dataset_ins:
eval_auc(y, g, yp)
eval_top1_prec(y, g, yp)
if config.dataset == config.dataset_qur:
eval_auc(y, g, yp)
eval_best_prec(y, g, yp)
def eval_best_prec(y, g, yp):
best_p, best_s = 0.0, 0.0
for i in range(50, 100, 1):
i = float(i) / 100
positive = 0
for _y, _yp in zip(y, yp):
p = 1 if _yp >= i else 0
if p == _y: positive += 1
prec = positive / len(yp)
if prec > best_p:
best_p = prec
best_s = i
print('best_prec: ' + str(best_p) + ' best_threshold:' + str(best_s))
return best_p, best_s
def eval_auc(y, g, yp):
auc = metrics.roc_auc_score(y, yp)
print('auc: ' + str(auc))
return auc
def eval_top1_prec(y, g, yp):
_list = [(_y, _g, _yp) for _y, _g, _yp in zip(y, g, yp)]
_dict = {}
for _y, _g, _yp in _list:
if not _g in _dict: _dict[_g] = []
_dict[_g].append((_y, _g, _yp))
positive, gc = 0 , 0
for _, group in _dict.items():
group = sorted(group, key=itemgetter(2), reverse=True)
gc += 1
if group[0][0] == 1:
positive += 1
prec = positive / gc
print('top1 precision ' + str(positive) + '/' + str(gc) + ': '+ str(positive / gc))
return prec
gitextract_f17xp5p4/ ├── README.md ├── cnn/ │ ├── tensorflow/ │ │ ├── README.md │ │ ├── insqa_cnn.py │ │ ├── insqa_cnn.py.old │ │ ├── insqa_train.py │ │ ├── insqa_train.py.old │ │ ├── insurance_qa_data_helpers.py │ │ └── test.py │ └── theano/ │ ├── README.md │ └── insqa_cnn.py ├── config.py ├── gen.py ├── lstm_cnn/ │ └── theano/ │ ├── README.md │ └── insqa_lstm.py ├── rnn_attention/ │ └── tensorflow/ │ ├── insurance_qa_data_helpers.py │ └── tf_rnn_char.py ├── swem/ │ ├── swem_hier.py │ ├── swem_hier_margin.py │ └── swem_max_margin.py └── utils.py
SYMBOL INDEX (127 symbols across 12 files)
FILE: cnn/tensorflow/insqa_cnn.py
class InsQACNN (line 7) | class InsQACNN(object):
method __init__ (line 8) | def __init__(self, _margin, sequence_length, batch_size,
method conv (line 60) | def conv(self, tensor):
method cosine (line 83) | def cosine(self, v1, v2):
FILE: cnn/tensorflow/insqa_train.py
function train_step (line 116) | def train_step(q, qp, qn):
function test_step (line 131) | def test_step():
FILE: cnn/tensorflow/insurance_qa_data_helpers.py
function build_vocab (line 14) | def build_vocab():
function rand_qa (line 37) | def rand_qa(qalist):
function read_alist (line 41) | def read_alist():
function vocab_plus_overlap (line 49) | def vocab_plus_overlap(vectors, sent, over, size):
function load_vectors (line 74) | def load_vectors():
function read_vector (line 86) | def read_vector(vectors, word):
function load_test_and_vectors (line 94) | def load_test_and_vectors():
function load_train_and_vectors (line 101) | def load_train_and_vectors():
function load_data_val_10 (line 108) | def load_data_val_10(testList, vectors, index):
function read_raw (line 118) | def read_raw():
function encode_sent (line 126) | def encode_sent(vocab, string, size):
function load_data_6 (line 136) | def load_data_6(vocab, alist, raw, size):
function load_data_val_6 (line 148) | def load_data_val_6(testList, vocab, index, batch):
function load_data_9 (line 162) | def load_data_9(trainList, vectors, size):
function load_data_val_9 (line 179) | def load_data_val_9(testList, vectors, index):
function load_data_10 (line 187) | def load_data_10(vectors, qalist, raw, size):
function load_data_11 (line 198) | def load_data_11(vectors, qalist, raw, size):
function batch_iter (line 209) | def batch_iter(data, batch_size, num_epochs, shuffle=True):
FILE: cnn/theano/insqa_cnn.py
function build_vocab (line 23) | def build_vocab():
function load_vectors (line 39) | def load_vectors():
function load_word_embeddings (line 52) | def load_word_embeddings(vocab, dim):
function encode_sent (line 66) | def encode_sent(vocab, string, size):
function load_train_list (line 76) | def load_train_list():
function load_test_list (line 83) | def load_test_list():
function load_data (line 90) | def load_data(trainList, vocab, batch_size):
function load_data_val (line 100) | def load_data_val(testList, vocab, index, batch_size):
function validation (line 112) | def validation(validate_model, testList, vocab, batch_size):
class QACnn (line 140) | class QACnn(object):
method __init__ (line 141) | def __init__(self, input1, input2, input3, word_embeddings, batch_size...
method _dropout (line 231) | def _dropout(self, rng, layer, keep_prob):
function train (line 238) | def train():
FILE: gen.py
function load_vocab (line 7) | def load_vocab():
function ins_load_answers (line 18) | def ins_load_answers():
function ins_w2v (line 29) | def ins_w2v():
function ins_train (line 49) | def ins_train():
function ins_test (line 65) | def ins_test():
function ins_qa (line 83) | def ins_qa():
function qur_prepare (line 88) | def qur_prepare():
function qur_qa (line 124) | def qur_qa():
FILE: lstm_cnn/theano/insqa_lstm.py
function build_vocab (line 25) | def build_vocab():
function load_vectors (line 40) | def load_vectors():
function load_word_embeddings (line 52) | def load_word_embeddings(vocab, dim):
function encode_sent (line 66) | def encode_sent(vocab, string, size):
function load_train_list (line 80) | def load_train_list():
function load_test_list (line 88) | def load_test_list():
function load_data (line 94) | def load_data(trainList, vocab, batch_size):
function load_data_val (line 118) | def load_data_val(testList, vocab, index, batch_size):
function validation (line 136) | def validation(validate_model, testList, vocab, batch_size):
function ortho_weight (line 171) | def ortho_weight(ndim):
function numpy_floatX (line 176) | def numpy_floatX(data):
function param_init_cnn (line 179) | def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_p...
function param_init_lstm (line 200) | def param_init_lstm(proj_size, tparams, grad_params):
function dropout_layer (line 220) | def dropout_layer(state_before, use_noise, trng):
class LSTM (line 229) | class LSTM(object):
method __init__ (line 230) | def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_e...
method _cnn_net (line 271) | def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_f...
method _lstm_net (line 286) | def _lstm_net(self, tparams, _input, sequence_len, batch_size, embeddi...
function lstm_layer (line 298) | def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None):
function _p (line 355) | def _p(pp, name):
function train (line 358) | def train():
FILE: rnn_attention/tensorflow/insurance_qa_data_helpers.py
function build_vocab (line 17) | def build_vocab():
function read_alist (line 39) | def read_alist():
function load_vectors (line 47) | def load_vectors():
function read_vector (line 59) | def read_vector(vectors, word):
function load_train_list (line 67) | def load_train_list():
function load_test_list (line 75) | def load_test_list():
function load_train_and_vectors (line 81) | def load_train_and_vectors():
function read_raw (line 88) | def read_raw():
function encode_sent (line 96) | def encode_sent(vocab, string, size):
function load_val_data (line 106) | def load_val_data(test_list, vocab, index, batch_size, max_len):
function load_train_data (line 124) | def load_train_data(trainList, vocab, batch_size, max_len):
function evaluation (line 148) | def evaluation(score_list, test_list):
FILE: rnn_attention/tensorflow/tf_rnn_char.py
class RNN_Model (line 11) | class RNN_Model(object):
method _rnn_net (line 12) | def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, emb...
method _max_pooling (line 37) | def _max_pooling(self, lstm):
method __init__ (line 44) | def __init__(self, config, is_training=True):
function train_step (line 101) | def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n):
function dev_step (line 117) | def dev_step(model, vocab, batch_size, max_len):
class Config (line 159) | class Config(object):
FILE: swem/swem_hier.py
class SWEM_HIER (line 15) | class SWEM_HIER(object):
method __init__ (line 16) | def __init__(self,
method logloss (line 53) | def logloss(self, y, v_one, sim):
method cosine (line 61) | def cosine(self, t1, t2):
function get_constant (line 68) | def get_constant(batch_size):
function train_step (line 104) | def train_step():
function test_step (line 113) | def test_step():
FILE: swem/swem_hier_margin.py
class SWEM_HIER (line 9) | class SWEM_HIER(object):
method __init__ (line 10) | def __init__(self,
method margin_loss (line 70) | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
method logloss (line 75) | def logloss(self, y, v_one, sim):
method cosine (line 83) | def cosine(self, t1, t2):
function get_constant (line 90) | def get_constant(batch_size):
function train_step (line 127) | def train_step():
function test_step (line 136) | def test_step():
FILE: swem/swem_max_margin.py
class SWEM_HIER (line 8) | class SWEM_HIER(object):
method __init__ (line 9) | def __init__(self,
method margin_loss (line 47) | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
method logloss (line 52) | def logloss(self, y, v_one, sim):
method cosine (line 60) | def cosine(self, t1, t2):
function get_constant (line 67) | def get_constant(batch_size):
function train_step (line 104) | def train_step():
function test_step (line 113) | def test_step():
FILE: utils.py
function load_embeddings (line 7) | def load_embeddings():
function encode_sent (line 24) | def encode_sent(s, vocab, max_len):
function load_train_data (line 33) | def load_train_data(vocab, max_len):
function qur_load_train_test_data (line 41) | def qur_load_train_test_data(_file, vocab, max_len):
function ins_load_train_data (line 49) | def ins_load_train_data(vocab, max_len):
function load_test_data (line 57) | def load_test_data(vocab, max_len):
function ins_load_test_data (line 65) | def ins_load_test_data(vocab, max_len):
function gen_train_batch_qpn (line 73) | def gen_train_batch_qpn(_data, batch_size):
function gen_train_batch_yxx (line 81) | def gen_train_batch_yxx(_data, batch_size):
function qur_gen_train_batch_yxx (line 89) | def qur_gen_train_batch_yxx(_data, batch_size):
function ins_gen_train_batch_yxx (line 96) | def ins_gen_train_batch_yxx(_data, batch_size):
function gen_test_batch_qpn (line 105) | def gen_test_batch_qpn(_data, start, end):
function gen_test_batch_yxx (line 115) | def gen_test_batch_yxx(_data, start, end):
function qur_gen_test_batch_yxx (line 123) | def qur_gen_test_batch_yxx(_data, start, end):
function ins_gen_test_batch_yxx (line 130) | def ins_gen_test_batch_yxx(_data, start, end):
function _eval (line 140) | def _eval(y, g, yp):
function eval_best_prec (line 148) | def eval_best_prec(y, g, yp):
function eval_auc (line 163) | def eval_auc(y, g, yp):
function eval_top1_prec (line 168) | def eval_top1_prec(y, g, yp):
Condensed preview — 20 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (119K chars).
[
{
"path": "README.md",
"chars": 1840,
"preview": "Insurance-QA deeplearning model\n======\nThis is a repo for Q&A Mathing, includes some deep learning models, such as CNN、R"
},
{
"path": "cnn/tensorflow/README.md",
"chars": 733,
"preview": "\n================result==================\n\n结果和theano版本的差不多,具体数值忘了\n\n虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的"
},
{
"path": "cnn/tensorflow/insqa_cnn.py",
"chars": 4179,
"preview": "import tensorflow as tf\nimport numpy as np\n\n##########################################################################\n#"
},
{
"path": "cnn/tensorflow/insqa_cnn.py.old",
"chars": 6350,
"preview": "import tensorflow as tf\nimport numpy as np\n\n##########################################################################\n#"
},
{
"path": "cnn/tensorflow/insqa_train.py",
"chars": 7542,
"preview": "#! /usr/bin/env python3.4\n\nimport tensorflow as tf\nimport numpy as np\nimport os, time, datetime, operator, sys\nfrom insq"
},
{
"path": "cnn/tensorflow/insqa_train.py.old",
"chars": 8381,
"preview": "#! /usr/bin/env python3.4\n\nimport tensorflow as tf\nimport numpy as np\nimport os\nimport time\nimport datetime\nimport insur"
},
{
"path": "cnn/tensorflow/insurance_qa_data_helpers.py",
"chars": 7535,
"preview": "import numpy as np\nimport random\n\nempty_vector = []\nfor i in range(0, 100):\n empty_vector.append(float(0.0))\nonevecto"
},
{
"path": "cnn/tensorflow/test.py",
"chars": 165,
"preview": "import random\n\n_list = [i for i in range(0, 10)]\n_l1 = random.sample(_list, 2)\n_l2 = random.sample(_list, 2)\nprint(_l1)\n"
},
{
"path": "cnn/theano/README.md",
"chars": 855,
"preview": "\n================result==================\ntheano and tensorflow cnn code for insuranceQA\n\ntheano code, test1 top-1 preci"
},
{
"path": "cnn/theano/insqa_cnn.py",
"chars": 11763,
"preview": "\n###########################################################\n# test1 top-1 precision: 62%\n##############################"
},
{
"path": "config.py",
"chars": 1536,
"preview": "import os\n\ndataset_ins = 'insurance-qa'\ndataset_qur = 'quora-qa'\n\n######################################################"
},
{
"path": "gen.py",
"chars": 4780,
"preview": "import config, os, random\n\n#####################################################################\n# function: load vocab\n"
},
{
"path": "lstm_cnn/theano/README.md",
"chars": 701,
"preview": "\ntheano lstm+cnn code for insuranceQA\n\n================result==================\n\ntheano code, test1 top-1 precision : 68"
},
{
"path": "lstm_cnn/theano/insqa_lstm.py",
"chars": 17224,
"preview": "\n############################################################\n# if batch_size is 1, there must be a dtype error when doi"
},
{
"path": "rnn_attention/tensorflow/insurance_qa_data_helpers.py",
"chars": 5618,
"preview": "import numpy as np\nimport random\nfrom operator import itemgetter\n\nprecision = '/export/jw/cnn/insuranceQA/acc.lstm'\n\nemp"
},
{
"path": "rnn_attention/tensorflow/tf_rnn_char.py",
"chars": 10144,
"preview": "# -*- coding: utf-8 -*-\n\n####################################################################################\n#test1 top"
},
{
"path": "swem/swem_hier.py",
"chars": 5669,
"preview": "import numpy as np\nimport tensorflow as tf\nimport time, os, random, datetime, sys\nfrom sklearn import metrics\nsys.path.a"
},
{
"path": "swem/swem_hier_margin.py",
"chars": 7025,
"preview": "import numpy as np\nimport tensorflow as tf\nimport time, os, random, datetime, sys\nfrom sklearn import metrics\nsys.path.a"
},
{
"path": "swem/swem_max_margin.py",
"chars": 6004,
"preview": "import numpy as np\nimport tensorflow as tf\nimport time, os, random, datetime, sys\nfrom sklearn import metrics\nsys.path.a"
},
{
"path": "utils.py",
"chars": 5880,
"preview": "import numpy as np\nimport random, sys, config\nfrom sklearn import metrics\nfrom operator import itemgetter\nfrom itertools"
}
]
About this extraction
This page contains the full source code of the white127/QA-deep-learning GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 20 files (111.3 KB), approximately 31.8k tokens, and a symbol index with 127 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.