Repository: white127/QA-deep-learning Branch: master Commit: 94971ec4b305 Files: 20 Total size: 111.3 KB Directory structure: gitextract_f17xp5p4/ ├── README.md ├── cnn/ │ ├── tensorflow/ │ │ ├── README.md │ │ ├── insqa_cnn.py │ │ ├── insqa_cnn.py.old │ │ ├── insqa_train.py │ │ ├── insqa_train.py.old │ │ ├── insurance_qa_data_helpers.py │ │ └── test.py │ └── theano/ │ ├── README.md │ └── insqa_cnn.py ├── config.py ├── gen.py ├── lstm_cnn/ │ └── theano/ │ ├── README.md │ └── insqa_lstm.py ├── rnn_attention/ │ └── tensorflow/ │ ├── insurance_qa_data_helpers.py │ └── tf_rnn_char.py ├── swem/ │ ├── swem_hier.py │ ├── swem_hier_margin.py │ └── swem_max_margin.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ Insurance-QA deeplearning model ====== This is a repo for Q&A Mathing, includes some deep learning models, such as CNN、RNN.
1. CNN. Basic CNN model from 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
2. RNN. RNN seems the best model on Insurance-QA dataset.
3. SWEM. SWEM is the fastest, and has good effect on other datasets, such as WikiQA ..., but is seems not so good on Insurance-QA dataset. I think that, SWEM is more suitable for Q&Q matching, not Q&A matching.
It's hard to say which model is the best in other datasets, you have to choose the most suitable model for you.

More models are on the way, pay attention to the updates.
## Requirements 1. tensorflow 1.4.0
2. python3.5
## Performance margin loss version
Model/Score | Ins_qa_top1_precision | quora_best_prec ------------ | ------------- | ------------- CNN | 62% | None LSTM+CNN | 68% | None SWEM | <55% | None logloss version
Model/Score | Insqa_top1_precision | quora_best_prec ------------ | ------------- | ------------- CNN | None | 79.60% LSTM+CNN | None | None SWEM | <40% | 82.69% ## Running Change configuration to your own environment, just like data pathes
vim config.py Data processing
python3 gen.py Run CNN model
cd ./cnn/tensorflow && python3 insqa_train.py It will take few hours(thousands of epoches) to train this model on a single GPU.
## Downloads 1. You can get Insurance-QA data from here https://github.com/shuzi/insuranceQA
2. You can get Quora data from here http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv
## Links 1. CNN and RNN textual classification repo https://github.com/white127/TextClassification_CNN_RNN
2. 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
================================================ FILE: cnn/tensorflow/README.md ================================================ ================result================== 结果和theano版本的差不多,具体数值忘了 虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的话训练速度要快一些。 ================dataset================ 数据格式和theano版本的是一样的 github上给出的是样本数据,如果需要全量的,也可直接联系我 dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) you can get the original dataset from https://github.com/shuzi/insuranceQA word embedding is trained by word2vec toolkit =================run===================== ./insqa_train.py 我使用的是python3.4,部分代码可能会和python2不兼容,如使用python2需要自己做一些小修改,核心的CNN代码应该 不用改动的 代码里的数据路径(类似'/export/...')是需要根据自己的环境修改的,指向自己的数据路径即可。核心的CNN代码无需改动 ================================================ FILE: cnn/tensorflow/insqa_cnn.py ================================================ import tensorflow as tf import numpy as np ########################################################################## # embedding_lookup + cnn + cosine margine , batch ########################################################################## class InsQACNN(object): def __init__(self, _margin, sequence_length, batch_size, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): self.L, self.B, self.V, self.E, self.FS, self.NF = sequence_length, batch_size, \ vocab_size, embedding_size, filter_sizes, num_filters #用户问题,字向量使用embedding_lookup self.q = tf.placeholder(tf.int32, [self.B, self.L], name="q") #待匹配正向问题 self.qp = tf.placeholder(tf.int32, [self.B, self.L], name="qp") #负向问题 self.qn = tf.placeholder(tf.int32, [self.B, self.L], name="qn") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") l2_loss = tf.constant(0.0) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): W = tf.get_variable( initializer=tf.random_uniform([self.V, self.E], -1.0, 1.0), name='We') self.qe = tf.nn.embedding_lookup(W, self.q) self.qpe = tf.nn.embedding_lookup(W, self.qp) self.qne = tf.nn.embedding_lookup(W, self.qn) self.qe = tf.expand_dims(self.qe, -1) self.qpe = tf.expand_dims(self.qpe, -1) self.qne = tf.expand_dims(self.qne, -1) with tf.variable_scope('shared-conv') as scope: self.qe = self.conv(self.qe) scope.reuse_variables() #tf.get_variable_scope().reuse_variables() self.qpe = self.conv(self.qpe) scope.reuse_variables() #tf.get_variable_scope().reuse_variables() self.qne = self.conv(self.qne) self.cos_q_qp = self.cosine(self.qe, self.qpe) self.cos_q_qn = self.cosine(self.qe, self.qne) zero = tf.constant(0, shape=[self.B], dtype=tf.float32) margin = tf.constant(_margin, shape=[self.B], dtype=tf.float32) with tf.name_scope("loss"): self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_q_qp, self.cos_q_qn))) self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss print('loss ', self.loss) # Accuracy with tf.name_scope("accuracy"): self.correct = tf.equal(zero, self.losses) self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy") for v in tf.trainable_variables(): print(v) def conv(self, tensor): pooled = [] #with tf.variable_scope(name_or_scope='my-conv', reuse=tf.AUTO_REUSE): with tf.variable_scope("my-conv-shared"): for i, fs in enumerate(self.FS): filter_shape = [fs, self.E, 1, self.NF] W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name="W-%s" % str(fs)) b = tf.get_variable(initializer=tf.constant(0.1, shape=[self.NF]), name="b-%s" % str(fs)) conv = tf.nn.conv2d( tensor, W, strides=[1, 1, 1, 1], padding='VALID', name="conv") h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") output = tf.nn.max_pool( h, ksize=[1, self.L - fs + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled.append(output) num_filters_total = self.NF * len(self.FS) pooled = tf.reshape(tf.concat(pooled, 3), [-1, num_filters_total]) pooled = tf.nn.dropout(pooled, self.dropout_keep_prob) return pooled def cosine(self, v1, v2): l1 = tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1)) l2 = tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)) a = tf.reduce_sum(tf.multiply(v1, v2), 1) cos = tf.div(a, tf.multiply(l1, l2), name='score') return tf.clip_by_value(cos, 1e-5, 0.99999) ================================================ FILE: cnn/tensorflow/insqa_cnn.py.old ================================================ import tensorflow as tf import numpy as np ########################################################################## # embedding_lookup + cnn + cosine margine , batch ########################################################################## class InsQACNN1(object): def __init__( self, sequence_length, batch_size, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): #用户问题,字向量使用embedding_lookup self.input_x_1 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_1") #待匹配正向问题 self.input_x_2 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_2") #负向问题 self.input_x_3 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_3") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") l2_loss = tf.constant(0.0) print("input_x_1 ", self.input_x_1) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): W = tf.Variable( tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W") chars_1 = tf.nn.embedding_lookup(W, self.input_x_1) chars_2 = tf.nn.embedding_lookup(W, self.input_x_2) chars_3 = tf.nn.embedding_lookup(W, self.input_x_3) #self.embedded_chars_1 = tf.nn.dropout(chars_1, self.dropout_keep_prob) #self.embedded_chars_2 = tf.nn.dropout(chars_2, self.dropout_keep_prob) #self.embedded_chars_3 = tf.nn.dropout(chars_3, self.dropout_keep_prob) self.embedded_chars_1 = chars_1 self.embedded_chars_2 = chars_2 self.embedded_chars_3 = chars_3 self.embedded_chars_expanded_1 = tf.expand_dims(self.embedded_chars_1, -1) self.embedded_chars_expanded_2 = tf.expand_dims(self.embedded_chars_2, -1) self.embedded_chars_expanded_3 = tf.expand_dims(self.embedded_chars_3, -1) pooled_outputs_1 = [] pooled_outputs_2 = [] pooled_outputs_3 = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d( self.embedded_chars_expanded_1, W, strides=[1, 1, 1, 1], padding='VALID', name="conv-1" ) h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-1") pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="poll-1" ) pooled_outputs_1.append(pooled) conv = tf.nn.conv2d( self.embedded_chars_expanded_2, W, strides=[1, 1, 1, 1], padding='VALID', name="conv-2" ) h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-2") pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="poll-2" ) pooled_outputs_2.append(pooled) conv = tf.nn.conv2d( self.embedded_chars_expanded_3, W, strides=[1, 1, 1, 1], padding='VALID', name="conv-3" ) h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-3") pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="poll-3" ) pooled_outputs_3.append(pooled) num_filters_total = num_filters * len(filter_sizes) pooled_reshape_1 = tf.reshape(tf.concat(pooled_outputs_1, 3), [-1, num_filters_total]) pooled_reshape_2 = tf.reshape(tf.concat(pooled_outputs_2, 3), [-1, num_filters_total]) pooled_reshape_3 = tf.reshape(tf.concat(pooled_outputs_3, 3), [-1, num_filters_total]) #dropout pooled_flat_1 = tf.nn.dropout(pooled_reshape_1, self.dropout_keep_prob) pooled_flat_2 = tf.nn.dropout(pooled_reshape_2, self.dropout_keep_prob) pooled_flat_3 = tf.nn.dropout(pooled_reshape_3, self.dropout_keep_prob) pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) #计算向量长度Batch模式 pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1)) pooled_len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_3, pooled_flat_3), 1)) pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) #计算向量的点乘Batch模式 pooled_mul_13 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_3), 1) with tf.name_scope("output"): self.cos_12 = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") #计算向量夹角Batch模式 self.cos_13 = tf.div(pooled_mul_13, tf.multiply(pooled_len_1, pooled_len_3)) zero = tf.constant(0, shape=[batch_size], dtype=tf.float32) margin = tf.constant(0.05, shape=[batch_size], dtype=tf.float32) with tf.name_scope("loss"): self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_12, self.cos_13))) self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss print('loss ', self.loss) # Accuracy with tf.name_scope("accuracy"): self.correct = tf.equal(zero, self.losses) self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy") for v in tf.trainable_variables(): print(v) exit(1) ================================================ FILE: cnn/tensorflow/insqa_train.py ================================================ #! /usr/bin/env python3.4 import tensorflow as tf import numpy as np import os, time, datetime, operator, sys from insqa_cnn import InsQACNN sys.path.append('../../') import config, utils print(tf.__version__) # Parameters # ================================================== # Model Hyperparameters tf.flags.DEFINE_float("margin", 0.05, "CNN model margin") tf.flags.DEFINE_integer("sequence_length", 200, "Max sequence lehgth(default: 200)") tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')") tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)") tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)") tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)") # Training parameters tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") tf.flags.DEFINE_integer("evaluate_every", 3000, "Evaluate model on dev set after this many steps (default: 100)") tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") vocab, embeddings = utils.load_embeddings() train_data = utils.load_train_data(vocab, FLAGS.sequence_length) test_data = utils.load_test_data(vocab, FLAGS.sequence_length) print("Load done...") # Training # ================================================== prev_auc = 0 with tf.Graph().as_default(): with tf.device("/gpu:1"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = InsQACNN( _margin=FLAGS.margin, sequence_length=FLAGS.sequence_length, batch_size=FLAGS.batch_size, vocab_size=len(vocab), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-1) #optimizer = tf.train.GradientDescentOptimizer(1e-2) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # Initialize all variables sess.run(tf.initialize_all_variables()) def train_step(q, qp, qn): feed_dict = { cnn.q: q, cnn.qp: qp, cnn.qn: qn, #cnn.input_x_1: q, cnn.input_x_2: qp, cnn.input_x_3: qn, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy, cos1, cos2 = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.cos_q_qp, cnn.cos_q_qn], feed_dict) #print(cos1) #print(cos2) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def test_step(): yp, y, group, of = [], [], [], open(config.predict1_file, 'w') for i in range(0, len(test_data), FLAGS.batch_size): f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+FLAGS.batch_size) feed_dict = { cnn.q: q1, cnn.qp: q2, cnn.qn: q2, #cnn.input_x_1: q1, cnn.input_x_2: q2, cnn.input_x_3: q2, cnn.dropout_keep_prob: 1.0 } cos = sess.run([cnn.cos_q_qp], feed_dict) yp.extend(cos[0]) y.extend(f) group.extend(g) y, g, yp = y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] auc = utils.eval_auc(y[:len(test_data)], g, yp[:len(test_data)]) top1_prec = utils._eval_top1_prec(y, g, yp) for p in yp[:len(test_data)]: of.write(str(p) + '\n') of.write(str(top1_prec) + '\n') of.close() return auc # Generate batches # Training loop. For each batch... for i in range(FLAGS.num_epochs): try: q, qp, qn = utils.gen_train_batch_qpn(train_data, FLAGS.batch_size) train_step(q, qp, qn) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: auc = test_step() #if auc < prev_auc: break prev_auc = auc if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) except Exception as e: print(e) ================================================ FILE: cnn/tensorflow/insqa_train.py.old ================================================ #! /usr/bin/env python3.4 import tensorflow as tf import numpy as np import os import time import datetime import insurance_qa_data_helpers from insqa_cnn import InsQACNN1 import operator #print tf.__version__ # Parameters # ================================================== # Model Hyperparameters tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')") tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)") tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)") tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)") # Training parameters tf.flags.DEFINE_integer("batch_size", 100, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)") tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 100)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") vocab = insurance_qa_data_helpers.build_vocab() alist = insurance_qa_data_helpers.read_alist() raw = insurance_qa_data_helpers.read_raw() x_train_1, x_train_2, x_train_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size) testList, vectors = insurance_qa_data_helpers.load_test_and_vectors() vectors = '' print('x_train_1', np.shape(x_train_1)) print("Load done...") val_file = '/export/jw/cnn/insuranceQA/test1' precision = '/export/jw/cnn/insuranceQA/test1.acc' #x_val, y_val = data_deepqa.load_data_val() # Training # ================================================== with tf.Graph().as_default(): with tf.device("/gpu:1"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = InsQACNN1( sequence_length=x_train_1.shape[1], batch_size=FLAGS.batch_size, vocab_size=len(vocab), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-1) #optimizer = tf.train.GradientDescentOptimizer(1e-2) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # Initialize all variables sess.run(tf.initialize_all_variables()) def train_step(x_batch_1, x_batch_2, x_batch_3): """ A single training step """ feed_dict = { cnn.input_x_1: x_batch_1, cnn.input_x_2: x_batch_2, cnn.input_x_3: x_batch_3, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(): scoreList = [] i = int(0) while True: x_test_1, x_test_2, x_test_3 = insurance_qa_data_helpers.load_data_val_6(testList, vocab, i, FLAGS.batch_size) feed_dict = { cnn.input_x_1: x_test_1, cnn.input_x_2: x_test_2, cnn.input_x_3: x_test_3, cnn.dropout_keep_prob: 1.0 } batch_scores = sess.run([cnn.cos_12], feed_dict) for score in batch_scores[0]: scoreList.append(score) i += FLAGS.batch_size if i >= len(testList): break sessdict = {} index = int(0) for line in open(val_file): items = line.strip().split(' ') qid = items[1].split(':')[1] if not qid in sessdict: sessdict[qid] = [] sessdict[qid].append((scoreList[index], items[0])) index += 1 if index >= len(testList): break lev1 = float(0) lev0 = float(0) of = open(precision, 'a') for k, v in sessdict.items(): v.sort(key=operator.itemgetter(0), reverse=True) score, flag = v[0] if flag == '1': lev1 += 1 if flag == '0': lev0 += 1 of.write('lev1:' + str(lev1) + '\n') of.write('lev0:' + str(lev0) + '\n') print('lev1 ' + str(lev1)) print('lev0 ' + str(lev0)) of.close() # Generate batches # Training loop. For each batch... for i in range(FLAGS.num_epochs): try: x_batch_1, x_batch_2, x_batch_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size) train_step(x_batch_1, x_batch_2, x_batch_3) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step() print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) except Exception as e: print(e) ================================================ FILE: cnn/tensorflow/insurance_qa_data_helpers.py ================================================ import numpy as np import random empty_vector = [] for i in range(0, 100): empty_vector.append(float(0.0)) onevector = [] for i in range(0, 10): onevector.append(float(1)) zerovector = [] for i in range(0, 10): zerovector.append(float(0)) def build_vocab(): code = int(0) vocab = {} vocab['UNKNOWN'] = code code += 1 for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') for i in range(2, 4): words = items[i].split('_') for word in words: if not word in vocab: vocab[word] = code code += 1 for line in open('/export/jw/cnn/insuranceQA/test1'): items = line.strip().split(' ') for i in range(2, 4): words = items[i].split('_') for word in words: if not word in vocab: vocab[word] = code code += 1 return vocab def rand_qa(qalist): index = random.randint(0, len(qalist) - 1) return qalist[index] def read_alist(): alist = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') alist.append(items[3]) print('read_alist done ......') return alist def vocab_plus_overlap(vectors, sent, over, size): global onevector global zerovector oldict = {} words = over.split('_') if len(words) < size: size = len(words) for i in range(0, size): if words[i] == '': continue oldict[words[i]] = '#' matrix = [] words = sent.split('_') if len(words) < size: size = len(words) for i in range(0, size): vec = read_vector(vectors, words[i]) newvec = vec.copy() #if words[i] in oldict: # newvec += onevector #else: # newvec += zerovector matrix.append(newvec) return matrix def load_vectors(): vectors = {} for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): items = line.strip().split(' ') if (len(items) < 101): continue vec = [] for i in range(1, 101): vec.append(float(items[i])) vectors[items[0]] = vec return vectors def read_vector(vectors, word): global empty_vector if word in vectors: return vectors[word] else: return empty_vector #return vectors[''] def load_test_and_vectors(): testList = [] for line in open('/export/jw/cnn/insuranceQA/test1'): testList.append(line.strip()) vectors = load_vectors() return testList, vectors def load_train_and_vectors(): trainList = [] for line in open('/export/jw/cnn/insuranceQA/train'): trainList.append(line.strip()) vectors = load_vectors() return trainList, vectors def load_data_val_10(testList, vectors, index): x_train_1 = [] x_train_2 = [] x_train_3 = [] items = testList[index].split(' ') x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) x_train_3.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) def read_raw(): raw = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') if items[0] == '1': raw.append(items) return raw def encode_sent(vocab, string, size): x = [] words = string.split('_') for i in range(0, 200): if words[i] in vocab: x.append(vocab[words[i]]) else: x.append(vocab['UNKNOWN']) return x def load_data_6(vocab, alist, raw, size): x_train_1 = [] x_train_2 = [] x_train_3 = [] for i in range(0, size): items = raw[random.randint(0, len(raw) - 1)] nega = rand_qa(alist) x_train_1.append(encode_sent(vocab, items[2], 100)) x_train_2.append(encode_sent(vocab, items[3], 100)) x_train_3.append(encode_sent(vocab, nega, 100)) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) def load_data_val_6(testList, vocab, index, batch): x_train_1 = [] x_train_2 = [] x_train_3 = [] for i in range(0, batch): true_index = index + i if (true_index >= len(testList)): true_index = len(testList) - 1 items = testList[true_index].split(' ') x_train_1.append(encode_sent(vocab, items[2], 100)) x_train_2.append(encode_sent(vocab, items[3], 100)) x_train_3.append(encode_sent(vocab, items[3], 100)) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) def load_data_9(trainList, vectors, size): x_train_1 = [] x_train_2 = [] y_train = [] for i in range(0, size): pos = trainList[random.randint(0, len(trainList) - 1)] posItems = pos.strip().split(' ') x_train_1.append(vocab_plus_overlap(vectors, posItems[2], posItems[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, posItems[3], posItems[2], 200)) y_train.append([1, 0]) neg = trainList[random.randint(0, len(trainList) - 1)] negItems = neg.strip().split(' ') x_train_1.append(vocab_plus_overlap(vectors, posItems[2], negItems[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, negItems[3], posItems[2], 200)) y_train.append([0, 1]) return np.array(x_train_1), np.array(x_train_2), np.array(y_train) def load_data_val_9(testList, vectors, index): x_train_1 = [] x_train_2 = [] items = testList[index].split(' ') x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) return np.array(x_train_1), np.array(x_train_2) def load_data_10(vectors, qalist, raw, size): x_train_1 = [] x_train_2 = [] x_train_3 = [] items = raw[random.randint(0, len(raw) - 1)] nega = rand_qa(qalist) x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200)) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) def load_data_11(vectors, qalist, raw, size): x_train_1 = [] x_train_2 = [] x_train_3 = [] items = raw[random.randint(0, len(raw) - 1)] nega = rand_qa(qalist) x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200)) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) def batch_iter(data, batch_size, num_epochs, shuffle=True): data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index] ================================================ FILE: cnn/tensorflow/test.py ================================================ import random _list = [i for i in range(0, 10)] _l1 = random.sample(_list, 2) _l2 = random.sample(_list, 2) print(_l1) print(_l2) for i in range(2, 2): print(i) ================================================ FILE: cnn/theano/README.md ================================================ ================result================== theano and tensorflow cnn code for insuranceQA theano code, test1 top-1 precision : 61.5% (see ./insuranceQA/acc) tensorflow code, test1 top-1 precision : 62.6% the best precision in the paper is 62.8% (see Applying Deep Leaarning To Answer Selection: A study and an open task) ================dataset================ dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) you can get the original dataset from https://github.com/shuzi/insuranceQA word embedding is trained by word2vec toolkit =================run===================== reformat the original dataset(see my train and test1.sample) change filepath to your dataset(see TODO in insqa_cnn.py) python insqa_cnn.py ================================================ FILE: cnn/theano/insqa_cnn.py ================================================ ########################################################### # test1 top-1 precision: 62% ########################################################### import os, sys, timeit, random, operator import numpy as np import theano import theano.tensor as T from theano.tensor.signal import pool from theano.tensor.nnet import conv2d #TODO change path to your dataset trainfile = '/export/jw/cnn/insuranceQA/train' test1file = '/export/jw/cnn/insuranceQA/test1' vectorsfile = '/export/jw/cnn/insuranceQA/vectors.nobin' ########################################################### # read qa data ########################################################### def build_vocab(): global trainfile code, vocab = int(0), {} vocab['UNKNOWN'] = code code += 1 for line in open(trainfile): items = line.strip().split(' ') for i in range(2, 3): for word in items[i].split('_'): if len(word) <= 0: continue if not word in vocab: vocab[word] = code code += 1 return vocab def load_vectors(): global vectorsfile vectors = {} for line in open(vectorsfile): items = line.strip().split(' ') if len(items[0]) <= 0: continue vec = [] for i in range(1, 101): vec.append(float(items[i])) vectors[items[0]] = vec return vectors def load_word_embeddings(vocab, dim): vectors = load_vectors() embeddings = [] #brute initialization for i in range(0, len(vocab)): vec = [] for j in range(0, dim): vec.append(0.01) embeddings.append(vec) for word, code in vocab.items(): if word in vectors: embeddings[code] = vectors[word] return np.array(embeddings, dtype='float32') #be attention initialization of UNKNNOW def encode_sent(vocab, string, size): x = [] words = string.split('_') for i in range(0, size): if words[i] in vocab: x.append(vocab[words[i]]) else: x.append(vocab['UNKNOWN']) return x def load_train_list(): global trainfile trainList = [] for line in open(trainfile): trainList.append(line.strip().split(' ')) return trainList def load_test_list(): global test1file testList = [] for line in open(test1file): testList.append(line.strip().split(' ')) return testList def load_data(trainList, vocab, batch_size): train_1, train_2, train_3 = [], [], [] for i in range(0, batch_size): pos = trainList[random.randint(0, len(trainList)-1)] neg = trainList[random.randint(0, len(trainList)-1)] train_1.append(encode_sent(vocab, pos[2], 100)) train_2.append(encode_sent(vocab, pos[3], 100)) train_3.append(encode_sent(vocab, neg[3], 100)) return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32') def load_data_val(testList, vocab, index, batch_size): x1, x2, x3 = [], [], [] for i in range(0, batch_size): true_index = index + i if true_index >= len(testList): true_index = len(testList) - 1 items = testList[true_index] x1.append(encode_sent(vocab, items[2], 100)) x2.append(encode_sent(vocab, items[3], 100)) x3.append(encode_sent(vocab, items[3], 100)) return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32') def validation(validate_model, testList, vocab, batch_size): index, score_list = int(0), [] while True: x1, x2, x3 = load_data_val(testList, vocab, index, batch_size) batch_scores, nouse = validate_model(x1, x2, x3, 1.0) for score in batch_scores: score_list.append(score) index += batch_size if index >= len(testList): break print 'Evaluation ' + str(index) sdict, index = {}, int(0) for items in testList: qid = items[1].split(':')[1] if not qid in sdict: sdict[qid] = [] sdict[qid].append((score_list[index], items[0])) index += 1 lev0, lev1 = float(0), float(0) for qid, cases in sdict.items(): cases.sort(key=operator.itemgetter(0), reverse=True) score, flag = cases[0] if flag == '1': lev1 += 1 if flag == '0': lev0 += 1 print 'top-1 precition: ' + str(lev1 / (lev0 + lev1)) class QACnn(object): def __init__(self, input1, input2, input3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters, keep_prob): rng = np.random.RandomState(23455) self.params = [] lookup_table = theano.shared(word_embeddings) self.params += [lookup_table] #input1-问题, input2-正向答案, input3-负向答案 #将每个字替换成字向量 input_matrix1 = lookup_table[T.cast(input1.flatten(), dtype="int32")] input_matrix2 = lookup_table[T.cast(input2.flatten(), dtype="int32")] input_matrix3 = lookup_table[T.cast(input3.flatten(), dtype="int32")] #CNN的输入是4维矩阵,这里只是增加了一个维度而已 input_x1 = input_matrix1.reshape((batch_size, 1, sequence_len, embedding_size)) input_x2 = input_matrix2.reshape((batch_size, 1, sequence_len, embedding_size)) input_x3 = input_matrix3.reshape((batch_size, 1, sequence_len, embedding_size)) #print(input_x1.shape.eval()) self.dbg_x1 = input_x1 outputs_1, outputs_2, outputs_3 = [], [], [] #设置多种大小的filter for filter_size in filter_sizes: #每种大小的filter的数量是num_filters filter_shape = (num_filters, 1, filter_size, embedding_size) image_shape = (batch_size, 1, sequence_len, embedding_size) fan_in = np.prod(filter_shape[1:]) fan_out = filter_shape[0] * np.prod(filter_shape[2:]) W_bound = np.sqrt(6. / (fan_in + fan_out)) W = theano.shared( np.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX) b = theano.shared(value=b_values, borrow=True) #卷积+max_pooling conv_out = conv2d(input=input_x1, filters=W, filter_shape=filter_shape, input_shape=image_shape) #卷积后的向量的长度为ds pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) outputs_1.append(pooled_active) conv_out = conv2d(input=input_x2, filters=W, filter_shape=filter_shape, input_shape=image_shape) pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) outputs_2.append(pooled_active) conv_out = conv2d(input=input_x3, filters=W, filter_shape=filter_shape, input_shape=image_shape) pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) outputs_3.append(pooled_active) self.params += [W, b] self.dbg_conv_out = conv_out.shape num_filters_total = num_filters * len(filter_sizes) self.dbg_outputs_1 = outputs_1[0].shape #每一个句子的语义表示向量的长度为num_filters_total output_flat1 = T.reshape(T.concatenate(outputs_1, axis=1), [batch_size, num_filters_total]) output_flat2 = T.reshape(T.concatenate(outputs_2, axis=1), [batch_size, num_filters_total]) output_flat3 = T.reshape(T.concatenate(outputs_3, axis=1), [batch_size, num_filters_total]) #dropout, keep_prob为1表示不进行dropout output_drop1 = self._dropout(rng, output_flat1, keep_prob) output_drop2 = self._dropout(rng, output_flat2, keep_prob) output_drop3 = self._dropout(rng, output_flat3, keep_prob) #计算问题和答案之前的向量夹角 #计算向量的长度 len1 = T.sqrt(T.sum(output_drop1 * output_drop1, axis=1)) len2 = T.sqrt(T.sum(output_drop2 * output_drop2, axis=1)) len3 = T.sqrt(T.sum(output_drop3 * output_drop3, axis=1)) #计算向量之间的夹角 cos12 = T.sum(output_drop1 * output_drop2, axis=1) / (len1 * len2) self.cos12 = cos12 cos13 = T.sum(output_drop1 * output_drop3, axis=1) / (len1 * len3) self.cos13 = cos13 zero = theano.shared(np.zeros(batch_size, dtype=theano.config.floatX), borrow=True) margin = theano.shared(np.full(batch_size, 0.05, dtype=theano.config.floatX), borrow=True) #Loss损失函数 diff = T.cast(T.maximum(zero, margin - cos12 + cos13), dtype=theano.config.floatX) self.cost = T.sum(diff, acc_dtype=theano.config.floatX) #mini-batch数据的准确率(如果正向答案和问题之间的cosine大于负向答案和问题的cosine,则认为正确, #否则是错误的) #Loss和Accuracy是用来评估训练中模型时候收敛的两个很重要的指标 self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size) def _dropout(self, rng, layer, keep_prob): srng = T.shared_randomstreams.RandomStreams(rng.randint(123456)) mask = srng.binomial(n=1, p=keep_prob, size=layer.shape) output = layer * T.cast(mask, theano.config.floatX) output = output / keep_prob return output def train(): batch_size = int(256) filter_sizes = [2,3,5] num_filters = 500 embedding_size = 100 learning_rate = 0.001 n_epochs = 2000000 validation_freq = 1000 keep_prob_value = 0.25 vocab = build_vocab() word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList = load_test_list() train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3') keep_prob = T.fscalar('keep_prob') model = QACnn( input1=x1, input2=x2, input3=x3, keep_prob=keep_prob, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[1], embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters) dbg_x1 = model.dbg_x1 dbg_outputs_1 = model.dbg_outputs_1 cost, cos12, cos13 = model.cost, model.cos12, model.cos13 print 'cost' print cost params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3') prob = T.fscalar('prob') train_model = theano.function( [p1, p2, p3, prob], [cost, accuracy, dbg_x1, dbg_outputs_1], updates=updates, givens={ x1: p1, x2: p2, x3: p3, keep_prob: prob } ) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') validate_model = theano.function( inputs=[v1, v2, v3, prob], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, keep_prob: prob } ) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) #print train_x3.shape cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value) print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc) if epoch % validation_freq == 0: print 'Evaluation ......' validation(validate_model, testList, vocab, batch_size) #print dbg_outputs_1 if __name__ == '__main__': train() ================================================ FILE: config.py ================================================ import os dataset_ins = 'insurance-qa' dataset_qur = 'quora-qa' ################################################################## # ajust to your runnning environment # which data do you want dataset = dataset_qur # word2vec command path w2v_command = '/export/jw/word2vec/word2vec' ################################################################## home = '' if dataset == dataset_ins: home = os.path.expanduser('/export/jw/insuranceQA') elif dataset == dataset_qur: home = os.path.expanduser('/export/jw/quora') #Insurance-QA original data directory qa_version = 'V1' vocab_file = os.path.join(home, qa_version, 'vocabulary') answers_file = os.path.join(home, qa_version, 'answers.label.token_idx') question_train_file = os.path.join(home, qa_version, 'question.train.token_idx.label') question_test1_file = os.path.join(home, qa_version, 'question.test1.label.token_idx.pool') question_test2_file = os.path.join(home, qa_version, 'question.test2.label.token_idx.pool') question_dev_file = os.path.join(home, qa_version, 'question.dev.label.token_idx.pool') #quora original data directory qr_file = os.path.join(home, 'quora_duplicate_questions.tsv') qr_train_ratio = 0.8 #processed files train_file = os.path.join(home, 'data', 'train.prepro') test1_file = os.path.join(home, 'data', 'test1.prepro') test2_file = os.path.join(home, 'data', 'test2.prepro') w2v_train_file = os.path.join(home, 'data', 'w2v.train') w2v_bin_file = os.path.join(home, 'data', 'w2v.bin') predict1_file = os.path.join(home, 'data', 'predict1') ================================================ FILE: gen.py ================================================ import config, os, random ##################################################################### # function: load vocab # return: dict[word] = [word_id] ##################################################################### def load_vocab(): voc = {} for line in open(config.vocab_file): word, _id = line.strip().split('\t') voc[word] = _id return voc ##################################################################### # function: load answers, restore idx to real word # return : [answer_1, answer_2, ..., answer_n] ##################################################################### def ins_load_answers(): _list, voc = [''], load_vocab() for line in open(config.answers_file): _, sent = line.strip().split('\t') _list.append('_'.join([voc[wid] for wid in sent.split(' ')])) return _list ##################################################################### # function: preprea word2vec binary file # return : ##################################################################### def ins_w2v(): print('preparing word2vec ......') _data, voc = [], load_vocab() for line in open(config.question_train_file): items = line.strip().split('\t') _data.append(' '.join([voc[_id] for _id in items[0].split(' ')])) for _file in [config.answers_file, config.question_dev_file, \ config.question_test1_file, config.question_test2_file]: for line in open(_file): items = line.strip().split('\t') _data.append(' '.join([voc[_id] for _id in items[1].split(' ')])) of = open(config.w2v_train_file, 'w') for s in _data: of.write(s + '\n') of.close() os.system('time ' + config.w2c_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1') ##################################################################### # function: preprea train file # file format: flag question answer ##################################################################### def ins_train(): print('preparing train ......') answers, voc, _data = ins_load_answers(), load_vocab(), [] for line in open(config.question_train_file): qsent, ids = line.strip().split('\t') qsent = '_'.join([voc[wid] for wid in qsent.split(' ')]) for _id in ids.split(' '): _data.append(' '.join(['1', qsent, answers[int(_id)]])) of = open(config.train_file, 'w') for _s in _data: of.write(_s + '\n') of.close() ##################################################################### # function: preprea test file # file format: flag group_id question answer ##################################################################### def ins_test(): print('preparing test ......') answers, voc = ins_load_answers(), load_vocab() for _in, _out in ([(config.question_test2_file, config.test2_file), \ (config.question_test1_file, config.test1_file)]): _data, group = [], int(0) for line in open(_in): pids, qsent, pnids = line.strip().split('\t') positive = {_id:'#' for _id in pids.split(' ')} qsent = '_'.join([voc[wid] for wid in qsent.split(' ')]) for _id in pnids.split(' '): flag = '1' if _id in positive else '0' _data.append(' '.join([flag, str(group), qsent, answers[int(_id)]])) group += 1 of = open(_out, 'w') for s in _data: of.write(s + '\n') of.close() def ins_qa(): ins_w2v() ins_train() ins_test() def qur_prepare(): #pretrain word2vec _list = [] for line in open(config.qr_file): items = line.strip().split('\t') if len(items) != 6: continue _list.append(items) _list = _list[1:] random.shuffle(_list) _list = [(f, q1, q2) for _,_,_,q1,q2,f in _list] of = open(config.w2v_train_file, 'w') for f, q1, q2 in _list: of.write(q1 + '\n') of.write(q2 + '\n') of.close() os.system('time ' + config.w2v_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1') #train file _newlist = [] for f, q1, q2 in _list: if len(q1) <= 1 or len(q2) <= 1: continue q1 = '_'.join(q1.split(' ')) q2 = '_'.join(q2.split(' ')) _newlist.append((f, q1, q2)) _list = _newlist of = open(config.train_file, 'w') for f, q1, q2 in _list[:int(len(_list) * 0.8)]: of.write(' '.join([f, q1, q2]) + '\n') of.close() #test file of = open(config.test1_file, 'w') for f, q1, q2 in _list[int(len(_list) * 0.8):]: of.write(' '.join([f, q1, q2]) + '\n') of.close() def qur_qa(): qur_prepare() if __name__ == '__main__': if config.dataset == config.dataset_ins: ins_qa() elif config.dataset == config.dataset_qur: qur_qa() ================================================ FILE: lstm_cnn/theano/README.md ================================================ theano lstm+cnn code for insuranceQA ================result================== theano code, test1 top-1 precision : 68.3% lstm+cnn is better than cnn(61.5%). ================dataset================ dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) you can get the original dataset from https://github.com/shuzi/insuranceQA word embedding is trained by word2vec toolkit =================run===================== reformat the original dataset(see my train and test1.sample) change filepath to your dataset(see TODO in insqa_cnn.py) python insqa_lstm.py ================================================ FILE: lstm_cnn/theano/insqa_lstm.py ================================================ ############################################################ # if batch_size is 1, there must be a dtype error when doing # T.grad, this is something about scan func # see https://github.com/Theano/Theano/issues/1772 # # LSTM + cnn # test1 top-1 precision: 68.3% ############################################################ from collections import OrderedDict import sys, time, random, operator import numpy as np import theano from theano import config import theano.tensor as T from theano.tensor.signal import pool from theano.tensor.nnet import conv2d from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams #TODO change filepath to your local environment #include train test1 vectors.nobin def build_vocab(): code, vocab = int(0), {} vocab['UNKNOWN'] = code code += 1 for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') for i in range(2, 3): for word in items[i].split('_'): if len(word) <= 0: continue if not word in vocab: vocab[word] = code code += 1 return vocab def load_vectors(): vectors = {} for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): items = line.strip().split(' ') if len(items[0]) <= 0: continue vec = [] for i in range(1, 101): vec.append(float(items[i])) vectors[items[0]] = vec return vectors def load_word_embeddings(vocab, dim): vectors = load_vectors() embeddings = [] #brute initialization for i in range(0, len(vocab)): vec = [] for j in range(0, dim): vec.append(0.01) embeddings.append(vec) for word, code in vocab.items(): if word in vectors: embeddings[code] = vectors[word] return np.array(embeddings, dtype='float32') #be attention initialization of UNKNNOW def encode_sent(vocab, string, size): x, m = [], [] words = string.split('_') for i in range(0, size): if words[i] in vocab: x.append(vocab[words[i]]) else: x.append(vocab['UNKNOWN']) if words[i] == '': #TODO m.append(1) #fixed sequence length, else use 0 else: m.append(1) return x, m def load_train_list(): trainList = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') if items[0] == '1': trainList.append(line.strip().split(' ')) return trainList def load_test_list(): testList = [] for line in open('/export/jw/cnn/insuranceQA/test1'): testList.append(line.strip().split(' ')) return testList def load_data(trainList, vocab, batch_size): train_1, train_2, train_3 = [], [], [] mask_1, mask_2, mask_3 = [], [], [] counter = 0 while True: pos = trainList[random.randint(0, len(trainList)-1)] neg = trainList[random.randint(0, len(trainList)-1)] if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''): #print 'empty string ......' continue x, m = encode_sent(vocab, pos[2], 100) train_1.append(x) mask_1.append(m) x, m = encode_sent(vocab, pos[3], 100) train_2.append(x) mask_2.append(m) x, m = encode_sent(vocab, neg[3], 100) train_3.append(x) mask_3.append(m) counter += 1 if counter >= batch_size: break return np.transpose(np.array(train_1, dtype=config.floatX)), np.transpose(np.array(train_2, dtype=config.floatX)), np.transpose(np.array(train_3, dtype=config.floatX)), np.transpose(np.array(mask_1, dtype=config.floatX)) , np.transpose(np.array(mask_2, dtype=config.floatX)), np.transpose(np.array(mask_3, dtype=config.floatX)) def load_data_val(testList, vocab, index, batch_size): x1, x2, x3, m1, m2, m3 = [], [], [], [], [], [] for i in range(0, batch_size): true_index = index + i if true_index >= len(testList): true_index = len(testList) - 1 items = testList[true_index] x, m = encode_sent(vocab, items[2], 100) x1.append(x) m1.append(m) x, m = encode_sent(vocab, items[3], 100) x2.append(x) m2.append(m) x, m = encode_sent(vocab, items[3], 100) x3.append(x) m3.append(m) return np.transpose(np.array(x1, dtype=config.floatX)), np.transpose(np.array(x2, dtype=config.floatX)), np.transpose(np.array(x3, dtype=config.floatX)), np.transpose(np.array(m1, dtype=config.floatX)) , np.transpose(np.array(m2, dtype=config.floatX)), np.transpose(np.array(m3, dtype=config.floatX)) def validation(validate_model, testList, vocab, batch_size): index, score_list = int(0), [] while True: x1, x2, x3, m1, m2, m3 = load_data_val(testList, vocab, index, batch_size) batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3) for score in batch_scores: score_list.append(score) index += batch_size if index >= len(testList): break print 'Evaluation ' + str(index) sdict, index = {}, int(0) for items in testList: qid = items[1].split(':')[1] if not qid in sdict: sdict[qid] = [] sdict[qid].append((score_list[index], items[0])) index += 1 lev0, lev1 = float(0), float(0) of = open('/export/jw/cnn/insuranceQA/acc.lstm', 'a') for qid, cases in sdict.items(): cases.sort(key=operator.itemgetter(0), reverse=True) score, flag = cases[0] if flag == '1': lev1 += 1 if flag == '0': lev0 += 1 for s in score_list: of.write(str(s) + '\n') of.write('lev1:' + str(lev1) + '\n') of.write('lev0:' + str(lev0) + '\n') print 'lev1:' + str(lev1) print 'lev0:' + str(lev0) of.close() def ortho_weight(ndim): W = np.random.randn(ndim, ndim) u, s, v = np.linalg.svd(W) return u.astype(config.floatX) def numpy_floatX(data): return np.asarray(data, dtype=config.floatX) def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_params): rng = np.random.RandomState(23455) for filter_size in filter_sizes: filter_shape = (num_filters, 1, filter_size, proj_size) fan_in = np.prod(filter_shape[1:]) fan_out = filter_shape[0] * np.prod(filter_shape[2:]) W_bound = np.sqrt(6. / (fan_in + fan_out)) W = theano.shared( np.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) tparams['cnn_W_' + str(filter_size)] = W b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX) b = theano.shared(value=b_values, borrow=True) tparams['cnn_b_' + str(filter_size)] = b grad_params += [W, b] return tparams, grad_params def param_init_lstm(proj_size, tparams, grad_params): W = np.concatenate([ortho_weight(proj_size), ortho_weight(proj_size), ortho_weight(proj_size), ortho_weight(proj_size)], axis=1) W_t = theano.shared(W, borrow=True) tparams[_p('lstm', 'W')] = W_t U = np.concatenate([ortho_weight(proj_size), ortho_weight(proj_size), ortho_weight(proj_size), ortho_weight(proj_size)], axis=1) U_t = theano.shared(U, borrow=True) tparams[_p('lstm', 'U')] = U_t b = np.zeros((4 * proj_size,)) b_t = theano.shared(b.astype(config.floatX), borrow=True) tparams[_p('lstm', 'b')] = b_t grad_params += [W_t, U_t, b_t] return tparams, grad_params def dropout_layer(state_before, use_noise, trng): proj = T.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype)), state_before * 0.5) return proj class LSTM(object): def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters): #proj_size means embedding_size #'lstm_W' = [embedding_size, embedding_size] #'lstm_U' = [embedding_size, embedding_size] #'lstm_b' = [embedding_size] proj_size = 100 #TODO, what does proj mean self.params, tparams = [], {} tparams, self.params = param_init_lstm(proj_size, tparams, self.params) tparams, self.params = param_init_cnn(filter_sizes, num_filters, proj_size, tparams, self.params) lookup_table = theano.shared(word_embeddings, borrow=True) tparams['lookup_table'] = lookup_table self.params += [lookup_table] n_timesteps = input1.shape[0] n_samples = input1.shape[1] lstm1, lstm_whole1 = self._lstm_net(tparams, input1, sequence_len, batch_size, embedding_size, mask1, proj_size) lstm2, lstm_whole2 = self._lstm_net(tparams, input2, sequence_len, batch_size, embedding_size, mask2, proj_size) lstm3, lstm_whole3 = self._lstm_net(tparams, input3, sequence_len, batch_size, embedding_size, mask3, proj_size) #dimshuffle [sequence_len, batch_size, proj_size] to [batch_size, sequence_len, proj_size] cnn_input1 = T.reshape(lstm1.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) cnn_input2 = T.reshape(lstm2.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) cnn_input3 = T.reshape(lstm3.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) cnn1 = self._cnn_net(tparams, cnn_input1, batch_size, sequence_len, num_filters, filter_sizes, proj_size) cnn2 = self._cnn_net(tparams, cnn_input2, batch_size, sequence_len, num_filters, filter_sizes, proj_size) cnn3 = self._cnn_net(tparams, cnn_input3, batch_size, sequence_len, num_filters, filter_sizes, proj_size) len1 = T.sqrt(T.sum(cnn1 * cnn1, axis=1)) len2 = T.sqrt(T.sum(cnn2 * cnn2, axis=1)) len3 = T.sqrt(T.sum(cnn3 * cnn3, axis=1)) self.cos12 = T.sum(cnn1 * cnn2, axis=1) / (len1 * len2) self.cos13 = T.sum(cnn1 * cnn3, axis=1) / (len1 * len3) zero = theano.shared(np.zeros(batch_size, dtype=config.floatX), borrow=True) margin = theano.shared(np.full(batch_size, 0.05, dtype=config.floatX), borrow=True) diff = T.cast(T.maximum(zero, margin - self.cos12 + self.cos13), dtype=config.floatX) self.cost = T.sum(diff, acc_dtype=config.floatX) self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size) def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_filters, filter_sizes, proj_size): outputs = [] for filter_size in filter_sizes: filter_shape = (num_filters, 1, filter_size, proj_size) image_shape = (batch_size, 1, sequence_len, proj_size) W = tparams['cnn_W_' + str(filter_size)] b = tparams['cnn_b_' + str(filter_size)] conv_out = conv2d(input=cnn_input, filters=W, filter_shape=filter_shape, input_shape=image_shape) pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) outputs.append(pooled_active) num_filters_total = num_filters * len(filter_sizes) output_tensor = T.reshape(T.concatenate(outputs, axis=1), [batch_size, num_filters_total]) return output_tensor def _lstm_net(self, tparams, _input, sequence_len, batch_size, embedding_size, mask, proj_size): input_matrix = tparams['lookup_table'][T.cast(_input.flatten(), dtype="int32")] input_x = input_matrix.reshape((sequence_len, batch_size, embedding_size)) proj, proj_whole = lstm_layer(tparams, input_x, proj_size, prefix='lstm', mask=mask) #if useMask == True: #proj = (proj * mask[:, :, None]).sum(axis=0) #proj = proj / mask.sum(axis=0)[:, None] #if options['use_dropout']: #proj = dropout_layer(proj, use_noise, trng) return proj, proj_whole #state_below is word_embbeding tensor(3dim) def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None): #dim-0 steps, dim-1 samples(batch_size), dim-3 word_embedding nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] #h means hidden output? c means context? so we'll use h? #rval[0] = [sequence_len, batch_size, proj_size], rval[1] the same #so preact size must equl to x_(lstm input slice) #if you want change lstm h(t) size, 'lstm_U' and 'lstm_b' #and precat must be changed to another function, like h*U+b #see http://colah.github.io/posts/2015-08-Understanding-LSTMs/ #f(t) = sigmoid(Wf * [h(t-1),x(t)] + bf) def _step(m_, x_, h_, c_): preact = T.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = T.nnet.sigmoid(_slice(preact, 0, proj_size)) f = T.nnet.sigmoid(_slice(preact, 1, proj_size)) o = T.nnet.sigmoid(_slice(preact, 2, proj_size)) c = T.tanh(_slice(preact, 3, proj_size)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * T.tanh(c) #if mask(t-1)==0, than make h(t) = h(t-1) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (T.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = proj_size rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[T.alloc(numpy_floatX(0.), n_samples, dim_proj), T.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0], rval[1] def _p(pp, name): return '%s_%s' % (pp, name) def train(): batch_size = int(256) embedding_size = 100 learning_rate = 0.05 n_epochs = 20000000 validation_freq = 1000 filter_sizes = [1, 2, 3, 5] num_filters = 500 vocab = build_vocab() word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList = load_test_list() train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size) x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3') m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3') model = LSTM( input1=x1, input2=x2, input3=x3, mask1=m1, mask2=m2, mask3=m3, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[0], #row is sequence_len embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters) cost, cos12, cos13 = model.cost, model.cos12, model.cos13 params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3') q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3') train_model = theano.function( [p1, p2, p3, q1, q2, q3], [cost, accuracy], updates=updates, givens={ x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3 } ) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3') validate_model = theano.function( inputs=[v1, v2, v3, u1, u2, u3], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3 } ) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size) #print('train_x1, train_x2, train_x3') #print(train_x1.shape, train_x2.shape, train_x3.shape) cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3) print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc) if epoch % validation_freq == 0: print 'Evaluation ......' validation(validate_model, testList, vocab, batch_size) if __name__ == '__main__': train() ================================================ FILE: rnn_attention/tensorflow/insurance_qa_data_helpers.py ================================================ import numpy as np import random from operator import itemgetter precision = '/export/jw/cnn/insuranceQA/acc.lstm' empty_vector = [] for i in range(0, 100): empty_vector.append(float(0.0)) onevector = [] for i in range(0, 10): onevector.append(float(1)) zerovector = [] for i in range(0, 10): zerovector.append(float(0)) def build_vocab(): code, vocab = int(0), {} vocab['UNKNOWN'] = code code += 1 for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') for i in range(2, 3): words = items[i].split('_') for word in words: if not word in vocab: vocab[word] = code code += 1 for line in open('/export/jw/cnn/insuranceQA/test1'): items = line.strip().split(' ') for i in range(2, 3): words = items[i].split('_') for word in words: if not word in vocab: vocab[word] = code code += 1 return vocab def read_alist(): alist = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') alist.append(items[3]) print('read_alist done ......') return alist def load_vectors(): vectors = {} for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): items = line.strip().split(' ') if (len(items) < 101): continue vec = [] for i in range(1, 101): vec.append(float(items[i])) vectors[items[0]] = vec return vectors def read_vector(vectors, word): global empty_vector if word in vectors: return vectors[word] else: return empty_vector #return vectors[''] def load_train_list(): train_list = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') if items[0] == '1': train_list.append(line.strip().split(' ')) return train_list def load_test_list(): test_list = [] for line in open('/export/jw/cnn/insuranceQA/test1'): test_list.append(line.strip().split(' ')) return test_list def load_train_and_vectors(): trainList = [] for line in open('/export/jw/cnn/insuranceQA/train'): trainList.append(line.strip()) vectors = load_vectors() return trainList, vectors def read_raw(): raw = [] for line in open('/export/jw/cnn/insuranceQA/train'): items = line.strip().split(' ') if items[0] == '1': raw.append(items) return raw def encode_sent(vocab, string, size): x, m = [], [] words = string.split('_') for i in range(0, size): if words[i] in vocab: x.append(vocab[words[i]]) else: x.append(vocab['UNKNOWN']) if words[i] == '': m.append(1) else: m.append(1) return x, m def load_val_data(test_list, vocab, index, batch_size, max_len): x1, x2, x3, m1, m2, m3 = [], [], [], [], [], [] for i in range(0, batch_size): t_i = index + i if t_i >= len(test_list): t_i = len(test_list) - 1 items = test_list[t_i] x, m = encode_sent(vocab, items[2], max_len) x1.append(x) m1.append(m) x, m = encode_sent(vocab, items[3], max_len) x2.append(x) m2.append(m) x, m = encode_sent(vocab, items[3], max_len) x3.append(x) m3.append(m) return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32'), np.transpose(np.array(m1, dtype='float32')) , np.transpose(np.array(m2, dtype='float32')), np.transpose(np.array(m3, dtype='float32')) def load_train_data(trainList, vocab, batch_size, max_len): train_1, train_2, train_3 = [], [], [] mask_1, mask_2, mask_3 = [], [], [] counter = 0 while True: pos = trainList[random.randint(0, len(trainList)-1)] neg = trainList[random.randint(0, len(trainList)-1)] if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''): #print 'empty string ......' continue x, m = encode_sent(vocab, pos[2], max_len) train_1.append(x) mask_1.append(m) x, m = encode_sent(vocab, pos[3], max_len) train_2.append(x) mask_2.append(m) x, m = encode_sent(vocab, neg[3], max_len) train_3.append(x) mask_3.append(m) counter += 1 if counter >= batch_size: break return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32'), np.transpose(np.array(mask_1, dtype='float32')) , np.transpose(np.array(mask_2, dtype='float32')), np.transpose(np.array(mask_3, dtype='float32')) def evaluation(score_list, test_list): global precision sessdict, index = {}, int(0) for items in test_list: qid = items[1].split(':')[1] if not qid in sessdict: sessdict[qid] = [] sessdict[qid].append((score_list[index], items[0])) index += 1 if index >= len(test_list): break lev1, lev0 = float(0), float(0) of = open(precision, 'a') for k, v in sessdict.items(): v.sort(key=itemgetter(0), reverse=True) score, flag = v[0] if flag == '1': lev1 += 1 if flag == '0': lev0 += 1 of.write('lev1:' + str(lev1) + '\n') of.write('lev0:' + str(lev0) + '\n') print('lev1 ' + str(lev1)) print('lev0 ' + str(lev0)) print('precision:' + str(lev1 / (lev0 + lev1))) of.close() ================================================ FILE: rnn_attention/tensorflow/tf_rnn_char.py ================================================ # -*- coding: utf-8 -*- #################################################################################### #test1 top1准确率59% #################################################################################### import tensorflow as tf import numpy as np from operator import itemgetter import random, datetime, json, insurance_qa_data_helpers class RNN_Model(object): def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, embed_dim, num_step, fw_cell, bw_cell): _initial_state = fw_cell.zero_state(batch_size,dtype=tf.float32) inputs=tf.nn.embedding_lookup(embedding, inputs) inputs = tf.nn.dropout(inputs, self.keep_prob) #[batch_size, sequence_length, embedding_size]转换为[sequence_length, batch_size, embedding_size] inputs = tf.transpose(inputs, [1, 0, 2]) #[sequence_length, batch_size, embedding_size]转换为list, sequence_length个[batch_size, embedding_size] inputs = tf.unstack(inputs) #inputs = tf.reshape(inputs, [-1, embed_dim]) #inputs = tf.split(inputs, num_step, 0) #输出为list, sequence_length个[batch_size, embedding_size * 2] outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=_initial_state, initial_state_bw=_initial_state) outputs = tf.transpose(tf.stack(outputs), [1, 0, 2]) self.outputs = outputs #对rnn的输出[batch_size, sequence_length, embedding_size],目前采用maxpooling是最好的效果 #mean_pooling以及取最后一个step的向量,效果都不好 outputs = self._max_pooling(outputs) print outputs #outputs = outputs[-1] #outputs = outputs * mask[:, :, None] #mean pooling #outputs = tf.reduce_sum(outputs, 0) / (tf.reduce_sum(mask, 0)[:,None]) return outputs def _max_pooling(self, lstm): sequence_length, embedding_size = int(lstm.get_shape()[1]), int(lstm.get_shape()[2]) lstm = tf.expand_dims(lstm, -1) output = tf.nn.max_pool(lstm, ksize=[1, sequence_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID') output = tf.reshape(output, [-1, embedding_size]) return output def __init__(self, config, is_training=True): self.keep_prob=tf.placeholder(tf.float32, name='dropout_keep_prob') self.batch_size=config.batch_size self.num_step=config.num_step self.qlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) #这个版本没有使用mask self.mask_q = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) self.plist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) self.mask_p = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) self.nlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) self.mask_n = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) hidden_neural_size=config.hidden_neural_size vocabulary_size=config.vocabulary_size self.embed_dim=config.embed_dim hidden_layer_num=config.hidden_layer_num #fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True) fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) fw_cell = tf.contrib.rnn.DropoutWrapper( fw_cell,output_keep_prob=self.keep_prob ) #bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True) bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) bw_cell = tf.contrib.rnn.DropoutWrapper( bw_cell,output_keep_prob=self.keep_prob ) #embedding layer with tf.device("/cpu:1"),tf.name_scope("embedding_layer"): self.embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W') #self.a_embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W') q = self._rnn_net(self.qlist, mask_q, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) tf.get_variable_scope().reuse_variables() p = self._rnn_net(self.plist, mask_p, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) tf.get_variable_scope().reuse_variables() n = self._rnn_net(self.nlist, mask_n, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) #len_1 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)), 0.01, 100000) #len_2 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)), 0.01, 100000) #len_3 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)), 0.01, 100000) len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)) len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)) len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)) self.cos12 = tf.reduce_sum(tf.multiply(q, p), axis=1) / (len_1 * len_2) self.cos13 = tf.reduce_sum(tf.multiply(q, n), axis=1) / (len_1 * len_3) self.q = q self.p = p zero = tf.constant(np.zeros(self.batch_size, dtype='float32')) margin = tf.constant(np.full(self.batch_size, 0.1, dtype='float32')) diff = tf.cast(tf.maximum(zero, margin - self.cos12 + self.cos13), dtype='float32') self.cost = tf.reduce_sum(diff) self.accuracy = tf.reduce_sum(tf.cast(tf.equal(zero, diff), dtype='float32')) / float(self.batch_size) def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n): fetches = [model.cost, model.accuracy, global_step, train_op, model.cos12, model.q, model.p, model.outputs] feed_dict = { model.qlist: qlist, model.plist: plist, model.nlist: nlist, model.mask_q : mask_q, model.mask_p : mask_p, model.mask_n : mask_n, model.keep_prob: config.keep_prob } cost, accuracy, step, _, cos12, q, p, outputs = sess.run(fetches, feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy)) def dev_step(model, vocab, batch_size, max_len): score_list, i = [], int(0) while True: qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_val_data(test_list, vocab, i, FLAGS.batch_size, max_len) feed_dict = { model.qlist: qlist, model.plist: plist, model.nlist: nlist, model.mask_q : mask_q, model.mask_p : mask_p, model.mask_n : mask_n, model.keep_prob: float(1.0) } batch_scores = sess.run([model.cos12], feed_dict) for score in batch_scores[0]: score_list.append(score) i += FLAGS.batch_size if i >= len(test_list): break insurance_qa_data_helpers.evaluation(score_list, test_list) tf.flags.DEFINE_integer('evaluate_every',10000,'evaluate every') tf.flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure') tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim') tf.flags.DEFINE_integer('hidden_neural_size',200,'LSTM hidden neural size') tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num') tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence') tf.flags.DEFINE_float('init_scale',0.1,'init scale') tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate') tf.flags.DEFINE_integer('num_epoch',1000000,'num epoch') tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm') # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() vocab = insurance_qa_data_helpers.build_vocab() train_list = insurance_qa_data_helpers.load_train_list() qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len) test_list = insurance_qa_data_helpers.load_test_list() class Config(object): hidden_neural_size=FLAGS.hidden_neural_size vocabulary_size=len(vocab) embed_dim=FLAGS.emdedding_dim hidden_layer_num=FLAGS.hidden_layer_num keep_prob=FLAGS.keep_prob batch_size = FLAGS.batch_size num_step = FLAGS.max_len max_grad_norm=FLAGS.max_grad_norm num_epoch = FLAGS.num_epoch config = Config() eval_config=Config() eval_config.keep_prob=1.0 with tf.Graph().as_default(): with tf.device('/gpu:1'): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale) with tf.variable_scope("model",reuse=None,initializer=initializer): model = RNN_Model(config=config, is_training=True) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) #optimizer = tf.train.RMSPropOptimizer(0.01) #optimizer = tf.train.AdamOptimizer(0.1) optimizer = tf.train.GradientDescentOptimizer(0.2) grads_and_vars = optimizer.compute_gradients(model.cost) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Initialize all variables sess.run(tf.global_variables_initializer()) for i in range(config.num_epoch): qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len) train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: dev_step(model, vocab, FLAGS.batch_size, FLAGS.max_len) ================================================ FILE: swem/swem_hier.py ================================================ import numpy as np import tensorflow as tf import time, os, random, datetime, sys from sklearn import metrics sys.path.append('../') import config, utils ################################################################################ # Insurance-QA # AUC 0.96, top 1 precision:31% # # quora-data # best precision: 0.8369, best threshold:0.62 ################################################################################ class SWEM_HIER(object): def __init__(self, sequence_length, vocab_size, embedding_size, embeddings): self.x1 = tf.placeholder(tf.int32, [None, sequence_length]) self.x2 = tf.placeholder(tf.int32, [None, sequence_length]) self.y = tf.placeholder(tf.float32, [None]) self.one = tf.placeholder(tf.float32, [None]) #self.dropout_keep_prob = tf.placeholder(tf.float32) with tf.device('/cpu:0'), tf.name_scope('embedding'): self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) x1_mat = tf.nn.embedding_lookup(self.word_mat, self.x1) x2_mat = tf.nn.embedding_lookup(self.word_mat, self.x2) self.x1_mat_exp = tf.expand_dims(x1_mat, -1) self.x2_mat_exp = tf.expand_dims(x2_mat, -1) p1 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') p2 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') p1 = tf.reshape(tf.reduce_max(p1, 1), [-1, embedding_size]) p2 = tf.reshape(tf.reduce_max(p2, 1), [-1, embedding_size]) """ p11 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 3, 1, 1], strides=[1, 1, 1, 1], padding='VALID') p21 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 3, 1, 1], strides=[1, 1, 1, 1], padding='VALID') p11 = tf.reshape(tf.reduce_max(p11, 1), [-1, embedding_size]) p21 = tf.reshape(tf.reduce_max(p21, 1), [-1, embedding_size]) p1 = tf.concat([p1, p11], 1) p2 = tf.concat([p2, p21], 1) """ self.cos = self.cosine(p1, p2) self.losses = self.logloss(self.y, self.one, self.cos) def logloss(self, y, v_one, sim): a = tf.multiply(y, tf.log(sim)) #y*log(p) b = tf.subtract(v_one, y)#1-y c = tf.log(tf.subtract(v_one, sim))#log(1-p) losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) losses = tf.reduce_sum(losses, -1) return losses def cosine(self, t1, t2): len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) cos = tf.div(multiply, tf.multiply(len1, len2)) return tf.clip_by_value(cos, 1e-5, 0.99999) def get_constant(batch_size): one, zero = [1.0] * batch_size, [0.0] * batch_size return np.array(one), np.array(zero) max_len = 100 num_epoch = 200000 batch_size = 256 checkpoint_every = 10000 vocab, embeddings = utils.load_embeddings() embedding_size = len(embeddings[0]) train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) print('load data done ......') print(embeddings.shape) prev_auc = 0.0 with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): swem = SWEM_HIER(max_len, len(vocab), embedding_size, embeddings) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-1) #optimizer = tf.train.GradientDescentOptimizer(1e-1) grads_and_vars = optimizer.compute_gradients(swem.losses) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) def train_step(): y, x1, x2 = utils.gen_train_batch_yxx(train_data, batch_size) one, zero = get_constant(batch_size) feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:y, swem.one:one} _, step, loss, cos = sess.run( [train_op, global_step, swem.losses, swem.cos], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) def test_step(): yp, y, group = [], [], [] for i in range(0, len(test_data), batch_size): f, g, x1, x2 = utils.gen_test_batch_yxx(test_data, i, i + batch_size) one, zero = get_constant(len(f)) feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:f, swem.one:one} loss, cos = sess.run([swem.losses, swem.cos], feed_dict) yp.extend(cos) y.extend(f) group.extend(g) ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] #for _y, _g, _yp in ppp: # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] for i in range(num_epoch): train_step() current_step = tf.train.global_step(sess, global_step) if current_step % checkpoint_every == 0: y, g, yp = test_step() utils._eval(y, g, yp) #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') #utils.save_features(features[3], './data/gen_sweg_hier_test.f') ================================================ FILE: swem/swem_hier_margin.py ================================================ import numpy as np import tensorflow as tf import time, os, random, datetime, sys from sklearn import metrics sys.path.append('../') import config, utils #top 1 precision:54% class SWEM_HIER(object): def __init__(self, margin, sequence_length, vocab_size, embedding_size, embeddings): self.zero = tf.placeholder(tf.float32, [None]) self.q = tf.placeholder(tf.int32, [None, sequence_length]) self.qp = tf.placeholder(tf.int32, [None, sequence_length]) self.qn = tf.placeholder(tf.int32, [None, sequence_length]) with tf.device('/cpu:0'), tf.name_scope('embedding'): self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) q_mat = tf.nn.embedding_lookup(self.word_mat, self.q) qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp) qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn) self.q_mat_exp = tf.expand_dims(q_mat, -1) self.qp_mat_exp = tf.expand_dims(qp_mat, -1) self.qn_mat_exp = tf.expand_dims(qn_mat, -1) self.word_mat1 = tf.Variable(embeddings, trainable=True, dtype=tf.float32) q_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.q) qp_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qp) qn_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qn) self.q_mat_exp1 = tf.expand_dims(q_mat1, -1) self.qp_mat_exp1 = tf.expand_dims(qp_mat1, -1) self.qn_mat_exp1 = tf.expand_dims(qn_mat1, -1) q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') q = tf.reshape(tf.reduce_max(q, 1), [-1, embedding_size]) qp = tf.reshape(tf.reduce_max(qp, 1), [-1, embedding_size]) qn = tf.reshape(tf.reduce_max(qn, 1), [-1, embedding_size]) q1 = tf.nn.avg_pool(self.q_mat_exp1, ksize=[1, 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qp1 = tf.nn.avg_pool(self.qp_mat_exp1, ksize=[1, 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qn1 = tf.nn.avg_pool(self.qn_mat_exp1, ksize=[1, 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') q1 = tf.reshape(tf.reduce_max(q1, 1), [-1, embedding_size]) qp1 = tf.reshape(tf.reduce_max(qp1, 1), [-1, embedding_size]) qn1 = tf.reshape(tf.reduce_max(qn1, 1), [-1, embedding_size]) q = tf.concat([q, q1], 1) qp = tf.concat([qp, qp1], 1) qn = tf.concat([qn, qn1], 1) self.cos_q_qp = self.cosine(q, qp) self.cos_q_qn = self.cosine(q, qn) self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn) correct = tf.equal(self.zero, loss_batch) self.accuracy = tf.reduce_mean(tf.cast(correct, "float")) def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn): loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn))) losses = tf.reduce_sum(loss_batch) return losses, loss_batch def logloss(self, y, v_one, sim): a = tf.multiply(y, tf.log(sim)) #y*log(p) b = tf.subtract(v_one, y)#1-y c = tf.log(tf.subtract(v_one, sim))#log(1-p) losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) losses = tf.reduce_sum(losses, -1) return losses def cosine(self, t1, t2): len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) cos = tf.div(multiply, tf.multiply(len1, len2)) return tf.clip_by_value(cos, 1e-5, 0.99999) def get_constant(batch_size): one, zero = [1.0] * batch_size, [0.0] * batch_size return np.array(one), np.array(zero) margin = 0.05 max_len = 200 num_epoch = 200000 batch_size = 256 checkpoint_every = 50000 vocab, embeddings = utils.load_embeddings() embedding_size = len(embeddings[0]) train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) print('load data done ......') print(embeddings.shape) prev_auc = 0.0 with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-1) #optimizer = tf.train.GradientDescentOptimizer(1e-1) grads_and_vars = optimizer.compute_gradients(swem.losses) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) def train_step(): q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size) one, zero = get_constant(batch_size) feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero} _, step, loss, cos, acc = sess.run( [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc{:g}".format(time_str, step, loss, acc)) def test_step(): yp, y, group = [], [], [] for i in range(0, len(test_data), batch_size): f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size) one, zero = get_constant(len(f)) feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero} loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict) yp.extend(cos) y.extend(f) group.extend(g) ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] #for _y, _g, _yp in ppp: # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] for i in range(num_epoch): train_step() current_step = tf.train.global_step(sess, global_step) if current_step % checkpoint_every == 0: y, g, yp = test_step() auc = utils.eval_auc(y, g, yp) top1_prec = utils._eval_top1_prec(y, g, yp) #if auc < prev_auc: # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)] # features.append(_flist) # break #prev_auc = auc #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') #utils.save_features(features[3], './data/gen_sweg_hier_test.f') ================================================ FILE: swem/swem_max_margin.py ================================================ import numpy as np import tensorflow as tf import time, os, random, datetime, sys from sklearn import metrics sys.path.append('../') import config, utils class SWEM_HIER(object): def __init__(self, margin, sequence_length, vocab_size, embedding_size, embeddings): self.zero = tf.placeholder(tf.float32, [None]) self.q = tf.placeholder(tf.int32, [None, sequence_length]) self.qp = tf.placeholder(tf.int32, [None, sequence_length]) self.qn = tf.placeholder(tf.int32, [None, sequence_length]) with tf.device('/cpu:0'), tf.name_scope('embedding'): self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) q_mat = tf.nn.embedding_lookup(self.word_mat, self.q) qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp) qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn) self.q_mat_exp = tf.expand_dims(q_mat, -1) self.qp_mat_exp = tf.expand_dims(qp_mat, -1) self.qn_mat_exp = tf.expand_dims(qn_mat, -1) """ q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1], strides=[1, 1, 1, 1], padding='VALID') """ q = tf.reshape(tf.reduce_max(self.q_mat_exp, 1), [-1, embedding_size]) qp = tf.reshape(tf.reduce_max(self.qp_mat_exp, 1), [-1, embedding_size]) qn = tf.reshape(tf.reduce_max(self.qn_mat_exp, 1), [-1, embedding_size]) self.cos_q_qp = self.cosine(q, qp) self.cos_q_qn = self.cosine(q, qn) self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn) correct = tf.equal(self.zero, loss_batch) self.accuracy = tf.reduce_mean(tf.cast(correct, "float")) def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn): loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn))) losses = tf.reduce_sum(loss_batch) return losses, loss_batch def logloss(self, y, v_one, sim): a = tf.multiply(y, tf.log(sim)) #y*log(p) b = tf.subtract(v_one, y)#1-y c = tf.log(tf.subtract(v_one, sim))#log(1-p) losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) losses = tf.reduce_sum(losses, -1) return losses def cosine(self, t1, t2): len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) cos = tf.div(multiply, tf.multiply(len1, len2)) return tf.clip_by_value(cos, 1e-5, 0.99999) def get_constant(batch_size): one, zero = [1.0] * batch_size, [0.0] * batch_size return np.array(one), np.array(zero) margin = 0.05 max_len = 200 num_epoch = 200000 batch_size = 256 checkpoint_every = 50000 vocab, embeddings = utils.load_embeddings() embedding_size = len(embeddings[0]) train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) print('load data done ......') print(embeddings.shape) prev_auc = 0.0 with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-1) #optimizer = tf.train.GradientDescentOptimizer(1e-1) grads_and_vars = optimizer.compute_gradients(swem.losses) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) def train_step(): q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size) one, zero = get_constant(batch_size) feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero} _, step, loss, cos, acc = sess.run( [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, acc)) def test_step(): yp, y, group = [], [], [] for i in range(0, len(test_data), batch_size): f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size) one, zero = get_constant(len(f)) feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero} loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict) yp.extend(cos) y.extend(f) group.extend(g) ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] #for _y, _g, _yp in ppp: # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] for i in range(num_epoch): train_step() current_step = tf.train.global_step(sess, global_step) if current_step % checkpoint_every == 0: y, g, yp = test_step() auc = utils.eval_auc(y, g, yp) top1_prec = utils._eval_top1_prec(y, g, yp) #if auc < prev_auc: # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)] # features.append(_flist) # break #prev_auc = auc #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') #utils.save_features(features[3], './data/gen_sweg_hier_test.f') ================================================ FILE: utils.py ================================================ import numpy as np import random, sys, config from sklearn import metrics from operator import itemgetter from itertools import groupby def load_embeddings(): _data, embeddings, vocab, _id = [], [], {}, int(0) for line in open(config.w2v_bin_file): _data.append(line.strip().split(' ')) size, dim = int(_data[0][0]), int(_data[0][1]) for i in range(1, len(_data)): w, vec = _data[i][0], [float(_data[i][k]) for k in range(1, dim+1)] embeddings.append(vec) vocab[w] = _id _id += 1 embeddings.append([0.01] * dim) vocab['UNKNOWN'] = _id _id += 1 embeddings.append([0.01] * dim) vocab[''] = _id return vocab, np.array(embeddings) def encode_sent(s, vocab, max_len): ws = [w for w in s.split('_')] ws = ws[:max_len] if len(ws) >= max_len else ws + [''] * (max_len - len(ws)) nws = [] for w in ws: nw = w if w in vocab else 'UNKNOWN' nws.append(vocab[nw]) return nws def load_train_data(vocab, max_len): if config.dataset == config.dataset_ins: return ins_load_train_data(vocab, max_len) if config.dataset == config.dataset_qur: return qur_load_train_test_data(config.train_file, vocab, max_len) print('bad load_train_data') exit(1) def qur_load_train_test_data(_file, vocab, max_len): _data = [] for line in open(_file): f, q1, q2 = line.strip().split(' ') q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) _data.append((int(f), q1, q2)) return _data def ins_load_train_data(vocab, max_len): _data = [] for line in open(config.train_file): f, q1, q2 = line.strip().split(' ') q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) _data.append((q1, q2)) return _data def load_test_data(vocab, max_len): if config.dataset == config.dataset_ins: return ins_load_test_data(vocab, max_len) if config.dataset == config.dataset_qur: return qur_load_train_test_data(config.test1_file, vocab, max_len) print('bad load_test_data') exit(1) def ins_load_test_data(vocab, max_len): _data = [] for line in open(config.test1_file): f, g, q1, q2 = line.strip().split(' ') q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) _data.append((f, g, q1, q2)) return _data def gen_train_batch_qpn(_data, batch_size): psample = random.sample(_data, batch_size) nsample = random.sample(_data, batch_size) q = [s1 for s1, s2 in psample] qp = [s2 for s1, s2 in psample] qn = [s2 for s1, s2 in nsample] return np.array(q), np.array(qp), np.array(qn) def gen_train_batch_yxx(_data, batch_size): if config.dataset == config.dataset_ins: return ins_gen_train_batch_yxx(_data, batch_size) if config.dataset == config.dataset_qur: return qur_gen_train_batch_yxx(_data, batch_size) print('bad gen_train_batch_yxx') exit(1) def qur_gen_train_batch_yxx(_data, batch_size): sample = random.sample(_data, batch_size) y = [i for i,_,_ in sample] x1 = [i for _,i,_ in sample] x2 = [i for _,_,i in sample] return np.array(y), np.array(x1), np.array(x2) def ins_gen_train_batch_yxx(_data, batch_size): part_one, part_two = int(batch_size / 4 * 3), int(batch_size / 4) psample = random.sample(_data, part_one) nsample = random.sample(_data, part_two) y = [1.0] * part_one + [0.0] * part_two x1 = [s1 for s1, s2 in psample] + [s1 for s1, s2 in psample[:part_two]] x2 = [s2 for s1, s2 in psample] + [s2 for s1, s2 in nsample] return np.array(y), np.array(x1), np.array(x2) def gen_test_batch_qpn(_data, start, end): sample = _data[start:end] for i in range(len(sample), end - start): sample.append(sample[-1]) f = [int(i) for i,_,_,_ in sample] g = [int(i) for _,i,_,_ in sample] q1 = [i for _,_,i,_ in sample] q2 = [i for _,_,_,i in sample] return f, g, np.array(q1), np.array(q2) def gen_test_batch_yxx(_data, start, end): if config.dataset == config.dataset_ins: return ins_gen_test_batch_yxx(_data, start, end) if config.dataset == config.dataset_qur: return qur_gen_test_batch_yxx(_data, start, end) print('bad gen_test_batch_yxx') exit(1) def qur_gen_test_batch_yxx(_data, start, end): sample = _data[start:end] y = [i for i,_,_ in sample] x1 = [i for _,i,_ in sample] x2 = [i for _,_,i in sample] return y, y, np.array(x1), np.array(x2) def ins_gen_test_batch_yxx(_data, start, end): sample = _data[start:end] for i in range(len(sample), end - start): sample.append(sample[-1]) f = [int(i) for i,_,_,_ in sample] g = [int(i) for _,i,_,_ in sample] q1 = [i for _,_,i,_ in sample] q2 = [i for _,_,_,i in sample] return f, g, np.array(q1), np.array(q2) def _eval(y, g, yp): if config.dataset == config.dataset_ins: eval_auc(y, g, yp) eval_top1_prec(y, g, yp) if config.dataset == config.dataset_qur: eval_auc(y, g, yp) eval_best_prec(y, g, yp) def eval_best_prec(y, g, yp): best_p, best_s = 0.0, 0.0 for i in range(50, 100, 1): i = float(i) / 100 positive = 0 for _y, _yp in zip(y, yp): p = 1 if _yp >= i else 0 if p == _y: positive += 1 prec = positive / len(yp) if prec > best_p: best_p = prec best_s = i print('best_prec: ' + str(best_p) + ' best_threshold:' + str(best_s)) return best_p, best_s def eval_auc(y, g, yp): auc = metrics.roc_auc_score(y, yp) print('auc: ' + str(auc)) return auc def eval_top1_prec(y, g, yp): _list = [(_y, _g, _yp) for _y, _g, _yp in zip(y, g, yp)] _dict = {} for _y, _g, _yp in _list: if not _g in _dict: _dict[_g] = [] _dict[_g].append((_y, _g, _yp)) positive, gc = 0 , 0 for _, group in _dict.items(): group = sorted(group, key=itemgetter(2), reverse=True) gc += 1 if group[0][0] == 1: positive += 1 prec = positive / gc print('top1 precision ' + str(positive) + '/' + str(gc) + ': '+ str(positive / gc)) return prec