Repository: liu-nlper/DocumentClassification
Branch: master
Commit: b6c6cce3638a
Files: 20
Total size: 38.2 KB

Directory structure:
gitextract__8lu9u9y/

├── README.md
└── code/
    ├── Data/
    │   └── corpus/
    │       └── corpus download link
    ├── README
    ├── TFNN/
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── layers/
    │   │   ├── ConvolutionalLayer.py
    │   │   ├── DenseLayer.py
    │   │   ├── EmbeddingLayer.py
    │   │   └── __init__.py
    │   ├── objectives.py
    │   └── utils/
    │       ├── __init__.py
    │       ├── data_util.py
    │       ├── evaluate_util.py
    │       ├── io_util.py
    │       └── tensor_util.py
    ├── configurations.py
    ├── load_data.py
    ├── model_dc.py
    ├── prepare_data.py
    └── train_w2v_model.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
# Document Classification
This code implements a simple CNN model for document classification with tensorflow.

# Model Structure
![model.png](https://github.com/MRliujiaxin/DocumentClassification/raw/master/model.png)

# Requirements
- Python: 2.7
- Tensorflow: 1.0.0
- Numpy: 1.12.1
- sklearn: 0.18.1
- gensim: 1.0.1
- pickle


================================================
FILE: code/Data/corpus/corpus download link
================================================
数据下载地址：http://competition.ai100.com.cn/html/game_det.html?id=24

将分词/词性标注后的文件命名为training.seg.csv和testing.seg.csv，放置到当前目录下。

处理后的文本如下所示:
  "公司/n 是/vshi 经/p 批准/v 依法/d 从事/vi 融资/vi 性/ng 担保/vn 业务/n 的/ude1 ..."


================================================
FILE: code/README
================================================
下载数据到Data/corpus目录下，并做分词和词性标注处理，再按以下顺序执行:
    train_w2v_model.py -> prepare_data.py -> model_dc.py


文件说明:

1. 分词、词性标注采用中科院NLPIR，处理后文件:
    ./Data/corpus/training.seg.csv
    ./Data/corpus/testing.seg.csv

2. configuration.py
    配置文件

3. train_w2v_model.py
    利用官方给的train, test训练词向量

3. prepare_data.py
    构建词表，词性表等

4. load_data.py
    加载数据

5. model_dc.py
    训练模型并预测


================================================
FILE: code/TFNN/__init__.py
================================================


================================================
FILE: code/TFNN/activations.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    激活函数
"""
import tensorflow as tf


def get_activation(activation=None):
    """
    Get activation function accord to the parameter 'activation'
    Args:
        activation: str: 激活函数的名称
    Return:
        激活函数
    """
    if activation is None:
        return None
    elif activation == 'tanh':
        return tf.nn.tanh
    elif activation == 'relu':
        return tf.nn.relu
    elif activation == 'softmax':
        return tf.nn.softmax
    else:
        raise Exception('Unknow activation function: %s' % activation)


================================================
FILE: code/TFNN/layers/ConvolutionalLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
    Convolutional1D

"""
import numpy as np
import tensorflow as tf
# from ..initializations import normal_weight
from ..activations import get_activation


class Convolutional1D(object):

    def __init__(self, input_data, filter_length, nb_filter, strides=[1, 1, 1, 1],
                 padding='VALID', activation='tanh', pooling=True,
                 name='Convolutional1D'):
        """1D卷积层
        Args:
            input_data: 3D tensor of shape=[batch_size, in_height, in_width]
                in_channels is set to 1 when use Convolutional1D.
            filter_length: int, 卷积核的长度，用于构造卷积核，在
                Convolutional1D中，卷积核shape=[filter_length, in_width, in_channels, nb_filters]
            nb_filter: int, 卷积核数量
            padding: 默认'VALID'，暂时不支持设成'SAME'
            pooling: bool, 是否池化
        """
        assert padding in ('VALID'), 'Unknow padding %s' % padding
        # assert padding in ('VALID', 'SAME'), 'Unknow padding %s' % padding

        in_height, in_width = map(int, input_data.get_shape()[1:])
        self._input_data = tf.expand_dims(input_data, -1)  # shape=[x, x, x, 1]
        self._filter_length = filter_length
        self._nb_filter = nb_filter
        self._strides = strides
        self._padding = padding
        self._activation = get_activation(activation)
        self._name = name
        self.pooling = pooling

        filter_length = self._filter_length
        nb_filter = self._nb_filter
        with tf.name_scope('%s_%d' % (name, filter_length)):
            if activation != 'relu':
                fan_in = filter_length * in_width
                fan_out = nb_filter * (in_width-filter_length+1)
                w_bound = np.sqrt(6. / (fan_in + fan_out))
                self.weights = tf.Variable(
                    tf.random_uniform(
                        minval=-w_bound, maxval=w_bound, dtype='float32',
                        shape=[filter_length, in_width, 1, nb_filter]),
                    name='conv_weight')
                tf.summary.histogram('weights', self.weights)
            else:  # init weight for relu
                w_values = tf.random_normal(
                    shape=[filter_length, in_width, 1, nb_filter]
                ) * tf.sqrt(2. / (filter_length * in_width * nb_filter))
                self.weights = tf.Variable(w_values, name='conv_weight')
            # bias
            self.biases = tf.Variable(
                tf.constant(0.1, shape=[nb_filter, ]),
                name='conv_bias')
            tf.summary.histogram('biases', self.biases)

        self.call()

    def call(self):
        # 卷积  if padding='VALID', then conv_output's shape=
        #   [batch_size, in_height-filter_length+1, 1, nb_filters]
        conv_output = tf.nn.conv2d(
            input=self._input_data,
            filter=self.weights,
            strides=self._strides,
            padding=self._padding)

        # output's shape=[batch_size, new_height, 1, nb_filters]
        linear_output = tf.nn.bias_add(conv_output, self.biases)
        act_output = (
            linear_output if self._activation is None
            else self._activation(linear_output))
        if self.pooling:
            # max pooling, shape=[?, nb_filter]
            self._output = tf.reduce_max(tf.squeeze(act_output, [2]), 1)
        else:
            self._output = tf.squeeze(act_output, axis=2)  # [?, n-w+1, nb_filter]

    @property
    def input_data(self):
        return self._input_data

    @property
    def output(self):
        return self._output

    @property
    def output_dim(self):
        return self._nb_filter

    def get_weights(self):
        return self.weights


================================================
FILE: code/TFNN/layers/DenseLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
    SoftmaxLayer

"""
import tensorflow as tf
from ..activations import get_activation


class SoftmaxDense(object):

    def __init__(self, input_data, input_dim, output_dim, weights=None,
                 biases=None, activation=None, name='Dense'):
        assert len(input_data.get_shape()) == 2, \
            "全连接层的输入必须要flatten, 即shape=[batch_size, input_dim]"
        self._input_data = input_data
        self._input_dim = input_dim
        self._output_dim = output_dim
        self._activation = get_activation(activation)
        self._name = name

        with tf.name_scope(self._name):
            # initialize weights
            if weights is None:
                w_bound = tf.sqrt(6. / (input_dim + output_dim))
                weights = tf.Variable(
                    tf.random_uniform(
                        minval=-w_bound, maxval=w_bound, dtype='float32',
                        shape=[input_dim, output_dim]),
                    name='weights'
                )
            self._weights = weights
            tf.summary.histogram('weights', self._weights)
            # initialize biases
            if biases is None:
                biases = tf.Variable(
                    tf.constant(0.1, shape=[self._output_dim]),
                    name='biases')
            self._biases = biases
            tf.summary.histogram('biases', biases)

        self.call()

    def call(self):
        # output
        linear_output = tf.matmul(self._input_data, self._weights) + \
                            self._biases
        self._output = (
            linear_output if self._activation is None
            else self._activation(linear_output)
        )

    def loss(self, y):
        y = tf.cast(y, tf.int32)
        cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.output, labels=y, name='xentroy')
        loss = tf.reduce_mean(cross_entroy, name='xentroy_mean')
        return loss

    def get_pre_y(self):
        # TODO 待修改
        # pre_y = tf.reshape(tf.round(tf.sigmoid(self._output)), [-1])
        pre_y = tf.arg_max(input=self._output, dimension=1)
        return pre_y

    @property
    def input_data(self):
        return self._input_data

    @property
    def input_dim(self):
        return self._input_dim

    @property
    def output_dim(self):
        return self._output_dim

    @property
    def name(self):
        return self._name

    @property
    def weights(self):
        return self._weights

    @property
    def biases(self):
        return self._biases

    @property
    def output(self):
        return self._output


================================================
FILE: code/TFNN/layers/EmbeddingLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
    Embedding

"""
import tensorflow as tf


class Embedding(object):

    def __init__(self, params, ids, name, keep_prob=1.0):
        with tf.name_scope('%s' % name):
            self._params = tf.Variable(params, tf.float32, name='embed')
            self._ids = ids

            # output
            embed_output = tf.nn.embedding_lookup(
                params=self._params,
                ids=self._ids
            )
            self._output = tf.nn.dropout(embed_output, keep_prob)

    @property
    def params(self):
        return self._params

    @property
    def output_dim(self):
        return int(self._output.get_shape()[-1])

    @property
    def output(self):
        return self._output

================================================
FILE: code/TFNN/layers/__init__.py
================================================


================================================
FILE: code/TFNN/objectives.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf


def categorical_crossentropy(y_true, y_pred):
    """
    Args:
        y_true: int of list, length=batch_size
        y_pred: 2D tensor with shape=[batch_size, nb_classes]
    Returns:
        xx
    """
    cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=y_pred, labels=y_true, name='xentroy')
    return tf.reduce_mean(cross_entroy, name='xentroy_mean')


================================================
FILE: code/TFNN/utils/__init__.py
================================================


================================================
FILE: code/TFNN/utils/data_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
import numpy as np
from collections import defaultdict


def flatten_list(nest_list):
    """
    将嵌套列表压扁
    Args:
        nest_list: list,嵌套列表
    Return:
        flatten_list: list
    """
    res = []
    for item in nest_list:
        if isinstance(item, list):
            res.extend(flatten_list(item))
        else:
            res.append(item)
    return res


def create_dictionary(items, dic_path, start=0, sort=True,
                      min_count=None, lower=False, overwrite=False):
    """
    构建字典，并将构建的字典写入pkl文件中
    Args:
        items: list, [item_1, item_2, ...]
        dic_path: 需要保存的路径(以pkl结尾)
        start: int, voc起始下标，默认为0
        sort: bool, 是否按频率排序, 若为False，则按items排序
        min_count: 最小频次
        lower: bool, 是否转为小写
        overwrite: bool, 是否覆盖之前的文件
    Returns:
        None
    """
    assert not dic_path.endswith('pk')
    if os.path.exists(dic_path) and not overwrite:
        return
    voc = dict()
    if sort:
        # 构建字典
        dic = defaultdict(int)
        for item in items:
            item = item if (not lower) else item.lower()
            dic[item] += 1
        # 排序
        dic = sorted(dic.items(), key=lambda d: d[1], reverse=True)
        for i, item in enumerate(dic):
            index = i + start
            key = item[0]
            if min_count and min_count > item[1]:
                continue
            voc[key] = index
    else:  # 按items排序
        for i, item in enumerate(items):
            item = item if not lower else item.lower()
            index = i + start
            voc[item] = index
    # 写入文件
    file = open(dic_path, 'wb')
    pickle.dump(voc, file)
    file.close()


def map_item2id(items, voc, max_len, none_word=0, lower=False):
    """
    将word/pos等映射为id
    Args:
        items: list, 待映射列表
        voc: 词表
        max_len: int, 序列最大长度
        none_word: 未登录词标号,默认为0
    Returns:
        arr: np.array, dtype=int32, shape=[max_len,]
    """
    assert type(none_word) == int
    arr = np.zeros((max_len,), dtype='int32')
    min_range = min(max_len, len(items))
    for i in range(min_range):  # 若items长度大于max_len，则被截断
        item = items[i] if not lower else items[i].lower()
        arr[i] = voc[item] if item in voc else none_word
    return arr


def random_over_sampling():
    """
    随机过采样
    Args:
        xx
    Return:
        xx
    """
    x_1 = [[1,1,1], [2,2,2], [3,3,3]]
    x_2 = [[1,1,1], [2,2,2], [3,3,3]]
    y = [1,2,3]
    from imblearn.over_samping import RandomOverSampler
    ros = RandomOverSampler(sandom_state=42)
    x_res, y_res = ros.fit_sample(x_1, y)
    print(x_res)
    print(y_res)


def demo():
    random_over_sampling()


if __name__ == '__main__':
    demo()


================================================
FILE: code/TFNN/utils/evaluate_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
from collections import defaultdict
import codecs


def sim_compute(pro_labels, right_labels, ignore_label=None):
    """
    simple evaluate...
    Args:
        param pro_labels list : predict labels
        param right_labels list : right labels
        param ignore_label int : the label should be ignored
    Returns:
        pre, rec, f
    """
    assert len(pro_labels) == len(right_labels)
    pre_pro_labels, pre_right_labels = [], []
    rec_pro_labels, rec_right_labels = [], []
    labels_len = len(pro_labels)
    for i in range(labels_len):
        pro_label = pro_labels[i]
        if pro_label != ignore_label:  #
            pre_pro_labels.append(pro_label)
            pre_right_labels.append(right_labels[i])
        if right_labels[i] != ignore_label:
            rec_pro_labels.append(pro_label)
            rec_right_labels.append(right_labels[i])
    pre_pro_labels, pre_right_labels = np.array(pre_pro_labels, dtype='int32'), \
        np.array(pre_right_labels, dtype='int32')
    rec_pro_labels, rec_right_labels = np.array(rec_pro_labels, dtype='int32'), \
        np.array(rec_right_labels, dtype='int32')
    pre = 0. if len(pre_pro_labels) == 0 \
        else len(np.where(pre_pro_labels == pre_right_labels)[0]) / float(len(pre_pro_labels))
    # rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(pre_pro_labels))
    rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(rec_right_labels))
    f = 0. if (pre + rec) == 0. \
        else (pre * rec * 2.) / (pre + rec)
    return pre, rec, f


def demo():
    pro_labels = [1, 2, 3, 4, 0, 6, 7, 0, 2, 8]
    right_labels = [0, 2, 3, 6, 5, 4, 7, 1, 0, 3]
    # ignore_label = 0
    pre, rec, f = sim_compute(pro_labels, right_labels, ignore_label=2)
    print('pre:', pre)
    print('rec:', rec)
    print('  f:', f)


if __name__ == '__main__':
    demo()


================================================
FILE: code/TFNN/utils/io_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import codecs


def read_lines(path):
    lines = []
    with codecs.open(path, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            line = line.rstrip()
            if line:
                lines.append(line)
    return lines


def get_file_list(path, postfix, file_list):
    """
    获取path路径下所有后缀为postfix的文件名
    Args:
        path str : 文件路径
        postfix str : 后缀
        file_list 存放文件路径
    Return:
        None
    """
    temp_list = os.listdir(path)
    for fi in temp_list:
        fi_d = os.path.join(path, fi)
        if os.path.isdir(fi_d):  # 若是目录，则递归
            get_file_list(fi_d, postfix, file_list)
        else:  # 若是文件
            if fi_d.endswith(postfix):  # 以postfix结尾
                file_list.append(fi_d)
    return None


================================================
FILE: code/TFNN/utils/tensor_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf


def zero_nil_slot(t, name=None):
    """
    Overwrite the nil_slot (first 1 rows) of the input Tensor with zeros.
    Args:
        t: 2D tensor
        name: str
    Returns:
        Same shape as t
    """
    with tf.name_scope('zero_nil_slot'):
        s = tf.shape(t)[1]
        z = tf.zeros([1, s], dtype=tf.float32)
        return tf.concat(
            axis=0, name=name,
            values=[z, tf.slice(t, [1, 0], [-1, -1])])


def add_gradient_noise(t, stddev=1e-3, name=None):
    """
    Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
    The input Tensor `t` should be a gradient.
    The output will be `t` + gaussian noise.
    0.001 was said to be a good fixed value for memory networks [2].
    Args:
        t: 2D tensor
    Returns:
        2D tensor, same shape as t
    """
    with tf.name_scope("add_gradient_noise"):
        gn = tf.random_normal(tf.shape(t), stddev=stddev)
        return tf.add(t, gn, name=name)


def mask_tensor(input_data, lengths, maxlen, dtype=tf.float32):
    """
    Args:
        input_data: 2D tensor
        lengths: integer vector, all its values < maxlen
        maxlen: scalar integer tensor
        dtype: str
    """
    mask = tf.cast(tf.sequence_mask(lengths, maxlen), dtype)
    return tf.multiply(input_data, mask)


================================================
FILE: code/configurations.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    configurations
"""
import os


# --- corpus ---
TRAIN_PATH = './Data/corpus/training.seg.csv'
TEST_PATH = './Data/corpus/testing.seg.csv'


# --- voc ---
VOC_ROOT = './Data/voc'
if not os.path.exists(VOC_ROOT):
    os.mkdir(VOC_ROOT)
WORD_VOC_PATH = VOC_ROOT + '/word_voc.pkl'
WORD_VOC_START = 2
TAG_VOC_PATH = VOC_ROOT + '/tag_voc.pkl'
TAG_VOC_START = 1
LABEL_VOC_PATH = VOC_ROOT + '/label_voc.pkl'


# --- embedding ---
W2V_DIM = 256
W2V_PATH = './Data/embedding/word2vec.pkl'
EMBEDDING_ROOT = './Data/embedding/'
if not os.path.exists(EMBEDDING_ROOT):
    os.mkdir(EMBEDDING_ROOT)
W2V_TRAIN_PATH = EMBEDDING_ROOT + '/word2v.pkl'
T2V_PATH = EMBEDDING_ROOT + '/tag2v.pkl'
TAG_DIM = 64


# --- training param ---
MAX_LEN = 300
BATCH_SIZE = 64
NB_LABELS = 11
NB_EPOCH = 30
KEEP_PROB = 0.5
WORD_KEEP_PROB = 0.9
TAG_KEEP_PROB = 0.9
KFOLD = 10


================================================
FILE: code/load_data.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Load data.
"""
import pickle
from time import time
import numpy as np
import configurations as config
from TFNN.utils.io_util import read_lines
from TFNN.utils.data_util import map_item2id


def get_sentence_arr(words_tags, word_voc, tag_voc):
    """
    获取词序列
    Args:
        words_tags: list, 句子 and tags
        word_voc: 词表
        tag_voc: 词性标注表
    Returns:
        sentence_arr: np.array, 字符id序列
        tag_arr: np.array, 词性标记序列
    """
    words, postags = [], []
    for item in words_tags:
        rindex = item.rindex('/')
        words.append(item[:rindex])
        postags.append(item[rindex+1:])
    # sentence arr
    sentence_arr = map_item2id(
        words, word_voc, config.MAX_LEN, lower=True)
    # pos tags arr
    postag_arr = map_item2id(
        postags, tag_voc, config.MAX_LEN, lower=False)
    return sentence_arr, postag_arr, len(words)


def init_data(lines, word_voc, tag_voc, label_voc):
    """
    加载数据
    Args:
        lines: list
        word_voc: dict, 词表
        tag_voc: dict, 词性标注表
        label_voc: dict
    Returns:
        sentences: np.array
        etc.
    """
    data_count = len(lines)
    sentences = np.zeros((data_count, config.MAX_LEN), dtype='int32')
    tags = np.zeros((data_count, config.MAX_LEN), dtype='int32')
    sentence_actual_lengths = np.zeros((data_count,), dtype='int32')
    labels = np.zeros((data_count,), dtype='int32')
    instance_index = 0
    for i in range(data_count):
        index = lines[i].index(',')
        label = lines[i][:index]
        sentence = lines[i][index+1:]
        words_tags = sentence.split(' ')
        sentence_arr, tag_arr, actual_length = get_sentence_arr(words_tags, word_voc, tag_voc)

        sentences[instance_index, :] = sentence_arr
        tags[instance_index, :] = tag_arr
        sentence_actual_lengths[instance_index] = actual_length
        labels[instance_index] = label_voc[label] if label in label_voc else 0
        instance_index += 1
    return sentences, tags, labels


def load_embedding():
    """
    加载词向量、词性向量
    Return:
        word_weights: np.array
        tag_weights: np.array
    """
    # 加载词向量
    with open(config.W2V_TRAIN_PATH, 'rb') as file_r:
        word_weights = pickle.load(file_r)
    # 加载tag向量
    with open(config.T2V_PATH, 'rb') as file_r:
        tag_weights = pickle.load(file_r)
    return word_weights, tag_weights


def load_voc():
    """
    Load voc...
    Return:
        word_voc: dict
        tag_voc: dict
        label_voc: dict
    """
    with open(config.WORD_VOC_PATH, 'rb') as file_r:
        word_voc = pickle.load(file_r)
    with open(config.TAG_VOC_PATH, 'rb') as file_r:
        tag_voc = pickle.load(file_r)
    with open(config.LABEL_VOC_PATH, 'rb') as file_r:
        label_voc = pickle.load(file_r)
    return word_voc, tag_voc, label_voc


def load_train_data(word_voc, tag_voc, label_voc):
    """
    加载训练测试数据
    Args:
        word_voc: dict
        tag_voc: dict
        label_voc: dict
    Returns:
        xx
    """
    return init_data(read_lines(config.TRAIN_PATH), word_voc, tag_voc, label_voc)


def load_test_data(word_voc, tag_voc, label_voc):
    """
    加载测试数据
    Args:
        word_voc: dict
        tag_voc: dict
        label_voc: dict
    Returns:
        xx
    """
    sentences, tags, _ = init_data(read_lines(config.TEST_PATH), word_voc, tag_voc, label_voc)
    return sentences, tags


def demo():
    t0 = time()
    word_weights, tag_weights = load_embedding()
    word_voc, tag_voc, label_voc = load_voc()
    data, label_voc = load_train_data()
    sentences, tags, labels = data[:]
    print(sentences.shape)
    print(tags.shape)
    print(labels.shape)
    print(word_weights.shape)
    print(tag_weights.shape)
    print('Done in %ds!' % (time()-t0))


if __name__ == '__main__':
    demo()


================================================
FILE: code/model_dc.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from time import time
import configurations as config
import tensorflow as tf
import numpy as np
from load_data import load_embedding, load_voc, load_train_data, load_test_data
from TFNN.layers.EmbeddingLayer import Embedding
from TFNN.layers.DenseLayer import SoftmaxDense
from TFNN.layers.ConvolutionalLayer import Convolutional1D
from TFNN.utils.evaluate_util import sim_compute
from TFNN.utils.tensor_util import zero_nil_slot
from sklearn.model_selection import KFold
import codecs
from TFNN.utils.io_util import read_lines


class DCModel(object):

    def __init__(self, max_len, word_weights, tag_weights, result_path=None, label_voc=None):
        """
        Initilize model
        Args:
            max_len: int, 句子最大长度
            word_weights: np.array, shape=[|V_words|, w2v_dim]，词向量
            tag_weights: np.array, shape=[|V_tags|, t2v_dim],标记向量
            result_path: str, 模型评价结果存放路径
            label_voc: dict
        """
        self._result_path = result_path
        self._label_voc = label_voc
        self._label_voc_rev = dict()
        for key in self._label_voc:
            value = self._label_voc[key]
            self._label_voc_rev[value] = key

        # input placeholders
        self.input_sentence_ph = tf.placeholder(
            tf.int32, shape=(None, max_len), name='input_sentence_ph')
        self.input_tag_ph = tf.placeholder(tf.int32, shape=(None, max_len), name='input_tag_ph')
        self.label_ph = tf.placeholder(tf.int32, shape=(None,), name='label_ph')
        self.keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob')
        self.word_keep_prob_ph = tf.placeholder(tf.float32, name='word_keep_prob')
        self.tag_keep_prob_ph = tf.placeholder(tf.float32, name='tag_keep_prob')

        # embedding layers
        self.nil_vars = set()
        word_embed_layer = Embedding(
            params=word_weights, ids=self.input_sentence_ph,
            keep_prob=self.word_keep_prob_ph, name='word_embed_layer')
        tag_embed_layer = Embedding(
            params=tag_weights, ids=self.input_tag_ph,
            keep_prob=self.tag_keep_prob_ph, name='tag_embed_layer')
        self.nil_vars.add(word_embed_layer.params.name)
        self.nil_vars.add(tag_embed_layer.params.name)

        # sentence representation
        sentence_input = tf.concat(
            values=[word_embed_layer.output, tag_embed_layer.output], axis=2)

        # sentence conv
        conv_layer = Convolutional1D(
            input_data=sentence_input, filter_length=3,
            nb_filter=1000, activation='relu', name='conv_layer')

        # dense layer
        dense_input_drop = tf.nn.dropout(conv_layer.output, self.keep_prob_ph)
        self.dense_layer = SoftmaxDense(
            input_data=dense_input_drop, input_dim=conv_layer.output_dim,
            output_dim=config.NB_LABELS, name='output_layer')

        self.loss = self.dense_layer.loss(self.label_ph) + \
            0.001*tf.nn.l2_loss(self.dense_layer.weights)
        optimizer = tf.train.AdamOptimizer()  # Adam
        grads_and_vars = optimizer.compute_gradients(self.loss)
        nil_grads_and_vars = []
        for g, v in grads_and_vars:
            if v.name in self.nil_vars:
                nil_grads_and_vars.append((zero_nil_slot(g), v))
            else:
                nil_grads_and_vars.append((g, v))
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # train op
        self.train_op = optimizer.apply_gradients(
            nil_grads_and_vars, name='train_op', global_step=global_step)

        # pre op
        self.pre_op = self.dense_layer.get_pre_y()

        # summary
        gpu_options = tf.GPUOptions(visible_device_list='0', allow_growth=True)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        # init model
        init = tf.global_variables_initializer()
        self.sess.run(init)

    def fit(self, sentences_train, tags_train, labels_train,
            sentences_dev=None, tags_dev=None, labels_dev=None,
            sentences_test=None, tags_test=None, labels_test=None,
            batch_size=64, nb_epoch=40, keep_prob=1.0, word_keep_prob=1.0,
            tag_keep_prob=1.0, seed=137):
        """
        fit model
        Args:
            sentences_train, tags_train, labels_train: 训练数据
            sentences_dev, tags_dev, labels_dev: 开发数据
            batch_size: int, batch size
            nb_epoch: int, 迭代次数
            keep_prob: float between [0, 1], 全连接层前的dropout
            word_keep_prob: float between [0, 1], 词向量层dropout
            tag_keep_prob: float between [0, 1], 标记向量层dropout
        """
        self.nb_epoch_scores = []  # 存放nb_epoch次迭代的f值
        nb_train = int(labels_train.shape[0] / batch_size) + 1
        for step in range(nb_epoch):
            print('Epoch %d / %d:' % (step+1, nb_epoch))
            # shuffle
            np.random.seed(seed)
            np.random.shuffle(sentences_train)
            np.random.seed(seed)
            np.random.shuffle(tags_train)
            np.random.seed(seed)
            np.random.shuffle(labels_train)

            # train
            total_loss = 0.
            for i in range(nb_train):
                # for i in range(nb_train):
                sentences_feed = sentences_train[i*batch_size:(i+1)*batch_size]
                tags_feed = tags_train[i*batch_size:(i+1)*batch_size]
                labels_feed = labels_train[i*batch_size:(i+1)*batch_size]
                feed_dict = {
                    self.input_sentence_ph: sentences_feed,
                    self.input_tag_ph: tags_feed,
                    self.label_ph: labels_feed,
                    self.keep_prob_ph: keep_prob,
                    self.word_keep_prob_ph: word_keep_prob,
                    self.tag_keep_prob_ph: tag_keep_prob,
                }
                _, loss_value = self.sess.run(
                    [self.train_op, self.loss], feed_dict=feed_dict)
                total_loss += loss_value

            total_loss /= float(nb_train)

            #  计算在训练集、开发集、测试集上的性能
            p_train, r_train, f_train = self.evaluate(sentences_train, tags_train, labels_train)
            p_dev, r_dev, f_dev = self.evaluate(sentences_dev, tags_dev, labels_dev)
            pre_labels = self.predict(sentences_test, tags_test)
            with codecs.open('./Data/result/epoch_%d.csv' % (step+1), 'w', encoding='utf-8') as file_w:
                for num, label in enumerate(pre_labels):
                    file_w.write('%d,%s\n' % (num+1, self._label_voc_rev[label]))
            self.nb_epoch_scores.append([p_dev, r_dev, f_dev])
            print('\tloss=%f, train f=%f, dev f=%f' % (total_loss, f_train, f_dev))

    def predict(self, data_sentences, data_tags, batch_size=50):
        """
        Args:
            data_sentences, data_tags: np.array
            batch_size: int
        Return:
            pre_labels: list
        """
        pre_labels = []
        nb_test = int(data_sentences.shape[0]/batch_size) + 1
        for i in range(nb_test):
            sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size]
            tags_feed = data_tags[i*batch_size:(i+1)*batch_size]
            feed_dict = {
                self.input_sentence_ph: sentences_feed,
                self.input_tag_ph: tags_feed,
                self.keep_prob_ph: 1.0,
                self.word_keep_prob_ph: 1.0,
                self.tag_keep_prob_ph: 1.0}
            pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict)
            pre_labels += list(pre_temp)
        return pre_labels

    def evaluate(self, data_sentences, data_tags, data_labels,
                 ignore_label=None, batch_size=64, simple_compute=True):
        """
        Args:
            data_sentences, data_tags, data_labels: np.array
            ignore_label: int, 负例的编号，或者None
            simple_compute: bool, 是否画出性能详细指标表格
        Return:
            p, r, f1
        """
        pre_labels = []
        nb_dev = int(len(data_labels)/batch_size) + 1
        for i in range(nb_dev):
            sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size]
            tags_feed = data_tags[i*batch_size:(i+1)*batch_size]
            labels_feed = data_labels[i*batch_size:(i+1)*batch_size]
            feed_dict = {
                self.input_sentence_ph: sentences_feed,
                self.input_tag_ph: tags_feed,
                self.label_ph: labels_feed,
                self.keep_prob_ph: 1.0,
                self.word_keep_prob_ph: 1.0,
                self.tag_keep_prob_ph: 1.0}
            pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict)
            pre_labels += list(pre_temp)
        right_labels = data_labels[:len(pre_labels)]
        pre, rec, f = sim_compute(pre_labels, right_labels, ignore_label=ignore_label)
        return pre, rec, f

    def clear_model(self):
        tf.reset_default_graph()  #
        self.sess.close()

    def get_best_score(self):
        """
        计算模型得分(当开发集上f值达到最高时所对应的测试集得分)
        Returns:
            score: float, 开发集达到最高时,测试集的[p, r, f]
            nb_epoch: int, the num of epoch
        """
        # nb_epoch_scores = sorted(self.nb_epoch_scores, key=lambda d: d[1][-1], reverse=True)
        nb_epoch, best_score = -1, None
        for i in range(len(self.nb_epoch_scores)):
            if not best_score or self.nb_epoch_scores[i][-1] > best_score[-1]:
                best_score = self.nb_epoch_scores[i]
                nb_epoch = i
        return best_score, nb_epoch


def predict():
    word_weights, tag_weights = load_embedding()
    word_voc, tag_voc, label_voc = load_voc()

    # train data
    sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc)
    seed = 137
    np.random.seed(seed)
    np.random.shuffle(sentences)
    np.random.seed(seed)
    np.random.shuffle(tags)
    np.random.seed(seed)
    np.random.shuffle(labels)

    # load data
    sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc)
    labels_test = None
    
    # clear reslut
    if not os.path.exists('./Data/result'):
        os.mkdir('./Data/result')
    command = 'rm ./Data/result/*'
    os.popen(command)

    # 划分训练、开发、测试集
    kf = KFold(n_splits=config.KFOLD)
    train_indices, dev_indices = [], []
    for train_index, dev_index in kf.split(labels):
        train_indices.append(train_index)
        dev_indices.append(dev_index)
    for num in range(config.KFOLD):
        train_index, dev_index = train_indices[num], dev_indices[num]
        sentences_train, sentences_dev = sentences[train_index], sentences[dev_index]
        tags_train, tags_dev = tags[train_index], tags[dev_index]
        labels_train, labels_dev = labels[train_index], labels[dev_index]

        # init model
        model = DCModel(
            config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt',
            label_voc=label_voc)

        # fit model
        model.fit(
            sentences_train, tags_train, labels_train,
            sentences_dev, tags_dev, labels_dev,
            sentences_test, tags_test, labels_test,
            config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB,
            word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB)
        print(model.get_best_score())
        [p_test, r_test, f_test], nb_epoch = model.get_best_score()
        command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num)
        print(command)
        os.popen(command)
        print(p_test, r_test, f_test, '\n')
        # evaluate
        # result_path_k = result_path % k
        # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test,
        #    labels_test, simple_compute=False, ignore_label=IGNORE_LABEL,
        #    label_voc=relation_voc, result_path=result_path_k)
        # clear model
        model.clear_model()
        del model


def init_result():
    labels = []
    for i in range(config.KFOLD):
        lines = read_lines('./Data/result/best_%d' % i)
        temp = []
        for line in lines:
            label = line.split(',')[1]
            temp.append(label)
        labels.append(temp)
    return labels


def merge():
    datas = init_result()
    data_count = len(datas[0])
    label_type_count = config.NB_LABELS
    labels = np.zeros((data_count, label_type_count))
    for data in datas:
        for i, label in enumerate(data):
            label_id = int(label) - 1
            labels[i][label_id] += 1
    # 取众数
    final_labels = []
    for item in labels:
        label = item.argmax() + 1
        final_labels.append(label)

    # clear result
    command = 'rm ./Data/result/*'
    os.popen(command)

    with codecs.open('./Data/result/integrade.csv', 'w', encoding='utf-8') as file_w:
        for i, label in enumerate(final_labels):
            file_w.write('%d,%d\n' % (i+1, label))
        print('Result: %s' % file_w.name)


if __name__ == '__main__':
    t0 = time()

    # predict test data
    predict()

    # merge
    merge()

    print('Done in %ds!' % (time()-t0))


================================================
FILE: code/prepare_data.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    prepare data.

    生成:
        word voc
        position voc
        relation type voc

        lookup tables
"""
import os
import pickle
import numpy as np
import configurations as config
from TFNN.utils.data_util import create_dictionary
from TFNN.utils.io_util import read_lines
from time import time


def init_voc():
    """
    初始化voc
    """
    lines = read_lines(config.TRAIN_PATH)
    lines += read_lines(config.TEST_PATH)
    words = []  # 句子
    pos_tags = []  # 词性标记类型
    for line in lines:
        index = line.index(',')
        sentence = line[index+1:]
        # words and tags
        words_tags = sentence.split(' ')
        words_temp, tag_temp = [], []
        for item in words_tags:
            r_index = item.rindex('/')
            word, tag = item[:r_index], item[r_index+1:]
            words_temp.append(word)
            tag_temp.append(tag)
        pos_tags.extend(tag_temp)
        words.extend(words_temp)
    # word voc
    create_dictionary(
        words, config.WORD_VOC_PATH, start=config.WORD_VOC_START,
        min_count=5, sort=True, lower=True, overwrite=True)
    # tag voc
    create_dictionary(
        pos_tags, config.TAG_VOC_PATH, start=config.TAG_VOC_START,
        sort=True, lower=False, overwrite=True)
    # label voc
    label_types = [str(i) for i in range(1, 12)]
    create_dictionary(
        label_types, config.LABEL_VOC_PATH, start=0, overwrite=True)


def init_word_embedding(path=None, overwrite=False):
    """
    初始化word embedding
    Args:
        path: 结果存放路径
    """
    if os.path.exists(path) and not overwrite:
        return
    with open(config.W2V_PATH, 'rb') as file:
        w2v_dict_full = pickle.load(file)
    with open(config.WORD_VOC_PATH, 'rb') as file:
        w2id_dict = pickle.load(file)
    word_voc_size = len(w2id_dict.keys()) + config.WORD_VOC_START
    word_weights = np.zeros((word_voc_size, config.W2V_DIM), dtype='float32')
    for word in w2id_dict:
        index = w2id_dict[word]  # 词的标号
        if word in w2v_dict_full:
            word_weights[index, :] = w2v_dict_full[word]
        else:
            random_vec = np.random.uniform(
                -0.25, 0.25, size=(config.W2V_DIM,)).astype('float32')
            word_weights[index, :] = random_vec
    # 写入pkl文件
    with open(path, 'wb') as file:
        pickle.dump(word_weights, file, protocol=2)


def init_tag_embedding(path, overwrite=False):
    """
    初始化pos tag embedding
    Args:
        path: 结果存放路径
    """
    if os.path.exists(path) and not overwrite:
        return
    with open(config.TAG_VOC_PATH, 'rb') as file:
        tag_voc = pickle.load(file)
    tag_voc_size = len(tag_voc.keys()) + config.TAG_VOC_START
    tag_weights = np.random.normal(
        size=(tag_voc_size, config.TAG_DIM)).astype('float32')
    for i in range(config.TAG_VOC_START):
        tag_weights[i, :] = 0.
    with open(path, 'wb') as file:
        pickle.dump(tag_weights, file, protocol=2)


def init_embedding():
    """
    初始化embedding
    """
    if not os.path.exists(config.EMBEDDING_ROOT):
        os.mkdir(config.EMBEDDING_ROOT)
    # 初始化word embedding
    init_word_embedding(config.W2V_TRAIN_PATH, overwrite=True)
    # 初始化tag embedding
    init_tag_embedding(config.T2V_PATH, overwrite=True)


def demo():
    with open(config.W2V_TRAIN_PATH, 'rb') as file:
        temp = pickle.load(file)
    print(temp.shape)


if __name__ == '__main__':
    t0 = time()

    init_voc()  # 初始化voc

    init_embedding()  # 初始化embedding

    demo()

    print('Done in %.1fs!' % (time()-t0))


================================================
FILE: code/train_w2v_model.py
================================================
#!/usr/bin/env python
# coding=utf-8
import codecs
import pickle
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
from TFNN.utils.io_util import read_lines


def get_sentence(sentence_tag):
    words = []
    for item in sentence_tag.split(' '):
        index = item.rindex('/')
        words.append(item[:index])
    return ' '.join(words)


def extract_sentece():
    lines = read_lines('./Data/corpus/training.seg.csv')
    lines += read_lines('./Data/corpus/testing.seg.csv')
    with codecs.open('./Data/corpus/sentence.txt', 'w', encoding='utf-8') as file_w:
        for line in lines:
            index = line.index(',')
            word_tag = line[index+1:]
            file_w.write('%s\n' % get_sentence(word_tag))


def train():
    extract_sentece()

    in_path = './Data/corpus/sentence.txt'
    out_path = './Data/embedding/word2vec.bin'
    # 训练模型
    model = Word2Vec(
        sg=1, sentences=LineSentence(in_path),
        size=256, window=5, min_count=3, workers=4, iter=40)
    model.wv.save_word2vec_format(out_path, binary=True)


def bin2pkl():
    model = KeyedVectors.load_word2vec_format('./Data/embedding/word2vec.bin', binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    with open('./Data/embedding/word2vec.pkl', 'wb') as file_w:
        pickle.dump(word_dict, file_w)
        print(file_w.name)


if __name__ == '__main__':
    train()

    bin2pkl()