Repository: liu-nlper/DocumentClassification Branch: master Commit: b6c6cce3638a Files: 20 Total size: 38.2 KB Directory structure: gitextract__8lu9u9y/ ├── README.md └── code/ ├── Data/ │ └── corpus/ │ └── corpus download link ├── README ├── TFNN/ │ ├── __init__.py │ ├── activations.py │ ├── layers/ │ │ ├── ConvolutionalLayer.py │ │ ├── DenseLayer.py │ │ ├── EmbeddingLayer.py │ │ └── __init__.py │ ├── objectives.py │ └── utils/ │ ├── __init__.py │ ├── data_util.py │ ├── evaluate_util.py │ ├── io_util.py │ └── tensor_util.py ├── configurations.py ├── load_data.py ├── model_dc.py ├── prepare_data.py └── train_w2v_model.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # Document Classification This code implements a simple CNN model for document classification with tensorflow. # Model Structure ![model.png](https://github.com/MRliujiaxin/DocumentClassification/raw/master/model.png) # Requirements - Python: 2.7 - Tensorflow: 1.0.0 - Numpy: 1.12.1 - sklearn: 0.18.1 - gensim: 1.0.1 - pickle ================================================ FILE: code/Data/corpus/corpus download link ================================================ 数据下载地址:http://competition.ai100.com.cn/html/game_det.html?id=24 将分词/词性标注后的文件命名为training.seg.csv和testing.seg.csv,放置到当前目录下。 处理后的文本如下所示: "公司/n 是/vshi 经/p 批准/v 依法/d 从事/vi 融资/vi 性/ng 担保/vn 业务/n 的/ude1 ..." ================================================ FILE: code/README ================================================ 下载数据到Data/corpus目录下,并做分词和词性标注处理,再按以下顺序执行: train_w2v_model.py -> prepare_data.py -> model_dc.py 文件说明: 1. 分词、词性标注采用中科院NLPIR,处理后文件: ./Data/corpus/training.seg.csv ./Data/corpus/testing.seg.csv 2. configuration.py 配置文件 3. train_w2v_model.py 利用官方给的train, test训练词向量 3. prepare_data.py 构建词表,词性表等 4. load_data.py 加载数据 5. model_dc.py 训练模型并预测 ================================================ FILE: code/TFNN/__init__.py ================================================ ================================================ FILE: code/TFNN/activations.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ 激活函数 """ import tensorflow as tf def get_activation(activation=None): """ Get activation function accord to the parameter 'activation' Args: activation: str: 激活函数的名称 Return: 激活函数 """ if activation is None: return None elif activation == 'tanh': return tf.nn.tanh elif activation == 'relu': return tf.nn.relu elif activation == 'softmax': return tf.nn.softmax else: raise Exception('Unknow activation function: %s' % activation) ================================================ FILE: code/TFNN/layers/ConvolutionalLayer.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ DNN Layers: Convolutional1D """ import numpy as np import tensorflow as tf # from ..initializations import normal_weight from ..activations import get_activation class Convolutional1D(object): def __init__(self, input_data, filter_length, nb_filter, strides=[1, 1, 1, 1], padding='VALID', activation='tanh', pooling=True, name='Convolutional1D'): """1D卷积层 Args: input_data: 3D tensor of shape=[batch_size, in_height, in_width] in_channels is set to 1 when use Convolutional1D. filter_length: int, 卷积核的长度,用于构造卷积核,在 Convolutional1D中,卷积核shape=[filter_length, in_width, in_channels, nb_filters] nb_filter: int, 卷积核数量 padding: 默认'VALID',暂时不支持设成'SAME' pooling: bool, 是否池化 """ assert padding in ('VALID'), 'Unknow padding %s' % padding # assert padding in ('VALID', 'SAME'), 'Unknow padding %s' % padding in_height, in_width = map(int, input_data.get_shape()[1:]) self._input_data = tf.expand_dims(input_data, -1) # shape=[x, x, x, 1] self._filter_length = filter_length self._nb_filter = nb_filter self._strides = strides self._padding = padding self._activation = get_activation(activation) self._name = name self.pooling = pooling filter_length = self._filter_length nb_filter = self._nb_filter with tf.name_scope('%s_%d' % (name, filter_length)): if activation != 'relu': fan_in = filter_length * in_width fan_out = nb_filter * (in_width-filter_length+1) w_bound = np.sqrt(6. / (fan_in + fan_out)) self.weights = tf.Variable( tf.random_uniform( minval=-w_bound, maxval=w_bound, dtype='float32', shape=[filter_length, in_width, 1, nb_filter]), name='conv_weight') tf.summary.histogram('weights', self.weights) else: # init weight for relu w_values = tf.random_normal( shape=[filter_length, in_width, 1, nb_filter] ) * tf.sqrt(2. / (filter_length * in_width * nb_filter)) self.weights = tf.Variable(w_values, name='conv_weight') # bias self.biases = tf.Variable( tf.constant(0.1, shape=[nb_filter, ]), name='conv_bias') tf.summary.histogram('biases', self.biases) self.call() def call(self): # 卷积 if padding='VALID', then conv_output's shape= # [batch_size, in_height-filter_length+1, 1, nb_filters] conv_output = tf.nn.conv2d( input=self._input_data, filter=self.weights, strides=self._strides, padding=self._padding) # output's shape=[batch_size, new_height, 1, nb_filters] linear_output = tf.nn.bias_add(conv_output, self.biases) act_output = ( linear_output if self._activation is None else self._activation(linear_output)) if self.pooling: # max pooling, shape=[?, nb_filter] self._output = tf.reduce_max(tf.squeeze(act_output, [2]), 1) else: self._output = tf.squeeze(act_output, axis=2) # [?, n-w+1, nb_filter] @property def input_data(self): return self._input_data @property def output(self): return self._output @property def output_dim(self): return self._nb_filter def get_weights(self): return self.weights ================================================ FILE: code/TFNN/layers/DenseLayer.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ DNN Layers: SoftmaxLayer """ import tensorflow as tf from ..activations import get_activation class SoftmaxDense(object): def __init__(self, input_data, input_dim, output_dim, weights=None, biases=None, activation=None, name='Dense'): assert len(input_data.get_shape()) == 2, \ "全连接层的输入必须要flatten, 即shape=[batch_size, input_dim]" self._input_data = input_data self._input_dim = input_dim self._output_dim = output_dim self._activation = get_activation(activation) self._name = name with tf.name_scope(self._name): # initialize weights if weights is None: w_bound = tf.sqrt(6. / (input_dim + output_dim)) weights = tf.Variable( tf.random_uniform( minval=-w_bound, maxval=w_bound, dtype='float32', shape=[input_dim, output_dim]), name='weights' ) self._weights = weights tf.summary.histogram('weights', self._weights) # initialize biases if biases is None: biases = tf.Variable( tf.constant(0.1, shape=[self._output_dim]), name='biases') self._biases = biases tf.summary.histogram('biases', biases) self.call() def call(self): # output linear_output = tf.matmul(self._input_data, self._weights) + \ self._biases self._output = ( linear_output if self._activation is None else self._activation(linear_output) ) def loss(self, y): y = tf.cast(y, tf.int32) cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.output, labels=y, name='xentroy') loss = tf.reduce_mean(cross_entroy, name='xentroy_mean') return loss def get_pre_y(self): # TODO 待修改 # pre_y = tf.reshape(tf.round(tf.sigmoid(self._output)), [-1]) pre_y = tf.arg_max(input=self._output, dimension=1) return pre_y @property def input_data(self): return self._input_data @property def input_dim(self): return self._input_dim @property def output_dim(self): return self._output_dim @property def name(self): return self._name @property def weights(self): return self._weights @property def biases(self): return self._biases @property def output(self): return self._output ================================================ FILE: code/TFNN/layers/EmbeddingLayer.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ DNN Layers: Embedding """ import tensorflow as tf class Embedding(object): def __init__(self, params, ids, name, keep_prob=1.0): with tf.name_scope('%s' % name): self._params = tf.Variable(params, tf.float32, name='embed') self._ids = ids # output embed_output = tf.nn.embedding_lookup( params=self._params, ids=self._ids ) self._output = tf.nn.dropout(embed_output, keep_prob) @property def params(self): return self._params @property def output_dim(self): return int(self._output.get_shape()[-1]) @property def output(self): return self._output ================================================ FILE: code/TFNN/layers/__init__.py ================================================ ================================================ FILE: code/TFNN/objectives.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import tensorflow as tf def categorical_crossentropy(y_true, y_pred): """ Args: y_true: int of list, length=batch_size y_pred: 2D tensor with shape=[batch_size, nb_classes] Returns: xx """ cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y_pred, labels=y_true, name='xentroy') return tf.reduce_mean(cross_entroy, name='xentroy_mean') ================================================ FILE: code/TFNN/utils/__init__.py ================================================ ================================================ FILE: code/TFNN/utils/data_util.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import os import pickle import numpy as np from collections import defaultdict def flatten_list(nest_list): """ 将嵌套列表压扁 Args: nest_list: list,嵌套列表 Return: flatten_list: list """ res = [] for item in nest_list: if isinstance(item, list): res.extend(flatten_list(item)) else: res.append(item) return res def create_dictionary(items, dic_path, start=0, sort=True, min_count=None, lower=False, overwrite=False): """ 构建字典,并将构建的字典写入pkl文件中 Args: items: list, [item_1, item_2, ...] dic_path: 需要保存的路径(以pkl结尾) start: int, voc起始下标,默认为0 sort: bool, 是否按频率排序, 若为False,则按items排序 min_count: 最小频次 lower: bool, 是否转为小写 overwrite: bool, 是否覆盖之前的文件 Returns: None """ assert not dic_path.endswith('pk') if os.path.exists(dic_path) and not overwrite: return voc = dict() if sort: # 构建字典 dic = defaultdict(int) for item in items: item = item if (not lower) else item.lower() dic[item] += 1 # 排序 dic = sorted(dic.items(), key=lambda d: d[1], reverse=True) for i, item in enumerate(dic): index = i + start key = item[0] if min_count and min_count > item[1]: continue voc[key] = index else: # 按items排序 for i, item in enumerate(items): item = item if not lower else item.lower() index = i + start voc[item] = index # 写入文件 file = open(dic_path, 'wb') pickle.dump(voc, file) file.close() def map_item2id(items, voc, max_len, none_word=0, lower=False): """ 将word/pos等映射为id Args: items: list, 待映射列表 voc: 词表 max_len: int, 序列最大长度 none_word: 未登录词标号,默认为0 Returns: arr: np.array, dtype=int32, shape=[max_len,] """ assert type(none_word) == int arr = np.zeros((max_len,), dtype='int32') min_range = min(max_len, len(items)) for i in range(min_range): # 若items长度大于max_len,则被截断 item = items[i] if not lower else items[i].lower() arr[i] = voc[item] if item in voc else none_word return arr def random_over_sampling(): """ 随机过采样 Args: xx Return: xx """ x_1 = [[1,1,1], [2,2,2], [3,3,3]] x_2 = [[1,1,1], [2,2,2], [3,3,3]] y = [1,2,3] from imblearn.over_samping import RandomOverSampler ros = RandomOverSampler(sandom_state=42) x_res, y_res = ros.fit_sample(x_1, y) print(x_res) print(y_res) def demo(): random_over_sampling() if __name__ == '__main__': demo() ================================================ FILE: code/TFNN/utils/evaluate_util.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np from collections import defaultdict import codecs def sim_compute(pro_labels, right_labels, ignore_label=None): """ simple evaluate... Args: param pro_labels list : predict labels param right_labels list : right labels param ignore_label int : the label should be ignored Returns: pre, rec, f """ assert len(pro_labels) == len(right_labels) pre_pro_labels, pre_right_labels = [], [] rec_pro_labels, rec_right_labels = [], [] labels_len = len(pro_labels) for i in range(labels_len): pro_label = pro_labels[i] if pro_label != ignore_label: # pre_pro_labels.append(pro_label) pre_right_labels.append(right_labels[i]) if right_labels[i] != ignore_label: rec_pro_labels.append(pro_label) rec_right_labels.append(right_labels[i]) pre_pro_labels, pre_right_labels = np.array(pre_pro_labels, dtype='int32'), \ np.array(pre_right_labels, dtype='int32') rec_pro_labels, rec_right_labels = np.array(rec_pro_labels, dtype='int32'), \ np.array(rec_right_labels, dtype='int32') pre = 0. if len(pre_pro_labels) == 0 \ else len(np.where(pre_pro_labels == pre_right_labels)[0]) / float(len(pre_pro_labels)) # rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(pre_pro_labels)) rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(rec_right_labels)) f = 0. if (pre + rec) == 0. \ else (pre * rec * 2.) / (pre + rec) return pre, rec, f def demo(): pro_labels = [1, 2, 3, 4, 0, 6, 7, 0, 2, 8] right_labels = [0, 2, 3, 6, 5, 4, 7, 1, 0, 3] # ignore_label = 0 pre, rec, f = sim_compute(pro_labels, right_labels, ignore_label=2) print('pre:', pre) print('rec:', rec) print(' f:', f) if __name__ == '__main__': demo() ================================================ FILE: code/TFNN/utils/io_util.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import os import codecs def read_lines(path): lines = [] with codecs.open(path, 'r', encoding='utf-8') as file: for line in file.readlines(): line = line.rstrip() if line: lines.append(line) return lines def get_file_list(path, postfix, file_list): """ 获取path路径下所有后缀为postfix的文件名 Args: path str : 文件路径 postfix str : 后缀 file_list 存放文件路径 Return: None """ temp_list = os.listdir(path) for fi in temp_list: fi_d = os.path.join(path, fi) if os.path.isdir(fi_d): # 若是目录,则递归 get_file_list(fi_d, postfix, file_list) else: # 若是文件 if fi_d.endswith(postfix): # 以postfix结尾 file_list.append(fi_d) return None ================================================ FILE: code/TFNN/utils/tensor_util.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import tensorflow as tf def zero_nil_slot(t, name=None): """ Overwrite the nil_slot (first 1 rows) of the input Tensor with zeros. Args: t: 2D tensor name: str Returns: Same shape as t """ with tf.name_scope('zero_nil_slot'): s = tf.shape(t)[1] z = tf.zeros([1, s], dtype=tf.float32) return tf.concat( axis=0, name=name, values=[z, tf.slice(t, [1, 0], [-1, -1])]) def add_gradient_noise(t, stddev=1e-3, name=None): """ Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2]. The input Tensor `t` should be a gradient. The output will be `t` + gaussian noise. 0.001 was said to be a good fixed value for memory networks [2]. Args: t: 2D tensor Returns: 2D tensor, same shape as t """ with tf.name_scope("add_gradient_noise"): gn = tf.random_normal(tf.shape(t), stddev=stddev) return tf.add(t, gn, name=name) def mask_tensor(input_data, lengths, maxlen, dtype=tf.float32): """ Args: input_data: 2D tensor lengths: integer vector, all its values < maxlen maxlen: scalar integer tensor dtype: str """ mask = tf.cast(tf.sequence_mask(lengths, maxlen), dtype) return tf.multiply(input_data, mask) ================================================ FILE: code/configurations.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ configurations """ import os # --- corpus --- TRAIN_PATH = './Data/corpus/training.seg.csv' TEST_PATH = './Data/corpus/testing.seg.csv' # --- voc --- VOC_ROOT = './Data/voc' if not os.path.exists(VOC_ROOT): os.mkdir(VOC_ROOT) WORD_VOC_PATH = VOC_ROOT + '/word_voc.pkl' WORD_VOC_START = 2 TAG_VOC_PATH = VOC_ROOT + '/tag_voc.pkl' TAG_VOC_START = 1 LABEL_VOC_PATH = VOC_ROOT + '/label_voc.pkl' # --- embedding --- W2V_DIM = 256 W2V_PATH = './Data/embedding/word2vec.pkl' EMBEDDING_ROOT = './Data/embedding/' if not os.path.exists(EMBEDDING_ROOT): os.mkdir(EMBEDDING_ROOT) W2V_TRAIN_PATH = EMBEDDING_ROOT + '/word2v.pkl' T2V_PATH = EMBEDDING_ROOT + '/tag2v.pkl' TAG_DIM = 64 # --- training param --- MAX_LEN = 300 BATCH_SIZE = 64 NB_LABELS = 11 NB_EPOCH = 30 KEEP_PROB = 0.5 WORD_KEEP_PROB = 0.9 TAG_KEEP_PROB = 0.9 KFOLD = 10 ================================================ FILE: code/load_data.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ Load data. """ import pickle from time import time import numpy as np import configurations as config from TFNN.utils.io_util import read_lines from TFNN.utils.data_util import map_item2id def get_sentence_arr(words_tags, word_voc, tag_voc): """ 获取词序列 Args: words_tags: list, 句子 and tags word_voc: 词表 tag_voc: 词性标注表 Returns: sentence_arr: np.array, 字符id序列 tag_arr: np.array, 词性标记序列 """ words, postags = [], [] for item in words_tags: rindex = item.rindex('/') words.append(item[:rindex]) postags.append(item[rindex+1:]) # sentence arr sentence_arr = map_item2id( words, word_voc, config.MAX_LEN, lower=True) # pos tags arr postag_arr = map_item2id( postags, tag_voc, config.MAX_LEN, lower=False) return sentence_arr, postag_arr, len(words) def init_data(lines, word_voc, tag_voc, label_voc): """ 加载数据 Args: lines: list word_voc: dict, 词表 tag_voc: dict, 词性标注表 label_voc: dict Returns: sentences: np.array etc. """ data_count = len(lines) sentences = np.zeros((data_count, config.MAX_LEN), dtype='int32') tags = np.zeros((data_count, config.MAX_LEN), dtype='int32') sentence_actual_lengths = np.zeros((data_count,), dtype='int32') labels = np.zeros((data_count,), dtype='int32') instance_index = 0 for i in range(data_count): index = lines[i].index(',') label = lines[i][:index] sentence = lines[i][index+1:] words_tags = sentence.split(' ') sentence_arr, tag_arr, actual_length = get_sentence_arr(words_tags, word_voc, tag_voc) sentences[instance_index, :] = sentence_arr tags[instance_index, :] = tag_arr sentence_actual_lengths[instance_index] = actual_length labels[instance_index] = label_voc[label] if label in label_voc else 0 instance_index += 1 return sentences, tags, labels def load_embedding(): """ 加载词向量、词性向量 Return: word_weights: np.array tag_weights: np.array """ # 加载词向量 with open(config.W2V_TRAIN_PATH, 'rb') as file_r: word_weights = pickle.load(file_r) # 加载tag向量 with open(config.T2V_PATH, 'rb') as file_r: tag_weights = pickle.load(file_r) return word_weights, tag_weights def load_voc(): """ Load voc... Return: word_voc: dict tag_voc: dict label_voc: dict """ with open(config.WORD_VOC_PATH, 'rb') as file_r: word_voc = pickle.load(file_r) with open(config.TAG_VOC_PATH, 'rb') as file_r: tag_voc = pickle.load(file_r) with open(config.LABEL_VOC_PATH, 'rb') as file_r: label_voc = pickle.load(file_r) return word_voc, tag_voc, label_voc def load_train_data(word_voc, tag_voc, label_voc): """ 加载训练测试数据 Args: word_voc: dict tag_voc: dict label_voc: dict Returns: xx """ return init_data(read_lines(config.TRAIN_PATH), word_voc, tag_voc, label_voc) def load_test_data(word_voc, tag_voc, label_voc): """ 加载测试数据 Args: word_voc: dict tag_voc: dict label_voc: dict Returns: xx """ sentences, tags, _ = init_data(read_lines(config.TEST_PATH), word_voc, tag_voc, label_voc) return sentences, tags def demo(): t0 = time() word_weights, tag_weights = load_embedding() word_voc, tag_voc, label_voc = load_voc() data, label_voc = load_train_data() sentences, tags, labels = data[:] print(sentences.shape) print(tags.shape) print(labels.shape) print(word_weights.shape) print(tag_weights.shape) print('Done in %ds!' % (time()-t0)) if __name__ == '__main__': demo() ================================================ FILE: code/model_dc.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import os from time import time import configurations as config import tensorflow as tf import numpy as np from load_data import load_embedding, load_voc, load_train_data, load_test_data from TFNN.layers.EmbeddingLayer import Embedding from TFNN.layers.DenseLayer import SoftmaxDense from TFNN.layers.ConvolutionalLayer import Convolutional1D from TFNN.utils.evaluate_util import sim_compute from TFNN.utils.tensor_util import zero_nil_slot from sklearn.model_selection import KFold import codecs from TFNN.utils.io_util import read_lines class DCModel(object): def __init__(self, max_len, word_weights, tag_weights, result_path=None, label_voc=None): """ Initilize model Args: max_len: int, 句子最大长度 word_weights: np.array, shape=[|V_words|, w2v_dim],词向量 tag_weights: np.array, shape=[|V_tags|, t2v_dim],标记向量 result_path: str, 模型评价结果存放路径 label_voc: dict """ self._result_path = result_path self._label_voc = label_voc self._label_voc_rev = dict() for key in self._label_voc: value = self._label_voc[key] self._label_voc_rev[value] = key # input placeholders self.input_sentence_ph = tf.placeholder( tf.int32, shape=(None, max_len), name='input_sentence_ph') self.input_tag_ph = tf.placeholder(tf.int32, shape=(None, max_len), name='input_tag_ph') self.label_ph = tf.placeholder(tf.int32, shape=(None,), name='label_ph') self.keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob') self.word_keep_prob_ph = tf.placeholder(tf.float32, name='word_keep_prob') self.tag_keep_prob_ph = tf.placeholder(tf.float32, name='tag_keep_prob') # embedding layers self.nil_vars = set() word_embed_layer = Embedding( params=word_weights, ids=self.input_sentence_ph, keep_prob=self.word_keep_prob_ph, name='word_embed_layer') tag_embed_layer = Embedding( params=tag_weights, ids=self.input_tag_ph, keep_prob=self.tag_keep_prob_ph, name='tag_embed_layer') self.nil_vars.add(word_embed_layer.params.name) self.nil_vars.add(tag_embed_layer.params.name) # sentence representation sentence_input = tf.concat( values=[word_embed_layer.output, tag_embed_layer.output], axis=2) # sentence conv conv_layer = Convolutional1D( input_data=sentence_input, filter_length=3, nb_filter=1000, activation='relu', name='conv_layer') # dense layer dense_input_drop = tf.nn.dropout(conv_layer.output, self.keep_prob_ph) self.dense_layer = SoftmaxDense( input_data=dense_input_drop, input_dim=conv_layer.output_dim, output_dim=config.NB_LABELS, name='output_layer') self.loss = self.dense_layer.loss(self.label_ph) + \ 0.001*tf.nn.l2_loss(self.dense_layer.weights) optimizer = tf.train.AdamOptimizer() # Adam grads_and_vars = optimizer.compute_gradients(self.loss) nil_grads_and_vars = [] for g, v in grads_and_vars: if v.name in self.nil_vars: nil_grads_and_vars.append((zero_nil_slot(g), v)) else: nil_grads_and_vars.append((g, v)) global_step = tf.Variable(0, name='global_step', trainable=False) # train op self.train_op = optimizer.apply_gradients( nil_grads_and_vars, name='train_op', global_step=global_step) # pre op self.pre_op = self.dense_layer.get_pre_y() # summary gpu_options = tf.GPUOptions(visible_device_list='0', allow_growth=True) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # init model init = tf.global_variables_initializer() self.sess.run(init) def fit(self, sentences_train, tags_train, labels_train, sentences_dev=None, tags_dev=None, labels_dev=None, sentences_test=None, tags_test=None, labels_test=None, batch_size=64, nb_epoch=40, keep_prob=1.0, word_keep_prob=1.0, tag_keep_prob=1.0, seed=137): """ fit model Args: sentences_train, tags_train, labels_train: 训练数据 sentences_dev, tags_dev, labels_dev: 开发数据 batch_size: int, batch size nb_epoch: int, 迭代次数 keep_prob: float between [0, 1], 全连接层前的dropout word_keep_prob: float between [0, 1], 词向量层dropout tag_keep_prob: float between [0, 1], 标记向量层dropout """ self.nb_epoch_scores = [] # 存放nb_epoch次迭代的f值 nb_train = int(labels_train.shape[0] / batch_size) + 1 for step in range(nb_epoch): print('Epoch %d / %d:' % (step+1, nb_epoch)) # shuffle np.random.seed(seed) np.random.shuffle(sentences_train) np.random.seed(seed) np.random.shuffle(tags_train) np.random.seed(seed) np.random.shuffle(labels_train) # train total_loss = 0. for i in range(nb_train): # for i in range(nb_train): sentences_feed = sentences_train[i*batch_size:(i+1)*batch_size] tags_feed = tags_train[i*batch_size:(i+1)*batch_size] labels_feed = labels_train[i*batch_size:(i+1)*batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.label_ph: labels_feed, self.keep_prob_ph: keep_prob, self.word_keep_prob_ph: word_keep_prob, self.tag_keep_prob_ph: tag_keep_prob, } _, loss_value = self.sess.run( [self.train_op, self.loss], feed_dict=feed_dict) total_loss += loss_value total_loss /= float(nb_train) # 计算在训练集、开发集、测试集上的性能 p_train, r_train, f_train = self.evaluate(sentences_train, tags_train, labels_train) p_dev, r_dev, f_dev = self.evaluate(sentences_dev, tags_dev, labels_dev) pre_labels = self.predict(sentences_test, tags_test) with codecs.open('./Data/result/epoch_%d.csv' % (step+1), 'w', encoding='utf-8') as file_w: for num, label in enumerate(pre_labels): file_w.write('%d,%s\n' % (num+1, self._label_voc_rev[label])) self.nb_epoch_scores.append([p_dev, r_dev, f_dev]) print('\tloss=%f, train f=%f, dev f=%f' % (total_loss, f_train, f_dev)) def predict(self, data_sentences, data_tags, batch_size=50): """ Args: data_sentences, data_tags: np.array batch_size: int Return: pre_labels: list """ pre_labels = [] nb_test = int(data_sentences.shape[0]/batch_size) + 1 for i in range(nb_test): sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size] tags_feed = data_tags[i*batch_size:(i+1)*batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.keep_prob_ph: 1.0, self.word_keep_prob_ph: 1.0, self.tag_keep_prob_ph: 1.0} pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict) pre_labels += list(pre_temp) return pre_labels def evaluate(self, data_sentences, data_tags, data_labels, ignore_label=None, batch_size=64, simple_compute=True): """ Args: data_sentences, data_tags, data_labels: np.array ignore_label: int, 负例的编号,或者None simple_compute: bool, 是否画出性能详细指标表格 Return: p, r, f1 """ pre_labels = [] nb_dev = int(len(data_labels)/batch_size) + 1 for i in range(nb_dev): sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size] tags_feed = data_tags[i*batch_size:(i+1)*batch_size] labels_feed = data_labels[i*batch_size:(i+1)*batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.label_ph: labels_feed, self.keep_prob_ph: 1.0, self.word_keep_prob_ph: 1.0, self.tag_keep_prob_ph: 1.0} pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict) pre_labels += list(pre_temp) right_labels = data_labels[:len(pre_labels)] pre, rec, f = sim_compute(pre_labels, right_labels, ignore_label=ignore_label) return pre, rec, f def clear_model(self): tf.reset_default_graph() # self.sess.close() def get_best_score(self): """ 计算模型得分(当开发集上f值达到最高时所对应的测试集得分) Returns: score: float, 开发集达到最高时,测试集的[p, r, f] nb_epoch: int, the num of epoch """ # nb_epoch_scores = sorted(self.nb_epoch_scores, key=lambda d: d[1][-1], reverse=True) nb_epoch, best_score = -1, None for i in range(len(self.nb_epoch_scores)): if not best_score or self.nb_epoch_scores[i][-1] > best_score[-1]: best_score = self.nb_epoch_scores[i] nb_epoch = i return best_score, nb_epoch def predict(): word_weights, tag_weights = load_embedding() word_voc, tag_voc, label_voc = load_voc() # train data sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc) seed = 137 np.random.seed(seed) np.random.shuffle(sentences) np.random.seed(seed) np.random.shuffle(tags) np.random.seed(seed) np.random.shuffle(labels) # load data sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc) labels_test = None # clear reslut if not os.path.exists('./Data/result'): os.mkdir('./Data/result') command = 'rm ./Data/result/*' os.popen(command) # 划分训练、开发、测试集 kf = KFold(n_splits=config.KFOLD) train_indices, dev_indices = [], [] for train_index, dev_index in kf.split(labels): train_indices.append(train_index) dev_indices.append(dev_index) for num in range(config.KFOLD): train_index, dev_index = train_indices[num], dev_indices[num] sentences_train, sentences_dev = sentences[train_index], sentences[dev_index] tags_train, tags_dev = tags[train_index], tags[dev_index] labels_train, labels_dev = labels[train_index], labels[dev_index] # init model model = DCModel( config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt', label_voc=label_voc) # fit model model.fit( sentences_train, tags_train, labels_train, sentences_dev, tags_dev, labels_dev, sentences_test, tags_test, labels_test, config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB, word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB) print(model.get_best_score()) [p_test, r_test, f_test], nb_epoch = model.get_best_score() command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num) print(command) os.popen(command) print(p_test, r_test, f_test, '\n') # evaluate # result_path_k = result_path % k # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test, # labels_test, simple_compute=False, ignore_label=IGNORE_LABEL, # label_voc=relation_voc, result_path=result_path_k) # clear model model.clear_model() del model def init_result(): labels = [] for i in range(config.KFOLD): lines = read_lines('./Data/result/best_%d' % i) temp = [] for line in lines: label = line.split(',')[1] temp.append(label) labels.append(temp) return labels def merge(): datas = init_result() data_count = len(datas[0]) label_type_count = config.NB_LABELS labels = np.zeros((data_count, label_type_count)) for data in datas: for i, label in enumerate(data): label_id = int(label) - 1 labels[i][label_id] += 1 # 取众数 final_labels = [] for item in labels: label = item.argmax() + 1 final_labels.append(label) # clear result command = 'rm ./Data/result/*' os.popen(command) with codecs.open('./Data/result/integrade.csv', 'w', encoding='utf-8') as file_w: for i, label in enumerate(final_labels): file_w.write('%d,%d\n' % (i+1, label)) print('Result: %s' % file_w.name) if __name__ == '__main__': t0 = time() # predict test data predict() # merge merge() print('Done in %ds!' % (time()-t0)) ================================================ FILE: code/prepare_data.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ prepare data. 生成: word voc position voc relation type voc lookup tables """ import os import pickle import numpy as np import configurations as config from TFNN.utils.data_util import create_dictionary from TFNN.utils.io_util import read_lines from time import time def init_voc(): """ 初始化voc """ lines = read_lines(config.TRAIN_PATH) lines += read_lines(config.TEST_PATH) words = [] # 句子 pos_tags = [] # 词性标记类型 for line in lines: index = line.index(',') sentence = line[index+1:] # words and tags words_tags = sentence.split(' ') words_temp, tag_temp = [], [] for item in words_tags: r_index = item.rindex('/') word, tag = item[:r_index], item[r_index+1:] words_temp.append(word) tag_temp.append(tag) pos_tags.extend(tag_temp) words.extend(words_temp) # word voc create_dictionary( words, config.WORD_VOC_PATH, start=config.WORD_VOC_START, min_count=5, sort=True, lower=True, overwrite=True) # tag voc create_dictionary( pos_tags, config.TAG_VOC_PATH, start=config.TAG_VOC_START, sort=True, lower=False, overwrite=True) # label voc label_types = [str(i) for i in range(1, 12)] create_dictionary( label_types, config.LABEL_VOC_PATH, start=0, overwrite=True) def init_word_embedding(path=None, overwrite=False): """ 初始化word embedding Args: path: 结果存放路径 """ if os.path.exists(path) and not overwrite: return with open(config.W2V_PATH, 'rb') as file: w2v_dict_full = pickle.load(file) with open(config.WORD_VOC_PATH, 'rb') as file: w2id_dict = pickle.load(file) word_voc_size = len(w2id_dict.keys()) + config.WORD_VOC_START word_weights = np.zeros((word_voc_size, config.W2V_DIM), dtype='float32') for word in w2id_dict: index = w2id_dict[word] # 词的标号 if word in w2v_dict_full: word_weights[index, :] = w2v_dict_full[word] else: random_vec = np.random.uniform( -0.25, 0.25, size=(config.W2V_DIM,)).astype('float32') word_weights[index, :] = random_vec # 写入pkl文件 with open(path, 'wb') as file: pickle.dump(word_weights, file, protocol=2) def init_tag_embedding(path, overwrite=False): """ 初始化pos tag embedding Args: path: 结果存放路径 """ if os.path.exists(path) and not overwrite: return with open(config.TAG_VOC_PATH, 'rb') as file: tag_voc = pickle.load(file) tag_voc_size = len(tag_voc.keys()) + config.TAG_VOC_START tag_weights = np.random.normal( size=(tag_voc_size, config.TAG_DIM)).astype('float32') for i in range(config.TAG_VOC_START): tag_weights[i, :] = 0. with open(path, 'wb') as file: pickle.dump(tag_weights, file, protocol=2) def init_embedding(): """ 初始化embedding """ if not os.path.exists(config.EMBEDDING_ROOT): os.mkdir(config.EMBEDDING_ROOT) # 初始化word embedding init_word_embedding(config.W2V_TRAIN_PATH, overwrite=True) # 初始化tag embedding init_tag_embedding(config.T2V_PATH, overwrite=True) def demo(): with open(config.W2V_TRAIN_PATH, 'rb') as file: temp = pickle.load(file) print(temp.shape) if __name__ == '__main__': t0 = time() init_voc() # 初始化voc init_embedding() # 初始化embedding demo() print('Done in %.1fs!' % (time()-t0)) ================================================ FILE: code/train_w2v_model.py ================================================ #!/usr/bin/env python # coding=utf-8 import codecs import pickle from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence from gensim.models.keyedvectors import KeyedVectors from TFNN.utils.io_util import read_lines def get_sentence(sentence_tag): words = [] for item in sentence_tag.split(' '): index = item.rindex('/') words.append(item[:index]) return ' '.join(words) def extract_sentece(): lines = read_lines('./Data/corpus/training.seg.csv') lines += read_lines('./Data/corpus/testing.seg.csv') with codecs.open('./Data/corpus/sentence.txt', 'w', encoding='utf-8') as file_w: for line in lines: index = line.index(',') word_tag = line[index+1:] file_w.write('%s\n' % get_sentence(word_tag)) def train(): extract_sentece() in_path = './Data/corpus/sentence.txt' out_path = './Data/embedding/word2vec.bin' # 训练模型 model = Word2Vec( sg=1, sentences=LineSentence(in_path), size=256, window=5, min_count=3, workers=4, iter=40) model.wv.save_word2vec_format(out_path, binary=True) def bin2pkl(): model = KeyedVectors.load_word2vec_format('./Data/embedding/word2vec.bin', binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] with open('./Data/embedding/word2vec.pkl', 'wb') as file_w: pickle.dump(word_dict, file_w) print(file_w.name) if __name__ == '__main__': train() bin2pkl()