Repository: liu-nlper/DocumentClassification
Branch: master
Commit: b6c6cce3638a
Files: 20
Total size: 38.2 KB
Directory structure:
gitextract__8lu9u9y/
├── README.md
└── code/
├── Data/
│ └── corpus/
│ └── corpus download link
├── README
├── TFNN/
│ ├── __init__.py
│ ├── activations.py
│ ├── layers/
│ │ ├── ConvolutionalLayer.py
│ │ ├── DenseLayer.py
│ │ ├── EmbeddingLayer.py
│ │ └── __init__.py
│ ├── objectives.py
│ └── utils/
│ ├── __init__.py
│ ├── data_util.py
│ ├── evaluate_util.py
│ ├── io_util.py
│ └── tensor_util.py
├── configurations.py
├── load_data.py
├── model_dc.py
├── prepare_data.py
└── train_w2v_model.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# Document Classification
This code implements a simple CNN model for document classification with tensorflow.
# Model Structure

# Requirements
- Python: 2.7
- Tensorflow: 1.0.0
- Numpy: 1.12.1
- sklearn: 0.18.1
- gensim: 1.0.1
- pickle
================================================
FILE: code/Data/corpus/corpus download link
================================================
数据下载地址:http://competition.ai100.com.cn/html/game_det.html?id=24
将分词/词性标注后的文件命名为training.seg.csv和testing.seg.csv,放置到当前目录下。
处理后的文本如下所示:
"公司/n 是/vshi 经/p 批准/v 依法/d 从事/vi 融资/vi 性/ng 担保/vn 业务/n 的/ude1 ..."
================================================
FILE: code/README
================================================
下载数据到Data/corpus目录下,并做分词和词性标注处理,再按以下顺序执行:
train_w2v_model.py -> prepare_data.py -> model_dc.py
文件说明:
1. 分词、词性标注采用中科院NLPIR,处理后文件:
./Data/corpus/training.seg.csv
./Data/corpus/testing.seg.csv
2. configuration.py
配置文件
3. train_w2v_model.py
利用官方给的train, test训练词向量
3. prepare_data.py
构建词表,词性表等
4. load_data.py
加载数据
5. model_dc.py
训练模型并预测
================================================
FILE: code/TFNN/__init__.py
================================================
================================================
FILE: code/TFNN/activations.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
激活函数
"""
import tensorflow as tf
def get_activation(activation=None):
"""
Get activation function accord to the parameter 'activation'
Args:
activation: str: 激活函数的名称
Return:
激活函数
"""
if activation is None:
return None
elif activation == 'tanh':
return tf.nn.tanh
elif activation == 'relu':
return tf.nn.relu
elif activation == 'softmax':
return tf.nn.softmax
else:
raise Exception('Unknow activation function: %s' % activation)
================================================
FILE: code/TFNN/layers/ConvolutionalLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
Convolutional1D
"""
import numpy as np
import tensorflow as tf
# from ..initializations import normal_weight
from ..activations import get_activation
class Convolutional1D(object):
def __init__(self, input_data, filter_length, nb_filter, strides=[1, 1, 1, 1],
padding='VALID', activation='tanh', pooling=True,
name='Convolutional1D'):
"""1D卷积层
Args:
input_data: 3D tensor of shape=[batch_size, in_height, in_width]
in_channels is set to 1 when use Convolutional1D.
filter_length: int, 卷积核的长度,用于构造卷积核,在
Convolutional1D中,卷积核shape=[filter_length, in_width, in_channels, nb_filters]
nb_filter: int, 卷积核数量
padding: 默认'VALID',暂时不支持设成'SAME'
pooling: bool, 是否池化
"""
assert padding in ('VALID'), 'Unknow padding %s' % padding
# assert padding in ('VALID', 'SAME'), 'Unknow padding %s' % padding
in_height, in_width = map(int, input_data.get_shape()[1:])
self._input_data = tf.expand_dims(input_data, -1) # shape=[x, x, x, 1]
self._filter_length = filter_length
self._nb_filter = nb_filter
self._strides = strides
self._padding = padding
self._activation = get_activation(activation)
self._name = name
self.pooling = pooling
filter_length = self._filter_length
nb_filter = self._nb_filter
with tf.name_scope('%s_%d' % (name, filter_length)):
if activation != 'relu':
fan_in = filter_length * in_width
fan_out = nb_filter * (in_width-filter_length+1)
w_bound = np.sqrt(6. / (fan_in + fan_out))
self.weights = tf.Variable(
tf.random_uniform(
minval=-w_bound, maxval=w_bound, dtype='float32',
shape=[filter_length, in_width, 1, nb_filter]),
name='conv_weight')
tf.summary.histogram('weights', self.weights)
else: # init weight for relu
w_values = tf.random_normal(
shape=[filter_length, in_width, 1, nb_filter]
) * tf.sqrt(2. / (filter_length * in_width * nb_filter))
self.weights = tf.Variable(w_values, name='conv_weight')
# bias
self.biases = tf.Variable(
tf.constant(0.1, shape=[nb_filter, ]),
name='conv_bias')
tf.summary.histogram('biases', self.biases)
self.call()
def call(self):
# 卷积 if padding='VALID', then conv_output's shape=
# [batch_size, in_height-filter_length+1, 1, nb_filters]
conv_output = tf.nn.conv2d(
input=self._input_data,
filter=self.weights,
strides=self._strides,
padding=self._padding)
# output's shape=[batch_size, new_height, 1, nb_filters]
linear_output = tf.nn.bias_add(conv_output, self.biases)
act_output = (
linear_output if self._activation is None
else self._activation(linear_output))
if self.pooling:
# max pooling, shape=[?, nb_filter]
self._output = tf.reduce_max(tf.squeeze(act_output, [2]), 1)
else:
self._output = tf.squeeze(act_output, axis=2) # [?, n-w+1, nb_filter]
@property
def input_data(self):
return self._input_data
@property
def output(self):
return self._output
@property
def output_dim(self):
return self._nb_filter
def get_weights(self):
return self.weights
================================================
FILE: code/TFNN/layers/DenseLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
SoftmaxLayer
"""
import tensorflow as tf
from ..activations import get_activation
class SoftmaxDense(object):
def __init__(self, input_data, input_dim, output_dim, weights=None,
biases=None, activation=None, name='Dense'):
assert len(input_data.get_shape()) == 2, \
"全连接层的输入必须要flatten, 即shape=[batch_size, input_dim]"
self._input_data = input_data
self._input_dim = input_dim
self._output_dim = output_dim
self._activation = get_activation(activation)
self._name = name
with tf.name_scope(self._name):
# initialize weights
if weights is None:
w_bound = tf.sqrt(6. / (input_dim + output_dim))
weights = tf.Variable(
tf.random_uniform(
minval=-w_bound, maxval=w_bound, dtype='float32',
shape=[input_dim, output_dim]),
name='weights'
)
self._weights = weights
tf.summary.histogram('weights', self._weights)
# initialize biases
if biases is None:
biases = tf.Variable(
tf.constant(0.1, shape=[self._output_dim]),
name='biases')
self._biases = biases
tf.summary.histogram('biases', biases)
self.call()
def call(self):
# output
linear_output = tf.matmul(self._input_data, self._weights) + \
self._biases
self._output = (
linear_output if self._activation is None
else self._activation(linear_output)
)
def loss(self, y):
y = tf.cast(y, tf.int32)
cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=self.output, labels=y, name='xentroy')
loss = tf.reduce_mean(cross_entroy, name='xentroy_mean')
return loss
def get_pre_y(self):
# TODO 待修改
# pre_y = tf.reshape(tf.round(tf.sigmoid(self._output)), [-1])
pre_y = tf.arg_max(input=self._output, dimension=1)
return pre_y
@property
def input_data(self):
return self._input_data
@property
def input_dim(self):
return self._input_dim
@property
def output_dim(self):
return self._output_dim
@property
def name(self):
return self._name
@property
def weights(self):
return self._weights
@property
def biases(self):
return self._biases
@property
def output(self):
return self._output
================================================
FILE: code/TFNN/layers/EmbeddingLayer.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DNN Layers:
Embedding
"""
import tensorflow as tf
class Embedding(object):
def __init__(self, params, ids, name, keep_prob=1.0):
with tf.name_scope('%s' % name):
self._params = tf.Variable(params, tf.float32, name='embed')
self._ids = ids
# output
embed_output = tf.nn.embedding_lookup(
params=self._params,
ids=self._ids
)
self._output = tf.nn.dropout(embed_output, keep_prob)
@property
def params(self):
return self._params
@property
def output_dim(self):
return int(self._output.get_shape()[-1])
@property
def output(self):
return self._output
================================================
FILE: code/TFNN/layers/__init__.py
================================================
================================================
FILE: code/TFNN/objectives.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf
def categorical_crossentropy(y_true, y_pred):
"""
Args:
y_true: int of list, length=batch_size
y_pred: 2D tensor with shape=[batch_size, nb_classes]
Returns:
xx
"""
cross_entroy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=y_pred, labels=y_true, name='xentroy')
return tf.reduce_mean(cross_entroy, name='xentroy_mean')
================================================
FILE: code/TFNN/utils/__init__.py
================================================
================================================
FILE: code/TFNN/utils/data_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
import numpy as np
from collections import defaultdict
def flatten_list(nest_list):
"""
将嵌套列表压扁
Args:
nest_list: list,嵌套列表
Return:
flatten_list: list
"""
res = []
for item in nest_list:
if isinstance(item, list):
res.extend(flatten_list(item))
else:
res.append(item)
return res
def create_dictionary(items, dic_path, start=0, sort=True,
min_count=None, lower=False, overwrite=False):
"""
构建字典,并将构建的字典写入pkl文件中
Args:
items: list, [item_1, item_2, ...]
dic_path: 需要保存的路径(以pkl结尾)
start: int, voc起始下标,默认为0
sort: bool, 是否按频率排序, 若为False,则按items排序
min_count: 最小频次
lower: bool, 是否转为小写
overwrite: bool, 是否覆盖之前的文件
Returns:
None
"""
assert not dic_path.endswith('pk')
if os.path.exists(dic_path) and not overwrite:
return
voc = dict()
if sort:
# 构建字典
dic = defaultdict(int)
for item in items:
item = item if (not lower) else item.lower()
dic[item] += 1
# 排序
dic = sorted(dic.items(), key=lambda d: d[1], reverse=True)
for i, item in enumerate(dic):
index = i + start
key = item[0]
if min_count and min_count > item[1]:
continue
voc[key] = index
else: # 按items排序
for i, item in enumerate(items):
item = item if not lower else item.lower()
index = i + start
voc[item] = index
# 写入文件
file = open(dic_path, 'wb')
pickle.dump(voc, file)
file.close()
def map_item2id(items, voc, max_len, none_word=0, lower=False):
"""
将word/pos等映射为id
Args:
items: list, 待映射列表
voc: 词表
max_len: int, 序列最大长度
none_word: 未登录词标号,默认为0
Returns:
arr: np.array, dtype=int32, shape=[max_len,]
"""
assert type(none_word) == int
arr = np.zeros((max_len,), dtype='int32')
min_range = min(max_len, len(items))
for i in range(min_range): # 若items长度大于max_len,则被截断
item = items[i] if not lower else items[i].lower()
arr[i] = voc[item] if item in voc else none_word
return arr
def random_over_sampling():
"""
随机过采样
Args:
xx
Return:
xx
"""
x_1 = [[1,1,1], [2,2,2], [3,3,3]]
x_2 = [[1,1,1], [2,2,2], [3,3,3]]
y = [1,2,3]
from imblearn.over_samping import RandomOverSampler
ros = RandomOverSampler(sandom_state=42)
x_res, y_res = ros.fit_sample(x_1, y)
print(x_res)
print(y_res)
def demo():
random_over_sampling()
if __name__ == '__main__':
demo()
================================================
FILE: code/TFNN/utils/evaluate_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
from collections import defaultdict
import codecs
def sim_compute(pro_labels, right_labels, ignore_label=None):
"""
simple evaluate...
Args:
param pro_labels list : predict labels
param right_labels list : right labels
param ignore_label int : the label should be ignored
Returns:
pre, rec, f
"""
assert len(pro_labels) == len(right_labels)
pre_pro_labels, pre_right_labels = [], []
rec_pro_labels, rec_right_labels = [], []
labels_len = len(pro_labels)
for i in range(labels_len):
pro_label = pro_labels[i]
if pro_label != ignore_label: #
pre_pro_labels.append(pro_label)
pre_right_labels.append(right_labels[i])
if right_labels[i] != ignore_label:
rec_pro_labels.append(pro_label)
rec_right_labels.append(right_labels[i])
pre_pro_labels, pre_right_labels = np.array(pre_pro_labels, dtype='int32'), \
np.array(pre_right_labels, dtype='int32')
rec_pro_labels, rec_right_labels = np.array(rec_pro_labels, dtype='int32'), \
np.array(rec_right_labels, dtype='int32')
pre = 0. if len(pre_pro_labels) == 0 \
else len(np.where(pre_pro_labels == pre_right_labels)[0]) / float(len(pre_pro_labels))
# rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(pre_pro_labels))
rec = len(np.where(rec_pro_labels == rec_right_labels)[0]) / float(len(rec_right_labels))
f = 0. if (pre + rec) == 0. \
else (pre * rec * 2.) / (pre + rec)
return pre, rec, f
def demo():
pro_labels = [1, 2, 3, 4, 0, 6, 7, 0, 2, 8]
right_labels = [0, 2, 3, 6, 5, 4, 7, 1, 0, 3]
# ignore_label = 0
pre, rec, f = sim_compute(pro_labels, right_labels, ignore_label=2)
print('pre:', pre)
print('rec:', rec)
print(' f:', f)
if __name__ == '__main__':
demo()
================================================
FILE: code/TFNN/utils/io_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import codecs
def read_lines(path):
lines = []
with codecs.open(path, 'r', encoding='utf-8') as file:
for line in file.readlines():
line = line.rstrip()
if line:
lines.append(line)
return lines
def get_file_list(path, postfix, file_list):
"""
获取path路径下所有后缀为postfix的文件名
Args:
path str : 文件路径
postfix str : 后缀
file_list 存放文件路径
Return:
None
"""
temp_list = os.listdir(path)
for fi in temp_list:
fi_d = os.path.join(path, fi)
if os.path.isdir(fi_d): # 若是目录,则递归
get_file_list(fi_d, postfix, file_list)
else: # 若是文件
if fi_d.endswith(postfix): # 以postfix结尾
file_list.append(fi_d)
return None
================================================
FILE: code/TFNN/utils/tensor_util.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf
def zero_nil_slot(t, name=None):
"""
Overwrite the nil_slot (first 1 rows) of the input Tensor with zeros.
Args:
t: 2D tensor
name: str
Returns:
Same shape as t
"""
with tf.name_scope('zero_nil_slot'):
s = tf.shape(t)[1]
z = tf.zeros([1, s], dtype=tf.float32)
return tf.concat(
axis=0, name=name,
values=[z, tf.slice(t, [1, 0], [-1, -1])])
def add_gradient_noise(t, stddev=1e-3, name=None):
"""
Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
The input Tensor `t` should be a gradient.
The output will be `t` + gaussian noise.
0.001 was said to be a good fixed value for memory networks [2].
Args:
t: 2D tensor
Returns:
2D tensor, same shape as t
"""
with tf.name_scope("add_gradient_noise"):
gn = tf.random_normal(tf.shape(t), stddev=stddev)
return tf.add(t, gn, name=name)
def mask_tensor(input_data, lengths, maxlen, dtype=tf.float32):
"""
Args:
input_data: 2D tensor
lengths: integer vector, all its values < maxlen
maxlen: scalar integer tensor
dtype: str
"""
mask = tf.cast(tf.sequence_mask(lengths, maxlen), dtype)
return tf.multiply(input_data, mask)
================================================
FILE: code/configurations.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
configurations
"""
import os
# --- corpus ---
TRAIN_PATH = './Data/corpus/training.seg.csv'
TEST_PATH = './Data/corpus/testing.seg.csv'
# --- voc ---
VOC_ROOT = './Data/voc'
if not os.path.exists(VOC_ROOT):
os.mkdir(VOC_ROOT)
WORD_VOC_PATH = VOC_ROOT + '/word_voc.pkl'
WORD_VOC_START = 2
TAG_VOC_PATH = VOC_ROOT + '/tag_voc.pkl'
TAG_VOC_START = 1
LABEL_VOC_PATH = VOC_ROOT + '/label_voc.pkl'
# --- embedding ---
W2V_DIM = 256
W2V_PATH = './Data/embedding/word2vec.pkl'
EMBEDDING_ROOT = './Data/embedding/'
if not os.path.exists(EMBEDDING_ROOT):
os.mkdir(EMBEDDING_ROOT)
W2V_TRAIN_PATH = EMBEDDING_ROOT + '/word2v.pkl'
T2V_PATH = EMBEDDING_ROOT + '/tag2v.pkl'
TAG_DIM = 64
# --- training param ---
MAX_LEN = 300
BATCH_SIZE = 64
NB_LABELS = 11
NB_EPOCH = 30
KEEP_PROB = 0.5
WORD_KEEP_PROB = 0.9
TAG_KEEP_PROB = 0.9
KFOLD = 10
================================================
FILE: code/load_data.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Load data.
"""
import pickle
from time import time
import numpy as np
import configurations as config
from TFNN.utils.io_util import read_lines
from TFNN.utils.data_util import map_item2id
def get_sentence_arr(words_tags, word_voc, tag_voc):
"""
获取词序列
Args:
words_tags: list, 句子 and tags
word_voc: 词表
tag_voc: 词性标注表
Returns:
sentence_arr: np.array, 字符id序列
tag_arr: np.array, 词性标记序列
"""
words, postags = [], []
for item in words_tags:
rindex = item.rindex('/')
words.append(item[:rindex])
postags.append(item[rindex+1:])
# sentence arr
sentence_arr = map_item2id(
words, word_voc, config.MAX_LEN, lower=True)
# pos tags arr
postag_arr = map_item2id(
postags, tag_voc, config.MAX_LEN, lower=False)
return sentence_arr, postag_arr, len(words)
def init_data(lines, word_voc, tag_voc, label_voc):
"""
加载数据
Args:
lines: list
word_voc: dict, 词表
tag_voc: dict, 词性标注表
label_voc: dict
Returns:
sentences: np.array
etc.
"""
data_count = len(lines)
sentences = np.zeros((data_count, config.MAX_LEN), dtype='int32')
tags = np.zeros((data_count, config.MAX_LEN), dtype='int32')
sentence_actual_lengths = np.zeros((data_count,), dtype='int32')
labels = np.zeros((data_count,), dtype='int32')
instance_index = 0
for i in range(data_count):
index = lines[i].index(',')
label = lines[i][:index]
sentence = lines[i][index+1:]
words_tags = sentence.split(' ')
sentence_arr, tag_arr, actual_length = get_sentence_arr(words_tags, word_voc, tag_voc)
sentences[instance_index, :] = sentence_arr
tags[instance_index, :] = tag_arr
sentence_actual_lengths[instance_index] = actual_length
labels[instance_index] = label_voc[label] if label in label_voc else 0
instance_index += 1
return sentences, tags, labels
def load_embedding():
"""
加载词向量、词性向量
Return:
word_weights: np.array
tag_weights: np.array
"""
# 加载词向量
with open(config.W2V_TRAIN_PATH, 'rb') as file_r:
word_weights = pickle.load(file_r)
# 加载tag向量
with open(config.T2V_PATH, 'rb') as file_r:
tag_weights = pickle.load(file_r)
return word_weights, tag_weights
def load_voc():
"""
Load voc...
Return:
word_voc: dict
tag_voc: dict
label_voc: dict
"""
with open(config.WORD_VOC_PATH, 'rb') as file_r:
word_voc = pickle.load(file_r)
with open(config.TAG_VOC_PATH, 'rb') as file_r:
tag_voc = pickle.load(file_r)
with open(config.LABEL_VOC_PATH, 'rb') as file_r:
label_voc = pickle.load(file_r)
return word_voc, tag_voc, label_voc
def load_train_data(word_voc, tag_voc, label_voc):
"""
加载训练测试数据
Args:
word_voc: dict
tag_voc: dict
label_voc: dict
Returns:
xx
"""
return init_data(read_lines(config.TRAIN_PATH), word_voc, tag_voc, label_voc)
def load_test_data(word_voc, tag_voc, label_voc):
"""
加载测试数据
Args:
word_voc: dict
tag_voc: dict
label_voc: dict
Returns:
xx
"""
sentences, tags, _ = init_data(read_lines(config.TEST_PATH), word_voc, tag_voc, label_voc)
return sentences, tags
def demo():
t0 = time()
word_weights, tag_weights = load_embedding()
word_voc, tag_voc, label_voc = load_voc()
data, label_voc = load_train_data()
sentences, tags, labels = data[:]
print(sentences.shape)
print(tags.shape)
print(labels.shape)
print(word_weights.shape)
print(tag_weights.shape)
print('Done in %ds!' % (time()-t0))
if __name__ == '__main__':
demo()
================================================
FILE: code/model_dc.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from time import time
import configurations as config
import tensorflow as tf
import numpy as np
from load_data import load_embedding, load_voc, load_train_data, load_test_data
from TFNN.layers.EmbeddingLayer import Embedding
from TFNN.layers.DenseLayer import SoftmaxDense
from TFNN.layers.ConvolutionalLayer import Convolutional1D
from TFNN.utils.evaluate_util import sim_compute
from TFNN.utils.tensor_util import zero_nil_slot
from sklearn.model_selection import KFold
import codecs
from TFNN.utils.io_util import read_lines
class DCModel(object):
def __init__(self, max_len, word_weights, tag_weights, result_path=None, label_voc=None):
"""
Initilize model
Args:
max_len: int, 句子最大长度
word_weights: np.array, shape=[|V_words|, w2v_dim],词向量
tag_weights: np.array, shape=[|V_tags|, t2v_dim],标记向量
result_path: str, 模型评价结果存放路径
label_voc: dict
"""
self._result_path = result_path
self._label_voc = label_voc
self._label_voc_rev = dict()
for key in self._label_voc:
value = self._label_voc[key]
self._label_voc_rev[value] = key
# input placeholders
self.input_sentence_ph = tf.placeholder(
tf.int32, shape=(None, max_len), name='input_sentence_ph')
self.input_tag_ph = tf.placeholder(tf.int32, shape=(None, max_len), name='input_tag_ph')
self.label_ph = tf.placeholder(tf.int32, shape=(None,), name='label_ph')
self.keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob')
self.word_keep_prob_ph = tf.placeholder(tf.float32, name='word_keep_prob')
self.tag_keep_prob_ph = tf.placeholder(tf.float32, name='tag_keep_prob')
# embedding layers
self.nil_vars = set()
word_embed_layer = Embedding(
params=word_weights, ids=self.input_sentence_ph,
keep_prob=self.word_keep_prob_ph, name='word_embed_layer')
tag_embed_layer = Embedding(
params=tag_weights, ids=self.input_tag_ph,
keep_prob=self.tag_keep_prob_ph, name='tag_embed_layer')
self.nil_vars.add(word_embed_layer.params.name)
self.nil_vars.add(tag_embed_layer.params.name)
# sentence representation
sentence_input = tf.concat(
values=[word_embed_layer.output, tag_embed_layer.output], axis=2)
# sentence conv
conv_layer = Convolutional1D(
input_data=sentence_input, filter_length=3,
nb_filter=1000, activation='relu', name='conv_layer')
# dense layer
dense_input_drop = tf.nn.dropout(conv_layer.output, self.keep_prob_ph)
self.dense_layer = SoftmaxDense(
input_data=dense_input_drop, input_dim=conv_layer.output_dim,
output_dim=config.NB_LABELS, name='output_layer')
self.loss = self.dense_layer.loss(self.label_ph) + \
0.001*tf.nn.l2_loss(self.dense_layer.weights)
optimizer = tf.train.AdamOptimizer() # Adam
grads_and_vars = optimizer.compute_gradients(self.loss)
nil_grads_and_vars = []
for g, v in grads_and_vars:
if v.name in self.nil_vars:
nil_grads_and_vars.append((zero_nil_slot(g), v))
else:
nil_grads_and_vars.append((g, v))
global_step = tf.Variable(0, name='global_step', trainable=False)
# train op
self.train_op = optimizer.apply_gradients(
nil_grads_and_vars, name='train_op', global_step=global_step)
# pre op
self.pre_op = self.dense_layer.get_pre_y()
# summary
gpu_options = tf.GPUOptions(visible_device_list='0', allow_growth=True)
self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
# init model
init = tf.global_variables_initializer()
self.sess.run(init)
def fit(self, sentences_train, tags_train, labels_train,
sentences_dev=None, tags_dev=None, labels_dev=None,
sentences_test=None, tags_test=None, labels_test=None,
batch_size=64, nb_epoch=40, keep_prob=1.0, word_keep_prob=1.0,
tag_keep_prob=1.0, seed=137):
"""
fit model
Args:
sentences_train, tags_train, labels_train: 训练数据
sentences_dev, tags_dev, labels_dev: 开发数据
batch_size: int, batch size
nb_epoch: int, 迭代次数
keep_prob: float between [0, 1], 全连接层前的dropout
word_keep_prob: float between [0, 1], 词向量层dropout
tag_keep_prob: float between [0, 1], 标记向量层dropout
"""
self.nb_epoch_scores = [] # 存放nb_epoch次迭代的f值
nb_train = int(labels_train.shape[0] / batch_size) + 1
for step in range(nb_epoch):
print('Epoch %d / %d:' % (step+1, nb_epoch))
# shuffle
np.random.seed(seed)
np.random.shuffle(sentences_train)
np.random.seed(seed)
np.random.shuffle(tags_train)
np.random.seed(seed)
np.random.shuffle(labels_train)
# train
total_loss = 0.
for i in range(nb_train):
# for i in range(nb_train):
sentences_feed = sentences_train[i*batch_size:(i+1)*batch_size]
tags_feed = tags_train[i*batch_size:(i+1)*batch_size]
labels_feed = labels_train[i*batch_size:(i+1)*batch_size]
feed_dict = {
self.input_sentence_ph: sentences_feed,
self.input_tag_ph: tags_feed,
self.label_ph: labels_feed,
self.keep_prob_ph: keep_prob,
self.word_keep_prob_ph: word_keep_prob,
self.tag_keep_prob_ph: tag_keep_prob,
}
_, loss_value = self.sess.run(
[self.train_op, self.loss], feed_dict=feed_dict)
total_loss += loss_value
total_loss /= float(nb_train)
# 计算在训练集、开发集、测试集上的性能
p_train, r_train, f_train = self.evaluate(sentences_train, tags_train, labels_train)
p_dev, r_dev, f_dev = self.evaluate(sentences_dev, tags_dev, labels_dev)
pre_labels = self.predict(sentences_test, tags_test)
with codecs.open('./Data/result/epoch_%d.csv' % (step+1), 'w', encoding='utf-8') as file_w:
for num, label in enumerate(pre_labels):
file_w.write('%d,%s\n' % (num+1, self._label_voc_rev[label]))
self.nb_epoch_scores.append([p_dev, r_dev, f_dev])
print('\tloss=%f, train f=%f, dev f=%f' % (total_loss, f_train, f_dev))
def predict(self, data_sentences, data_tags, batch_size=50):
"""
Args:
data_sentences, data_tags: np.array
batch_size: int
Return:
pre_labels: list
"""
pre_labels = []
nb_test = int(data_sentences.shape[0]/batch_size) + 1
for i in range(nb_test):
sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size]
tags_feed = data_tags[i*batch_size:(i+1)*batch_size]
feed_dict = {
self.input_sentence_ph: sentences_feed,
self.input_tag_ph: tags_feed,
self.keep_prob_ph: 1.0,
self.word_keep_prob_ph: 1.0,
self.tag_keep_prob_ph: 1.0}
pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict)
pre_labels += list(pre_temp)
return pre_labels
def evaluate(self, data_sentences, data_tags, data_labels,
ignore_label=None, batch_size=64, simple_compute=True):
"""
Args:
data_sentences, data_tags, data_labels: np.array
ignore_label: int, 负例的编号,或者None
simple_compute: bool, 是否画出性能详细指标表格
Return:
p, r, f1
"""
pre_labels = []
nb_dev = int(len(data_labels)/batch_size) + 1
for i in range(nb_dev):
sentences_feed = data_sentences[i*batch_size:(i+1)*batch_size]
tags_feed = data_tags[i*batch_size:(i+1)*batch_size]
labels_feed = data_labels[i*batch_size:(i+1)*batch_size]
feed_dict = {
self.input_sentence_ph: sentences_feed,
self.input_tag_ph: tags_feed,
self.label_ph: labels_feed,
self.keep_prob_ph: 1.0,
self.word_keep_prob_ph: 1.0,
self.tag_keep_prob_ph: 1.0}
pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict)
pre_labels += list(pre_temp)
right_labels = data_labels[:len(pre_labels)]
pre, rec, f = sim_compute(pre_labels, right_labels, ignore_label=ignore_label)
return pre, rec, f
def clear_model(self):
tf.reset_default_graph() #
self.sess.close()
def get_best_score(self):
"""
计算模型得分(当开发集上f值达到最高时所对应的测试集得分)
Returns:
score: float, 开发集达到最高时,测试集的[p, r, f]
nb_epoch: int, the num of epoch
"""
# nb_epoch_scores = sorted(self.nb_epoch_scores, key=lambda d: d[1][-1], reverse=True)
nb_epoch, best_score = -1, None
for i in range(len(self.nb_epoch_scores)):
if not best_score or self.nb_epoch_scores[i][-1] > best_score[-1]:
best_score = self.nb_epoch_scores[i]
nb_epoch = i
return best_score, nb_epoch
def predict():
word_weights, tag_weights = load_embedding()
word_voc, tag_voc, label_voc = load_voc()
# train data
sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc)
seed = 137
np.random.seed(seed)
np.random.shuffle(sentences)
np.random.seed(seed)
np.random.shuffle(tags)
np.random.seed(seed)
np.random.shuffle(labels)
# load data
sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc)
labels_test = None
# clear reslut
if not os.path.exists('./Data/result'):
os.mkdir('./Data/result')
command = 'rm ./Data/result/*'
os.popen(command)
# 划分训练、开发、测试集
kf = KFold(n_splits=config.KFOLD)
train_indices, dev_indices = [], []
for train_index, dev_index in kf.split(labels):
train_indices.append(train_index)
dev_indices.append(dev_index)
for num in range(config.KFOLD):
train_index, dev_index = train_indices[num], dev_indices[num]
sentences_train, sentences_dev = sentences[train_index], sentences[dev_index]
tags_train, tags_dev = tags[train_index], tags[dev_index]
labels_train, labels_dev = labels[train_index], labels[dev_index]
# init model
model = DCModel(
config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt',
label_voc=label_voc)
# fit model
model.fit(
sentences_train, tags_train, labels_train,
sentences_dev, tags_dev, labels_dev,
sentences_test, tags_test, labels_test,
config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB,
word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB)
print(model.get_best_score())
[p_test, r_test, f_test], nb_epoch = model.get_best_score()
command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num)
print(command)
os.popen(command)
print(p_test, r_test, f_test, '\n')
# evaluate
# result_path_k = result_path % k
# p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test,
# labels_test, simple_compute=False, ignore_label=IGNORE_LABEL,
# label_voc=relation_voc, result_path=result_path_k)
# clear model
model.clear_model()
del model
def init_result():
labels = []
for i in range(config.KFOLD):
lines = read_lines('./Data/result/best_%d' % i)
temp = []
for line in lines:
label = line.split(',')[1]
temp.append(label)
labels.append(temp)
return labels
def merge():
datas = init_result()
data_count = len(datas[0])
label_type_count = config.NB_LABELS
labels = np.zeros((data_count, label_type_count))
for data in datas:
for i, label in enumerate(data):
label_id = int(label) - 1
labels[i][label_id] += 1
# 取众数
final_labels = []
for item in labels:
label = item.argmax() + 1
final_labels.append(label)
# clear result
command = 'rm ./Data/result/*'
os.popen(command)
with codecs.open('./Data/result/integrade.csv', 'w', encoding='utf-8') as file_w:
for i, label in enumerate(final_labels):
file_w.write('%d,%d\n' % (i+1, label))
print('Result: %s' % file_w.name)
if __name__ == '__main__':
t0 = time()
# predict test data
predict()
# merge
merge()
print('Done in %ds!' % (time()-t0))
================================================
FILE: code/prepare_data.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
prepare data.
生成:
word voc
position voc
relation type voc
lookup tables
"""
import os
import pickle
import numpy as np
import configurations as config
from TFNN.utils.data_util import create_dictionary
from TFNN.utils.io_util import read_lines
from time import time
def init_voc():
"""
初始化voc
"""
lines = read_lines(config.TRAIN_PATH)
lines += read_lines(config.TEST_PATH)
words = [] # 句子
pos_tags = [] # 词性标记类型
for line in lines:
index = line.index(',')
sentence = line[index+1:]
# words and tags
words_tags = sentence.split(' ')
words_temp, tag_temp = [], []
for item in words_tags:
r_index = item.rindex('/')
word, tag = item[:r_index], item[r_index+1:]
words_temp.append(word)
tag_temp.append(tag)
pos_tags.extend(tag_temp)
words.extend(words_temp)
# word voc
create_dictionary(
words, config.WORD_VOC_PATH, start=config.WORD_VOC_START,
min_count=5, sort=True, lower=True, overwrite=True)
# tag voc
create_dictionary(
pos_tags, config.TAG_VOC_PATH, start=config.TAG_VOC_START,
sort=True, lower=False, overwrite=True)
# label voc
label_types = [str(i) for i in range(1, 12)]
create_dictionary(
label_types, config.LABEL_VOC_PATH, start=0, overwrite=True)
def init_word_embedding(path=None, overwrite=False):
"""
初始化word embedding
Args:
path: 结果存放路径
"""
if os.path.exists(path) and not overwrite:
return
with open(config.W2V_PATH, 'rb') as file:
w2v_dict_full = pickle.load(file)
with open(config.WORD_VOC_PATH, 'rb') as file:
w2id_dict = pickle.load(file)
word_voc_size = len(w2id_dict.keys()) + config.WORD_VOC_START
word_weights = np.zeros((word_voc_size, config.W2V_DIM), dtype='float32')
for word in w2id_dict:
index = w2id_dict[word] # 词的标号
if word in w2v_dict_full:
word_weights[index, :] = w2v_dict_full[word]
else:
random_vec = np.random.uniform(
-0.25, 0.25, size=(config.W2V_DIM,)).astype('float32')
word_weights[index, :] = random_vec
# 写入pkl文件
with open(path, 'wb') as file:
pickle.dump(word_weights, file, protocol=2)
def init_tag_embedding(path, overwrite=False):
"""
初始化pos tag embedding
Args:
path: 结果存放路径
"""
if os.path.exists(path) and not overwrite:
return
with open(config.TAG_VOC_PATH, 'rb') as file:
tag_voc = pickle.load(file)
tag_voc_size = len(tag_voc.keys()) + config.TAG_VOC_START
tag_weights = np.random.normal(
size=(tag_voc_size, config.TAG_DIM)).astype('float32')
for i in range(config.TAG_VOC_START):
tag_weights[i, :] = 0.
with open(path, 'wb') as file:
pickle.dump(tag_weights, file, protocol=2)
def init_embedding():
"""
初始化embedding
"""
if not os.path.exists(config.EMBEDDING_ROOT):
os.mkdir(config.EMBEDDING_ROOT)
# 初始化word embedding
init_word_embedding(config.W2V_TRAIN_PATH, overwrite=True)
# 初始化tag embedding
init_tag_embedding(config.T2V_PATH, overwrite=True)
def demo():
with open(config.W2V_TRAIN_PATH, 'rb') as file:
temp = pickle.load(file)
print(temp.shape)
if __name__ == '__main__':
t0 = time()
init_voc() # 初始化voc
init_embedding() # 初始化embedding
demo()
print('Done in %.1fs!' % (time()-t0))
================================================
FILE: code/train_w2v_model.py
================================================
#!/usr/bin/env python
# coding=utf-8
import codecs
import pickle
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
from TFNN.utils.io_util import read_lines
def get_sentence(sentence_tag):
words = []
for item in sentence_tag.split(' '):
index = item.rindex('/')
words.append(item[:index])
return ' '.join(words)
def extract_sentece():
lines = read_lines('./Data/corpus/training.seg.csv')
lines += read_lines('./Data/corpus/testing.seg.csv')
with codecs.open('./Data/corpus/sentence.txt', 'w', encoding='utf-8') as file_w:
for line in lines:
index = line.index(',')
word_tag = line[index+1:]
file_w.write('%s\n' % get_sentence(word_tag))
def train():
extract_sentece()
in_path = './Data/corpus/sentence.txt'
out_path = './Data/embedding/word2vec.bin'
# 训练模型
model = Word2Vec(
sg=1, sentences=LineSentence(in_path),
size=256, window=5, min_count=3, workers=4, iter=40)
model.wv.save_word2vec_format(out_path, binary=True)
def bin2pkl():
model = KeyedVectors.load_word2vec_format('./Data/embedding/word2vec.bin', binary=True)
word_dict = {}
for word in model.vocab:
word_dict[word] = model[word]
with open('./Data/embedding/word2vec.pkl', 'wb') as file_w:
pickle.dump(word_dict, file_w)
print(file_w.name)
if __name__ == '__main__':
train()
bin2pkl()
gitextract__8lu9u9y/
├── README.md
└── code/
├── Data/
│ └── corpus/
│ └── corpus download link
├── README
├── TFNN/
│ ├── __init__.py
│ ├── activations.py
│ ├── layers/
│ │ ├── ConvolutionalLayer.py
│ │ ├── DenseLayer.py
│ │ ├── EmbeddingLayer.py
│ │ └── __init__.py
│ ├── objectives.py
│ └── utils/
│ ├── __init__.py
│ ├── data_util.py
│ ├── evaluate_util.py
│ ├── io_util.py
│ └── tensor_util.py
├── configurations.py
├── load_data.py
├── model_dc.py
├── prepare_data.py
└── train_w2v_model.py
SYMBOL INDEX (64 symbols across 13 files)
FILE: code/TFNN/activations.py
function get_activation (line 9) | def get_activation(activation=None):
FILE: code/TFNN/layers/ConvolutionalLayer.py
class Convolutional1D (line 14) | class Convolutional1D(object):
method __init__ (line 16) | def __init__(self, input_data, filter_length, nb_filter, strides=[1, 1...
method call (line 68) | def call(self):
method input_data (line 89) | def input_data(self):
method output (line 93) | def output(self):
method output_dim (line 97) | def output_dim(self):
method get_weights (line 100) | def get_weights(self):
FILE: code/TFNN/layers/DenseLayer.py
class SoftmaxDense (line 12) | class SoftmaxDense(object):
method __init__ (line 14) | def __init__(self, input_data, input_dim, output_dim, weights=None,
method call (line 46) | def call(self):
method loss (line 55) | def loss(self, y):
method get_pre_y (line 62) | def get_pre_y(self):
method input_data (line 69) | def input_data(self):
method input_dim (line 73) | def input_dim(self):
method output_dim (line 77) | def output_dim(self):
method name (line 81) | def name(self):
method weights (line 85) | def weights(self):
method biases (line 89) | def biases(self):
method output (line 93) | def output(self):
FILE: code/TFNN/layers/EmbeddingLayer.py
class Embedding (line 11) | class Embedding(object):
method __init__ (line 13) | def __init__(self, params, ids, name, keep_prob=1.0):
method params (line 26) | def params(self):
method output_dim (line 30) | def output_dim(self):
method output (line 34) | def output(self):
FILE: code/TFNN/objectives.py
function categorical_crossentropy (line 6) | def categorical_crossentropy(y_true, y_pred):
FILE: code/TFNN/utils/data_util.py
function flatten_list (line 9) | def flatten_list(nest_list):
function create_dictionary (line 26) | def create_dictionary(items, dic_path, start=0, sort=True,
function map_item2id (line 70) | def map_item2id(items, voc, max_len, none_word=0, lower=False):
function random_over_sampling (line 90) | def random_over_sampling():
function demo (line 108) | def demo():
FILE: code/TFNN/utils/evaluate_util.py
function sim_compute (line 8) | def sim_compute(pro_labels, right_labels, ignore_label=None):
function demo (line 43) | def demo():
FILE: code/TFNN/utils/io_util.py
function read_lines (line 7) | def read_lines(path):
function get_file_list (line 17) | def get_file_list(path, postfix, file_list):
FILE: code/TFNN/utils/tensor_util.py
function zero_nil_slot (line 6) | def zero_nil_slot(t, name=None):
function add_gradient_noise (line 23) | def add_gradient_noise(t, stddev=1e-3, name=None):
function mask_tensor (line 39) | def mask_tensor(input_data, lengths, maxlen, dtype=tf.float32):
FILE: code/load_data.py
function get_sentence_arr (line 14) | def get_sentence_arr(words_tags, word_voc, tag_voc):
function init_data (line 39) | def init_data(lines, word_voc, tag_voc, label_voc):
function load_embedding (line 72) | def load_embedding():
function load_voc (line 88) | def load_voc():
function load_train_data (line 105) | def load_train_data(word_voc, tag_voc, label_voc):
function load_test_data (line 118) | def load_test_data(word_voc, tag_voc, label_voc):
function demo (line 132) | def demo():
FILE: code/model_dc.py
class DCModel (line 19) | class DCModel(object):
method __init__ (line 21) | def __init__(self, max_len, word_weights, tag_weights, result_path=Non...
method fit (line 100) | def fit(self, sentences_train, tags_train, labels_train,
method predict (line 159) | def predict(self, data_sentences, data_tags, batch_size=50):
method evaluate (line 182) | def evaluate(self, data_sentences, data_tags, data_labels,
method clear_model (line 211) | def clear_model(self):
method get_best_score (line 215) | def get_best_score(self):
function predict (line 231) | def predict():
function init_result (line 295) | def init_result():
function merge (line 307) | def merge():
FILE: code/prepare_data.py
function init_voc (line 22) | def init_voc():
function init_word_embedding (line 57) | def init_word_embedding(path=None, overwrite=False):
function init_tag_embedding (line 84) | def init_tag_embedding(path, overwrite=False):
function init_embedding (line 103) | def init_embedding():
function demo (line 115) | def demo():
FILE: code/train_w2v_model.py
function get_sentence (line 11) | def get_sentence(sentence_tag):
function extract_sentece (line 19) | def extract_sentece():
function train (line 29) | def train():
function bin2pkl (line 41) | def bin2pkl():
Condensed preview — 20 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (43K chars).
[
{
"path": "README.md",
"chars": 328,
"preview": "# Document Classification\nThis code implements a simple CNN model for document classification with tensorflow.\n\n# Model "
},
{
"path": "code/Data/corpus/corpus download link",
"chars": 205,
"preview": "数据下载地址:http://competition.ai100.com.cn/html/game_det.html?id=24\n\n将分词/词性标注后的文件命名为training.seg.csv和testing.seg.csv,放置到当前目录"
},
{
"path": "code/README",
"chars": 373,
"preview": "下载数据到Data/corpus目录下,并做分词和词性标注处理,再按以下顺序执行:\n train_w2v_model.py -> prepare_data.py -> model_dc.py\n\n\n文件说明:\n\n1. 分词、词性标注采用"
},
{
"path": "code/TFNN/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "code/TFNN/activations.py",
"chars": 580,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\n 激活函数\n\"\"\"\nimport tensorflow as tf\n\n\ndef get_activation(activation=N"
},
{
"path": "code/TFNN/layers/ConvolutionalLayer.py",
"chars": 3726,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\nDNN Layers:\n Convolutional1D\n\n\"\"\"\nimport numpy as np\nimport tensorf"
},
{
"path": "code/TFNN/layers/DenseLayer.py",
"chars": 2685,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\nDNN Layers:\n SoftmaxLayer\n\n\"\"\"\nimport tensorflow as tf\nfrom ..activ"
},
{
"path": "code/TFNN/layers/EmbeddingLayer.py",
"chars": 772,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\nDNN Layers:\n Embedding\n\n\"\"\"\nimport tensorflow as tf\n\n\nclass Embeddi"
},
{
"path": "code/TFNN/layers/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "code/TFNN/objectives.py",
"chars": 459,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport tensorflow as tf\n\n\ndef categorical_crossentropy(y_true, y_pred):\n "
},
{
"path": "code/TFNN/utils/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "code/TFNN/utils/data_util.py",
"chars": 2766,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport os\nimport pickle\nimport numpy as np\nfrom collections import default"
},
{
"path": "code/TFNN/utils/evaluate_util.py",
"chars": 1941,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport numpy as np\nfrom collections import defaultdict\nimport codecs\n\n\ndef"
},
{
"path": "code/TFNN/utils/io_util.py",
"chars": 836,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport os\nimport codecs\n\n\ndef read_lines(path):\n lines = []\n with co"
},
{
"path": "code/TFNN/utils/tensor_util.py",
"chars": 1376,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport tensorflow as tf\n\n\ndef zero_nil_slot(t, name=None):\n \"\"\"\n Ove"
},
{
"path": "code/configurations.py",
"chars": 894,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\n configurations\n\"\"\"\nimport os\n\n\n# --- corpus ---\nTRAIN_PATH = './Da"
},
{
"path": "code/load_data.py",
"chars": 3861,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\n Load data.\n\"\"\"\nimport pickle\nfrom time import time\nimport numpy as"
},
{
"path": "code/model_dc.py",
"chars": 13161,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nimport os\nfrom time import time\nimport configurations as config\nimport ten"
},
{
"path": "code/prepare_data.py",
"chars": 3596,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\n prepare data.\n\n 生成:\n word voc\n position voc\n "
},
{
"path": "code/train_w2v_model.py",
"chars": 1518,
"preview": "#!/usr/bin/env python\n# coding=utf-8\nimport codecs\nimport pickle\nfrom gensim.models import Word2Vec\nfrom gensim.models.w"
}
]
About this extraction
This page contains the full source code of the liu-nlper/DocumentClassification GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 20 files (38.2 KB), approximately 10.8k tokens, and a symbol index with 64 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.