Repository: chqiwang/convseg Branch: master Commit: 1abf17001efa Files: 10 Total size: 56.2 KB Directory structure: gitextract_wd0xs651/ ├── LICENSE ├── README.md ├── cws.py ├── score.perl ├── server.py ├── tagger.py ├── test.py ├── train.py ├── train_cws.sh └── train_cws_wemb.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) [2017] [Wang Chunqi] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # CONV-SEG Convolutional neural network for Chinese word segmentation (CWS). The corresponding paper: [**Convolutional Neural Network with Word Embeddings for Chinese Word Segmentation**](https://arxiv.org/pdf/1711.04411.pdf) ## Author Chunqi Wang ## Dependencies * [Python 2.7](https://www.python.org/) * [Tensorflow 1.0](https://www.tensorflow.org/) It is better to use a nvidia GPU to accelerate the training procedure. ## Data Downlaod `data.zip` from [here](https://drive.google.com/file/d/0B-f0oKMQIe6sQVNxeE9JeUJfQ0k/view?usp=sharing&resourcekey=0-pqrbXYjENu6AJxR6mq8zJQ) (Note that the SIGHAN datasets should only be used for research purposes). Extract `data.zip` to this directory. So the file tree would be: convseg | data | | datasets | | | sighan2005-pku | | | | train.txt | | | | dev.txt | | | | test.txt | | | sighan2005-msr | | | | train.txt | | | | dev.txt | | | | test.txt | | embeddings | | | news_tensite.w2v200 | | | news_tensite.pku.words.w2v50 | | | news_tensite.msr.words.w2v50 | tagger.py | train_cws.py | train_cws.sh | train_cws_wemb.sh | score.perl | README.md ## How to use First, give execute permission to scripts: chmod +x train_cws.sh train_cws_wemb.sh Train a preliminary model (CONV-SEG): ./train_cws.sh WHICH_DATASET WHICH_GPU Train a model with word embeddings (WE-CONV-SEG): ./train_cws_wemb.sh WHICH_DATASET WHICH_GPU We have two optional datasets: `pku` and `msr`. If you run the program in CPU environment, just leave the second argument empty. For example, if you want to train the model CONV-SEG on the pku dataset and on gpu0, you should run: ./train_cws.sh pku 0 More arguments can be set in `train.py`. ## Test Score | Model | PKU(dev) | PKU(test) | MSR(dev) | MSR(test) | |:------|:---------|:----------|:---------|:----------| | CONV-SEG | 96.8 | 95.7 | 97.2 | 97.3 | | WE-CONV-SEG | 97.5 | 96.5 | 98.1 | 98.0 | ================================================ FILE: cws.py ================================================ from __future__ import print_function import os, codecs from itertools import izip from tagger import data_iterator scope = 'CWS' def process_train_sentence(sentence, bigram, word_window): sentence = sentence.strip() words = sentence.split() chars = [] tags = [] ret = [] for w in words: chars.extend(list(w)) if len(w) == 1: tags.append('S') else: tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E']) ret.append(chars) if bigram: chars = ['', ''] + chars + ['', ''] ret.append([a + b if a and b else '' for a, b in zip(chars[:-4], chars[1:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[1:-3], chars[2:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[2:-2], chars[3:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-1], chars[4:])]) elif word_window > 0: chars = ['', '', ''] + chars + ['', '', ''] # single char if word_window >= 1: ret.append(chars[3:-3]) if word_window >= 2: # bi chars ret.append([a + b if a and b else '' for a, b in zip(chars[2:], chars[3:-3])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-3], chars[4:])]) if word_window >= 3: # tri chars ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[1:], chars[2:], chars[3:-3])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[2:], chars[3:-3], chars[4:])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[3:-3], chars[4:], chars[5:])]) if word_window >= 4: # four chars ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[0:], chars[1:], chars[2:], chars[3:-3])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[1:], chars[2:], chars[3:-3], chars[4:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[2:], chars[3:-3], chars[4:], chars[5:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[3:-3], chars[4:], chars[5:], chars[6:])]) ret.append(tags) return ret def process_raw_sentence(sentence, bigram, word_window): sentence = sentence.strip() chars = list(sentence) ret = [chars] if bigram: chars = ['', ''] + chars + ['', ''] ret.append([a + b if a and b else '' for a, b in zip(chars[:-4], chars[1:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[1:-3], chars[2:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[2:-2], chars[3:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-1], chars[4:])]) elif word_window > 0: chars = ['', '', ''] + chars + ['', '', ''] # single char if word_window >= 1: ret.append(chars[3:-3]) if word_window >= 2: # bi chars ret.append([a + b if a and b else '' for a, b in zip(chars[2:], chars[3:-3])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-3], chars[4:])]) if word_window >= 3: # tri chars ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[1:], chars[2:], chars[3:-3])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[2:], chars[3:-3], chars[4:])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[3:-3], chars[4:], chars[5:])]) if word_window >= 4: # four chars ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[0:], chars[1:], chars[2:], chars[3:-3])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[1:], chars[2:], chars[3:-3], chars[4:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[2:], chars[3:-3], chars[4:], chars[5:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[3:-3], chars[4:], chars[5:], chars[6:])]) return ret def read_train_file(fin, bigram=False, word_window=4): """ Read training data. """ data = [] for l in fin: data.append(process_train_sentence(l, bigram, word_window)) return zip(*data) def read_raw_file(fin, batch_size, bigram=False, word_window=4): """ Read raw data. """ buffer = [] max_buffer = 100000 for i, l in enumerate(fin): buffer.append(process_raw_sentence(l, bigram, word_window)) if i % max_buffer == 0 and i > 0: for b in data_iterator(zip(*buffer), batch_size, shuffle=False): yield b buffer = [] if buffer: for b in data_iterator(zip(*buffer), batch_size, shuffle=False): yield b def read_raw_file_all(fin, bigram=False, word_window=4): """ Read raw data. """ data = [] for b in read_raw_file(fin, 1000, bigram, word_window): data.extend(zip(*b)) return zip(*data) def create_output(seqs, stags): """ Create final output from characters and BMES tags. """ output = [] for seq, stag in izip(seqs, stags): new_sen = [] for c, tag in izip(seq, stag): new_sen.append(c) if tag == 'S' or tag == 'E': new_sen.append(' ') output.append(''.join(new_sen)) return output def evaluator(data, output_dir, output_flag): """ Evaluate presion, recall and F1. """ seqs, gold_stags, pred_stags = data assert len(seqs) == len(gold_stags) == len(pred_stags) # Create and open temp files. if not os.path.exists(output_dir): os.makedirs(output_dir) ref_path = os.path.join(output_dir, '%s.ref' % output_flag) pred_path = os.path.join(output_dir, '%s.pred' % output_flag) score_path = os.path.join(output_dir, '%s.score' % output_flag) # Empty words file. temp_path = os.path.join(output_dir, '%s.temp' % output_flag) ref_file = codecs.open(ref_path, 'w', 'utf8') pred_file = codecs.open(pred_path, 'w', 'utf8') for l in create_output(seqs, gold_stags): print(l, file=ref_file) for i, l in enumerate(create_output(seqs, pred_stags)): print(l, file=pred_file) ref_file.close() pred_file.close() os.system('echo > %s' % temp_path) os.system('%s %s %s %s > %s' % ('./score.perl', temp_path, ref_path, pred_path, score_path)) # Sighan evaluation results os.system('tail -n 7 %s > %s' % (score_path, temp_path)) eval_lines = [l.rstrip() for l in codecs.open(temp_path, 'r', 'utf8')] # Remove temp files. os.remove(ref_path) os.remove(pred_path) os.remove(score_path) os.remove(temp_path) # Precision, Recall and F1 score return (float(eval_lines[1].split(':')[1]), float(eval_lines[0].split(':')[1]), float(eval_lines[2].split(':')[1])) ================================================ FILE: score.perl ================================================ #!/usr/bin/perl -w ########################################################################### # # # SIGHAN # # Copyright (c) 2003,2005 # # All Rights Reserved. # # # # Permission is hereby granted, free of charge, to use and distribute # # this software and its documentation without restriction, including # # without limitation the rights to use, copy, modify, merge, publish, # # distribute, sublicense, and/or sell copies of this work, and to # # permit persons to whom this work is furnished to do so, subject to # # the following conditions: # # 1. The code must retain the above copyright notice, this list of # # conditions and the following disclaimer. # # 2. Any modifications must be clearly marked as such. # # 3. Original authors' names are not deleted. # # 4. The authors' names are not used to endorse or promote products # # derived from this software without specific prior written # # permission. # # # # SIGHAN AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES # # WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF # # MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SIGHAN NOR THE # # CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL # # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA # # OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # # PERFORMANCE OF THIS SOFTWARE. # # # ########################################################################### # # # Author: Richard Sproat (rws@uiuc.edu) # # Tom Emerson (tree@basistech.com) # # # ########################################################################### ## This code depends upon a version of diff (e.g. GNU diffutils 2.7.2) ## that supports the -y flag: ## ## -y Use the side by side output format. ## ## change the following per your installation: $diff = "/usr/bin/diff"; $USAGE = "Usage:\t$0 dictionary truth test\n\t"; if (@ARGV != 3) {print "$USAGE\n"; exit;} $tmp1 = "/tmp/comp01$$"; $tmp2 = "/tmp/comp02$$"; %dict = (); open (S, $ARGV[0]) or die "$ARGV[0]: $!\n"; while () { chop; s/^\s*//; s/\s*$//; $dict{$_} = 1; } close(S); open (TRUTH, $ARGV[1]) or die "$ARGV[1]: $!\n"; open (TEST, $ARGV[2]) or die "$ARGV[2]: $!\n"; $Tot = $Del = $Ins = $Subst = $Truecount = $Testcount = 0; $RawRecall = $RawPrecision = 0; $linenum = 0; $IVMISSED = $OOVMISSED = $OOV = $IV = 0; $file1 = $ARGV[1]; $file2 = $ARGV[2]; $file1 =~ s=^/.*/==; $file2 =~ s=^/.*/==; while (defined($truth = ) && defined($test = )) { $truth =~ s/^\s*//; $test =~ s/^\s*//; $truth =~ s/\s*$//; $test =~ s/\s*$//; $truth =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g; $test =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g; $truth =~ s/ //g; $test =~ s/ //g; @truthwords = split /\s+/, $truth; @testwords = split /\s+/, $test; $truecount = scalar(@truthwords); $testcount = scalar(@testwords); ++$linenum; if ($truecount == 0) { if ($testcount > 0) { print STDERR "Warning: training is 0 but test is nonzero, possible misalignment at line $linenum.\n"; } next; } if ($testcount == 0) { print STDERR "Warning: No output in test data where there is in training data, line $linenum\n"; } open (T1, ">$tmp1") or die "Can't open $tmp1"; open (T2, ">$tmp2") or die "Can't open $tmp2"; foreach my $w (@truthwords) { print T1 "$w\n"; } foreach my $w (@testwords) {print T2 "$w\n";} close (T1); close (T2); open (P, "$diff -y $tmp1 $tmp2 |") or die "Can't open pipe.\n"; print "--$file1-------$file2----$linenum\n"; my $del = 0; my $ins = 0; my $subst = 0; my $rawrecall = 0; my $rawprecision = 0; while (

) { my $err = 0; if (/\s\|\s/) {$subst++ ; $err++; } elsif (/\s\>\s/) {$ins++ ; $err++; } elsif (/\s\<\s/) {$del++ ; $err++; } if (/^([^\s]+)\s/) { my $w = $1; if (!$dict{$w}) {++$OOV;} else {++$IV;} if (/^[^\s]+\s.*\s[\|\>\<]\s/) { if (!$dict{$w}) {++$OOVMISSED;} else {++$IVMISSED;} ++$rawrecall; } } if (/\s[\|\>\<]\s.*[^\s]$/) { ++$rawprecision; } print "$_"; } close (P); my $tot = $del + $ins + $subst; $Tot += $tot; $Del += $del; $Ins += $ins; $Subst += $subst; $Truecount += $truecount; $Testcount += $testcount; $rawrecall = $truecount - $rawrecall; $rawprecision = $testcount - $rawprecision; $RawRecall += $rawrecall; $RawPrecision += $rawprecision; $rawrecall = sprintf("%2.3f", $rawrecall/$truecount); $rawprecision = sprintf("%2.3f", $rawprecision/$testcount); print "INSERTIONS:\t$ins\n"; print "DELETIONS:\t$del\n"; print "SUBSTITUTIONS:\t$subst\n"; print "NCHANGE:\t$tot\n"; print "NTRUTH:\t$truecount\n"; print "NTEST:\t$testcount\n"; print "TRUE WORDS RECALL:\t$rawrecall\n"; print "TEST WORDS PRECISION:\t$rawprecision\n"; } close(TRUTH); close(TEST); unlink($tmp1); unlink($tmp2); print "=== SUMMARY:\n"; print "=== TOTAL INSERTIONS:\t$Ins\n"; print "=== TOTAL DELETIONS:\t$Del\n"; print "=== TOTAL SUBSTITUTIONS:\t$Subst\n"; print "=== TOTAL NCHANGE:\t$Tot\n"; print "=== TOTAL TRUE WORD COUNT:\t$Truecount\n"; print "=== TOTAL TEST WORD COUNT:\t$Testcount\n"; $RawRecall = $RawRecall/$Truecount; $RawPrecision = $RawPrecision/$Testcount; $beta = 1; $R = $RawRecall; $P = $RawPrecision; if ($R != 0. || $P != 0.) { $F = (1 + $beta)*$P*$R/($beta * $P + $R); } else{ $F=0.; } $F = sprintf("%2.3f", $F); $RawRecall = sprintf("%2.3f", $RawRecall); $RawPrecision = sprintf("%2.3f", $RawPrecision); print "=== TOTAL TRUE WORDS RECALL:\t$RawRecall\n"; print "=== TOTAL TEST WORDS PRECISION:\t$RawPrecision\n"; print "=== F MEASURE:\t$F\n"; if ($OOV > 0) { $OOVMISSED = sprintf("%2.3f", 1 - $OOVMISSED / $OOV); } else { $OOVMISSED = "--"; } $OOV = sprintf("%2.3f", $OOV / $Truecount); if ($IV > 0) { $IVMISSED = sprintf("%2.3f", 1 - $IVMISSED / $IV); } else { $IVMISSED = "--"; } print "=== OOV Rate:\t$OOV\n"; print "=== OOV Recall Rate:\t$OOVMISSED\n"; print "=== IV Recall Rate:\t$IVMISSED\n"; print "###\t$file2\t$Ins\t$Del\t$Subst\t$Tot\t$Truecount\t$Testcount\t$RawRecall\t$RawPrecision\t$F\t$OOV\t$OOVMISSED\t$IVMISSED\n"; exit(0); ================================================ FILE: server.py ================================================ # -*- coding:utf-8 -* from __future__ import print_function import StringIO from argparse import ArgumentParser import tornado.ioloop import tornado.web import tensorflow as tf from tagger import Model def load_template(): page = '''



This is a %s demo provided by Chunqi Wang (chqiwang@126.com). ''' % (TASK.scope, TASK.scope, TASK.scope) return page class Tagger(object): def __init__(self, sess, model_dir, scope, batch_size): self.model = Model(scope, sess) self.batch_size = batch_size self.model.load_model(model_dir) def tag(self, sentences): sentences = self.preprocess(sentences) sf = StringIO.StringIO() sf.write(sentences) sf.seek(0) data = TASK.read_raw_file_all(sf) output = self.model.tag_all(data, self.batch_size) return TASK.create_output(*output) def preprocess(self, sentences): return sentences class MainHandler(tornado.web.RequestHandler): def get(self): self.write(load_template()) class TaskHandler(tornado.web.RequestHandler): def initialize(self, tagger): self.tagger = tagger def post(self): sentences = self.get_argument('sentences') segs = self.tagger.tag(sentences) self.write('\n'.join(segs)) def make_app(model_dir): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 1.0 config.allow_soft_placement = True config.log_device_placement = True sess = tf.Session(config=config) tagger = Tagger(sess=sess, model_dir=model_dir, scope=TASK.scope, batch_size=200) return tornado.web.Application([ (r"/", MainHandler), (r"/%s" % TASK.scope, TaskHandler, {'tagger': tagger}) ]) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('--task', dest='task') parser.add_argument('--model_dir', dest='model_dir') parser.add_argument('--port', dest='port', type=int, default=8888) args = parser.parse_args() TASK = __import__(args.task) app = make_app(args.model_dir) app.listen(args.port) tornado.ioloop.IOLoop.current().start() ================================================ FILE: tagger.py ================================================ from __future__ import print_function import tensorflow as tf import tensorflow.contrib.layers as layers import tensorflow.contrib.crf as crf import time import codecs import os import cPickle as pickle import numpy as np from itertools import izip INT_TYPE = np.int32 FLOAT_TYPE = np.float32 ################################################################################ # Model # ################################################################################ class Model(object): def __init__(self, scope, sess): self.scope = scope self.sess = sess def build_input_graph(self, vocab_size, emb_size, word_vocab_size, word_emb_size, word_window_size): """ Gather embeddings from lookup tables. """ seq_ids = tf.placeholder(dtype=INT_TYPE, shape=[None, None], name='seq_ids') seq_word_ids = [tf.placeholder(dtype=INT_TYPE, shape=[None, None], name='seq_feature_%d_ids' % i) for i in range(word_window_size)] embeddings = tf.get_variable('embeddings', [vocab_size, emb_size]) embedding_output = tf.nn.embedding_lookup([embeddings], seq_ids) word_outputs = [] word_embeddings = tf.get_variable('word_embeddings', [word_vocab_size, word_emb_size]) for i in range(word_window_size): word_outputs.append(tf.nn.embedding_lookup([word_embeddings], seq_word_ids[i])) return seq_ids, seq_word_ids, tf.concat([embedding_output] + word_outputs, 2, 'inputs') def build_tagging_graph(self, inputs, hidden_layers, channels, num_tags, use_crf, lamd, dropout_emb, dropout_hidden, kernel_size, use_bn, use_wn, active_type): """ Build a deep neural model for sequence tagging. """ stag_ids = tf.placeholder(dtype=INT_TYPE, shape=[None, None], name='stag_ids') seq_lengths = tf.placeholder(dtype=INT_TYPE, shape=[None], name='seq_lengths') # Default is not train. is_train = tf.placeholder(dtype=tf.bool, shape=[], name='is_train') masks = tf.cast(tf.sequence_mask(seq_lengths), FLOAT_TYPE) # Dropout on embedding output. if dropout_emb: inputs = tf.cond(is_train, lambda: tf.nn.dropout(inputs, 1 - dropout_emb), lambda: inputs) hidden_output = inputs pre_channels = inputs.get_shape()[-1].value for i in xrange(hidden_layers): k = kernel_size cur_channels = channels[i] filter_w = tf.get_variable('filter_w_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) filter_v = tf.get_variable('filter_v_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) bias_b = tf.get_variable('bias_b_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) bias_c = tf.get_variable('bias_c_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) # Weight normalization. if use_wn: epsilon = 1e-12 g_w = tf.get_variable('g_w_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) g_v = tf.get_variable('g_v_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) # Perform wn filter_w = g_w * filter_w / (tf.sqrt(tf.reduce_sum(filter_w ** 2, 1, keep_dims=True)) + epsilon) filter_v = g_v * filter_v / (tf.sqrt(tf.reduce_sum(filter_v ** 2, 1, keep_dims=True)) + epsilon) w = tf.nn.conv1d(hidden_output, filter_w, 1, 'SAME') + bias_b v = tf.nn.conv1d(hidden_output, filter_v, 1, 'SAME') + bias_c if use_bn: w = layers.batch_norm(inputs=v, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_w_%d' % i) v = layers.batch_norm(inputs=w, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_v_%d' % i) if active_type == 'glu': hidden_output = w * tf.nn.sigmoid(v) elif active_type == 'relu': hidden_output = tf.nn.relu(w) elif active_type == 'gtu': hidden_output = tf.tanh(w) * tf.nn.sigmoid(v) elif active_type == 'tanh': hidden_output = tf.tanh(w) elif active_type == 'linear': hidden_output = w elif active_type == 'bilinear': hidden_output = w * v # Mask paddings. hidden_output = hidden_output * tf.expand_dims(masks, -1) # Dropout on hidden output. if dropout_hidden: hidden_output = tf.cond(is_train, lambda: tf.nn.dropout(hidden_output, 1 - dropout_hidden), lambda: hidden_output ) pre_channels = cur_channels # Un-scaled log probabilities. scores = layers.fully_connected(hidden_output, num_tags, tf.identity) if use_crf: cost, transitions = crf.crf_log_likelihood(inputs=scores, tag_indices=stag_ids, sequence_lengths=seq_lengths) cost = - tf.reduce_mean(cost) else: reshaped_scores = tf.reshape(scores, [-1, num_tags]) reshaped_stag_ids = tf.reshape(stag_ids, [-1]) real_distribution = layers.one_hot_encoding(reshaped_stag_ids, num_tags) cost = tf.nn.softmax_cross_entropy_with_logits(reshaped_scores, real_distribution) cost = tf.reduce_sum(tf.reshape(cost, tf.shape(stag_ids)) * masks) / tf.cast(tf.shape(inputs)[0], FLOAT_TYPE) # Calculate L2 penalty. l2_penalty = 0 if lamd > 0: for v in tf.trainable_variables(): if '/B:' not in v.name and '/biases:' not in v.name: l2_penalty += lamd * tf.nn.l2_loss(v) train_cost = cost + l2_penalty # Summary cost. tf.summary.scalar('cost', cost) summaries = tf.summary.merge_all() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) with tf.control_dependencies([updates]): cost = tf.identity(cost) return stag_ids, seq_lengths, is_train, cost, train_cost, scores, summaries def build_graph(self): parameters = self.parameters with tf.variable_scope(name_or_scope=self.scope, initializer=tf.uniform_unit_scaling_initializer()): seq_ids_pl, seq_other_ids_pls, inputs = self.build_input_graph(vocab_size=parameters['vocab_size'], emb_size=parameters['emb_size'], word_window_size=parameters['word_window_size'], word_vocab_size=parameters['word_vocab_size'], word_emb_size=parameters['word_emb_size']) stag_ids_pl, seq_lengths_pl, is_train_pl, cost_op, train_cost_op, scores_op, summary_op = \ self.build_tagging_graph(inputs=inputs, num_tags=parameters['num_tags'], use_crf=parameters['use_crf'], lamd=parameters['lamd'], dropout_emb=parameters['dropout_emb'], dropout_hidden=parameters['dropout_hidden'], hidden_layers=parameters['hidden_layers'], channels=parameters['channels'], kernel_size=parameters['kernel_size'], use_bn=parameters['use_bn'], use_wn=parameters['use_wn'], active_type=parameters['active_type']) self.seq_ids_pl = seq_ids_pl self.seq_other_ids_pls = seq_other_ids_pls self.stag_ids_pl = stag_ids_pl self.seq_lengths_pl = seq_lengths_pl self.is_train_pl = is_train_pl self.cost_op = cost_op self.train_cost_op = train_cost_op self.scores_op = scores_op self.summary_op = summary_op def inference(self, scores, sequence_lengths=None): """ Inference label sequence given scores. If transitions is given, then perform veterbi search, else perform greedy search. Args: scores: A numpy array with shape (batch, max_length, num_tags). sequence_lengths: A numpy array with shape (batch,). Returns: A numpy array with shape (batch, max_length). """ if not self.parameters['use_crf']: return np.argmax(scores, 2) else: with tf.variable_scope(self.scope, reuse=True): transitions = tf.get_variable('transitions').eval(session=self.sess) paths = np.zeros(scores.shape[:2], dtype=INT_TYPE) for i in xrange(scores.shape[0]): tag_score, length = scores[i], sequence_lengths[i] if length == 0: continue path, _ = crf.viterbi_decode(tag_score[:length], transitions) paths[i, :length] = path return paths def train(self, train_data, dev_data, test_data, model_dir, log_dir, emb_size, word_emb_size, optimizer, hidden_layers, channels, kernel_size, active_type, use_bn, use_wn, use_crf, lamd, dropout_emb, dropout_hidden, evaluator, batch_size, eval_batch_size, pre_trained_emb_path, fix_word_emb, reserve_all_word_emb, pre_trained_word_emb_path, max_epoches, print_freq): """ This function is the main function for preparing data and training the model. """ assert len(channels) == hidden_layers # Parse optimization method and parameters. optimizer = optimizer.split('_') optimizer_name = optimizer[0] optimizer_options = [eval(i) for i in optimizer[1:]] optimizer = { 'sgd': tf.train.GradientDescentOptimizer, 'adadelta': tf.train.AdadeltaOptimizer, 'adam': tf.train.AdamOptimizer, 'mom': tf.train.MomentumOptimizer }[optimizer_name](*optimizer_options) print('Preparing data...', end='') if not os.path.exists(model_dir): os.makedirs(model_dir) mappings_path = os.path.join(model_dir, 'mappings.pkl') parameters_path = os.path.join(model_dir, 'parameters.pkl') # Load character embeddings. pre_trained = {} if pre_trained_emb_path and os.path.isfile(pre_trained_emb_path): for l in codecs.open(pre_trained_emb_path, 'r', 'utf8'): we = l.split() if len(we) == emb_size + 1: w, e = we[0], np.array(map(float, we[1:])) pre_trained[w] = e # Load word embeddings. pre_trained_word = {} if pre_trained_word_emb_path and os.path.isfile(pre_trained_word_emb_path): for l in codecs.open(pre_trained_word_emb_path, 'r', 'utf8', 'ignore'): we = l.split() if len(we) == word_emb_size + 1: w, e = we[0], np.array(map(float, we[1:])) pre_trained_word[w] = e # Load or create mappings. if os.path.isfile(mappings_path): item2id, id2item, tag2id, id2tag, word2id, id2word = pickle.load(open(mappings_path, 'r')) else: item2id, id2item = create_mapping(create_dic(train_data[0], add_unk=True, add_pad=True)) tag2id, id2tag = create_mapping(create_dic(train_data[-1])) words = [] for t in train_data[1:-1]: words.extend(t) for t in dev_data[1:-1]: words.extend(t) for t in test_data[1:-1]: words.extend(t) word_dic = create_dic(words, add_unk=True, add_pad=True) for k in word_dic.keys(): if k not in pre_trained_word and k != '' and k != '': word_dic.pop(k) if reserve_all_word_emb: for w in pre_trained_word: if w not in word_dic: word_dic[w] = 0 word2id, id2word = create_mapping(word_dic) # Save the mappings to disk. pickle.dump((item2id, id2item, tag2id, id2tag, word2id, id2word), open(mappings_path, 'w')) # Hyper parameters. word_window_size = len(train_data) - 2 parameters = { 'vocab_size': len(item2id), 'emb_size': emb_size, 'word_window_size': word_window_size, 'word_vocab_size': len(word2id), 'word_emb_size': word_emb_size, 'hidden_layers': hidden_layers, 'channels': channels, 'kernel_size': kernel_size, 'use_bn': use_bn, 'use_wn': use_wn, 'num_tags': len(tag2id), 'use_crf': use_crf, 'lamd': lamd, 'dropout_emb': dropout_emb, 'dropout_hidden': dropout_hidden, 'active_type': active_type } if os.path.isfile(parameters_path): parameters_old = pickle.load(open(parameters_path, 'r')) if parameters != parameters_old: raise Exception('Network parameters are not consistent!') else: pickle.dump(parameters, open(parameters_path, 'w')) self.item2id = item2id self.id2item = id2item self.tag2id = tag2id self.id2tag = id2tag self.word2id = word2id self.id2word = id2word self.parameters = parameters # Convert data to corresponding ids. train_data_ids = data_to_ids( train_data, [item2id] + [word2id] * word_window_size + [tag2id] ) print('Finished.') print("Start building the network...", end='') self.build_graph() print('Finished.') def summary(name, dtype=FLOAT_TYPE): value = tf.placeholder(dtype, shape=[]) return value, tf.summary.scalar(name, value) dev_f1_pl, dev_summary_op = summary('dev f1') test_f1_pl, test_summary_op = summary('test f1') # Clip gradients and apply. grads_and_vars = optimizer.compute_gradients(loss=self.train_cost_op, var_list=tf.trainable_variables()) grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] # If use fixed word embeddings, remove the grad if fix_word_emb: grads_and_vars = [(g, v) for g, v in grads_and_vars if '/word_embeddings' not in v.name] grads_summary_op = tf.summary.histogram('grads', tf.concat([tf.reshape(g, [-1]) for g, _ in grads_and_vars], 0)) grads_norm = tf.sqrt(sum([tf.reduce_sum(tf.pow(g, 2)) for g, _ in grads_and_vars])) grads_and_vars = [(g / (tf.reduce_max([grads_norm, 5]) / 5), v) for g, v in grads_and_vars] train_op = optimizer.apply_gradients(grads_and_vars) # Variables for recording training procedure. best_epoch = tf.get_variable('best_epoch', shape=[], initializer=tf.zeros_initializer(), trainable=False, dtype=INT_TYPE) best_step = tf.get_variable('best_step', shape=[], initializer=tf.zeros_initializer(), trainable=False, dtype=INT_TYPE) best_dev_score = tf.get_variable('best_dev_score', shape=[], initializer=tf.zeros_initializer(), trainable=False, dtype=FLOAT_TYPE) best_test_score = tf.get_variable('best_test_score', shape=[], initializer=tf.zeros_initializer(), trainable=False, dtype=FLOAT_TYPE) init_op = tf.global_variables_initializer() saver = tf.train.Saver(tf.global_variables()) summary_writer = tf.summary.FileWriter(log_dir + '/summaries') print('Finished.') print('Start training the network...') self.sess.run(init_op) start_time_begin = time.time() try: checkpoint = tf.train.latest_checkpoint(model_dir) saver.restore(self.sess, checkpoint) print('Restore model from %s.' % checkpoint) except (tf.errors.DataLossError, TypeError, Exception): # Failed to restore model from disk. Load pre-trained embeddings. # Load character embeddings. with tf.variable_scope(self.scope, reuse=True): embeddings = tf.get_variable('embeddings') value = self.sess.run(embeddings) count = 0 for item in item2id: item_id = item2id[item] if item in pre_trained: value[item_id] = pre_trained[item] count += 1 # Run assign op. self.sess.run(embeddings.assign(value)) del (pre_trained) print('%d of %d character embeddings were loaded from pre-trained.' % (count, len(item2id))) # Load word embeddings. with tf.variable_scope(self.scope, reuse=True): word_embeddings = tf.get_variable('word_embeddings') value = self.sess.run(word_embeddings) count = 0 for item in word2id: item_id = word2id[item] if item in pre_trained_word: value[item_id] = pre_trained_word[item] count += 1 # Run assign op. self.sess.run(word_embeddings.assign(value)) del (pre_trained_word) print('%d of %d word embeddings were loaded from pre-trained.' % (count, len(word2id))) start_epoch, global_step, best_dev_f1 = self.sess.run((best_epoch, best_step, best_dev_score)) for epoch in range(start_epoch + 1, max_epoches + 1): print('Starting epoch %d...' % epoch) start_time = time.time() loss_ep = 0 n_step = 0 iterator = data_iterator(train_data_ids, batch_size, shuffle=True) for batch in iterator: batch = create_input(batch) seq_ids, seq_other_ids_list, stag_ids, seq_lengths = batch[0], batch[1: -2], batch[-2], batch[-1] feed_dict = {self.seq_ids_pl: seq_ids.astype(INT_TYPE), self.stag_ids_pl: stag_ids.astype(INT_TYPE), self.seq_lengths_pl: seq_lengths.astype(INT_TYPE), self.is_train_pl: True} assert len(self.seq_other_ids_pls) == len(seq_other_ids_list) for pl, v in zip(self.seq_other_ids_pls, seq_other_ids_list): feed_dict[pl] = v # feed_dict.update(drop_feed_dict) # enable noise input loss, summaries, grads_summaries, _ = self.sess.run( [self.cost_op, self.summary_op, grads_summary_op, train_op], feed_dict=feed_dict) loss_ep += loss n_step += 1 global_step += 1 summary_writer.add_summary(summaries, global_step) summary_writer.add_summary(grads_summaries, global_step) # Show training information. if global_step % print_freq == 0: print(' Step %d, current cost %.6f, average cost %.6f' % (global_step, loss, loss_ep / n_step)) loss_ep = loss_ep / n_step print('Epoch %d finished. Time: %ds Cost: %.6f' % (epoch, time.time() - start_time, loss_ep)) # Evaluate precision, recall and f1 with an external script. dev_pre, dev_rec, dev_f1 = \ evaluator((dev_data[0], dev_data[-1], self.tag_all(dev_data[:-1], eval_batch_size)[1]), log_dir + '/dev', epoch) test_pre, test_rec, test_f1 = \ evaluator((test_data[0], test_data[-1], self.tag_all(test_data[:-1], eval_batch_size)[1]), log_dir + '/test', epoch) # Summary dev and test F1 score. summary_writer.add_summary(self.sess.run(dev_summary_op, {dev_f1_pl: dev_f1}), epoch) summary_writer.add_summary(self.sess.run(test_summary_op, {test_f1_pl: test_f1}), epoch) print("Dev precision / recall / f1 score: %.2f / %.2f / %.2f" % (dev_pre * 100, dev_rec * 100, dev_f1 * 100)) print("Test precision / recall / f1 score: %.2f / %.2f / %.2f" % (test_pre * 100, test_rec * 100, test_f1 * 100)) if dev_f1 > best_dev_f1: best_dev_f1 = dev_f1 self.sess.run((tf.assign(best_epoch, epoch), tf.assign(best_dev_score, dev_f1), tf.assign(best_test_score, test_f1), tf.assign(best_step, global_step))) path = saver.save(self.sess, model_dir + '/model', epoch) print('New best score on dev.') print('Save model at %s.' % path) print('Finished.') print('Total training time: %fs.' % (time.time() - start_time_begin)) def load_model(self, model_dir): mappings_path = os.path.join(model_dir, 'mappings.pkl') parameters_path = os.path.join(model_dir, 'parameters.pkl') item2id, id2item, tag2id, id2tag, word2id, id2word = \ pickle.load(open(mappings_path, 'r')) parameters = pickle.load(open(parameters_path)) self.item2id = item2id self.id2item = id2item self.tag2id = tag2id self.id2tag = id2tag self.word2id = word2id self.id2word = id2word self.parameters = parameters print(parameters) print('Building input graph...', end='') self.build_graph() print('Finished.') print('Initializing variables...', end='') init_op = tf.initialize_all_variables() self.sess.run(init_op) print('Finished.') print('Reloading parameters...', end='') saver = tf.train.Saver(tf.global_variables()) checkpoint = tf.train.latest_checkpoint(model_dir) saver.restore(self.sess, checkpoint) print('Finished.') def tag(self, data_iter): """A tagging function. Args: data_iter: A iterator for generate batches. Returns: A generator for tagging result. """ output = [] for data in data_iter: batch = data_to_ids(data, [self.item2id] + [self.word2id] * self.parameters['word_window_size']) batch = create_input(batch) seq_ids, seq_other_ids_list, seq_lengths = batch[0], batch[1: -1], batch[-1] feed_dict = {self.seq_ids_pl: seq_ids.astype(INT_TYPE), self.seq_lengths_pl: seq_lengths.astype(INT_TYPE), self.is_train_pl: False} for pl, v in zip(self.seq_other_ids_pls, seq_other_ids_list): feed_dict[pl] = v.astype(INT_TYPE) scores = self.sess.run(self.scores_op, feed_dict) stag_ids = self.inference(scores, seq_lengths) for seq, stag_id, length in izip(data[0], stag_ids, seq_lengths): output.append((seq, [self.id2tag[t] for t in stag_id[:length]])) yield zip(*output) output = [] def tag_all(self, data, batch_size): data_iter = data_iterator(data, batch_size=batch_size, shuffle=False) output = [] for b in self.tag(data_iter): output.extend(zip(*b)) return zip(*output) ################################################################################ # DATA UTILS # ################################################################################ def create_dic(item_list, add_unk=False, add_pad=False): """ Create a dictionary of items from a list of list of items. """ assert type(item_list) in (list, tuple) dic = {} for items in item_list: for item in items: if item not in dic: dic[item] = 1 else: dic[item] += 1 # Make sure that have a id 0. if add_pad: dic[''] = 1e20 # If specified, add a special item . if add_unk: dic[''] = 1e10 return dic def create_mapping(items): """ Create a mapping (item to ID / ID to item) from a dictionary. Items are ordered by decreasing frequency. """ if type(items) is dict: sorted_items = sorted(items.items(), key=lambda x: (-x[1], x[0])) id2item = {i: v[0] for i, v in enumerate(sorted_items)} item2id = {v: k for k, v in id2item.items()} return item2id, id2item elif type(items) is list: id2item = {i: v for i, v in enumerate(items)} item2id = {v: k for k, v in id2item.items()} return item2id, id2item def create_input(batch): """ Take each sentence data in batch and return an input for the training or the evaluation function. """ assert len(batch) > 0 lengths = [len(seq) for seq in batch[0]] max_len = max(2, max(lengths)) ret = [] for d in batch: dd = [] for seq_id, pos in izip(d, lengths): assert len(seq_id) == pos pad = [0] * (max_len - pos) dd.append(seq_id + pad) ret.append(np.array(dd)) ret.append(np.array(lengths)) return ret def data_to_ids(data, mappings): """ Map text data to ids. """ def strQ2B(ustring): rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 12288: inside_code = 32 elif 65281 <= inside_code <= 65374: inside_code -= 65248 rstring += unichr(inside_code) return rstring def strB2Q(ustring): rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 32: inside_code = 12288 elif 32 <= inside_code <= 126: inside_code += 65248 rstring += unichr(inside_code) return rstring def map(item, mapping): if item in mapping: return mapping[item] item = strB2Q(item) if item in mapping: return mapping[item] item = strQ2B(item) if item in mapping: return mapping[item] return mapping[''] def map_seq(seqs, mapping): return [[map(item, mapping) for item in seq] for seq in seqs] ret = [] for d, m in izip(data, mappings): ret.append(map_seq(d, m)) return tuple(ret) def data_iterator(inputs, batch_size, shuffle=True, max_length=200): """ A simple iterator for generating dynamic mini batches. """ assert len(inputs) > 0 assert all([len(item) == len(inputs[0]) for item in inputs]) inputs = zip(*inputs) if shuffle: np.random.shuffle(inputs) batch = [] bs = batch_size for d in inputs: if len(d[0]) > max_length: bs = max(1, min(batch_size * max_length / len(d[0]), bs)) if len(batch) < bs: batch.append(d) else: yield zip(*batch) batch = [d] if len(d[0]) < max_length: bs = batch_size else: bs = max(1, batch_size * max_length / len(d[0])) if batch: yield zip(*batch) ================================================ FILE: test.py ================================================ from __future__ import print_function import codecs import time from argparse import ArgumentParser import tensorflow as tf from tagger import Model if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--task', dest='task') parser.add_argument('--input', dest='input') parser.add_argument('--output', dest='output') parser.add_argument('--model_dir', dest='model_dir') parser.add_argument('--batch_size', dest='batch_size', type=int) args = parser.parse_args() TASK = __import__(args.task) data_iter = TASK.read_raw_file(codecs.open(args.input, 'r', 'utf8'), args.batch_size) fout = codecs.open(args.output, 'w', 'utf8') config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 1.0 config.allow_soft_placement = True config.log_device_placement = True with tf.Session(config=config) as sess: model = Model(TASK.scope, sess) model.load_model(args.model_dir) start = time.time() count = 0 for seqs, stags in model.tag(data_iter): for l in TASK.create_output(seqs, stags): count += 1 print(l, file=fout) print('Tagged %d lines in %d seconds.' % (count, time.time() - start)) end = time.time() fout.close() print('Finished.') ================================================ FILE: train.py ================================================ #!coding=utf8 from __future__ import print_function import sys, codecs import tensorflow as tf from argparse import ArgumentParser from tagger import Model class FlushFile: """ A wrapper for File, allowing users see result immediately. """ def __init__(self, f): self.f = f def write(self, x): self.f.write(x) self.f.flush() if __name__ == '__main__': sys.stdout = FlushFile(sys.stdout) parser = ArgumentParser() parser.add_argument('--task', dest='task') parser.add_argument('--training_path', dest='training_path', default='data/datasets/sighan2005-pku/train.txt') parser.add_argument('--dev_path', dest='dev_path', default='data/datasets/sighan2005-pku/dev.txt') parser.add_argument('--test_path', dest='test_path', default='data/datasets/sighan2005-pku/test.txt') parser.add_argument('--pre_trained_emb_path', dest='pre_trained_emb_path', default=None) parser.add_argument('--pre_trained_word_emb_path', dest='pre_trained_word_emb_path', default=None) parser.add_argument('--model_root', dest='model_root', default='model-pku') parser.add_argument('--emb_size', dest='emb_size', type=int, default=200) parser.add_argument('--word_window', dest='word_window', type=int, default=0) parser.add_argument('--hidden_layers', dest='hidden_layers', type=int, default=5) parser.add_argument('--channels', dest='channels', type=int, default=200) parser.add_argument('--kernel_size', dest='kernel_size', type=int, default=3) parser.add_argument('--word_emb_size', dest='word_emb_size', type=int, default=50) parser.add_argument('--use_bn', dest='use_bn', type=int, default=0) parser.add_argument('--use_wn', dest='use_wn', type=int, default=1) parser.add_argument('--dropout_emb', dest='dropout_emb', type=float, default=0.2) parser.add_argument('--dropout_hidden', dest='dropout_hidden', type=float, default=0.2) parser.add_argument('--active_type', dest='active_type', default='glu') parser.add_argument('--lamd', dest='lamd', type=float, default=0) parser.add_argument('--fix_word_emb', dest='fix_word_emb', type=int, default=0) parser.add_argument('--reserve_all_word_emb', dest='reserve_all_word_emb', type=int, default=0) parser.add_argument('--use_crf', dest='use_crf', type=int, default=1) parser.add_argument('--optimizer', dest='optimizer', default='adam_0.001') parser.add_argument('--batch_size', dest='batch_size', type=int, default=100) parser.add_argument('--eval_batch_size', dest='eval_batch_size', type=int, default=1000) parser.add_argument('--max_epoches', dest='max_epoches', type=int, default=100) args = parser.parse_args() print(args) TASK = __import__(args.task) train_data, dev_data, test_data = (TASK.read_train_file(codecs.open(args.training_path, 'r', 'utf8'), word_window=args.word_window), TASK.read_train_file(codecs.open(args.dev_path, 'r', 'utf8'), word_window=args.word_window), TASK.read_train_file(codecs.open(args.test_path, 'r', 'utf8'), word_window=args.word_window)) sess = tf.Session() model = Model(TASK.scope, sess) model.train(train_data=train_data, dev_data=dev_data, test_data=test_data, model_dir=args.model_root + '/models', log_dir=args.model_root + '/logs', emb_size=args.emb_size, word_emb_size=args.word_emb_size, hidden_layers=args.hidden_layers, channels=[args.channels] * args.hidden_layers, kernel_size=args.kernel_size, use_bn=args.use_bn, use_wn=args.use_wn, active_type=args.active_type, batch_size=args.batch_size, use_crf=args.use_crf, lamd=args.lamd, dropout_emb=args.dropout_emb, dropout_hidden=args.dropout_hidden, optimizer=args.optimizer, evaluator=TASK.evaluator, eval_batch_size=args.eval_batch_size, print_freq=50, pre_trained_emb_path=args.pre_trained_emb_path, pre_trained_word_emb_path=args.pre_trained_word_emb_path, fix_word_emb=args.fix_word_emb, reserve_all_word_emb=args.reserve_all_word_emb, max_epoches=args.max_epoches) sess.close() ================================================ FILE: train_cws.sh ================================================ #!/usr/bin/env bash corpus=$1 export CUDA_VISIBLE_DEVICES=$2 if [ "${corpus}" != "pku" ] && [ "${corpus}" != "msr" ];then echo "The first input must be pku or msr!" exit fi chmod +x score.perl model_root=model-${corpus} if [ ! -d ${model_root} ];then mkdir ${model_root} fi nohup python train.py --task cws \ --training_path data/datasets/sighan2005-${corpus}/train.txt \ --dev_path data/datasets/sighan2005-${corpus}/dev.txt \ --test_path data/datasets/sighan2005-${corpus}/test.txt \ --pre_trained_emb_path data/embeddings/news_tensite.w2v200 \ --model_root ${model_root} \ --word_window 0 \ >>${model_root}/stdout.txt 2>>${model_root}/stderr.txt & echo "Model and log are saved in ${model_root}." ================================================ FILE: train_cws_wemb.sh ================================================ #!/usr/bin/env bash corpus=$1 export CUDA_VISIBLE_DEVICES=$2 if [ "${corpus}" != "pku" ] && [ "${corpus}" != "msr" ];then echo "The first input must be pku or msr!" exit fi chmod +x score.perl model_root=model-wemb-${corpus} if [ ! -d ${model_root} ];then mkdir ${model_root} fi nohup python train.py --task cws \ --training_path data/datasets/sighan2005-${corpus}/train.txt \ --dev_path data/datasets/sighan2005-${corpus}/dev.txt \ --test_path data/datasets/sighan2005-${corpus}/test.txt \ --pre_trained_emb_path data/embeddings/news_tensite.w2v200 \ --pre_trained_word_emb_path data/embeddings/news_tensite.${corpus}.words.w2v50 \ --model_root ${model_root} \ --word_window 4 \ >>${model_root}/stdout.txt 2>>${model_root}/stderr.txt & echo "Model and log are saved in ${model_root}."