Repository: yistLin/pytorch-dual-learning Branch: master Commit: a2a6ff78cd86 Files: 25 Total size: 109.0 KB Directory structure: gitextract_r657jzkl/ ├── README.md ├── data.py ├── dual.py ├── lm/ │ ├── README.md │ ├── __init__.py │ ├── data.py │ ├── generate.py │ ├── lm_prob.py │ ├── main.py │ └── model.py ├── model.py ├── nmt/ │ ├── README.md │ ├── __init__.py │ ├── channel.py │ ├── model.py │ ├── nmt.py │ ├── scripts/ │ │ ├── multi-bleu.perl │ │ ├── test.sh │ │ ├── train-small.sh │ │ └── train.sh │ ├── util.py │ └── vocab.py ├── train-dual.sh ├── util.py └── vocab.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # PyTorch Dual Learning This is the PyTorch implementation for [Dual Learning for Machine Translation](https://arxiv.org/abs/1611.00179). The NMT models used as channels are heavily depend on [pcyin/pytorch\_nmt](https://github.com/pcyin/pytorch_nmt). ### Usage You shall prepare these models for dual learning step: - Language Models x 2 - Translation Models x 2 ##### Warm-up Step - Language Models \ Check here [lm/](https://github.com/yistLin/pytorch-dual-learning/tree/master/lm) - Translation Models \ Check here [nmt/](https://github.com/yistLin/pytorch-dual-learning/tree/master/nmt) ##### Dual Learning Step During the reinforcement learning process, it will gain rewards from language models and translation models, and update the translation models. \ You can find more details in the paper. - Training \ You can simply use this [script](https://github.com/yistLin/pytorch-dual-learning/blob/master/train-dual.sh), you have to modify the path and name to your models. - Test \ To use the trained models, you can just treat it as [NMT models](https://github.com/pcyin/pytorch_nmt). ### Test (Basic) Firstly, we trained our basic model with 450K bilingual pair, which is only 10% data, as warm-start. Then, we set up a dual-learning game, and trained two models using reinforcement technique. ##### Configs - Reward - language model reward: average over square rooted length of string - final reward: ``` rk = 0.01 x r1 + 0.99 x r2 ``` - Optimizer ``` torch.optim.SGD(models[m].parameters(), lr=1e-3, momentum=0.9) ``` ##### Results - English-Deutsch - after 600 iterations ``` BLEU = 21.39, 49.1/26.8/17.6/12.2 ``` - after 1200 iterations ``` BLEU = 21.49, 48.6/26.6/17.4/12.0 ``` - Deutsch-English - after 600 iterations ``` BLEU = 25.89, 56.0/32.8/22.3/15.8 ``` - after 1200 iterations ``` BLEU = 25.94, 55.9/32.7/22.2/15.8 ``` ##### Comparisons | Model | Original | iter300 | iter600 | iter900 | iter1200 | iter1500 | iter3000 | iter4500 | iter6600 | |--------------|---------:|--------:|--------:|--------:|---------:|---------:|---------:|---------:|---------:| | EN-DE | 20.54 | 21.27 | 21.39 | 21.49 | 21.46 | 21.49 | 21.56 | 21.62 | 21.60 | | EN-DE (bleu) | | 21.42 | 21.57 | 21.55 | 21.55 | | | | | | DE-EN | 24.69 | 25.90 | 25.89 | 25.91 | 26.03 | 25.94 | 26.02 | 26.18 | 26.20 | | DE-EN (bleu) | | 25.96 | 26.25 | 26.22 | 26.18 | | | | | ================================================ FILE: data.py ================================================ import os import torch import pickle class Dictionary(object): def __init__(self): self.word2idx = {'': 0} self.idx2word = [''] self.wordcnt = {'': 1} def add_word(self, word): if word not in self.word2idx: self.idx2word.append(word) self.word2idx[word] = len(self.idx2word) - 1 self.wordcnt[word] = 1 else: self.wordcnt[word] = self.wordcnt[word] + 1 return self.word2idx[word] def getid(self, word, thresh=10): if (word not in self.word2idx) or (self.wordcnt[word] < thresh): return self.word2idx[''] return self.word2idx[word] def __len__(self): return len(self.idx2word) class Corpus(object): def __init__(self, path): self.dictionary = Dictionary() self.train = self.tokenize(os.path.join(path, 'train.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt')) with open(os.path.join(path, 'dict.pkl'), 'wb') as f: pickle.dump(self.dictionary, f) def tokenize(self, path): """Tokenizes a text file.""" assert os.path.exists(path) # Add words to the dictionary with open(path, 'r') as f: tokens = 0 for line in f: words = [''] + line.split() + [''] tokens += len(words) for word in words: self.dictionary.add_word(word) # Tokenize file content with open(path, 'r') as f: ids = torch.LongTensor(tokens) token = 0 for line in f: words = [''] + line.split() + [''] for word in words: ids[token] = self.dictionary.getid(word) token += 1 return ids ================================================ FILE: dual.py ================================================ # -*- coding: utf-8 -*- import sys import torch import argparse import random from torch.autograd import Variable from nmt import read_corpus, data_iter from nmt import NMT, to_input_variable from lm import LMProb from lm import model def dual(args): vocabs = {} opts = {} state_dicts = {} train_srcs = {} lms = {} # load model params & training data for i in range(2): model_id = (['A', 'B'])[i] print('loading pieces, part {:s}'.format(model_id)) print(' load model{:s} from [{:s}]'.format(model_id, args.nmt[i]), file=sys.stderr) params = torch.load(args.nmt[i], map_location=lambda storage, loc: storage) # load model onto CPU vocabs[model_id] = params['vocab'] opts[model_id] = params['args'] state_dicts[model_id] = params['state_dict'] print(' load train_src{:s} from [{:s}]'.format(model_id, args.src[i]), file=sys.stderr) train_srcs[model_id] = read_corpus(args.src[i], source='src') print(' load lm{:s} from [{:s}]'.format(model_id, args.lm[i]), file=sys.stderr) lms[model_id] = LMProb(args.lm[i], args.dict[i]) models = {} optimizers = {} for m in ['A', 'B']: # build model opts[m].cuda = args.cuda models[m] = NMT(opts[m], vocabs[m]) models[m].load_state_dict(state_dicts[m]) models[m].train() if args.cuda: models[m] = models[m].cuda() random.shuffle(train_srcs[m]) # optimizer # optimizers[m] = torch.optim.Adam(models[m].parameters()) optimizers[m] = torch.optim.SGD(models[m].parameters(), lr=1e-3, momentum=0.9) # loss function loss_nll = torch.nn.NLLLoss() loss_ce = torch.nn.CrossEntropyLoss() epoch = 0 start = args.start_iter while True: epoch += 1 print('\nstart of epoch {:d}'.format(epoch)) data = {} data['A'] = iter(train_srcs['A']) data['B'] = iter(train_srcs['B']) start += (epoch - 1) * len(train_srcs['A']) + 1 for t in range(start, start + len(train_srcs['A'])): show_log = False if t % args.log_every == 0: show_log = True if show_log: print('\nstep', t) for m in ['A', 'B']: lm_probs = [] NLL_losses = [] CE_losses = [] modelA = models[m] modelB = models[change(m)] lmB = lms[change(m)] optimizerA = optimizers[m] optimizerB = optimizers[change(m)] vocabB = vocabs[change(m)] s = next(data[m]) if show_log: print('\n{:s} -> {:s}'.format(m, change(m))) print('[s]', ' '.join(s)) hyps = modelA.beam(s, beam_size=5) for ids, smid, dist in hyps: if show_log: print('[smid]', ' '.join(smid)) var_ids = Variable(torch.LongTensor(ids[1:]), requires_grad=False) NLL_losses.append(loss_nll(dist, var_ids).cpu()) lm_probs.append(lmB.get_prob(smid)) src_sent_var = to_input_variable([smid], vocabB.src, cuda=args.cuda) tgt_sent_var = to_input_variable([[''] + s + ['']], vocabB.tgt, cuda=args.cuda) src_sent_len = [len(smid)] score = modelB(src_sent_var, src_sent_len, tgt_sent_var[:-1]).squeeze(1) CE_losses.append(loss_ce(score, tgt_sent_var[1:].view(-1)).cpu()) # losses on target language fw_losses = torch.cat(NLL_losses) # losses on reconstruction bw_losses = torch.cat(CE_losses) # r1, language model reward r1s = Variable(torch.FloatTensor(lm_probs), requires_grad=False) r1s = (r1s - torch.mean(r1s)) / torch.std(r1s) # r2, communication reward r2s = Variable(bw_losses.data, requires_grad=False) r2s = (torch.mean(r2s) - r2s) / torch.std(r2s) # rk = alpha * r1 + (1 - alpha) * r2 rks = r1s * args.alpha + r2s * (1 - args.alpha) # averaging loss over samples A_loss = torch.mean(fw_losses * rks) B_loss = torch.mean(bw_losses * (1 - args.alpha)) if show_log: for r1, r2, rk, fw_loss, bw_loss in zip(r1s.data.numpy(), r2s.data.numpy(), rks.data.numpy(), fw_losses.data.numpy(), bw_losses.data.numpy()): print('r1={:7.4f}\t r2={:7.4f}\t rk={:7.4f}\t fw_loss={:7.4f}\t bw_loss={:7.4f}'.format(r1, r2, rk, fw_loss, bw_loss)) print('A loss = {:.7f} \t B loss = {:.7f}'.format(A_loss.data.numpy().item(), B_loss.data.numpy().item())) optimizerA.zero_grad() optimizerB.zero_grad() A_loss.backward() B_loss.backward() optimizerA.step() optimizerB.step() if t % args.save_n_iter == 0: print('\nsaving model') models['A'].save('{}.iter{}.bin'.format(args.model[0], t)) models['B'].save('{}.iter{}.bin'.format(args.model[1], t)) def change(m): if m == 'A': return 'B' else: return 'A' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--nmt', nargs=2, required=True, help='pre-train nmt model path') parser.add_argument('--lm', nargs=2, required=True, help='language model path') parser.add_argument('--dict', nargs=2, required=True, help='dictionary path') parser.add_argument('--src', nargs=2, required=True, help='training data path') parser.add_argument('--model', nargs=2, type=str, default=['modelA', 'modelB']) parser.add_argument('--log_every', type=int, default=10) parser.add_argument('--save_n_iter', type=int, default=1000) parser.add_argument('--alpha', type=float, default=0.5) parser.add_argument('--start_iter', type=int, default=0) parser.add_argument('--cuda', action='store_true') args = parser.parse_args() print(args) dual(args) ================================================ FILE: lm/README.md ================================================ # Language Model This language model is heavily depended on [Word-level language modeling RNN - pytorch/examples](https://github.com/pytorch/examples/tree/master/word_language_model). To train it, just use the code here and follow the steps provided there. ### Usage Reload pre-trained model and dictionary first, and use `get_prob()` to get language model probability. ```python words = ['we', 'have', 'told', 'that', 'this', 'will'] lmprob = LMProb('wmt16-en.pt', 'data/wmt16-en/dict.pkl') norm_prob = lmprob.get_prob(words, verbose=True) print('norm_prob = {:.4f}'.format(norm_prob)) ``` ================================================ FILE: lm/__init__.py ================================================ from lm.lm_prob import LMProb from lm import model ================================================ FILE: lm/data.py ================================================ import os import torch import pickle class Dictionary(object): def __init__(self): self.word2idx = {'': 0} self.idx2word = [''] self.wordcnt = {'': 1} def add_word(self, word): if word not in self.word2idx: self.idx2word.append(word) self.word2idx[word] = len(self.idx2word) - 1 self.wordcnt[word] = 1 else: self.wordcnt[word] = self.wordcnt[word] + 1 return self.word2idx[word] def getid(self, word, thresh=10): if (word not in self.word2idx) or (self.wordcnt[word] < thresh): return self.word2idx[''] return self.word2idx[word] def __len__(self): return len(self.idx2word) class Corpus(object): def __init__(self, path): self.dictionary = Dictionary() self.train = self.tokenize(os.path.join(path, 'train.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt')) with open(os.path.join(path, 'dict.pkl'), 'wb') as f: pickle.dump(self.dictionary, f) def tokenize(self, path): """Tokenizes a text file.""" assert os.path.exists(path) # Add words to the dictionary with open(path, 'r') as f: tokens = 0 for line in f: words = [''] + line.split() + [''] tokens += len(words) for word in words: self.dictionary.add_word(word) # Tokenize file content with open(path, 'r') as f: ids = torch.LongTensor(tokens) token = 0 for line in f: words = [''] + line.split() + [''] for word in words: ids[token] = self.dictionary.getid(word) token += 1 return ids ================================================ FILE: lm/generate.py ================================================ ############################################################################### # Language Modeling on Penn Tree Bank # # This file generates new sentences sampled from the language model # ############################################################################### import argparse import torch from torch.autograd import Variable import data parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') # Model parameters. parser.add_argument('--data', type=str, default='./data/penn', help='location of the data corpus') parser.add_argument('--checkpoint', type=str, default='./model.pt', help='model checkpoint to use') parser.add_argument('--outf', type=str, default='output.txt', help='output file for generated text') parser.add_argument('--words', type=int, default='1000', help='number of words to generate') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--temperature', type=float, default=1.0, help='temperature - higher will increase diversity') parser.add_argument('--log-interval', type=int, default=100, help='reporting interval') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: model = torch.load(f) model.eval() if args.cuda: model.cuda() else: model.cpu() corpus = data.Corpus(args.data) ntokens = len(corpus.dictionary) hidden = model.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if args.cuda: input.data = input.data.cuda() with open(args.outf, 'w') as outf: for i in range(args.words): output, hidden = model(input, hidden) word_weights = output.squeeze().data.div(args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] input.data.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] outf.write(word + ('\n' if i % 20 == 19 else ' ')) if i % args.log_interval == 0: print('| Generated {}/{} words'.format(i, args.words)) ================================================ FILE: lm/lm_prob.py ================================================ # -*- coding: utf-8 -*- import math import torch import pickle import numpy as np from torch.autograd import Variable class LMProb(): def __init__(self, model_path, dict_path): with open(model_path, 'rb') as f: self.model = torch.load(f) self.model.eval() self.model = self.model.cpu() with open(dict_path, 'rb') as f: self.dictionary = pickle.load(f) def get_prob(self, words, verbose=False): pad_words = [''] + words + [''] indxs = [self.dictionary.getid(w) for w in pad_words] input = Variable(torch.LongTensor([int(indxs[0])]).unsqueeze(0), volatile=True) if verbose: print('words =', pad_words) print('indxs =', indxs) hidden = self.model.init_hidden(1) log_probs = [] for i in range(1, len(pad_words)): output, hidden = self.model(input, hidden) word_weights = output.squeeze().data.exp() prob = word_weights[indxs[i]] / word_weights.sum() log_probs.append(math.log(prob)) input.data.fill_(int(indxs[i])) if verbose: for i in range(len(log_probs)): print(' {} => {:d},\tlogP(w|s)={:.4f}'.format(pad_words[i+1], indxs[i+1], log_probs[i])) print('\n => sum_prob = {:.4f}'.format(sum(log_probs))) return sum(log_probs) / math.sqrt(len(log_probs)) if __name__ == '__main__': words = ['we', 'have', 'told', 'that', 'this', 'will'] lmprob = LMProb('wmt16-en.pt', 'data/wmt16-en/dict.pkl') norm_prob = lmprob.get_prob(words, verbose=True) print('\n => norm_prob = {:.4f}'.format(norm_prob)) ================================================ FILE: lm/main.py ================================================ # coding: utf-8 import argparse import time import math import torch import torch.nn as nn from torch.autograd import Variable import data import model parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='./data/penn', help='location of the data corpus') parser.add_argument('--emsize', type=int, default=200, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=200, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=2, help='number of layers') parser.add_argument('--lr', type=float, default=20, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=35, help='sequence length') parser.add_argument('--dropout', type=float, default=0.2, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--tied', action='store_true', help='tie the word embedding and softmax weights') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='model.pt', help='path to save the final model') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data) # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() if args.cuda: data = data.cuda() return data eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if args.cuda: model.cuda() criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h) # get_batch subdivides the source data into chunks of length args.bptt. # If source is equal to the example output of the batchify function, with # a bptt-limit of 2, we'd get the following two Variables for i = 0: # ┌ a g m s ┐ ┌ b h n t ┐ # └ b h n t ┘ └ c i o u ┘ # Note that despite the name of the function, the subdivison of data is not # done along the batch dimension (i.e. dimension 1), since that was handled # by the batchify function. The chunks are along dimension 0, corresponding # to the seq_len dimension in the LSTM. def get_batch(source, i, evaluation=False): seq_len = min(args.bptt, len(source) - 1 - i) data = Variable(source[i:i+seq_len], volatile=evaluation) target = Variable(source[i+1:i+1+seq_len].view(-1)) return data, target def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() # Loop over epochs. lr = args.lr best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(1, args.epochs+1): epoch_start_time = time.time() train() val_loss = evaluate(val_data) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open(args.save, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. with open(args.save, 'rb') as f: model = torch.load(f) # Run on test data. test_loss = evaluate(test_data) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89) ================================================ FILE: lm/model.py ================================================ import torch.nn as nn from torch.autograd import Variable class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder.""" def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_weights: if nhid != ninp: raise ValueError('When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() self.nhid = nhid self.nlayers = nlayers def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.fill_(0) self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, input, hidden): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden def init_hidden(self, bsz): weight = next(self.parameters()).data return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) ================================================ FILE: model.py ================================================ import torch.nn as nn from torch.autograd import Variable class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder.""" def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_weights: if nhid != ninp: raise ValueError('When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() self.nhid = nhid self.nlayers = nlayers def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.fill_(0) self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, input, hidden): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden def init_hidden(self, bsz): weight = next(self.parameters()).data return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) ================================================ FILE: nmt/README.md ================================================ # Neural Machine Translation This NMT model is heavily depended on [pcyin/pytorch\_nmt](https://github.com/pcyin/pytorch_nmt). To train model, just follow the steps provided there. Basiscally, you need to: 1. use `vocab.py` to generate vocab file 2. use `nmt.py` to train model And you may find `scripts/train.sh` helpful. ### Test Results ##### WMT16 - English-Deutsch - with 10% data ``` BLEU = 20.54, 49.0/26.7/17.4/11.9 (BP=0.900, ratio=0.904, hyp_len=129552, ref_len=143246) ``` - with 100% data ``` BLEU = 22.94, 50.9/28.9/19.5/13.8 (BP=0.915, ratio=0.919, hyp_len=131583, ref_len=143246) ``` - Deutsch-English - with 10% data ``` BLEU = 24.69, 56.2/32.5/22.0/15.5 (BP=0.880, ratio=0.886, hyp_len=123720, ref_len=139584) ``` - with 100% data ``` BLEU = 26.73, 57.6/34.4/23.7/17.1 (BP=0.894, ratio=0.899, hyp_len=125477, ref_len=139584) ``` ================================================ FILE: nmt/__init__.py ================================================ from nmt.util import read_corpus, data_iter from nmt import vocab from nmt.model import NMT, to_input_variable ================================================ FILE: nmt/channel.py ================================================ # -*- coding: utf-8 -*- import sys import torch import argparse from util import read_corpus, data_iter from model import NMT def sample(args): train_data_src = read_corpus(args.src_file, source='src') train_data_tgt = read_corpus(args.tgt_file, source='tgt') train_data = zip(train_data_src, train_data_tgt) # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.eval() model = model.cuda() # sampling print('begin sampling') train_iter = cum_samples = 0 for src_sents, tgt_sents in data_iter(train_data, batch_size=1): train_iter += 1 samples = model.sample(src_sents, sample_size=5, to_word=True) cum_samples += sum(len(sample) for sample in samples) for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80) def beam(args): # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.train() # model.eval() model = model.cuda() # loss function loss_fn = torch.nn.NLLLoss() # sampling print('begin beam searching') src_sent = ['we', 'have', 'told', 'that', '.'] hyps = model.beam(src_sent) print('src_sent:', ' '.join(src_sent)) for ids, hyp, dist in hyps: print('tgt_sent:', ' '.join(hyp)) print('tgt_ids :', end=' ') for id in ids: print(id, end=', ') print() print('out_dist:', dist) var_ids = torch.autograd.Variable(torch.LongTensor(ids[1:]), requires_grad=False) loss = loss_fn(dist, var_ids) print('NLL loss =', loss) loss.backward() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('model_bin') parser.add_argument('src_file') parser.add_argument('tgt_file') args = parser.parse_args() # sample(args) beam(args) ================================================ FILE: nmt/model.py ================================================ # -*- coding: utf-8 -*- import sys import torch import torch.nn as nn import torch.nn.utils from torch.autograd import Variable import torch.nn.functional as F from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence def input_transpose(sents, pad_token): max_len = max(len(s) for s in sents) batch_size = len(sents) sents_t = [] masks = [] for i in range(max_len): sents_t.append([sents[k][i] if len(sents[k]) > i else pad_token for k in range(batch_size)]) masks.append([1 if len(sents[k]) > i else 0 for k in range(batch_size)]) return sents_t, masks def word2id(sents, vocab): if type(sents[0]) == list: return [[vocab[w] for w in s] for s in sents] else: return [vocab[w] for w in sents] def tensor_transform(linear, X): # X is a 3D tensor return linear(X.contiguous().view(-1, X.size(2))).view(X.size(0), X.size(1), -1) class NMT(nn.Module): def __init__(self, args, vocab): super(NMT, self).__init__() self.args = args self.vocab = vocab self.src_embed = nn.Embedding(len(vocab.src), self.args.embed_size, padding_idx=vocab.src['']) self.tgt_embed = nn.Embedding(len(vocab.tgt), self.args.embed_size, padding_idx=vocab.tgt['']) self.encoder_lstm = nn.LSTM(self.args.embed_size, self.args.hidden_size, bidirectional=True, dropout=self.args.dropout) self.decoder_lstm = nn.LSTMCell(self.args.embed_size + self.args.hidden_size, self.args.hidden_size) # attention: dot product attention # project source encoding to decoder rnn's h space self.att_src_linear = nn.Linear(self.args.hidden_size * 2, self.args.hidden_size, bias=False) # transformation of decoder hidden states and context vectors before reading out target words # this produces the `attentional vector` in (Luong et al., 2015) self.att_vec_linear = nn.Linear(self.args.hidden_size * 2 + self.args.hidden_size, self.args.hidden_size, bias=False) # prediction layer of the target vocabulary self.readout = nn.Linear(self.args.hidden_size, len(vocab.tgt), bias=False) # dropout layer self.dropout = nn.Dropout(self.args.dropout) # initialize the decoder's state and cells with encoder hidden states self.decoder_cell_init = nn.Linear(self.args.hidden_size * 2, self.args.hidden_size) def forward(self, src_sents, src_sents_len, tgt_words): src_encodings, init_ctx_vec = self.encode(src_sents, src_sents_len) scores = self.decode(src_encodings, init_ctx_vec, tgt_words) return scores def encode(self, src_sents, src_sents_len): """ :param src_sents: (src_sent_len, batch_size), sorted by the length of the source :param src_sents_len: (src_sent_len) """ # (src_sent_len, batch_size, embed_size) src_word_embed = self.src_embed(src_sents) packed_src_embed = pack_padded_sequence(src_word_embed, src_sents_len) # output: (src_sent_len, batch_size, hidden_size) output, (last_state, last_cell) = self.encoder_lstm(packed_src_embed) output, _ = pad_packed_sequence(output) dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], 1)) dec_init_state = F.tanh(dec_init_cell) return output, (dec_init_state, dec_init_cell) def decode(self, src_encoding, dec_init_vec, tgt_sents): """ :param src_encoding: (src_sent_len, batch_size, hidden_size) :param dec_init_vec: (batch_size, hidden_size) :param tgt_sents: (tgt_sent_len, batch_size) :return: """ init_state = dec_init_vec[0] init_cell = dec_init_vec[1] hidden = (init_state, init_cell) new_tensor = init_cell.data.new batch_size = src_encoding.size(1) # (batch_size, src_sent_len, hidden_size * 2) src_encoding = src_encoding.permute(1, 0, 2) # (batch_size, src_sent_len, hidden_size) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) # initialize attentional vector att_tm1 = Variable(new_tensor(batch_size, self.args.hidden_size).zero_(), requires_grad=False) tgt_word_embed = self.tgt_embed(tgt_sents) scores = [] # start from ``, until y_{T-1} for y_tm1_embed in tgt_word_embed.split(split_size=1): # input feeding: concate y_tm1 and previous attentional vector x = torch.cat([y_tm1_embed.squeeze(0), att_tm1], 1) # h_t: (batch_size, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, src_encoding, src_encoding_att_linear) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) # E.q. (5) att_t = self.dropout(att_t) score_t = self.readout(att_t) # E.q. (6) scores.append(score_t) att_tm1 = att_t hidden = h_t, cell_t scores = torch.stack(scores) return scores def translate(self, src_sents, beam_size=None, to_word=True): """ perform beam search TODO: batched beam search """ if not type(src_sents[0]) == list: src_sents = [src_sents] if not beam_size: beam_size = self.args.beam_size src_sents_var = to_input_variable(src_sents, self.vocab.src, cuda=self.args.cuda, is_test=True) src_encoding, dec_init_vec = self.encode(src_sents_var, [len(src_sents[0])]) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) init_state = dec_init_vec[0] init_cell = dec_init_vec[1] hidden = (init_state, init_cell) att_tm1 = Variable(torch.zeros(1, self.args.hidden_size), volatile=True) hyp_scores = Variable(torch.zeros(1), volatile=True) if self.args.cuda: att_tm1 = att_tm1.cuda() hyp_scores = hyp_scores.cuda() eos_id = self.vocab.tgt[''] bos_id = self.vocab.tgt[''] tgt_vocab_size = len(self.vocab.tgt) hypotheses = [[bos_id]] completed_hypotheses = [] completed_hypothesis_scores = [] t = 0 while len(completed_hypotheses) < beam_size and t < self.args.decode_max_time_step: t += 1 hyp_num = len(hypotheses) expanded_src_encoding = src_encoding.expand(src_encoding.size(0), hyp_num, src_encoding.size(2)) expanded_src_encoding_att_linear = src_encoding_att_linear.expand(src_encoding_att_linear.size(0), hyp_num, src_encoding_att_linear.size(2)) y_tm1 = Variable(torch.LongTensor([hyp[-1] for hyp in hypotheses]), volatile=True) if self.args.cuda: y_tm1 = y_tm1.cuda() y_tm1_embed = self.tgt_embed(y_tm1) x = torch.cat([y_tm1_embed, att_tm1], 1) # h_t: (hyp_num, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, expanded_src_encoding.permute(1, 0, 2), expanded_src_encoding_att_linear.permute(1, 0, 2)) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) att_t = self.dropout(att_t) score_t = self.readout(att_t) p_t = F.log_softmax(score_t) live_hyp_num = beam_size - len(completed_hypotheses) new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(p_t) + p_t).view(-1) top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_new_hyp_pos / tgt_vocab_size word_ids = top_new_hyp_pos % tgt_vocab_size # new_hyp_scores = new_hyp_scores[top_new_hyp_pos.data] new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, word_id, new_hyp_score in zip(prev_hyp_ids.cpu().data, word_ids.cpu().data, top_new_hyp_scores.cpu().data): hyp_tgt_words = hypotheses[prev_hyp_id] + [word_id] if word_id == eos_id: completed_hypotheses.append(hyp_tgt_words) completed_hypothesis_scores.append(new_hyp_score) else: new_hypotheses.append(hyp_tgt_words) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.LongTensor(live_hyp_ids) if self.args.cuda: live_hyp_ids = live_hyp_ids.cuda() hidden = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hyp_scores = Variable(torch.FloatTensor(new_hyp_scores), volatile=True) # new_hyp_scores[live_hyp_ids] if self.args.cuda: hyp_scores = hyp_scores.cuda() hypotheses = new_hypotheses if len(completed_hypotheses) == 0: completed_hypotheses = [hypotheses[0]] completed_hypothesis_scores = [0.0] if to_word: for i, hyp in enumerate(completed_hypotheses): completed_hypotheses[i] = [self.vocab.tgt.id2word[w] for w in hyp] ranked_hypotheses = sorted(zip(completed_hypotheses, completed_hypothesis_scores), key=lambda x: x[1], reverse=True) return [hyp for hyp, score in ranked_hypotheses] def sample(self, src_sents, sample_size=None, to_word=False): if not type(src_sents[0]) == list: src_sents = [src_sents] if not sample_size: sample_size = self.args.sample_size src_sents_num = len(src_sents) batch_size = src_sents_num * sample_size src_sents_var = to_input_variable(src_sents, self.vocab.src, cuda=self.args.cuda, is_test=True) src_encoding, (dec_init_state, dec_init_cell) = self.encode(src_sents_var, [len(s) for s in src_sents]) dec_init_state = dec_init_state.repeat(sample_size, 1) dec_init_cell = dec_init_cell.repeat(sample_size, 1) hidden = (dec_init_state, dec_init_cell) src_encoding = src_encoding.repeat(1, sample_size, 1) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) src_encoding = src_encoding.permute(1, 0, 2) src_encoding_att_linear = src_encoding_att_linear.permute(1, 0, 2) new_tensor = dec_init_state.data.new att_tm1 = Variable(new_tensor(batch_size, self.args.hidden_size).zero_(), volatile=True) y_0 = Variable(torch.LongTensor([self.vocab.tgt[''] for _ in range(batch_size)]), volatile=True) eos = self.vocab.tgt[''] # eos_batch = torch.LongTensor([eos] * batch_size) sample_ends = torch.ByteTensor([0] * batch_size) all_ones = torch.ByteTensor([1] * batch_size) if self.args.cuda: y_0 = y_0.cuda() sample_ends = sample_ends.cuda() all_ones = all_ones.cuda() samples = [y_0] t = 0 while t < self.args.decode_max_time_step: t += 1 # (sample_size) y_tm1 = samples[-1] y_tm1_embed = self.tgt_embed(y_tm1) x = torch.cat([y_tm1_embed, att_tm1], 1) # h_t: (batch_size, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, src_encoding, src_encoding_att_linear) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) # E.q. (5) att_t = self.dropout(att_t) score_t = self.readout(att_t) # E.q. (6) p_t = F.softmax(score_t) if self.args.sample_method == 'random': y_t = torch.multinomial(p_t, num_samples=1).squeeze(1) elif self.args.sample_method == 'greedy': _, y_t = torch.topk(p_t, k=1, dim=1) y_t = y_t.squeeze(1) samples.append(y_t) sample_ends |= torch.eq(y_t, eos).byte().data if torch.equal(sample_ends, all_ones): break # if torch.equal(y_t.data, eos_batch): # break att_tm1 = att_t hidden = h_t, cell_t # post-processing completed_samples = [list([list() for _ in range(sample_size)]) for _ in range(src_sents_num)] for y_t in samples: for i, sampled_word in enumerate(y_t.cpu().data): src_sent_id = i % src_sents_num sample_id = i // src_sents_num if len(completed_samples[src_sent_id][sample_id]) == 0 or completed_samples[src_sent_id][sample_id][-1] != eos: completed_samples[src_sent_id][sample_id].append(sampled_word) if to_word: for i, src_sent_samples in enumerate(completed_samples): completed_samples[i] = word2id(src_sent_samples, self.vocab.tgt.id2word) return completed_samples def beam(self, src_sents, beam_size=3): """ perform beam search """ if not type(src_sents[0]) == list: src_sents = [src_sents] src_sents_var = to_input_variable(src_sents, self.vocab.src, cuda=self.args.cuda, is_test=False) src_encoding, dec_init_vec = self.encode(src_sents_var, [len(src_sents[0])]) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) init_state = dec_init_vec[0] init_cell = dec_init_vec[1] hidden = (init_state, init_cell) att_tm1 = Variable(torch.zeros(1, self.args.hidden_size), requires_grad=False) hyp_scores = Variable(torch.zeros(1), requires_grad=False) if self.args.cuda: att_tm1 = att_tm1.cuda() hyp_scores = hyp_scores.cuda() eos_id = self.vocab.tgt[''] bos_id = self.vocab.tgt[''] tgt_vocab_size = len(self.vocab.tgt) # store output distributions out_dists = [[]] completed_out_dists = [] hypotheses = [[bos_id]] completed_hypotheses = [] completed_hypothesis_scores = [] t = 0 while len(completed_hypotheses) < beam_size and t < self.args.decode_max_time_step: t += 1 hyp_num = len(hypotheses) expanded_src_encoding = src_encoding.expand(src_encoding.size(0), hyp_num, src_encoding.size(2)) expanded_src_encoding_att_linear = src_encoding_att_linear.expand(src_encoding_att_linear.size(0), hyp_num, src_encoding_att_linear.size(2)) y_tm1 = Variable(torch.LongTensor([hyp[-1] for hyp in hypotheses]), requires_grad=False) if self.args.cuda: y_tm1 = y_tm1.cuda() y_tm1_embed = self.tgt_embed(y_tm1) x = torch.cat([y_tm1_embed, att_tm1], 1) # h_t: (hyp_num, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, expanded_src_encoding.permute(1, 0, 2), expanded_src_encoding_att_linear.permute(1, 0, 2)) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) att_t = self.dropout(att_t) score_t = self.readout(att_t) p_t = F.log_softmax(score_t) live_hyp_num = beam_size - len(completed_hypotheses) new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(p_t) + p_t).view(-1) top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_new_hyp_pos / tgt_vocab_size word_ids = top_new_hyp_pos % tgt_vocab_size # new_hyp_scores = new_hyp_scores[top_new_hyp_pos.data] # get output distributions p_t_cpu = p_t.cpu() new_out_dists = [] new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, word_id, new_hyp_score in zip(prev_hyp_ids.cpu().data, word_ids.cpu().data, top_new_hyp_scores.cpu().data): tgt_dists = out_dists[prev_hyp_id] + [p_t_cpu[prev_hyp_id].unsqueeze(0)] hyp_tgt_words = hypotheses[prev_hyp_id] + [word_id] if word_id == eos_id: completed_out_dists.append(tgt_dists) completed_hypotheses.append(hyp_tgt_words) completed_hypothesis_scores.append(new_hyp_score) else: new_out_dists.append(tgt_dists) new_hypotheses.append(hyp_tgt_words) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.LongTensor(live_hyp_ids) if self.args.cuda: live_hyp_ids = live_hyp_ids.cuda() hidden = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hyp_scores = Variable(torch.FloatTensor(new_hyp_scores), requires_grad=False) # new_hyp_scores[live_hyp_ids] if self.args.cuda: hyp_scores = hyp_scores.cuda() out_dists = new_out_dists hypotheses = new_hypotheses if len(completed_hypotheses) == 0: completed_out_dists = [out_dists[0]] completed_hypotheses = [hypotheses[0]] completed_hypothesis_scores = [0.0] # convert to words completed_hypotheses_words = [] for i, hyp in enumerate(completed_hypotheses): completed_hypotheses_words.append([self.vocab.tgt.id2word[w] for w in hyp]) # merge variables for i, dists in enumerate(completed_out_dists): completed_out_dists[i] = torch.cat(dists, 0) # sort with scores ranked_hypotheses = sorted(zip(completed_hypotheses, completed_hypothesis_scores, completed_hypotheses_words, completed_out_dists), key=lambda x: x[1], reverse=True) return [(hyp, words, dist) for hyp, score, words, dist in ranked_hypotheses] def attention(self, h_t, src_encoding, src_linear_for_att): # (1, batch_size, attention_size) + (src_sent_len, batch_size, attention_size) => # (src_sent_len, batch_size, attention_size) att_hidden = F.tanh(self.att_h_linear(h_t).unsqueeze(0).expand_as(src_linear_for_att) + src_linear_for_att) # (batch_size, src_sent_len) att_weights = F.softmax(tensor_transform(self.att_vec_linear, att_hidden).squeeze(2).permute(1, 0)) # (batch_size, hidden_size * 2) ctx_vec = torch.bmm(src_encoding.permute(1, 2, 0), att_weights.unsqueeze(2)).squeeze(2) return ctx_vec, att_weights def dot_prod_attention(self, h_t, src_encoding, src_encoding_att_linear, mask=None): """ :param h_t: (batch_size, hidden_size) :param src_encoding: (batch_size, src_sent_len, hidden_size * 2) :param src_encoding_att_linear: (batch_size, src_sent_len, hidden_size) :param mask: (batch_size, src_sent_len) """ # (batch_size, src_sent_len) att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2) if mask: att_weight.data.masked_fill_(mask, -float('inf')) att_weight = F.softmax(att_weight) att_view = (att_weight.size(0), 1, att_weight.size(1)) # (batch_size, hidden_size) ctx_vec = torch.bmm(att_weight.view(*att_view), src_encoding).squeeze(1) return ctx_vec, att_weight def save(self, path): print('save parameters to [%s]' % path, file=sys.stderr) params = { 'args': self.args, 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def to_input_variable(sents, vocab, cuda=False, is_test=False): """ return a tensor of shape (src_sent_len, batch_size) """ word_ids = word2id(sents, vocab) sents_t, masks = input_transpose(word_ids, vocab['']) sents_var = Variable(torch.LongTensor(sents_t), volatile=is_test, requires_grad=False) if cuda: sents_var = sents_var.cuda() return sents_var ================================================ FILE: nmt/nmt.py ================================================ from __future__ import print_function import os import sys import time import argparse from itertools import tee import numpy as np import torch import torch.nn as nn import torch.nn.utils from torch.autograd import Variable from torch import optim from torch.nn import Parameter import torch.nn.functional as F from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence from nltk.translate.bleu_score import corpus_bleu from util import read_corpus, data_iter from vocab import Vocab, VocabEntry def init_config(): parser = argparse.ArgumentParser() parser.add_argument('--seed', default=5783287, type=int, help='random seed') parser.add_argument('--cuda', action='store_true', default=False, help='use gpu') parser.add_argument('--mode', choices=['train', 'raml_train', 'test', 'sample', 'prob', 'interactive'], default='train', help='run mode') parser.add_argument('--vocab', type=str, help='path of the serialized vocabulary') parser.add_argument('--batch_size', default=32, type=int, help='batch size') parser.add_argument('--beam_size', default=5, type=int, help='beam size for beam search') parser.add_argument('--sample_size', default=10, type=int, help='sample size') parser.add_argument('--embed_size', default=256, type=int, help='size of word embeddings') parser.add_argument('--hidden_size', default=256, type=int, help='size of LSTM hidden states') parser.add_argument('--dropout', default=0., type=float, help='dropout rate') parser.add_argument('--train_src', type=str, help='path to the training source file') parser.add_argument('--train_tgt', type=str, help='path to the training target file') parser.add_argument('--dev_src', type=str, help='path to the dev source file') parser.add_argument('--dev_tgt', type=str, help='path to the dev target file') parser.add_argument('--test_src', type=str, help='path to the test source file') parser.add_argument('--test_tgt', type=str, help='path to the test target file') parser.add_argument('--decode_max_time_step', default=200, type=int, help='maximum number of time steps used ' 'in decoding and sampling') parser.add_argument('--valid_niter', default=500, type=int, help='every n iterations to perform validation') parser.add_argument('--valid_metric', default='bleu', choices=['bleu', 'ppl', 'word_acc', 'sent_acc'], help='metric used for validation') parser.add_argument('--log_every', default=50, type=int, help='every n iterations to log training statistics') parser.add_argument('--load_model', default=None, type=str, help='load a pre-trained model') parser.add_argument('--save_to', default='model', type=str, help='save trained model to') parser.add_argument('--save_model_after', default=2, type=int, help='save the model only after n validation iterations') parser.add_argument('--save_to_file', default=None, type=str, help='if provided, save decoding results to file') parser.add_argument('--save_nbest', default=False, action='store_true', help='save nbest decoding results') parser.add_argument('--patience', default=5, type=int, help='training patience') parser.add_argument('--uniform_init', default=None, type=float, help='if specified, use uniform initialization for all parameters') parser.add_argument('--clip_grad', default=5., type=float, help='clip gradients') parser.add_argument('--max_niter', default=-1, type=int, help='maximum number of training iterations') parser.add_argument('--lr', default=0.001, type=float, help='learning rate') parser.add_argument('--lr_decay', default=0.5, type=float, help='decay learning rate if the validation performance drops') # raml training parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--temp', default=0.85, type=float, help='temperature in reward distribution') parser.add_argument('--raml_sample_mode', default='pre_sample', choices=['pre_sample', 'hamming_distance', 'hamming_distance_impt_sample'], help='sample mode when using RAML') parser.add_argument('--raml_sample_file', type=str, help='path to the sampled targets') parser.add_argument('--raml_bias_groundtruth', action='store_true', default=False, help='make sure ground truth y* is in samples') parser.add_argument('--smooth_bleu', action='store_true', default=False, help='smooth sentence level BLEU score.') #TODO: greedy sampling is still buggy! parser.add_argument('--sample_method', default='random', choices=['random', 'greedy']) args = parser.parse_args() # seed the RNG torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed * 13 // 7) return args def input_transpose(sents, pad_token): max_len = max(len(s) for s in sents) batch_size = len(sents) sents_t = [] masks = [] for i in range(max_len): sents_t.append([sents[k][i] if len(sents[k]) > i else pad_token for k in range(batch_size)]) masks.append([1 if len(sents[k]) > i else 0 for k in range(batch_size)]) return sents_t, masks def word2id(sents, vocab): if type(sents[0]) == list: return [[vocab[w] for w in s] for s in sents] else: return [vocab[w] for w in sents] def tensor_transform(linear, X): # X is a 3D tensor return linear(X.contiguous().view(-1, X.size(2))).view(X.size(0), X.size(1), -1) class NMT(nn.Module): def __init__(self, args, vocab): super(NMT, self).__init__() self.args = args self.vocab = vocab self.src_embed = nn.Embedding(len(vocab.src), args.embed_size, padding_idx=vocab.src['']) self.tgt_embed = nn.Embedding(len(vocab.tgt), args.embed_size, padding_idx=vocab.tgt['']) self.encoder_lstm = nn.LSTM(args.embed_size, args.hidden_size, bidirectional=True, dropout=args.dropout) self.decoder_lstm = nn.LSTMCell(args.embed_size + args.hidden_size, args.hidden_size) # attention: dot product attention # project source encoding to decoder rnn's h space self.att_src_linear = nn.Linear(args.hidden_size * 2, args.hidden_size, bias=False) # transformation of decoder hidden states and context vectors before reading out target words # this produces the `attentional vector` in (Luong et al., 2015) self.att_vec_linear = nn.Linear(args.hidden_size * 2 + args.hidden_size, args.hidden_size, bias=False) # prediction layer of the target vocabulary self.readout = nn.Linear(args.hidden_size, len(vocab.tgt), bias=False) # dropout layer self.dropout = nn.Dropout(args.dropout) # initialize the decoder's state and cells with encoder hidden states self.decoder_cell_init = nn.Linear(args.hidden_size * 2, args.hidden_size) def forward(self, src_sents, src_sents_len, tgt_words): src_encodings, init_ctx_vec = self.encode(src_sents, src_sents_len) scores = self.decode(src_encodings, init_ctx_vec, tgt_words) return scores def encode(self, src_sents, src_sents_len): """ :param src_sents: (src_sent_len, batch_size), sorted by the length of the source :param src_sents_len: (src_sent_len) """ # (src_sent_len, batch_size, embed_size) src_word_embed = self.src_embed(src_sents) packed_src_embed = pack_padded_sequence(src_word_embed, src_sents_len) # output: (src_sent_len, batch_size, hidden_size) output, (last_state, last_cell) = self.encoder_lstm(packed_src_embed) output, _ = pad_packed_sequence(output) dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], 1)) dec_init_state = F.tanh(dec_init_cell) return output, (dec_init_state, dec_init_cell) def decode(self, src_encoding, dec_init_vec, tgt_sents): """ :param src_encoding: (src_sent_len, batch_size, hidden_size) :param dec_init_vec: (batch_size, hidden_size) :param tgt_sents: (tgt_sent_len, batch_size) :return: """ init_state = dec_init_vec[0] init_cell = dec_init_vec[1] hidden = (init_state, init_cell) new_tensor = init_cell.data.new batch_size = src_encoding.size(1) # (batch_size, src_sent_len, hidden_size * 2) src_encoding = src_encoding.permute(1, 0, 2) # (batch_size, src_sent_len, hidden_size) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) # initialize attentional vector att_tm1 = Variable(new_tensor(batch_size, self.args.hidden_size).zero_(), requires_grad=False) tgt_word_embed = self.tgt_embed(tgt_sents) scores = [] # start from ``, until y_{T-1} for y_tm1_embed in tgt_word_embed.split(split_size=1): # input feeding: concate y_tm1 and previous attentional vector x = torch.cat([y_tm1_embed.squeeze(0), att_tm1], 1) # h_t: (batch_size, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, src_encoding, src_encoding_att_linear) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) # E.q. (5) att_t = self.dropout(att_t) score_t = self.readout(att_t) # E.q. (6) scores.append(score_t) att_tm1 = att_t hidden = h_t, cell_t scores = torch.stack(scores) return scores def translate(self, src_sents, beam_size=None, to_word=True): """ perform beam search TODO: batched beam search """ if not type(src_sents[0]) == list: src_sents = [src_sents] if not beam_size: beam_size = args.beam_size src_sents_var = to_input_variable(src_sents, self.vocab.src, cuda=args.cuda, is_test=True) src_encoding, dec_init_vec = self.encode(src_sents_var, [len(src_sents[0])]) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) init_state = dec_init_vec[0] init_cell = dec_init_vec[1] hidden = (init_state, init_cell) att_tm1 = Variable(torch.zeros(1, self.args.hidden_size), volatile=True) hyp_scores = Variable(torch.zeros(1), volatile=True) if args.cuda: att_tm1 = att_tm1.cuda() hyp_scores = hyp_scores.cuda() eos_id = self.vocab.tgt[''] bos_id = self.vocab.tgt[''] tgt_vocab_size = len(self.vocab.tgt) hypotheses = [[bos_id]] completed_hypotheses = [] completed_hypothesis_scores = [] t = 0 while len(completed_hypotheses) < beam_size and t < args.decode_max_time_step: t += 1 hyp_num = len(hypotheses) expanded_src_encoding = src_encoding.expand(src_encoding.size(0), hyp_num, src_encoding.size(2)) expanded_src_encoding_att_linear = src_encoding_att_linear.expand(src_encoding_att_linear.size(0), hyp_num, src_encoding_att_linear.size(2)) y_tm1 = Variable(torch.LongTensor([hyp[-1] for hyp in hypotheses]), volatile=True) if args.cuda: y_tm1 = y_tm1.cuda() y_tm1_embed = self.tgt_embed(y_tm1) x = torch.cat([y_tm1_embed, att_tm1], 1) # h_t: (hyp_num, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, expanded_src_encoding.permute(1, 0, 2), expanded_src_encoding_att_linear.permute(1, 0, 2)) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) att_t = self.dropout(att_t) score_t = self.readout(att_t) p_t = F.log_softmax(score_t) live_hyp_num = beam_size - len(completed_hypotheses) new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(p_t) + p_t).view(-1) top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_new_hyp_pos / tgt_vocab_size word_ids = top_new_hyp_pos % tgt_vocab_size # new_hyp_scores = new_hyp_scores[top_new_hyp_pos.data] new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, word_id, new_hyp_score in zip(prev_hyp_ids.cpu().data, word_ids.cpu().data, top_new_hyp_scores.cpu().data): hyp_tgt_words = hypotheses[prev_hyp_id] + [word_id] if word_id == eos_id: completed_hypotheses.append(hyp_tgt_words) completed_hypothesis_scores.append(new_hyp_score) else: new_hypotheses.append(hyp_tgt_words) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.LongTensor(live_hyp_ids) if args.cuda: live_hyp_ids = live_hyp_ids.cuda() hidden = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hyp_scores = Variable(torch.FloatTensor(new_hyp_scores), volatile=True) # new_hyp_scores[live_hyp_ids] if args.cuda: hyp_scores = hyp_scores.cuda() hypotheses = new_hypotheses if len(completed_hypotheses) == 0: completed_hypotheses = [hypotheses[0]] completed_hypothesis_scores = [0.0] if to_word: for i, hyp in enumerate(completed_hypotheses): completed_hypotheses[i] = [self.vocab.tgt.id2word[w] for w in hyp] ranked_hypotheses = sorted(zip(completed_hypotheses, completed_hypothesis_scores), key=lambda x: x[1], reverse=True) return [hyp for hyp, score in ranked_hypotheses] def sample(self, src_sents, sample_size=None, to_word=False): if not type(src_sents[0]) == list: src_sents = [src_sents] if not sample_size: sample_size = args.sample_size src_sents_num = len(src_sents) batch_size = src_sents_num * sample_size src_sents_var = to_input_variable(src_sents, self.vocab.src, cuda=args.cuda, is_test=True) src_encoding, (dec_init_state, dec_init_cell) = self.encode(src_sents_var, [len(s) for s in src_sents]) dec_init_state = dec_init_state.repeat(sample_size, 1) dec_init_cell = dec_init_cell.repeat(sample_size, 1) hidden = (dec_init_state, dec_init_cell) # tile everything # if args.sample_method == 'expand': # # src_enc: (src_sent_len, sample_size, enc_size) # # cat result: (src_sent_len, batch_size * sample_size, enc_size) # src_encoding = torch.cat([src_enc.expand(src_enc.size(0), sample_size, src_enc.size(2)) for src_enc in src_encoding.split(1, dim=1)], 1) # dec_init_state = torch.cat([x.expand(sample_size, x.size(1)) for x in dec_init_state.split(1, dim=0)], 0) # dec_init_cell = torch.cat([x.expand(sample_size, x.size(1)) for x in dec_init_cell.split(1, dim=0)], 0) # elif args.sample_method == 'repeat': src_encoding = src_encoding.repeat(1, sample_size, 1) src_encoding_att_linear = tensor_transform(self.att_src_linear, src_encoding) src_encoding = src_encoding.permute(1, 0, 2) src_encoding_att_linear = src_encoding_att_linear.permute(1, 0, 2) new_tensor = dec_init_state.data.new att_tm1 = Variable(new_tensor(batch_size, self.args.hidden_size).zero_(), volatile=True) y_0 = Variable(torch.LongTensor([self.vocab.tgt[''] for _ in range(batch_size)]), volatile=True) eos = self.vocab.tgt[''] # eos_batch = torch.LongTensor([eos] * batch_size) sample_ends = torch.ByteTensor([0] * batch_size) all_ones = torch.ByteTensor([1] * batch_size) if args.cuda: y_0 = y_0.cuda() sample_ends = sample_ends.cuda() all_ones = all_ones.cuda() samples = [y_0] t = 0 while t < args.decode_max_time_step: t += 1 # (sample_size) y_tm1 = samples[-1] y_tm1_embed = self.tgt_embed(y_tm1) x = torch.cat([y_tm1_embed, att_tm1], 1) # h_t: (batch_size, hidden_size) h_t, cell_t = self.decoder_lstm(x, hidden) h_t = self.dropout(h_t) ctx_t, alpha_t = self.dot_prod_attention(h_t, src_encoding, src_encoding_att_linear) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) # E.q. (5) att_t = self.dropout(att_t) score_t = self.readout(att_t) # E.q. (6) p_t = F.softmax(score_t) if args.sample_method == 'random': y_t = torch.multinomial(p_t, num_samples=1).squeeze(1) elif args.sample_method == 'greedy': _, y_t = torch.topk(p_t, k=1, dim=1) y_t = y_t.squeeze(1) samples.append(y_t) sample_ends |= torch.eq(y_t, eos).byte().data if torch.equal(sample_ends, all_ones): break # if torch.equal(y_t.data, eos_batch): # break att_tm1 = att_t hidden = h_t, cell_t # post-processing completed_samples = [list([list() for _ in range(sample_size)]) for _ in range(src_sents_num)] for y_t in samples: for i, sampled_word in enumerate(y_t.cpu().data): src_sent_id = i % src_sents_num sample_id = i / src_sents_num if len(completed_samples[src_sent_id][sample_id]) == 0 or completed_samples[src_sent_id][sample_id][-1] != eos: completed_samples[src_sent_id][sample_id].append(sampled_word) if to_word: for i, src_sent_samples in enumerate(completed_samples): completed_samples[i] = word2id(src_sent_samples, self.vocab.tgt.id2word) return completed_samples def attention(self, h_t, src_encoding, src_linear_for_att): # (1, batch_size, attention_size) + (src_sent_len, batch_size, attention_size) => # (src_sent_len, batch_size, attention_size) att_hidden = F.tanh(self.att_h_linear(h_t).unsqueeze(0).expand_as(src_linear_for_att) + src_linear_for_att) # (batch_size, src_sent_len) att_weights = F.softmax(tensor_transform(self.att_vec_linear, att_hidden).squeeze(2).permute(1, 0)) # (batch_size, hidden_size * 2) ctx_vec = torch.bmm(src_encoding.permute(1, 2, 0), att_weights.unsqueeze(2)).squeeze(2) return ctx_vec, att_weights def dot_prod_attention(self, h_t, src_encoding, src_encoding_att_linear, mask=None): """ :param h_t: (batch_size, hidden_size) :param src_encoding: (batch_size, src_sent_len, hidden_size * 2) :param src_encoding_att_linear: (batch_size, src_sent_len, hidden_size) :param mask: (batch_size, src_sent_len) """ # (batch_size, src_sent_len) att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2) if mask: att_weight.data.masked_fill_(mask, -float('inf')) att_weight = F.softmax(att_weight) att_view = (att_weight.size(0), 1, att_weight.size(1)) # (batch_size, hidden_size) ctx_vec = torch.bmm(att_weight.view(*att_view), src_encoding).squeeze(1) return ctx_vec, att_weight def save(self, path): print('save parameters to [%s]' % path, file=sys.stderr) params = { 'args': self.args, 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def to_input_variable(sents, vocab, cuda=False, is_test=False): """ return a tensor of shape (src_sent_len, batch_size) """ word_ids = word2id(sents, vocab) sents_t, masks = input_transpose(word_ids, vocab['']) sents_var = Variable(torch.LongTensor(sents_t), volatile=is_test, requires_grad=False) if cuda: sents_var = sents_var.cuda() return sents_var def evaluate_loss(model, data, crit): print('[INFO] evaluating loss') model.eval() cum_loss = 0. cum_tgt_words = 0. for src_sents, tgt_sents in data_iter(data, batch_size=args.batch_size, shuffle=False): pred_tgt_word_num = sum(len(s[1:]) for s in tgt_sents) # omitting leading `` src_sents_len = [len(s) for s in src_sents] src_sents_var = to_input_variable(src_sents, model.vocab.src, cuda=args.cuda, is_test=True) tgt_sents_var = to_input_variable(tgt_sents, model.vocab.tgt, cuda=args.cuda, is_test=True) # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) loss = crit(scores.view(-1, scores.size(2)), tgt_sents_var[1:].view(-1)) cum_loss += loss.data[0] cum_tgt_words += pred_tgt_word_num cum_tgt_words = 1. if cum_tgt_words < 1. else cum_tgt_words loss = cum_loss / cum_tgt_words return loss def init_training(args): if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] model = NMT(opt, vocab) model.load_state_dict(state_dict) model.train() else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.train() if args.uniform_init: print('uniformly initialize parameters [-%f, +%f]' % (args.uniform_init, args.uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-args.uniform_init, args.uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['']] = 0 nll_loss = nn.NLLLoss(weight=vocab_mask, size_average=False) cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask, size_average=False) if args.cuda: model = model.cuda() nll_loss = nll_loss.cuda() cross_entropy_loss = cross_entropy_loss.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) return vocab, model, optimizer, nll_loss, cross_entropy_loss def train(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args) train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0 if args.load_model: import re train_iter = int(re.search('(?<=iter)\d+', args.load_model).group(0)) print('start from train_iter = %d' % train_iter) valid_num = train_iter // args.valid_niter hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 print('start of epoch {:d}'.format(epoch)) for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 src_sents_var = to_input_variable(src_sents, vocab.src, cuda=args.cuda) tgt_sents_var = to_input_variable(tgt_sents, vocab.tgt, cuda=args.cuda) batch_size = len(src_sents) src_sents_len = [len(s) for s in src_sents] pred_tgt_word_num = sum(len(s[1:]) for s in tgt_sents) # omitting leading `` optimizer.zero_grad() # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) word_loss = cross_entropy_loss(scores.view(-1, scores.size(2)), tgt_sents_var[1:].view(-1)) loss = word_loss / batch_size word_loss_val = word_loss.data[0] loss_val = loss.data[0] loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() report_loss += word_loss_val cum_loss += word_loss_val report_tgt_words += pred_tgt_word_num cum_tgt_words += pred_tgt_word_num report_examples += batch_size cum_examples += batch_size cum_batches += batch_size if train_iter % args.log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, np.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_batches, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_batches = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) model.eval() # compute dev. ppl and bleu dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss) dev_ppl = np.exp(dev_loss) if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']: dev_hyps = decode(model, dev_data) dev_hyps = [hyps[0] for hyps in dev_hyps] if args.valid_metric == 'bleu': valid_metric = get_bleu([tgt for src, tgt in dev_data], dev_hyps) else: valid_metric = get_acc([tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric) print('validation: iter %d, dev. ppl %f, dev. %s %f' % (train_iter, dev_ppl, args.valid_metric, valid_metric), file=sys.stderr) else: valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) model.train() is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) is_better_than_last = len(hist_valid_scores) == 0 or valid_metric > hist_valid_scores[-1] hist_valid_scores.append(valid_metric) if valid_num > args.save_model_after: model_file = args.save_to + '.iter%d.bin' % train_iter print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) if (not is_better_than_last) and args.lr_decay: lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('decay learning rate to %f' % lr, file=sys.stderr) optimizer.param_groups[0]['lr'] = lr if is_better: patience = 0 best_model_iter = train_iter if valid_num > args.save_model_after: print('save currently the best model ..', file=sys.stderr) model_file_abs_path = os.path.abspath(model_file) symlin_file_abs_path = os.path.abspath(args.save_to + '.bin') os.system('ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path)) else: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: print('early stop!', file=sys.stderr) print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr) exit(0) def get_bleu(references, hypotheses): # compute BLEU bleu_score = corpus_bleu([[ref[1:-1]] for ref in references], [hyp[1:-1] for hyp in hypotheses]) return bleu_score def get_acc(references, hypotheses, acc_type='word'): assert acc_type == 'word_acc' or acc_type == 'sent_acc' cum_acc = 0. for ref, hyp in zip(references, hypotheses): ref = ref[1:-1] hyp = hyp[1:-1] if acc_type == 'word_acc': acc = len([1 for ref_w, hyp_w in zip(ref, hyp) if ref_w == hyp_w]) / float(len(hyp) + 1e-6) else: acc = 1. if all(ref_w == hyp_w for ref_w, hyp_w in zip(ref, hyp)) else 0. cum_acc += acc acc = cum_acc / len(hypotheses) return acc def decode(model, data, verbose=True): """ decode the dataset and compute sentence level acc. and BLEU. """ hypotheses = [] begin_time = time.time() data = list(data) if type(data[0]) is tuple: for src_sent, tgt_sent in data: hyps = model.translate(src_sent) hypotheses.append(hyps) if verbose: print('*' * 50) print('Source: ', ' '.join(src_sent)) print('Target: ', ' '.join(tgt_sent)) print('Top Hypothesis: ', ' '.join(hyps[0])) else: for src_sent in data: hyps = model.translate(src_sent) hypotheses.append(hyps) if verbose: print('*' * 50) print('Source: ', ' '.join(src_sent)) print('Top Hypothesis: ', ' '.join(hyps[0])) elapsed = time.time() - begin_time print('decoded %d examples, took %d s' % (len(data), elapsed), file=sys.stderr) return hypotheses def compute_lm_prob(args): """ given source-target sentence pairs, compute ppl and log-likelihood """ test_data_src = read_corpus(args.test_src, source='src') test_data_tgt = read_corpus(args.test_tgt, source='tgt') test_data = zip(test_data_src, test_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: model = model.cuda() f = open(args.save_to_file, 'w') for src_sent, tgt_sent in test_data: src_sents = [src_sent] tgt_sents = [tgt_sent] batch_size = len(src_sents) src_sents_len = [len(s) for s in src_sents] pred_tgt_word_nums = [len(s[1:]) for s in tgt_sents] # omitting leading `` # (sent_len, batch_size) src_sents_var = to_input_variable(src_sents, model.vocab.src, cuda=args.cuda, is_test=True) tgt_sents_var = to_input_variable(tgt_sents, model.vocab.tgt, cuda=args.cuda, is_test=True) # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) # (tgt_sent_len * batch_size, tgt_vocab_size) log_scores = F.log_softmax(scores.view(-1, scores.size(2))) # remove leading in tgt sent, which is not used as the target # (batch_size * tgt_sent_len) flattened_tgt_sents = tgt_sents_var[1:].view(-1) # (batch_size * tgt_sent_len) tgt_log_scores = torch.gather(log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1) # 0-index is the symbol tgt_log_scores = tgt_log_scores * (1. - torch.eq(flattened_tgt_sents, 0).float()) # (tgt_sent_len, batch_size) tgt_log_scores = tgt_log_scores.view(-1, batch_size) # .permute(1, 0) # (batch_size) tgt_sent_scores = tgt_log_scores.sum(dim=0).squeeze() tgt_sent_word_scores = [tgt_sent_scores[i].data[0] / pred_tgt_word_nums[i] for i in range(batch_size)] for src_sent, tgt_sent, score in zip(src_sents, tgt_sents, tgt_sent_word_scores): f.write('%s ||| %s ||| %f\n' % (' '.join(src_sent), ' '.join(tgt_sent), score)) f.close() def test(args): test_data_src = read_corpus(args.test_src, source='src') test_data_tgt = read_corpus(args.test_tgt, source='tgt') test_data = zip(test_data_src, test_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: model = model.cuda() hypotheses = decode(model, test_data, verbose=False) top_hypotheses = [hyps[0] for hyps in hypotheses] # bleu_score = get_bleu([tgt for src, tgt in test_data], top_hypotheses) # word_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'word_acc') # sent_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'sent_acc') # print('Corpus Level BLEU: %f, word level acc: %f, sentence level acc: %f' % (bleu_score, word_acc, sent_acc), file=sys.stderr) if args.save_to_file: print('save decoding results to %s' % args.save_to_file, file=sys.stderr) with open(args.save_to_file, 'w') as f: for hyps in hypotheses: f.write(' '.join(hyps[0][1:-1]) + '\n') if args.save_nbest: nbest_file = args.save_to_file + '.nbest' print('save nbest decoding results to %s' % nbest_file, file=sys.stderr) with open(nbest_file, 'w') as f: for src_sent, tgt_sent, hyps in zip(test_data_src, test_data_tgt, hypotheses): print('Source: %s' % ' '.join(src_sent), file=f) print('Target: %s' % ' '.join(tgt_sent), file=f) print('Hypotheses:', file=f) for i, hyp in enumerate(hyps, 1): print('[%d] %s' % (i, ' '.join(hyp)), file=f) print('*' * 30, file=f) def interactive(args): assert args.load_model, 'You have to specify a pre-trained model' print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) model.eval() if args.cuda: model = model.cuda() while True: src_sent = raw_input('Source Sentence:') src_sent = src_sent.strip().split(' ') hyps = model.translate(src_sent) for i, hyp in enumerate(hyps, 1): print('Hypothesis #%d: %s' % (i, ' '.join(hyp))) def sample(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') train_data = zip(train_data_src, train_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] model = NMT(opt, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: model = model.cuda() print('begin sampling') check_every = 10 train_iter = cum_samples = 0 train_time = time.time() for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 samples = model.sample(src_sents, sample_size=args.sample_size, to_word=True) cum_samples += sum(len(sample) for sample in samples) if train_iter % check_every == 0: elapsed = time.time() - train_time print('sampling speed: %d/s' % (cum_samples / elapsed), file=sys.stderr) cum_samples = 0 train_time = time.time() for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80) if __name__ == '__main__': args = init_config() print(args, file=sys.stderr) if args.mode == 'train': train(args) elif args.mode == 'raml_train': train_raml(args) elif args.mode == 'sample': sample(args) elif args.mode == 'test': test(args) elif args.mode == 'prob': compute_lm_prob(args) elif args.mode == 'interactive': interactive(args) else: raise RuntimeError('unknown mode') ================================================ FILE: nmt/scripts/multi-bleu.perl ================================================ #!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; use strict; my $lowercase = 0; if ($ARGV[0] eq "-lc") { $lowercase = 1; shift; } my $stem = $ARGV[0]; if (!defined $stem) { print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; print STDERR "Reads the references from reference or reference0, reference1, ...\n"; exit(1); } $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; my @REF; my $ref=0; while(-e "$stem$ref") { &add_to_ref("$stem$ref",\@REF); $ref++; } &add_to_ref($stem,\@REF) if -e $stem; die("ERROR: could not find reference file $stem") unless scalar @REF; # add additional references explicitly specified on the command line shift; foreach my $stem (@ARGV) { &add_to_ref($stem,\@REF) if -e $stem; } sub add_to_ref { my ($file,$REF) = @_; my $s=0; if ($file =~ /.gz$/) { open(REF,"gzip -dc $file|") or die "Can't read $file"; } else { open(REF,$file) or die "Can't read $file"; } while() { chop; push @{$$REF[$s++]}, $_; } close(REF); } my(@CORRECT,@TOTAL,$length_translation,$length_reference); my $s=0; while() { chop; $_ = lc if $lowercase; my @WORD = split; my %REF_NGRAM = (); my $length_translation_this_sentence = scalar(@WORD); my ($closest_diff,$closest_length) = (9999,9999); foreach my $reference (@{$REF[$s]}) { # print "$s $_ <=> $reference\n"; $reference = lc($reference) if $lowercase; my @WORD = split(' ',$reference); my $length = scalar(@WORD); my $diff = abs($length_translation_this_sentence-$length); if ($diff < $closest_diff) { $closest_diff = $diff; $closest_length = $length; # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; } elsif ($diff == $closest_diff) { $closest_length = $length if $length < $closest_length; # from two references with the same closeness to me # take the *shorter* into account, not the "first" one. } for(my $n=1;$n<=4;$n++) { my %REF_NGRAM_N = (); for(my $start=0;$start<=$#WORD-($n-1);$start++) { my $ngram = "$n"; for(my $w=0;$w<$n;$w++) { $ngram .= " ".$WORD[$start+$w]; } $REF_NGRAM_N{$ngram}++; } foreach my $ngram (keys %REF_NGRAM_N) { if (!defined($REF_NGRAM{$ngram}) || $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; } } } } $length_translation += $length_translation_this_sentence; $length_reference += $closest_length; for(my $n=1;$n<=4;$n++) { my %T_NGRAM = (); for(my $start=0;$start<=$#WORD-($n-1);$start++) { my $ngram = "$n"; for(my $w=0;$w<$n;$w++) { $ngram .= " ".$WORD[$start+$w]; } $T_NGRAM{$ngram}++; } foreach my $ngram (keys %T_NGRAM) { $ngram =~ /^(\d+) /; my $n = $1; # my $corr = 0; # print "$i e $ngram $T_NGRAM{$ngram}
\n"; $TOTAL[$n] += $T_NGRAM{$ngram}; if (defined($REF_NGRAM{$ngram})) { if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { $CORRECT[$n] += $T_NGRAM{$ngram}; # $corr = $T_NGRAM{$ngram}; # print "$i e correct1 $T_NGRAM{$ngram}
\n"; } else { $CORRECT[$n] += $REF_NGRAM{$ngram}; # $corr = $REF_NGRAM{$ngram}; # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; } } # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" } } $s++; } my $brevity_penalty = 1; my $bleu = 0; my @bleu=(); for(my $n=1;$n<=4;$n++) { if (defined ($TOTAL[$n])){ $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; }else{ $bleu[$n]=0; } } if ($length_reference==0){ printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; exit(1); } if ($length_translation<$length_reference) { $brevity_penalty = exp(1-$length_reference/$length_translation); } $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + my_log( $bleu[2] ) + my_log( $bleu[3] ) + my_log( $bleu[4] ) ) / 4) ; printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 100*$bleu, 100*$bleu[1], 100*$bleu[2], 100*$bleu[3], 100*$bleu[4], $brevity_penalty, $length_translation / $length_reference, $length_translation, $length_reference; sub my_log { return -9999999999 unless $_[0]; return log($_[0]); } ================================================ FILE: nmt/scripts/test.sh ================================================ #!/bin/bash src=$1 tgt=$2 mdl=$3 txt=$4 python3 nmt.py --mode test --test_src $src --test_tgt $tgt --load_model $mdl --save_to_file $txt --cuda ================================================ FILE: nmt/scripts/train-small.sh ================================================ #!/bin/sh L1=$1 L2=$2 JOB=$3 data_dir="./wmt16-small-data" vocab_bin="$data_dir/vocab.$L1$L2.bin" train_src="$data_dir/train.$L1" train_tgt="$data_dir/train.$L2" dev_src="$data_dir/valid.$L1" dev_tgt="$data_dir/valid.$L2" test_src="$data_dir/test.$L1" test_tgt="$data_dir/test.$L2" job_name="$JOB" model_name="model.${job_name}" python3 nmt.py \ --cuda \ --mode train \ --vocab ${vocab_bin} \ --save_to ${model_name} \ --log_every 50 \ --valid_niter 2500 \ --valid_metric ppl \ --save_model_after 2 \ --beam_size 5 \ --batch_size 64 \ --hidden_size 256 \ --embed_size 256 \ --uniform_init 0.1 \ --dropout 0.2 \ --clip_grad 5.0 \ --lr_decay 0.5 \ --train_src ${train_src} \ --train_tgt ${train_tgt} \ --dev_src ${dev_src} \ --dev_tgt ${dev_tgt} ================================================ FILE: nmt/scripts/train.sh ================================================ #!/bin/sh data_dir="/data/groups/chatbot/dl_data/wmt16" vocab_bin="$data_dir/vocab.deen.bin" train_src="$data_dir/train.de" train_tgt="$data_dir/train.en" dev_src="$data_dir/valid.de" dev_tgt="$data_dir/valid.en" test_src="$data_dir/test.de" test_tgt="$data_dir/test.en" job_name="wmt16-deen" model_name="model.${job_name}" python3 nmt.py \ --cuda \ --mode train \ --vocab ${vocab_bin} \ --save_to ${model_name} \ --log_every 100 \ --valid_niter 5000 \ --valid_metric ppl \ --save_model_after 1 \ --beam_size 5 \ --batch_size 64 \ --hidden_size 256 \ --embed_size 256 \ --uniform_init 0.1 \ --dropout 0.2 \ --clip_grad 5.0 \ --lr_decay 0.5 \ --train_src ${train_src} \ --train_tgt ${train_tgt} \ --dev_src ${dev_src} \ --dev_tgt ${dev_tgt} \ --load_model "$1" ================================================ FILE: nmt/util.py ================================================ from collections import defaultdict import numpy as np def read_corpus(file_path, source): data = [] for line in open(file_path): sent = line.strip().split(' ') # only append and to the target sentence if source == 'tgt': sent = [''] + sent + [''] data.append(sent) return data def batch_slice(data, batch_size, sort=True): batched_data = [] batch_num = int(np.ceil(len(data) / float(batch_size))) for i in range(batch_num): cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i src_sents = [data[i * batch_size + b][0] for b in range(cur_batch_size)] tgt_sents = [data[i * batch_size + b][1] for b in range(cur_batch_size)] if sort: src_ids = sorted(range(cur_batch_size), key=lambda src_id: len(src_sents[src_id]), reverse=True) src_sents = [src_sents[src_id] for src_id in src_ids] tgt_sents = [tgt_sents[src_id] for src_id in src_ids] batched_data.append((src_sents, tgt_sents)) return batched_data def data_iter(data, batch_size, shuffle=True): """ randomly permute data, then sort by source length, and partition into batches ensure that the length of source sentences in each batch is decreasing """ buckets = defaultdict(list) for pair in data: src_sent = pair[0] buckets[len(src_sent)].append(pair) batched_data = [] for src_len in buckets: tuples = buckets[src_len] if shuffle: np.random.shuffle(tuples) batched_data.extend(batch_slice(tuples, batch_size)) if shuffle: np.random.shuffle(batched_data) for batch in batched_data: yield batch ================================================ FILE: nmt/vocab.py ================================================ from __future__ import print_function import argparse from collections import Counter from itertools import chain import torch from util import read_corpus class VocabEntry(object): def __init__(self): self.word2id = dict() self.unk_id = 3 self.word2id[''] = 0 self.word2id[''] = 1 self.word2id[''] = 2 self.word2id[''] = 3 self.id2word = {v: k for k, v in self.word2id.items()} def __getitem__(self, word): return self.word2id.get(word, self.unk_id) def __contains__(self, word): return word in self.word2id def __setitem__(self, key, value): raise ValueError('vocabulary is readonly') def __len__(self): return len(self.word2id) def __repr__(self): return 'Vocabulary[size=%d]' % len(self) def id2word(self, wid): return self.id2word[wid] def add(self, word): if word not in self: wid = self.word2id[word] = len(self) self.id2word[wid] = word return wid else: return self[word] @staticmethod def from_corpus(corpus, size, remove_singleton=True): vocab_entry = VocabEntry() word_freq = Counter(chain(*corpus)) non_singletons = [w for w in word_freq if word_freq[w] > 1] print('number of word types: %d, number of word types w/ frequency > 1: %d' % (len(word_freq), len(non_singletons))) top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)[:size] for word in top_k_words: if len(vocab_entry) < size: if not (word_freq[word] == 1 and remove_singleton): vocab_entry.add(word) return vocab_entry class Vocab(object): def __init__(self, src_sents, tgt_sents, src_vocab_size, tgt_vocab_size, remove_singleton=True): assert len(src_sents) == len(tgt_sents) print('initialize source vocabulary ..') self.src = VocabEntry.from_corpus(src_sents, src_vocab_size, remove_singleton=remove_singleton) print('initialize target vocabulary ..') self.tgt = VocabEntry.from_corpus(tgt_sents, tgt_vocab_size, remove_singleton=remove_singleton) def __repr__(self): return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--src_vocab_size', default=50000, type=int, help='source vocabulary size') parser.add_argument('--tgt_vocab_size', default=50000, type=int, help='target vocabulary size') parser.add_argument('--include_singleton', action='store_true', default=False, help='whether to include singleton' 'in the vocabulary (default=False)') parser.add_argument('--train_src', type=str, required=True, help='file of source sentences') parser.add_argument('--train_tgt', type=str, required=True, help='file of target sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') args = parser.parse_args() print('read in source sentences: %s' % args.train_src) print('read in target sentences: %s' % args.train_tgt) src_sents = read_corpus(args.train_src, source='src') tgt_sents = read_corpus(args.train_tgt, source='tgt') vocab = Vocab(src_sents, tgt_sents, args.src_vocab_size, args.tgt_vocab_size, remove_singleton=not args.include_singleton) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) torch.save(vocab, args.output) print('vocabulary saved to %s' % args.output) ================================================ FILE: train-dual.sh ================================================ #!/bin/bash nmtdir=/data/groups/chatbot/dl_data/wmt16-small lmdir=/data/groups/chatbot/dl_data/lm srcdir=/data/groups/chatbot/dl_data/wmt16-dual nmtA=$nmtdir/model.wmt16-ende-small.best.bin nmtB=$nmtdir/model.wmt16-deen-small.best.bin lmA=$lmdir/wmt16-en.pt lmB=$lmdir/wmt16-de.pt lmA_dict=$lmdir/dict.en.pkl lmB_dict=$lmdir/dict.de.pkl srcA=$srcdir/train-small.en srcB=$srcdir/train-small.de saveA="modelA" saveB="modelB" python3 dual.py \ --nmt $nmtA $nmtB \ --lm $lmA $lmB \ --dict $lmA_dict $lmB_dict \ --src $srcA $srcB \ --log_every 5 \ --save_n_iter 400 \ --alpha 0.01 \ --model $saveA $saveB ================================================ FILE: util.py ================================================ from collections import defaultdict import numpy as np def read_corpus(file_path, source): data = [] for line in open(file_path): sent = line.strip().split(' ') # only append and to the target sentence if source == 'tgt': sent = [''] + sent + [''] data.append(sent) return data def batch_slice(data, batch_size, sort=True): batched_data = [] batch_num = int(np.ceil(len(data) / float(batch_size))) for i in range(batch_num): cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i src_sents = [data[i * batch_size + b][0] for b in range(cur_batch_size)] tgt_sents = [data[i * batch_size + b][1] for b in range(cur_batch_size)] if sort: src_ids = sorted(range(cur_batch_size), key=lambda src_id: len(src_sents[src_id]), reverse=True) src_sents = [src_sents[src_id] for src_id in src_ids] tgt_sents = [tgt_sents[src_id] for src_id in src_ids] batched_data.append((src_sents, tgt_sents)) return batched_data def data_iter(data, batch_size, shuffle=True): """ randomly permute data, then sort by source length, and partition into batches ensure that the length of source sentences in each batch is decreasing """ buckets = defaultdict(list) for pair in data: src_sent = pair[0] buckets[len(src_sent)].append(pair) batched_data = [] for src_len in buckets: tuples = buckets[src_len] if shuffle: np.random.shuffle(tuples) batched_data.extend(batch_slice(tuples, batch_size)) if shuffle: np.random.shuffle(batched_data) for batch in batched_data: yield batch ================================================ FILE: vocab.py ================================================ from __future__ import print_function import argparse from collections import Counter from itertools import chain import torch from util import read_corpus class VocabEntry(object): def __init__(self): self.word2id = dict() self.unk_id = 3 self.word2id[''] = 0 self.word2id[''] = 1 self.word2id[''] = 2 self.word2id[''] = 3 self.id2word = {v: k for k, v in self.word2id.items()} def __getitem__(self, word): return self.word2id.get(word, self.unk_id) def __contains__(self, word): return word in self.word2id def __setitem__(self, key, value): raise ValueError('vocabulary is readonly') def __len__(self): return len(self.word2id) def __repr__(self): return 'Vocabulary[size=%d]' % len(self) def id2word(self, wid): return self.id2word[wid] def add(self, word): if word not in self: wid = self.word2id[word] = len(self) self.id2word[wid] = word return wid else: return self[word] @staticmethod def from_corpus(corpus, size, remove_singleton=True): vocab_entry = VocabEntry() word_freq = Counter(chain(*corpus)) non_singletons = [w for w in word_freq if word_freq[w] > 1] print('number of word types: %d, number of word types w/ frequency > 1: %d' % (len(word_freq), len(non_singletons))) top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)[:size] for word in top_k_words: if len(vocab_entry) < size: if not (word_freq[word] == 1 and remove_singleton): vocab_entry.add(word) return vocab_entry class Vocab(object): def __init__(self, src_sents, tgt_sents, src_vocab_size, tgt_vocab_size, remove_singleton=True): assert len(src_sents) == len(tgt_sents) print('initialize source vocabulary ..') self.src = VocabEntry.from_corpus(src_sents, src_vocab_size, remove_singleton=remove_singleton) print('initialize target vocabulary ..') self.tgt = VocabEntry.from_corpus(tgt_sents, tgt_vocab_size, remove_singleton=remove_singleton) def __repr__(self): return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--src_vocab_size', default=50000, type=int, help='source vocabulary size') parser.add_argument('--tgt_vocab_size', default=50000, type=int, help='target vocabulary size') parser.add_argument('--include_singleton', action='store_true', default=False, help='whether to include singleton' 'in the vocabulary (default=False)') parser.add_argument('--train_src', type=str, required=True, help='file of source sentences') parser.add_argument('--train_tgt', type=str, required=True, help='file of target sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') args = parser.parse_args() print('read in source sentences: %s' % args.train_src) print('read in target sentences: %s' % args.train_tgt) src_sents = read_corpus(args.train_src, source='src') tgt_sents = read_corpus(args.train_tgt, source='tgt') vocab = Vocab(src_sents, tgt_sents, args.src_vocab_size, args.tgt_vocab_size, remove_singleton=not args.include_singleton) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) torch.save(vocab, args.output) print('vocabulary saved to %s' % args.output)