Repository: Deeperjia/tensorflow-wavenet Branch: master Commit: 45898bb263c6 Files: 8 Total size: 15.1 KB Directory structure: gitextract_r_ayre2m/ ├── README.md ├── cache/ │ └── readme.md ├── data/ │ └── readme.md ├── model/ │ └── readme.me ├── model.py ├── test.py ├── train.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet = A tensorflow implementation for Chinese speech recognition based on DeepMind's WaveNet: A Generative Model for Raw Audio. ([Hereafter the Paper]( https://arxiv.org/abs/1609.03499)) Version --- Current Version : 0.0.1 Dependencies --- 1. python == 3.5 2. tensorflow == 1.0.0 3. librosa == 0.5.0 Dataset --- [清华30小时中文数据集](http://data.cslt.org/thchs30/standalone.html) Directories --- 1. cache: save data featrue and word dictionary 2. data: wav files and related labels 3. model: save the models Network model --- 1. Data random shuffle per epoch 2. Xavier initialization 3. Adam optimization algorithms 4. Batch Normalization Train the network --- python3 train.py Test the network --- python3 test.py Other resources --- 1. [TensorFlow练习15: 中文语音识别](http://blog.topspeedsnail.com/archives/10696#more-10696) 2. [ibab's WaveNet(speech synthesis) tensorflow implementationt](https://github.com/ibab/tensorflow-wavenet) 3. [buriburisuri's WaveNet(English speech recognition) tensorflow and sugartensor implementationt](https://github.com/buriburisuri/speech-to-text-wavenet#version) ================================================ FILE: cache/readme.md ================================================ ================================================ FILE: data/readme.md ================================================ ================================================ FILE: model/readme.me ================================================ ================================================ FILE: model.py ================================================ #-*- coding:utf-8 -*- __author__ = 'Deeper' import tensorflow as tf # 1.0.0 import numpy as np class Model(): def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True): n_dim = 128 self.is_training = is_training self.input_data = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, n_mfcc]) self.seq_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(self.input_data, reduction_indices=2), 0.), tf.int32), reduction_indices=1) self.targets = tf.placeholder(dtype=tf.int32, shape=[batch_size, None]) # 1D convolution self.conv1d_index = 0 out = self.conv1d_layer(self.input_data, dim=n_dim) # stack hole CNN n_blocks = 3 skip = 0 self.aconv1d_index = 0 for _ in range(n_blocks): for r in [1, 2, 4, 8, 16]: out, s = self.residual_block(out, size=7, rate=r, dim=n_dim) skip += s logit = self.conv1d_layer(skip, dim=skip.get_shape().as_list()[-1]) self.logit = self.conv1d_layer(logit, dim=n_out, bias=True, activation=None) # CTC loss indices = tf.where(tf.not_equal(tf.cast(self.targets, tf.float32), 0.)) target = tf.SparseTensor(indices=indices, values=tf.gather_nd(self.targets, indices)-1, dense_shape=tf.cast(tf.shape(self.targets), tf.int64)) loss = tf.nn.ctc_loss(target, self.logit, self.seq_len, time_major=False) self.cost = tf.reduce_mean(loss) # optimizer optimizer = tf.train.AdamOptimizer() var_list = [var for var in tf.trainable_variables()] gradient = optimizer.compute_gradients(self.cost, var_list=var_list) self.optimizer_op = optimizer.apply_gradients(gradient) def residual_block(self, input_tensor, size, rate, dim): conv_filter = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='tanh') conv_gate = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='sigmoid') out = conv_filter * conv_gate out = self.conv1d_layer(out, size=1, dim=dim) return out + input_tensor, out def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, activation='tanh'): with tf.variable_scope('conv1d'+str(self.conv1d_index)): shape = input_tensor.get_shape().as_list() kernel = tf.get_variable('kernel', (size, shape[-1], dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) if bias: b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0)) out = tf.nn.conv1d(input_tensor, kernel, stride=1, padding='SAME') + (b if bias else 0) if not bias: out = self.batch_norm_wrapper(out) out = self.activation_wrapper(out, activation) self.conv1d_index += 1 return out def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, activation='tanh'): with tf.variable_scope('aconv1d_'+str(self.aconv1d_index)): shape = input_tensor.get_shape().as_list() kernel = tf.get_variable('kernel',(1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) if bias: b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0)) out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), kernel, rate=rate, padding='SAME') out = tf.squeeze(out, [1]) if not bias: out = self.batch_norm_wrapper(out) out = self.activation_wrapper(out, activation) self.aconv1d_index += 1 return out def batch_norm_wrapper(self, inputs, decay=0.999): epsilon = 1e-3 shape = inputs.get_shape().as_list() beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) pop_mean = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) pop_var = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) if self.is_training: batch_mean, batch_var = tf.nn.moments(inputs, axes=list(range(len(shape)-1))) train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay)) train_var =tf.assign(pop_var, pop_var*decay+batch_var*(1-decay)) with tf.control_dependencies([train_mean, train_var]): return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) else: return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon) def activation_wrapper(self, inputs, activation): out = inputs if activation == 'sigmoid': out = tf.nn.sigmoid(out) elif activation == 'tanh': out = tf.nn.tanh(out) elif activation == 'relu': out = tf.nn.relu(out) return out ================================================ FILE: test.py ================================================ #-*- coding:utf-8 -*- from __future__ import print_function from model import Model from utils import SpeechLoader import tensorflow as tf # 1.0.0 import numpy as np import librosa import os # 语音识别 # 把batch_size改为1 def speech_to_text(): n_mfcc = 60 # load data speech_loader = SpeechLoader() # load model model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) saver = tf.train.Saver() with tf.Session() as sess: for j in range(750,755): # extract feature wav_file = os.path.join(os.getcwd(),'data','wav','test','D4','D4_'+str(j)+'.wav') wav, sr = librosa.load(wav_file, mono=True) mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0,2,1]) mfcc = mfcc.tolist() # fill 0 while len(mfcc[0]) < speech_loader.wav_max_len: mfcc[0].append([0] * n_mfcc) # word dict wmap = {value:key for key, value in speech_loader.wordmap.items()} # recognition saver.restore(sess, tf.train.latest_checkpoint('model')) decoded = tf.transpose(model.logit, perm=[1, 0, 2]) decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) # print result words = '' for i in range(len(output[0])): words += wmap.get(output[0][i], -1) print("---------------------------") print("Input: " + wav_file) print("Output: " + words) if __name__ == '__main__': speech_to_text() ================================================ FILE: train.py ================================================ #-*- coding:utf-8 -*- from __future__ import print_function from utils import SpeechLoader from model import Model import tensorflow as tf #1.0.0 import time import os def train(): # setting parameters batch_size = 32 n_epoch = 100 n_mfcc = 60 # load speech data wav_path = os.path.join(os.getcwd(),'data','wav','train') label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt') speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc) n_out = speech_loader.vocab_size # load model model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for epoch in range(n_epoch): speech_loader.create_batches() # random shuffle data speech_loader.reset_batch_pointer() for batch in range(speech_loader.n_batches): start = time.time() batches_wav, batches_label = speech_loader.next_batch() feed = {model.input_data: batches_wav, model.targets: batches_label} train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed) end = time.time() print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start)) # save models if epoch % 5 ==0: saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch) if __name__ == '__main__': train() ================================================ FILE: utils.py ================================================ #-*- coding:utf-8 -*- __author__ = 'Deeper' import tensorflow as tf import numpy as np import os import codecs import librosa from six.moves import cPickle, reduce, map from collections import Counter import random class SpeechLoader(): def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfcc=20, encoding='utf-8'): self.batch_size = batch_size self.encoding = encoding self.n_mfcc = n_mfcc # path setting data_dir = os.path.join(os.getcwd(), 'cache', 'mfcc'+str(n_mfcc)) # data cache wavs_file = os.path.join(data_dir, "wavs.file") vocab_file = os.path.join(data_dir,"vocab.file") mfcc_tensor = os.path.join(data_dir, "mfcc.tensor") label_tensor = os.path.join(data_dir, "label.tensor") # data process if not (os.path.exists(vocab_file) and os.path.exists(mfcc_tensor) and os.path.exists(label_tensor)): print("reading wav files") self.preprocess(wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor) else: print("loading preprocessed files") self.load_preprocessed(vocab_file, mfcc_tensor, label_tensor) # minibatch self.create_batches() # pointer reset self.reset_batch_pointer() def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor): def handle_file(dirpath, filename): if filename.endswith('.wav') or filename.endswith('.WAV'): filename_path = os.path.join(dirpath, filename) if os.stat(filename_path).st_size < 240000: return return filename_path # read label file labels_dict = {} with codecs.open(label_file,"r", encoding=self.encoding) as f: for label in f: label = label.strip('\n') labels_id = label.split(' ',1)[0] labels_text = label.split(' ',1)[1] labels_dict[labels_id] = labels_text # print("",len(labels_dict)) # 10000 # wav files wav_files = [] if wav_path: for (dirpath, dirnames, filenames) in os.walk(wav_path): for filename in filenames: if handle_file(dirpath,filename): wav_files.append(handle_file(dirpath,filename)) print("初始样本数:", len(wav_files)) #样本数 # data filter and feature extraction wav_files_filter = [] labels_filter = [] self.mfcc_tensor = [] self.wav_max_len = 0 cnt = 0 for wav_file in wav_files: wav_id = os.path.basename(wav_file).split('.')[0] if wav_id in labels_dict: print('样本'+str(cnt), wav_file) labels_filter.append(labels_dict[wav_id]) wav_files_filter.append(wav_file) # mfcc feature wav_file, sr = librosa.load(wav_file, mono=True) mfcc = np.transpose(librosa.feature.mfcc(wav_file, sr, n_mfcc=self.n_mfcc),[1,0]) self.mfcc_tensor.append(mfcc.tolist()) cnt += 1 self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor) print("样本总数:", cnt) print("最长的语音:", self.wav_max_len) # print(len(wav_files_filter), len(labels_filter),len(wav2mfcc)) # assert check dimensions with open(wavs_file, 'wb') as f: cPickle.dump(wav_files_filter, f) with open(mfcc_tensor, 'wb') as f: cPickle.dump(self.mfcc_tensor, f) # vocab file vocabs = [] for label in labels_filter: vocabs += [word for word in label] count = Counter(vocabs) count_pairs = sorted(count.items(), key=lambda x:-x[1]) words, _ = zip(*count_pairs) self.wordmap = dict(zip(words, range(len(words)))) self.vocab_size = len(words) print("词汇表大小:",len(words)) with open(vocab_file,'wb') as f: cPickle.dump(self.wordmap, f) # label vector label_encoder = lambda word: self.wordmap.get(word, len(words)) self.label_tensor = [list(map(label_encoder, label)) for label in labels_filter] self.label_max_len = max(len(label) for label in self.label_tensor) print("最长的句子:", self.label_max_len) with open(label_tensor,'wb') as f: cPickle.dump(self.label_tensor, f) def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor): with open(vocab_file, 'rb') as f: self.wordmap = cPickle.load(f) self.vocab_size = len(self.wordmap) print("词汇表大小:",self.vocab_size) with open(mfcc_tensor, 'rb') as f: self.mfcc_tensor = cPickle.load(f) self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor) print("最长的语音:", self.wav_max_len) with open(label_tensor, 'rb') as f: self.label_tensor = cPickle.load(f) self.label_max_len = max(len(label) for label in self.label_tensor) print("最长的句子:", self.label_max_len) def create_batches(self): self.n_batches = len(self.mfcc_tensor) // self.batch_size if self.n_batches==0: assert False, "Not enough data. Make seq_length and batch_size small." self.mfcc_tensor = self.mfcc_tensor[:self.n_batches*self.batch_size] self.label_tensor = self.label_tensor[:self.n_batches*self.batch_size] # random shuffle the data if len(self.mfcc_tensor) != len(self.label_tensor): assert False, "Data length does not match the label length!" data_tensor = [] for i in range(len(self.mfcc_tensor)): data_tensor.append([self.mfcc_tensor[i], self.label_tensor[i]]) random.shuffle(data_tensor) self.mfcc_tensor = [] self.label_tensor = [] for i in range(len(data_tensor)): self.mfcc_tensor.append(data_tensor[i][0]) self.label_tensor.append(data_tensor[i][1]) # create batches self.x_batches = [] self.y_batches = [] for i in range(self.n_batches): from_index = i*self.batch_size to_index = from_index + self.batch_size mfcc_batches = self.mfcc_tensor[from_index:to_index] label_batches = self.label_tensor[from_index:to_index] # 补零对齐 for mfcc in mfcc_batches: while len(mfcc) < self.wav_max_len: mfcc.append([0]*self.n_mfcc) for label in label_batches: while len(label) < self.label_max_len: label.append(0) self.x_batches.append(mfcc_batches) self.y_batches.append(label_batches) def next_batch(self): x, y = self.x_batches[self.pointer], self.y_batches[self.pointer] self.pointer += 1 return x, y def reset_batch_pointer(self): self.pointer = 0