Repository: Deeperjia/tensorflow-wavenet
Branch: master
Commit: 45898bb263c6
Files: 8
Total size: 15.1 KB
Directory structure:
gitextract_r_ayre2m/
├── README.md
├── cache/
│ └── readme.md
├── data/
│ └── readme.md
├── model/
│ └── readme.me
├── model.py
├── test.py
├── train.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet
=
A tensorflow implementation for Chinese speech recognition based on DeepMind's WaveNet: A Generative Model for Raw Audio. ([Hereafter the Paper]( https://arxiv.org/abs/1609.03499))
Version
---
Current Version : 0.0.1
Dependencies
---
1. python == 3.5
2. tensorflow == 1.0.0
3. librosa == 0.5.0
Dataset
---
[清华30小时中文数据集](http://data.cslt.org/thchs30/standalone.html)
Directories
---
1. cache: save data featrue and word dictionary
2. data: wav files and related labels
3. model: save the models
Network model
---
1. Data random shuffle per epoch
2. Xavier initialization
3. Adam optimization algorithms
4. Batch Normalization
Train the network
---
python3 train.py
Test the network
---
python3 test.py
Other resources
---
1. [TensorFlow练习15: 中文语音识别](http://blog.topspeedsnail.com/archives/10696#more-10696)
2. [ibab's WaveNet(speech synthesis) tensorflow implementationt](https://github.com/ibab/tensorflow-wavenet)
3. [buriburisuri's WaveNet(English speech recognition) tensorflow and sugartensor implementationt](https://github.com/buriburisuri/speech-to-text-wavenet#version)
================================================
FILE: cache/readme.md
================================================
================================================
FILE: data/readme.md
================================================
================================================
FILE: model/readme.me
================================================
================================================
FILE: model.py
================================================
#-*- coding:utf-8 -*-
__author__ = 'Deeper'
import tensorflow as tf # 1.0.0
import numpy as np
class Model():
def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True):
n_dim = 128
self.is_training = is_training
self.input_data = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, n_mfcc])
self.seq_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(self.input_data, reduction_indices=2), 0.), tf.int32), reduction_indices=1)
self.targets = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])
# 1D convolution
self.conv1d_index = 0
out = self.conv1d_layer(self.input_data, dim=n_dim)
# stack hole CNN
n_blocks = 3
skip = 0
self.aconv1d_index = 0
for _ in range(n_blocks):
for r in [1, 2, 4, 8, 16]:
out, s = self.residual_block(out, size=7, rate=r, dim=n_dim)
skip += s
logit = self.conv1d_layer(skip, dim=skip.get_shape().as_list()[-1])
self.logit = self.conv1d_layer(logit, dim=n_out, bias=True, activation=None)
# CTC loss
indices = tf.where(tf.not_equal(tf.cast(self.targets, tf.float32), 0.))
target = tf.SparseTensor(indices=indices, values=tf.gather_nd(self.targets, indices)-1, dense_shape=tf.cast(tf.shape(self.targets), tf.int64))
loss = tf.nn.ctc_loss(target, self.logit, self.seq_len, time_major=False)
self.cost = tf.reduce_mean(loss)
# optimizer
optimizer = tf.train.AdamOptimizer()
var_list = [var for var in tf.trainable_variables()]
gradient = optimizer.compute_gradients(self.cost, var_list=var_list)
self.optimizer_op = optimizer.apply_gradients(gradient)
def residual_block(self, input_tensor, size, rate, dim):
conv_filter = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='tanh')
conv_gate = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='sigmoid')
out = conv_filter * conv_gate
out = self.conv1d_layer(out, size=1, dim=dim)
return out + input_tensor, out
def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, activation='tanh'):
with tf.variable_scope('conv1d'+str(self.conv1d_index)):
shape = input_tensor.get_shape().as_list()
kernel = tf.get_variable('kernel', (size, shape[-1], dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
if bias:
b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0))
out = tf.nn.conv1d(input_tensor, kernel, stride=1, padding='SAME') + (b if bias else 0)
if not bias:
out = self.batch_norm_wrapper(out)
out = self.activation_wrapper(out, activation)
self.conv1d_index += 1
return out
def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, activation='tanh'):
with tf.variable_scope('aconv1d_'+str(self.aconv1d_index)):
shape = input_tensor.get_shape().as_list()
kernel = tf.get_variable('kernel',(1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
if bias:
b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0))
out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), kernel, rate=rate, padding='SAME')
out = tf.squeeze(out, [1])
if not bias:
out = self.batch_norm_wrapper(out)
out = self.activation_wrapper(out, activation)
self.aconv1d_index += 1
return out
def batch_norm_wrapper(self, inputs, decay=0.999):
epsilon = 1e-3
shape = inputs.get_shape().as_list()
beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
pop_mean = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
pop_var = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
if self.is_training:
batch_mean, batch_var = tf.nn.moments(inputs, axes=list(range(len(shape)-1)))
train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
train_var =tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))
with tf.control_dependencies([train_mean, train_var]):
return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon)
else:
return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon)
def activation_wrapper(self, inputs, activation):
out = inputs
if activation == 'sigmoid':
out = tf.nn.sigmoid(out)
elif activation == 'tanh':
out = tf.nn.tanh(out)
elif activation == 'relu':
out = tf.nn.relu(out)
return out
================================================
FILE: test.py
================================================
#-*- coding:utf-8 -*-
from __future__ import print_function
from model import Model
from utils import SpeechLoader
import tensorflow as tf # 1.0.0
import numpy as np
import librosa
import os
# 语音识别
# 把batch_size改为1
def speech_to_text():
n_mfcc = 60
# load data
speech_loader = SpeechLoader()
# load model
model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)
saver = tf.train.Saver()
with tf.Session() as sess:
for j in range(750,755):
# extract feature
wav_file = os.path.join(os.getcwd(),'data','wav','test','D4','D4_'+str(j)+'.wav')
wav, sr = librosa.load(wav_file, mono=True)
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0,2,1])
mfcc = mfcc.tolist()
# fill 0
while len(mfcc[0]) < speech_loader.wav_max_len:
mfcc[0].append([0] * n_mfcc)
# word dict
wmap = {value:key for key, value in speech_loader.wordmap.items()}
# recognition
saver.restore(sess, tf.train.latest_checkpoint('model'))
decoded = tf.transpose(model.logit, perm=[1, 0, 2])
decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True)
predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc})
# print result
words = ''
for i in range(len(output[0])):
words += wmap.get(output[0][i], -1)
print("---------------------------")
print("Input: " + wav_file)
print("Output: " + words)
if __name__ == '__main__':
speech_to_text()
================================================
FILE: train.py
================================================
#-*- coding:utf-8 -*-
from __future__ import print_function
from utils import SpeechLoader
from model import Model
import tensorflow as tf #1.0.0
import time
import os
def train():
# setting parameters
batch_size = 32
n_epoch = 100
n_mfcc = 60
# load speech data
wav_path = os.path.join(os.getcwd(),'data','wav','train')
label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt')
speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc)
n_out = speech_loader.vocab_size
# load model
model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
for epoch in range(n_epoch):
speech_loader.create_batches() # random shuffle data
speech_loader.reset_batch_pointer()
for batch in range(speech_loader.n_batches):
start = time.time()
batches_wav, batches_label = speech_loader.next_batch()
feed = {model.input_data: batches_wav, model.targets: batches_label}
train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed)
end = time.time()
print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start))
# save models
if epoch % 5 ==0:
saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch)
if __name__ == '__main__':
train()
================================================
FILE: utils.py
================================================
#-*- coding:utf-8 -*-
__author__ = 'Deeper'
import tensorflow as tf
import numpy as np
import os
import codecs
import librosa
from six.moves import cPickle, reduce, map
from collections import Counter
import random
class SpeechLoader():
def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfcc=20, encoding='utf-8'):
self.batch_size = batch_size
self.encoding = encoding
self.n_mfcc = n_mfcc
# path setting
data_dir = os.path.join(os.getcwd(), 'cache', 'mfcc'+str(n_mfcc))
# data cache
wavs_file = os.path.join(data_dir, "wavs.file")
vocab_file = os.path.join(data_dir,"vocab.file")
mfcc_tensor = os.path.join(data_dir, "mfcc.tensor")
label_tensor = os.path.join(data_dir, "label.tensor")
# data process
if not (os.path.exists(vocab_file) and os.path.exists(mfcc_tensor) and os.path.exists(label_tensor)):
print("reading wav files")
self.preprocess(wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor)
else:
print("loading preprocessed files")
self.load_preprocessed(vocab_file, mfcc_tensor, label_tensor)
# minibatch
self.create_batches()
# pointer reset
self.reset_batch_pointer()
def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor):
def handle_file(dirpath, filename):
if filename.endswith('.wav') or filename.endswith('.WAV'):
filename_path = os.path.join(dirpath, filename)
if os.stat(filename_path).st_size < 240000:
return
return filename_path
# read label file
labels_dict = {}
with codecs.open(label_file,"r", encoding=self.encoding) as f:
for label in f:
label = label.strip('\n')
labels_id = label.split(' ',1)[0]
labels_text = label.split(' ',1)[1]
labels_dict[labels_id] = labels_text
# print("",len(labels_dict)) # 10000
# wav files
wav_files = []
if wav_path:
for (dirpath, dirnames, filenames) in os.walk(wav_path):
for filename in filenames:
if handle_file(dirpath,filename):
wav_files.append(handle_file(dirpath,filename))
print("初始样本数:", len(wav_files)) #样本数
# data filter and feature extraction
wav_files_filter = []
labels_filter = []
self.mfcc_tensor = []
self.wav_max_len = 0
cnt = 0
for wav_file in wav_files:
wav_id = os.path.basename(wav_file).split('.')[0]
if wav_id in labels_dict:
print('样本'+str(cnt), wav_file)
labels_filter.append(labels_dict[wav_id])
wav_files_filter.append(wav_file)
# mfcc feature
wav_file, sr = librosa.load(wav_file, mono=True)
mfcc = np.transpose(librosa.feature.mfcc(wav_file, sr, n_mfcc=self.n_mfcc),[1,0])
self.mfcc_tensor.append(mfcc.tolist())
cnt += 1
self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
print("样本总数:", cnt)
print("最长的语音:", self.wav_max_len)
# print(len(wav_files_filter), len(labels_filter),len(wav2mfcc)) # assert check dimensions
with open(wavs_file, 'wb') as f:
cPickle.dump(wav_files_filter, f)
with open(mfcc_tensor, 'wb') as f:
cPickle.dump(self.mfcc_tensor, f)
# vocab file
vocabs = []
for label in labels_filter:
vocabs += [word for word in label]
count = Counter(vocabs)
count_pairs = sorted(count.items(), key=lambda x:-x[1])
words, _ = zip(*count_pairs)
self.wordmap = dict(zip(words, range(len(words))))
self.vocab_size = len(words)
print("词汇表大小:",len(words))
with open(vocab_file,'wb') as f:
cPickle.dump(self.wordmap, f)
# label vector
label_encoder = lambda word: self.wordmap.get(word, len(words))
self.label_tensor = [list(map(label_encoder, label)) for label in labels_filter]
self.label_max_len = max(len(label) for label in self.label_tensor)
print("最长的句子:", self.label_max_len)
with open(label_tensor,'wb') as f:
cPickle.dump(self.label_tensor, f)
def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor):
with open(vocab_file, 'rb') as f:
self.wordmap = cPickle.load(f)
self.vocab_size = len(self.wordmap)
print("词汇表大小:",self.vocab_size)
with open(mfcc_tensor, 'rb') as f:
self.mfcc_tensor = cPickle.load(f)
self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
print("最长的语音:", self.wav_max_len)
with open(label_tensor, 'rb') as f:
self.label_tensor = cPickle.load(f)
self.label_max_len = max(len(label) for label in self.label_tensor)
print("最长的句子:", self.label_max_len)
def create_batches(self):
self.n_batches = len(self.mfcc_tensor) // self.batch_size
if self.n_batches==0:
assert False, "Not enough data. Make seq_length and batch_size small."
self.mfcc_tensor = self.mfcc_tensor[:self.n_batches*self.batch_size]
self.label_tensor = self.label_tensor[:self.n_batches*self.batch_size]
# random shuffle the data
if len(self.mfcc_tensor) != len(self.label_tensor):
assert False, "Data length does not match the label length!"
data_tensor = []
for i in range(len(self.mfcc_tensor)):
data_tensor.append([self.mfcc_tensor[i], self.label_tensor[i]])
random.shuffle(data_tensor)
self.mfcc_tensor = []
self.label_tensor = []
for i in range(len(data_tensor)):
self.mfcc_tensor.append(data_tensor[i][0])
self.label_tensor.append(data_tensor[i][1])
# create batches
self.x_batches = []
self.y_batches = []
for i in range(self.n_batches):
from_index = i*self.batch_size
to_index = from_index + self.batch_size
mfcc_batches = self.mfcc_tensor[from_index:to_index]
label_batches = self.label_tensor[from_index:to_index]
# 补零对齐
for mfcc in mfcc_batches:
while len(mfcc) < self.wav_max_len:
mfcc.append([0]*self.n_mfcc)
for label in label_batches:
while len(label) < self.label_max_len:
label.append(0)
self.x_batches.append(mfcc_batches)
self.y_batches.append(label_batches)
def next_batch(self):
x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
self.pointer += 1
return x, y
def reset_batch_pointer(self):
self.pointer = 0
gitextract_r_ayre2m/ ├── README.md ├── cache/ │ └── readme.md ├── data/ │ └── readme.md ├── model/ │ └── readme.me ├── model.py ├── test.py ├── train.py └── utils.py
SYMBOL INDEX (16 symbols across 4 files)
FILE: model.py
class Model (line 6) | class Model():
method __init__ (line 7) | def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True):
method residual_block (line 43) | def residual_block(self, input_tensor, size, rate, dim):
method conv1d_layer (line 50) | def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, acti...
method aconv1d_layer (line 65) | def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, acti...
method batch_norm_wrapper (line 81) | def batch_norm_wrapper(self, inputs, decay=0.999):
method activation_wrapper (line 98) | def activation_wrapper(self, inputs, activation):
FILE: test.py
function speech_to_text (line 14) | def speech_to_text():
FILE: train.py
function train (line 10) | def train():
FILE: utils.py
class SpeechLoader (line 12) | class SpeechLoader():
method __init__ (line 14) | def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfc...
method preprocess (line 40) | def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc...
method load_preprocessed (line 119) | def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor):
method create_batches (line 135) | def create_batches(self):
method next_batch (line 178) | def next_batch(self):
method reset_batch_pointer (line 183) | def reset_batch_pointer(self):
Condensed preview — 8 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (17K chars).
[
{
"path": "README.md",
"chars": 1191,
"preview": "Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet\n=\nA tensorflow im"
},
{
"path": "cache/readme.md",
"chars": 1,
"preview": "\n"
},
{
"path": "data/readme.md",
"chars": 1,
"preview": "\n"
},
{
"path": "model/readme.me",
"chars": 1,
"preview": "\n"
},
{
"path": "model.py",
"chars": 4700,
"preview": "#-*- coding:utf-8 -*-\r\n__author__ = 'Deeper'\r\nimport tensorflow as tf # 1.0.0\r\nimport numpy as np\r\n\r\nclass Model():\r\n\td"
},
{
"path": "test.py",
"chars": 1945,
"preview": "#-*- coding:utf-8 -*-\r\n\r\nfrom __future__ import print_function\r\nfrom model import Model\r\nfrom utils import SpeechLoader\r"
},
{
"path": "train.py",
"chars": 1497,
"preview": "#-*- coding:utf-8 -*-\r\n\r\nfrom __future__ import print_function\r\nfrom utils import SpeechLoader\r\nfrom model import Model\r"
},
{
"path": "utils.py",
"chars": 6116,
"preview": "#-*- coding:utf-8 -*-\r\n__author__ = 'Deeper'\r\nimport tensorflow as tf \r\nimport numpy as np \r\nimport os\r\nimport codecs\r\ni"
}
]
About this extraction
This page contains the full source code of the Deeperjia/tensorflow-wavenet GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 8 files (15.1 KB), approximately 4.4k tokens, and a symbol index with 16 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.