Full Code of Deeperjia/tensorflow-wavenet for AI

master 45898bb263c6 cached
8 files
15.1 KB
4.4k tokens
16 symbols
1 requests
Download .txt
Repository: Deeperjia/tensorflow-wavenet
Branch: master
Commit: 45898bb263c6
Files: 8
Total size: 15.1 KB

Directory structure:
gitextract_r_ayre2m/

├── README.md
├── cache/
│   └── readme.md
├── data/
│   └── readme.md
├── model/
│   └── readme.me
├── model.py
├── test.py
├── train.py
└── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet
=
A tensorflow implementation for Chinese speech recognition based on DeepMind's WaveNet: A Generative Model for Raw Audio. ([Hereafter the Paper]( https://arxiv.org/abs/1609.03499))

Version
---
Current Version : 0.0.1

Dependencies
---
1. python == 3.5
2. tensorflow == 1.0.0
3. librosa == 0.5.0

Dataset
---
[清华30小时中文数据集](http://data.cslt.org/thchs30/standalone.html)

Directories
---
1. cache: save data featrue and word dictionary
2. data: wav files and related labels
3. model: save the models

Network model
---
1. Data random shuffle per epoch
2. Xavier initialization
3. Adam optimization algorithms
4. Batch Normalization

Train the network
---
python3 train.py

Test the network
---
python3 test.py

Other resources
---
1. [TensorFlow练习15: 中文语音识别](http://blog.topspeedsnail.com/archives/10696#more-10696)
2. [ibab's WaveNet(speech synthesis) tensorflow implementationt](https://github.com/ibab/tensorflow-wavenet)
3. [buriburisuri's WaveNet(English speech recognition) tensorflow and sugartensor implementationt](https://github.com/buriburisuri/speech-to-text-wavenet#version)


================================================
FILE: cache/readme.md
================================================



================================================
FILE: data/readme.md
================================================



================================================
FILE: model/readme.me
================================================



================================================
FILE: model.py
================================================
#-*- coding:utf-8 -*-
__author__ = 'Deeper'
import tensorflow as tf  # 1.0.0
import numpy as np

class Model():
	def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True):
		n_dim = 128
		self.is_training = is_training

		self.input_data = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, n_mfcc])
		self.seq_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(self.input_data, reduction_indices=2), 0.), tf.int32), reduction_indices=1)
		self.targets = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])

		# 1D convolution
		self.conv1d_index = 0
		out = self.conv1d_layer(self.input_data, dim=n_dim)
		
		# stack hole CNN
		n_blocks = 3
		skip = 0
		self.aconv1d_index = 0
		for _ in range(n_blocks):
			for r in [1, 2, 4, 8, 16]:
				out, s = self.residual_block(out, size=7, rate=r, dim=n_dim)
				skip += s

		logit = self.conv1d_layer(skip, dim=skip.get_shape().as_list()[-1])
		self.logit = self.conv1d_layer(logit, dim=n_out, bias=True, activation=None)

		# CTC loss
		indices = tf.where(tf.not_equal(tf.cast(self.targets, tf.float32), 0.))
		target = tf.SparseTensor(indices=indices, values=tf.gather_nd(self.targets, indices)-1, dense_shape=tf.cast(tf.shape(self.targets), tf.int64))
		loss = tf.nn.ctc_loss(target, self.logit, self.seq_len, time_major=False)
		self.cost = tf.reduce_mean(loss)

		# optimizer
		optimizer = tf.train.AdamOptimizer()
		var_list = [var for var in tf.trainable_variables()]
		gradient = optimizer.compute_gradients(self.cost, var_list=var_list)
		self.optimizer_op = optimizer.apply_gradients(gradient)

	def residual_block(self, input_tensor, size, rate, dim):
		conv_filter = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='tanh')
		conv_gate = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='sigmoid')
		out = conv_filter * conv_gate
		out = self.conv1d_layer(out, size=1, dim=dim)
		return out + input_tensor, out

	def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, activation='tanh'):
		with tf.variable_scope('conv1d'+str(self.conv1d_index)):
			shape = input_tensor.get_shape().as_list()
			kernel = tf.get_variable('kernel', (size, shape[-1], dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
			if bias:
				b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0))
			out = tf.nn.conv1d(input_tensor, kernel, stride=1, padding='SAME') + (b if bias else 0)
			if not bias:
				out = self.batch_norm_wrapper(out)

			out = self.activation_wrapper(out, activation)
			
			self.conv1d_index += 1
			return out

	def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, activation='tanh'):
		with tf.variable_scope('aconv1d_'+str(self.aconv1d_index)):
			shape = input_tensor.get_shape().as_list()
			kernel = tf.get_variable('kernel',(1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
			if bias:
				b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0))
			out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), kernel, rate=rate, padding='SAME')
			out = tf.squeeze(out, [1])
			if not bias:
				out = self.batch_norm_wrapper(out)

			out = self.activation_wrapper(out, activation)
			
			self.aconv1d_index += 1
			return out

	def batch_norm_wrapper(self, inputs, decay=0.999):
		epsilon = 1e-3
		shape = inputs.get_shape().as_list()

		beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
		gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
		pop_mean = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
		pop_var = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
		if self.is_training:
			batch_mean, batch_var = tf.nn.moments(inputs, axes=list(range(len(shape)-1)))
			train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
			train_var =tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))
			with tf.control_dependencies([train_mean, train_var]):
				return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon)
		else:
			return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon)

	def activation_wrapper(self, inputs, activation):
		out = inputs

		if activation == 'sigmoid':
			out = tf.nn.sigmoid(out)
		elif activation == 'tanh':
			out = tf.nn.tanh(out)
		elif activation == 'relu':
			out = tf.nn.relu(out)

		return out

================================================
FILE: test.py
================================================
#-*- coding:utf-8 -*-

from __future__ import print_function
from model import Model
from utils import SpeechLoader

import tensorflow as tf  # 1.0.0
import numpy as np
import librosa
import os
 
# 语音识别
# 把batch_size改为1
def speech_to_text():
    n_mfcc = 60

    # load data
    speech_loader = SpeechLoader()

    # load model
    model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        for j in range(750,755):
            # extract feature
            wav_file = os.path.join(os.getcwd(),'data','wav','test','D4','D4_'+str(j)+'.wav')
            wav, sr = librosa.load(wav_file, mono=True)
            mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0,2,1])
            mfcc = mfcc.tolist()

            # fill 0
            while len(mfcc[0]) < speech_loader.wav_max_len:
                mfcc[0].append([0] * n_mfcc)

            # word dict
            wmap = {value:key for key, value in speech_loader.wordmap.items()}

            # recognition
            saver.restore(sess, tf.train.latest_checkpoint('model'))
            decoded = tf.transpose(model.logit, perm=[1, 0, 2])
            decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True)
            predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
            output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc})
            
            # print result
            words = ''
            for i in range(len(output[0])):
                words += wmap.get(output[0][i], -1)

            print("---------------------------")
            print("Input: " + wav_file)
            print("Output: " + words)


if __name__ == '__main__':
        speech_to_text()
        
    

================================================
FILE: train.py
================================================
#-*- coding:utf-8 -*-

from __future__ import print_function
from utils import SpeechLoader
from model import Model
import tensorflow as tf #1.0.0
import time
import os

def train():
	# setting parameters
	batch_size = 32
	n_epoch = 100
	n_mfcc = 60

	# load speech data
	wav_path = os.path.join(os.getcwd(),'data','wav','train')
	label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt')
	speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc)
	n_out = speech_loader.vocab_size

	# load model
	model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

	with tf.Session() as sess:
		sess.run(tf.global_variables_initializer())
		
		saver = tf.train.Saver(tf.global_variables())

		for epoch in range(n_epoch):
			speech_loader.create_batches() # random shuffle data
			speech_loader.reset_batch_pointer()
			for batch in range(speech_loader.n_batches):
				start = time.time()
				batches_wav, batches_label = speech_loader.next_batch()
				feed = {model.input_data: batches_wav, model.targets: batches_label}
				train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed)
				end = time.time()
				print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start))

			# save models
			if epoch % 5 ==0:
				saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch)


if __name__ == '__main__':
	train()

================================================
FILE: utils.py
================================================
#-*- coding:utf-8 -*-
__author__ = 'Deeper'
import tensorflow as tf 
import numpy as np 
import os
import codecs
import librosa
from six.moves import cPickle, reduce, map
from collections import Counter
import random

class SpeechLoader():

	def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfcc=20, encoding='utf-8'):
		self.batch_size = batch_size
		self.encoding = encoding
		self.n_mfcc = n_mfcc

		# path setting
		data_dir = os.path.join(os.getcwd(), 'cache', 'mfcc'+str(n_mfcc))
		# data cache
		wavs_file = os.path.join(data_dir, "wavs.file")
		vocab_file = os.path.join(data_dir,"vocab.file")
		mfcc_tensor = os.path.join(data_dir, "mfcc.tensor")
		label_tensor = os.path.join(data_dir, "label.tensor")

		# data process
		if not (os.path.exists(vocab_file) and os.path.exists(mfcc_tensor) and os.path.exists(label_tensor)):
			print("reading wav files")
			self.preprocess(wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor)
		else:
			print("loading preprocessed files")
			self.load_preprocessed(vocab_file, mfcc_tensor, label_tensor)

		# minibatch
		self.create_batches()
		# pointer reset
		self.reset_batch_pointer()

	def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor):
		def handle_file(dirpath, filename):
			if filename.endswith('.wav') or filename.endswith('.WAV'):
				filename_path = os.path.join(dirpath, filename)
				if os.stat(filename_path).st_size < 240000:
					return
				return filename_path

		# read label file
		labels_dict = {}
		with codecs.open(label_file,"r", encoding=self.encoding) as f:
			for label in f:
				label = label.strip('\n')
				labels_id = label.split(' ',1)[0]
				labels_text = label.split(' ',1)[1]
				labels_dict[labels_id] = labels_text
		# print("",len(labels_dict)) # 10000
		
		# wav files
		wav_files = []
		if wav_path:
			for (dirpath, dirnames, filenames) in os.walk(wav_path):
				for filename in filenames:
					if handle_file(dirpath,filename):
						wav_files.append(handle_file(dirpath,filename))
		print("初始样本数:", len(wav_files)) #样本数

		# data filter and feature extraction
		wav_files_filter = []
		labels_filter = []
		self.mfcc_tensor = []
		self.wav_max_len = 0
		cnt = 0
		for wav_file in wav_files:
			wav_id = os.path.basename(wav_file).split('.')[0]
			if wav_id in labels_dict:
				print('样本'+str(cnt), wav_file)
				labels_filter.append(labels_dict[wav_id])
				wav_files_filter.append(wav_file)
				# mfcc feature
				wav_file, sr = librosa.load(wav_file, mono=True)
				mfcc = np.transpose(librosa.feature.mfcc(wav_file, sr, n_mfcc=self.n_mfcc),[1,0])
				self.mfcc_tensor.append(mfcc.tolist())
				cnt += 1
		self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
		print("样本总数:", cnt)
		print("最长的语音:", self.wav_max_len)
		# print(len(wav_files_filter), len(labels_filter),len(wav2mfcc)) # assert check dimensions

		with open(wavs_file, 'wb') as f:
			cPickle.dump(wav_files_filter, f)

		with open(mfcc_tensor, 'wb') as f:
			cPickle.dump(self.mfcc_tensor, f)

		# vocab file
		vocabs = []
		for label in labels_filter:
			vocabs += [word for word in label]
		count = Counter(vocabs)
		count_pairs = sorted(count.items(), key=lambda x:-x[1])
		words, _ = zip(*count_pairs)
		self.wordmap = dict(zip(words, range(len(words))))

		self.vocab_size = len(words)
		print("词汇表大小:",len(words))

		with open(vocab_file,'wb') as f:
			cPickle.dump(self.wordmap, f)

		# label vector
		label_encoder = lambda word: self.wordmap.get(word, len(words))
		self.label_tensor = [list(map(label_encoder, label)) for label in labels_filter]
		self.label_max_len = max(len(label) for label in self.label_tensor)
		print("最长的句子:", self.label_max_len)

		with open(label_tensor,'wb') as f:
			cPickle.dump(self.label_tensor, f)

	def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor):
		with open(vocab_file, 'rb') as f:
			self.wordmap = cPickle.load(f) 
		self.vocab_size = len(self.wordmap)
		print("词汇表大小:",self.vocab_size)

		with open(mfcc_tensor, 'rb') as f:
			self.mfcc_tensor = cPickle.load(f)
		self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
		print("最长的语音:", self.wav_max_len)

		with open(label_tensor, 'rb') as f:
			self.label_tensor = cPickle.load(f)
		self.label_max_len = max(len(label) for label in self.label_tensor)
		print("最长的句子:", self.label_max_len)

	def create_batches(self):
		self.n_batches = len(self.mfcc_tensor) // self.batch_size
		if self.n_batches==0:
			assert False, "Not enough data. Make seq_length and batch_size small."
		
		self.mfcc_tensor = self.mfcc_tensor[:self.n_batches*self.batch_size]
		self.label_tensor = self.label_tensor[:self.n_batches*self.batch_size]

		# random shuffle the data
		if len(self.mfcc_tensor) != len(self.label_tensor):
			assert False, "Data length does not match the label length!"

		data_tensor = []
		for i in range(len(self.mfcc_tensor)):
			data_tensor.append([self.mfcc_tensor[i], self.label_tensor[i]])

		random.shuffle(data_tensor)
		self.mfcc_tensor = []
		self.label_tensor = []
		for i in range(len(data_tensor)):
			self.mfcc_tensor.append(data_tensor[i][0])
			self.label_tensor.append(data_tensor[i][1])

		# create batches
		self.x_batches = []
		self.y_batches = []

		for i in range(self.n_batches):
			from_index = i*self.batch_size
			to_index = from_index + self.batch_size
			mfcc_batches = self.mfcc_tensor[from_index:to_index]
			label_batches = self.label_tensor[from_index:to_index]
			# 补零对齐
			for mfcc in mfcc_batches:
				while len(mfcc) < self.wav_max_len:
					mfcc.append([0]*self.n_mfcc)
			for label in label_batches:
				while len(label) < self.label_max_len:
					label.append(0)

			self.x_batches.append(mfcc_batches)
			self.y_batches.append(label_batches)

	def next_batch(self):
		x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
		self.pointer += 1
		return x, y

	def reset_batch_pointer(self):
		self.pointer = 0
Download .txt
gitextract_r_ayre2m/

├── README.md
├── cache/
│   └── readme.md
├── data/
│   └── readme.md
├── model/
│   └── readme.me
├── model.py
├── test.py
├── train.py
└── utils.py
Download .txt
SYMBOL INDEX (16 symbols across 4 files)

FILE: model.py
  class Model (line 6) | class Model():
    method __init__ (line 7) | def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True):
    method residual_block (line 43) | def residual_block(self, input_tensor, size, rate, dim):
    method conv1d_layer (line 50) | def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, acti...
    method aconv1d_layer (line 65) | def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, acti...
    method batch_norm_wrapper (line 81) | def batch_norm_wrapper(self, inputs, decay=0.999):
    method activation_wrapper (line 98) | def activation_wrapper(self, inputs, activation):

FILE: test.py
  function speech_to_text (line 14) | def speech_to_text():

FILE: train.py
  function train (line 10) | def train():

FILE: utils.py
  class SpeechLoader (line 12) | class SpeechLoader():
    method __init__ (line 14) | def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfc...
    method preprocess (line 40) | def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc...
    method load_preprocessed (line 119) | def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor):
    method create_batches (line 135) | def create_batches(self):
    method next_batch (line 178) | def next_batch(self):
    method reset_batch_pointer (line 183) | def reset_batch_pointer(self):
Condensed preview — 8 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (17K chars).
[
  {
    "path": "README.md",
    "chars": 1191,
    "preview": "Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet\n=\nA tensorflow im"
  },
  {
    "path": "cache/readme.md",
    "chars": 1,
    "preview": "\n"
  },
  {
    "path": "data/readme.md",
    "chars": 1,
    "preview": "\n"
  },
  {
    "path": "model/readme.me",
    "chars": 1,
    "preview": "\n"
  },
  {
    "path": "model.py",
    "chars": 4700,
    "preview": "#-*- coding:utf-8 -*-\r\n__author__ = 'Deeper'\r\nimport tensorflow as tf  # 1.0.0\r\nimport numpy as np\r\n\r\nclass Model():\r\n\td"
  },
  {
    "path": "test.py",
    "chars": 1945,
    "preview": "#-*- coding:utf-8 -*-\r\n\r\nfrom __future__ import print_function\r\nfrom model import Model\r\nfrom utils import SpeechLoader\r"
  },
  {
    "path": "train.py",
    "chars": 1497,
    "preview": "#-*- coding:utf-8 -*-\r\n\r\nfrom __future__ import print_function\r\nfrom utils import SpeechLoader\r\nfrom model import Model\r"
  },
  {
    "path": "utils.py",
    "chars": 6116,
    "preview": "#-*- coding:utf-8 -*-\r\n__author__ = 'Deeper'\r\nimport tensorflow as tf \r\nimport numpy as np \r\nimport os\r\nimport codecs\r\ni"
  }
]

About this extraction

This page contains the full source code of the Deeperjia/tensorflow-wavenet GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 8 files (15.1 KB), approximately 4.4k tokens, and a symbol index with 16 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!