Repository: noahchalifour/rnnt-speech-recognition Branch: master Commit: a685904d71b1 Files: 31 Total size: 76.3 KB Directory structure: gitextract_zm_qbej6/ ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── __init__.py ├── cmake/ │ └── warp-rnnt-cmakelist.txt ├── debug/ │ ├── debug_dataset.py │ └── get_common_voice_stats.py ├── hparams.py ├── model.py ├── preprocess_common_voice.py ├── preprocess_librispeech.py ├── quantize_model.py ├── requirements.txt ├── run_rnnt.py ├── scripts/ │ ├── build_rnnt.sh │ ├── common_voice_convert.sh │ └── remove_missing_samples.py ├── streaming_transcribe.py ├── transcribe_file.py └── utils/ ├── __init__.py ├── data/ │ ├── __init__.py │ ├── common_voice.py │ └── librispeech.py ├── decoding.py ├── encoding.py ├── loss.py ├── metrics.py ├── model.py ├── preprocessing.py └── vocabulary.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ /.vscode .DS_Store /model /logs /data rsync_exclude.txt /scripts/train.sh /scripts/send_to_server.sh /data_p /model_tmp /figs ================================================ FILE: .gitmodules ================================================ [submodule "warp-transducer"] path = warp-transducer url = https://github.com/noahchalifour/warp-transducer.git ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Noah Chalifour Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # RNN-Transducer Speech Recognition End-to-end speech recognition using RNN-Transducer in Tensorflow 2.0 ## Overview This speech recognition model is based off Google's [Streaming End-to-end Speech Recognition For Mobile Devices](https://arxiv.org/pdf/1811.06621.pdf) research paper and is implemented in Python 3 using Tensorflow 2.0 ## Setup Your Environment To setup your environment, run the following command: ``` git clone --recurse https://github.com/noahchalifour/rnnt-speech-recognition.git cd rnnt-speech-recognition pip install tensorflow==2.2.0 # or tensorflow-gpu==2.2.0 for GPU support pip install -r requirements.txt ./scripts/build_rnnt.sh # to setup the rnnt loss ``` ## Common Voice You can find and download the Common Voice dataset [here](https://voice.mozilla.org/en/datasets) ### Convert all MP3s to WAVs Before you can train a model on the Common Voice dataset, you must first convert all the audio mp3 filetypes to wavs. Do so by running the following command: > **_NOTE:_** Make sure you have `ffmpeg` installed on your computer, as it uses that to convert mp3 to wav ``` ./scripts/common_voice_convert.sh <# of threads> python scripts/remove_missing_samples.py \ --data_dir \ --replace_old ``` ### Preprocessing dataset After converting all the mp3s to wavs you need to preprocess the dataset, you can do so by running the following command: ``` python preprocess_common_voice.py \ --data_dir \ --output_dir ``` ### Training a model To train a simple model, run the following command: ``` python run_rnnt.py \ --mode train \ --data_dir ``` ================================================ FILE: __init__.py ================================================ ================================================ FILE: cmake/warp-rnnt-cmakelist.txt ================================================ IF (APPLE) cmake_minimum_required(VERSION 3.4) ELSE() cmake_minimum_required(VERSION 2.8) ENDIF() project(rnnt_release) IF (NOT APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") ENDIF() IF (APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") add_definitions(-DAPPLE) ENDIF() include_directories(include) FIND_PACKAGE(CUDA) MESSAGE(STATUS "cuda found ${CUDA_FOUND}") option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF) option(DEBUG_TIME "output kernel time" OFF) option(DEBUG_KERNEL "output alpha beta" OFF) if (USE_NAIVE_KERNEL) add_definitions(-DUSE_NAIVE_KERNEL) endif() if (DEBUG_TIME) add_definitions(-DDEBUG_TIME) endif() if (DEBUG_KERNEL) add_definitions(-DDEBUG_KERNEL) endif() option(WITH_GPU "compile warp-rnnt with cuda." ${CUDA_FOUND}) option(WITH_OMP "compile warp-rnnt with openmp." ON) if(NOT WITH_OMP) add_definitions(-DRNNT_DISABLE_OMP) endif() if (WITH_OMP) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") endif() # need to be at least 30 or __shfl_down in reduce wont compile set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") ENDIF() IF (CUDA_VERSION GREATER 7.6) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") ENDIF() IF (CUDA_VERSION GREATER 8.9) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") ENDIF() IF (CUDA_VERSION GREATER 9.9) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") ENDIF() if (NOT APPLE) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") ENDIF() IF (APPLE) EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") #for el capitain have to use rpath IF (DARWIN_VERSION LESS 15) set(CMAKE_SKIP_RPATH TRUE) ENDIF () ELSE() #always skip for linux set(CMAKE_SKIP_RPATH TRUE) ENDIF() IF (WITH_GPU) MESSAGE(STATUS "Building shared library with GPU support") set(CUDA_curand_LIBRARY "/usr/local/cuda/lib64/libcurand.so.10") CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu) IF (!Torch_FOUND) TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) ENDIF() cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp ) TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) SET_TARGET_PROPERTIES(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") ELSE() MESSAGE(STATUS "Building shared library with no GPU support") if (NOT APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") ENDIF() ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cpp) ENDIF() add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp ) TARGET_LINK_LIBRARIES(test_cpu warprnnt) SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") add_executable(test_time tests/test_time.cpp tests/random.cpp ) TARGET_LINK_LIBRARIES(test_time warprnnt) SET_TARGET_PROPERTIES(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") INSTALL(TARGETS warprnnt RUNTIME DESTINATION "bin" LIBRARY DESTINATION "lib" ARCHIVE DESTINATION "lib") INSTALL(FILES include/rnnt.h DESTINATION "include") ================================================ FILE: debug/debug_dataset.py ================================================ from argparse import ArgumentParser import os import json import sys import tensorflow as tf FILE_DIR = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(FILE_DIR, '..')) from utils import preprocessing def check_for_invalid_values(inp, labels): tf.debugging.check_numerics(inp['mel_specs'], message='mel_specs has invalid value.') return inp, labels def check_empty(inp, labels): tf.debugging.assert_none_equal( tf.size(inp['mel_specs']), 0, message='mel_specs is empty tensor.') tf.debugging.assert_none_equal( tf.size(inp['pred_inp']), 0, message='pred_inp is empty tensor.') tf.debugging.assert_none_equal( tf.size(inp['spec_lengths']), 0, message='spec_lengths is empty tensor.') tf.debugging.assert_none_equal( tf.size(inp['label_lengths']), 0, message='label_lengths is empty tensor.') tf.debugging.assert_none_equal( tf.size(labels), 0, message='labels is empty tensor.') return inp, labels def get_dataset(data_dir, name, batch_size, n_epochs): dataset = preprocessing.load_dataset(data_dir, name) dataset = dataset.padded_batch( batch_size, padded_shapes=({ 'mel_specs': [-1, -1], 'pred_inp': [-1], 'spec_lengths': [], 'label_lengths': [] }, [-1])) dataset = dataset.repeat(n_epochs) with open(os.path.join(data_dir, '{}-specs.json'.format(name)), 'r') as f: dataset_specs = json.load(f) return dataset, dataset_specs def main(args): dataset, dataset_specs = get_dataset( args.data_dir, args.split, batch_size=1, n_epochs=1) dataset.map(check_for_invalid_values) dataset.map(check_empty) for _ in dataset: pass print('All checks passed.') def parse_args(): ap = ArgumentParser() ap.add_argument('-d', '--data_dir', type=str, required=True, help='Path to preprocessed dataset.') ap.add_argument('-s', '--split', type=str, default='train', help='Name of dataset split to inspect.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: debug/get_common_voice_stats.py ================================================ from argparse import ArgumentParser from scipy.io.wavfile import read as read_wav import glob import os def main(args): max_length = 0 min_length = 0 total_length = 0 count = 0 with open(os.path.join(args.data_dir, args.split + '.tsv'), 'r') as f: next(f) for line in f: line_split = line.split('\t') audio_fn = line_split[1] filepath = os.path.join(args.data_dir, 'clips', audio_fn[:-4] + '.wav') sr, data = read_wav(filepath) length = len(data) / sr if length > max_length: max_length = length if length < min_length or min_length == 0: min_length = length total_length += length count += 1 avg_length = total_length / count print('Total: {:.4f} s'.format(total_length)) print('Min length: {:.4f} s'.format(min_length)) print('Max length: {:.4f} s'.format(max_length)) print('Average length: {:.4f} s'.format(avg_length)) def parse_args(): ap = ArgumentParser() ap.add_argument('-d', '--data_dir', required=True, type=str, help='Directory of common voice dataset.') ap.add_argument('-s', '--split', type=str, default='train', help='Split to get statistics for.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: hparams.py ================================================ from tensorboard.plugins.hparams import api as hp HP_TOKEN_TYPE = hp.HParam('token_type', hp.Discrete(['word-piece', 'character'])) HP_VOCAB_SIZE = hp.HParam('vocab_size', hp.Discrete([2**12])) # Preprocessing Hparams HP_MEL_BINS = hp.HParam('mel_bins', hp.Discrete([80])) HP_FRAME_LENGTH = hp.HParam('frame_length', hp.Discrete([0.025])) HP_FRAME_STEP = hp.HParam('frame_step', hp.Discrete([0.01])) HP_HERTZ_LOW = hp.HParam('hertz_low', hp.Discrete([125.0])) HP_HERTZ_HIGH = hp.HParam('hertz_high', hp.Discrete([7600.0])) HP_DOWNSAMPLE_FACTOR = hp.HParam('downsample_factor', hp.Discrete([3])) # Model Hparams HP_EMBEDDING_SIZE = hp.HParam('embedding_size', hp.Discrete([500])) HP_ENCODER_LAYERS = hp.HParam('encoder_layers', hp.Discrete([8])) HP_ENCODER_SIZE = hp.HParam('encoder_size', hp.Discrete([2048])) HP_PROJECTION_SIZE = hp.HParam('projection_size', hp.Discrete([640])) HP_TIME_REDUCT_INDEX = hp.HParam('time_reduction_index', hp.Discrete([1])) HP_TIME_REDUCT_FACTOR = hp.HParam('time_reduction_factor', hp.Discrete([2])) HP_PRED_NET_LAYERS = hp.HParam('pred_net_layers', hp.Discrete([2])) HP_PRED_NET_SIZE = hp.HParam('pred_net_size', hp.Discrete([2048])) HP_JOINT_NET_SIZE = hp.HParam('joint_net_size', hp.Discrete([640])) HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0])) # HP_EMBEDDING_SIZE = hp.HParam('embedding_size', hp.Discrete([32])) # HP_ENCODER_LAYERS = hp.HParam('encoder_layers', hp.Discrete([4])) # HP_ENCODER_SIZE = hp.HParam('encoder_size', hp.Discrete([20])) # HP_PROJECTION_SIZE = hp.HParam('projection_size', hp.Discrete([50])) # HP_TIME_REDUCT_INDEX = hp.HParam('time_reduction_index', hp.Discrete([1])) # HP_TIME_REDUCT_FACTOR = hp.HParam('time_reduction_factor', hp.Discrete([2])) # HP_PRED_NET_LAYERS = hp.HParam('pred_net_layers', hp.Discrete([2])) # HP_PRED_NET_SIZE = hp.HParam('pred_net_size', hp.Discrete([100])) # HP_JOINT_NET_SIZE = hp.HParam('joint_net_size', hp.Discrete([50])) # HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.2])) HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([1e-4])) METRIC_TRAIN_LOSS = 'train_loss' METRIC_TRAIN_ACCURACY = 'train_accuracy' METRIC_EVAL_LOSS = 'eval_loss' METRIC_EVAL_ACCURACY = 'eval_accuracy' METRIC_EVAL_CER = 'eval_cer' METRIC_EVAL_WER = 'eval_wer' METRIC_ACCURACY = 'accuracy' METRIC_CER = 'cer' METRIC_WER = 'wer' ================================================ FILE: model.py ================================================ import re import os import tensorflow as tf from hparams import * class TimeReduction(tf.keras.layers.Layer): def __init__(self, reduction_factor, batch_size=None, **kwargs): super(TimeReduction, self).__init__(**kwargs) self.reduction_factor = reduction_factor self.batch_size = batch_size def call(self, inputs): input_shape = tf.shape(inputs) batch_size = self.batch_size if batch_size is None: batch_size = input_shape[0] max_time = input_shape[1] num_units = inputs.get_shape().as_list()[-1] outputs = inputs paddings = [[0, 0], [0, tf.math.floormod(max_time, self.reduction_factor)], [0, 0]] outputs = tf.pad(outputs, paddings) return tf.reshape(outputs, (batch_size, -1, num_units * self.reduction_factor)) def encoder(specs_shape, num_layers, d_model, proj_size, reduction_index, reduction_factor, dropout, stateful=False, initializer=None, dtype=tf.float32): batch_size = None if stateful: batch_size = 1 mel_specs = tf.keras.Input(shape=specs_shape, batch_size=batch_size, dtype=tf.float32) norm_mel_specs = tf.keras.layers.BatchNormalization()(mel_specs) lstm_cell = lambda: tf.compat.v1.nn.rnn_cell.LSTMCell(d_model, num_proj=proj_size, initializer=initializer, dtype=dtype) outputs = norm_mel_specs for i in range(num_layers): rnn_layer = tf.keras.layers.RNN(lstm_cell(), return_sequences=True, stateful=stateful) outputs = rnn_layer(outputs) outputs = tf.keras.layers.Dropout(dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(dtype=dtype)(outputs) if i == reduction_index: # outputs = tf.keras.layers.Conv1D(proj_size, # kernel_size=reduction_factor, # strides=reduction_factor)(outputs) outputs = TimeReduction(reduction_factor, batch_size=batch_size)(outputs) return tf.keras.Model(inputs=[mel_specs], outputs=[outputs], name='encoder') def prediction_network(vocab_size, embedding_size, num_layers, layer_size, proj_size, dropout, stateful=False, initializer=None, dtype=tf.float32): batch_size = None if stateful: batch_size = 1 inputs = tf.keras.Input(shape=[None], batch_size=batch_size, dtype=tf.float32) embed = tf.keras.layers.Embedding(vocab_size, embedding_size)(inputs) rnn_cell = lambda: tf.compat.v1.nn.rnn_cell.LSTMCell(layer_size, num_proj=proj_size, initializer=initializer, dtype=dtype) outputs = embed for _ in range(num_layers): outputs = tf.keras.layers.RNN(rnn_cell(), return_sequences=True)(outputs) outputs = tf.keras.layers.Dropout(dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(dtype=dtype)(outputs) return tf.keras.Model(inputs=[inputs], outputs=[outputs], name='prediction_network') def build_keras_model(hparams, stateful=False, initializer=None, dtype=tf.float32): specs_shape = [None, hparams[HP_MEL_BINS.name] * hparams[HP_DOWNSAMPLE_FACTOR.name]] batch_size = None if stateful: batch_size = 1 mel_specs = tf.keras.Input(shape=specs_shape, batch_size=batch_size, dtype=tf.float32, name='mel_specs') pred_inp = tf.keras.Input(shape=[None], batch_size=batch_size, dtype=tf.float32, name='pred_inp') inp_enc = encoder( specs_shape=specs_shape, num_layers=hparams[HP_ENCODER_LAYERS.name], d_model=hparams[HP_ENCODER_SIZE.name], proj_size=hparams[HP_PROJECTION_SIZE.name], dropout=hparams[HP_DROPOUT.name], reduction_index=hparams[HP_TIME_REDUCT_INDEX.name], reduction_factor=hparams[HP_TIME_REDUCT_FACTOR.name], stateful=stateful, initializer=initializer, dtype=dtype)(mel_specs) pred_outputs = prediction_network( vocab_size=hparams[HP_VOCAB_SIZE.name], embedding_size=hparams[HP_EMBEDDING_SIZE.name], num_layers=hparams[HP_PRED_NET_LAYERS.name], layer_size=hparams[HP_PRED_NET_SIZE.name], proj_size=hparams[HP_PROJECTION_SIZE.name], dropout=hparams[HP_DROPOUT.name], stateful=stateful, initializer=initializer, dtype=dtype)(pred_inp) joint_inp = ( tf.expand_dims(inp_enc, axis=2) + # [B, T, V] => [B, T, 1, V] tf.expand_dims(pred_outputs, axis=1)) # [B, U, V] => [B, 1, U, V] joint_outputs = tf.keras.layers.Dense(hparams[HP_JOINT_NET_SIZE.name], kernel_initializer=initializer, activation='tanh')(joint_inp) outputs = tf.keras.layers.Dense(hparams[HP_VOCAB_SIZE.name], kernel_initializer=initializer)(joint_outputs) return tf.keras.Model(inputs=[mel_specs, pred_inp], outputs=[outputs], name='transducer') ================================================ FILE: preprocess_common_voice.py ================================================ from absl import app, logging, flags import os import json import tensorflow as tf from utils import preprocessing, encoding from utils.data import common_voice from hparams import * FLAGS = flags.FLAGS flags.DEFINE_string( 'data_dir', None, 'Directory to read Common Voice data from.') flags.DEFINE_string( 'output_dir', './data', 'Directory to save preprocessed data.') flags.DEFINE_integer( 'max_length', 0, 'Max audio length in seconds.') def write_dataset(dataset, name): filepath = os.path.join(FLAGS.output_dir, '{}.tfrecord'.format(name)) writer = tf.data.experimental.TFRecordWriter(filepath) writer.write(dataset) logging.info('Wrote {} dataset to {}'.format( name, filepath)) def main(_): hparams = { HP_TOKEN_TYPE: HP_TOKEN_TYPE.domain.values[1], HP_VOCAB_SIZE: HP_VOCAB_SIZE.domain.values[0], # Preprocessing HP_MEL_BINS: HP_MEL_BINS.domain.values[0], HP_FRAME_LENGTH: HP_FRAME_LENGTH.domain.values[0], HP_FRAME_STEP: HP_FRAME_STEP.domain.values[0], HP_HERTZ_LOW: HP_HERTZ_LOW.domain.values[0], HP_HERTZ_HIGH: HP_HERTZ_HIGH.domain.values[0], HP_DOWNSAMPLE_FACTOR: HP_DOWNSAMPLE_FACTOR.domain.values[0] } _hparams = {k.name: v for k, v in hparams.items()} texts_gen = common_voice.texts_generator(FLAGS.data_dir) encoder_fn, decoder_fn, vocab_size = encoding.get_encoder( encoder_dir=FLAGS.output_dir, hparams=_hparams, texts_generator=texts_gen) _hparams[HP_VOCAB_SIZE.name] = vocab_size train_dataset = common_voice.load_dataset( FLAGS.data_dir, 'train') dev_dataset = common_voice.load_dataset( FLAGS.data_dir, 'dev') test_dataset = common_voice.load_dataset( FLAGS.data_dir, 'test') train_dataset = preprocessing.preprocess_dataset( train_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length, save_plots=True) write_dataset(train_dataset, 'train') dev_dataset = preprocessing.preprocess_dataset( dev_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length) write_dataset(dev_dataset, 'dev') test_dataset = preprocessing.preprocess_dataset( test_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length) write_dataset(test_dataset, 'test') if __name__ == '__main__': flags.mark_flag_as_required('data_dir') app.run(main) ================================================ FILE: preprocess_librispeech.py ================================================ from absl import app, logging, flags import os import json import tensorflow as tf from utils import preprocessing, encoding from utils.data import librispeech from hparams import * FLAGS = flags.FLAGS flags.DEFINE_string( 'data_dir', None, 'Directory to read Librispeech data from.') flags.DEFINE_string( 'output_dir', './data', 'Directory to save preprocessed data.') flags.DEFINE_integer( 'max_length', 0, 'Max audio length in seconds.') def write_dataset(dataset, name): filepath = os.path.join(FLAGS.output_dir, '{}.tfrecord'.format(name)) writer = tf.data.experimental.TFRecordWriter(filepath) writer.write(dataset) logging.info('Wrote {} dataset to {}'.format( name, filepath)) def main(_): hparams = { HP_TOKEN_TYPE: HP_TOKEN_TYPE.domain.values[1], HP_VOCAB_SIZE: HP_VOCAB_SIZE.domain.values[0], # Preprocessing HP_MEL_BINS: HP_MEL_BINS.domain.values[0], HP_FRAME_LENGTH: HP_FRAME_LENGTH.domain.values[0], HP_FRAME_STEP: HP_FRAME_STEP.domain.values[0], HP_HERTZ_LOW: HP_HERTZ_LOW.domain.values[0], HP_HERTZ_HIGH: HP_HERTZ_HIGH.domain.values[0], HP_DOWNSAMPLE_FACTOR: HP_DOWNSAMPLE_FACTOR.domain.values[0] } train_splits = [ 'dev-clean' ] dev_splits = [ 'dev-clean' ] test_splits = [ 'dev-clean' ] # train_splits = [ # 'train-clean-100', # 'train-clean-360', # 'train-other-500' # ] # dev_splits = [ # 'dev-clean', # 'dev-other' # ] # test_splits = [ # 'test-clean', # 'test-other' # ] _hparams = {k.name: v for k, v in hparams.items()} texts_gen = librispeech.texts_generator(FLAGS.data_dir, split_names=train_splits) encoder_fn, decoder_fn, vocab_size = encoding.get_encoder( encoder_dir=FLAGS.output_dir, hparams=_hparams, texts_generator=texts_gen) _hparams[HP_VOCAB_SIZE.name] = vocab_size train_dataset = librispeech.load_dataset( FLAGS.data_dir, train_splits) dev_dataset = librispeech.load_dataset( FLAGS.data_dir, dev_splits) test_dataset = librispeech.load_dataset( FLAGS.data_dir, test_splits) train_dataset = preprocessing.preprocess_dataset( train_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length, save_plots=True) write_dataset(train_dataset, 'train') dev_dataset = preprocessing.preprocess_dataset( dev_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length) write_dataset(dev_dataset, 'dev') test_dataset = preprocessing.preprocess_dataset( test_dataset, encoder_fn=encoder_fn, hparams=_hparams, max_length=FLAGS.max_length) write_dataset(test_dataset, 'test') if __name__ == '__main__': flags.mark_flag_as_required('data_dir') app.run(main) ================================================ FILE: quantize_model.py ================================================ from argparse import ArgumentParser import os import tensorflow as tf from utils import model as model_utils def main(args): hparams = model_utils.load_hparams(args.model_dir) model, _ = model_utils.load_model(args.model_dir, hparams, stateful=True) model.summary() converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.experimental_new_converter = True # converter.experimental_new_quantizer = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # converter.optimizations = [tf.lite.Optimize.DEFAULT] tflite_quant_model = converter.convert() tflite_dir = os.path.join(args.model_dir, 'tflite') os.makedirs(tflite_dir, exist_ok=True) with open(os.path.join(tflite_dir, 'model.tflite'), 'wb') as f: f.write(tflite_quant_model) def parse_args(): ap = ArgumentParser() ap.add_argument('-m', '--model_dir', type=str, default='./model', help='Directory of model.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: requirements.txt ================================================ pydub>=0.23.1 scipy>=1.3.1 tqdm tensorflow-datasets soundfile librosa matplotlib ================================================ FILE: run_rnnt.py ================================================ from absl import flags, logging, app from tensorboard.plugins.hparams import api as hp from tensorflow.keras.mixed_precision import experimental as mixed_precision from datetime import datetime import json import re import os import time import shutil import tensorflow as tf tf.get_logger().setLevel('WARNING') tf.autograph.set_verbosity(0) # tf.random.set_seed(1234) from utils import preprocessing, vocabulary, encoding, \ metrics, decoding from utils.loss import get_loss_fn from utils import model as model_utils from model import build_keras_model from hparams import * FLAGS = flags.FLAGS # Required flags flags.DEFINE_enum( 'mode', None, ['train', 'eval', 'test'], 'Mode to run.') flags.DEFINE_string( 'data_dir', None, 'Input data directory.') # Optional flags flags.DEFINE_string( 'tb_log_dir', './logs', 'Directory to save Tensorboard logs.') flags.DEFINE_string( 'output_dir', './model', 'Directory to save model.') flags.DEFINE_string( 'checkpoint', None, 'Checkpoint to restore from.') flags.DEFINE_integer( 'batch_size', 32, 'Training batch size.') flags.DEFINE_integer( 'n_epochs', 1000, 'Number of training epochs.') flags.DEFINE_integer( 'steps_per_log', 1, 'Number of steps between each log.') flags.DEFINE_integer( 'steps_per_checkpoint', 1000, 'Number of steps between eval and checkpoint.') flags.DEFINE_integer( 'eval_size', None, 'Max number of samples to use for eval.') flags.DEFINE_list( 'gpus', None, 'GPUs to run training on.') flags.DEFINE_bool( 'fp16_run', False, 'Run using 16-bit precision instead of 32-bit.') def get_dataset(data_dir, name, batch_size, n_epochs, strategy=None, max_size=None): dataset = preprocessing.load_dataset(data_dir, name) if max_size is not None: dataset = dataset.take(max_size) dataset = dataset.padded_batch( batch_size, padded_shapes=( [-1, -1], [-1], [], [], [-1] ) ) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if strategy is not None: dataset = strategy.experimental_distribute_dataset(dataset) return dataset def configure_environment(gpu_names, fp16_run): if fp16_run: print('Using 16-bit float precision.') policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) gpus = tf.config.experimental.list_physical_devices('GPU') if gpu_names is not None and len(gpu_names) > 0: gpus = [x for x in gpus if x.name[len('/physical_device:'):] in gpu_names] if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # tf.config.experimental.set_virtual_device_configuration( # gpus[0], # [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096), # tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: logging.warn(str(e)) if len(gpus) > 1: print('Running multi gpu: {}'.format(', '.join(gpu_names))) strategy = tf.distribute.MirroredStrategy( devices=gpu_names) else: device = gpus[0].name[len('/physical_device:'):] print('Running single gpu: {}'.format(device)) strategy = tf.distribute.OneDeviceStrategy( device=device) dtype = tf.float16 if fp16_run else tf.float32 return strategy, dtype def setup_hparams(log_dir, checkpoint): if checkpoint is not None: checkpoint_dir = os.path.dirname(os.path.realpath(checkpoint)) hparams = model_utils.load_hparams(checkpoint_dir) tb_hparams = {} tb_keys = [ HP_TOKEN_TYPE, HP_MEL_BINS, HP_FRAME_LENGTH, HP_FRAME_STEP, HP_HERTZ_LOW, HP_HERTZ_HIGH, HP_DOWNSAMPLE_FACTOR, HP_EMBEDDING_SIZE, HP_ENCODER_LAYERS, HP_ENCODER_SIZE, HP_PROJECTION_SIZE, HP_TIME_REDUCT_FACTOR, HP_TIME_REDUCT_INDEX, HP_PRED_NET_LAYERS, HP_PRED_NET_SIZE, HP_JOINT_NET_SIZE, HP_DROPOUT, HP_LEARNING_RATE ] for k, v in hparams.items(): for tb_key in tb_keys: if k == tb_key.name: tb_hparams[tb_key] = v else: tb_hparams = { HP_TOKEN_TYPE: HP_TOKEN_TYPE.domain.values[1], # Preprocessing HP_MEL_BINS: HP_MEL_BINS.domain.values[0], HP_FRAME_LENGTH: HP_FRAME_LENGTH.domain.values[0], HP_FRAME_STEP: HP_FRAME_STEP.domain.values[0], HP_HERTZ_LOW: HP_HERTZ_LOW.domain.values[0], HP_HERTZ_HIGH: HP_HERTZ_HIGH.domain.values[0], HP_DOWNSAMPLE_FACTOR: HP_DOWNSAMPLE_FACTOR.domain.values[0], # Model HP_EMBEDDING_SIZE: HP_EMBEDDING_SIZE.domain.values[0], HP_ENCODER_LAYERS: HP_ENCODER_LAYERS.domain.values[0], HP_ENCODER_SIZE: HP_ENCODER_SIZE.domain.values[0], HP_PROJECTION_SIZE: HP_PROJECTION_SIZE.domain.values[0], HP_TIME_REDUCT_INDEX: HP_TIME_REDUCT_INDEX.domain.values[0], HP_TIME_REDUCT_FACTOR: HP_TIME_REDUCT_FACTOR.domain.values[0], HP_PRED_NET_LAYERS: HP_PRED_NET_LAYERS.domain.values[0], HP_PRED_NET_SIZE: HP_PRED_NET_SIZE.domain.values[0], HP_JOINT_NET_SIZE: HP_JOINT_NET_SIZE.domain.values[0], HP_DROPOUT: HP_DROPOUT.domain.values[0], HP_LEARNING_RATE: HP_LEARNING_RATE.domain.values[0] } with tf.summary.create_file_writer(os.path.join(log_dir, 'hparams_tuning')).as_default(): hp.hparams_config( hparams=[ HP_TOKEN_TYPE, HP_VOCAB_SIZE, HP_ENCODER_LAYERS, HP_ENCODER_SIZE, HP_PROJECTION_SIZE, HP_TIME_REDUCT_INDEX, HP_TIME_REDUCT_FACTOR, HP_PRED_NET_LAYERS, HP_PRED_NET_SIZE, HP_JOINT_NET_SIZE, HP_DROPOUT ], metrics=[ hp.Metric(METRIC_ACCURACY, display_name='Accuracy'), hp.Metric(METRIC_WER, display_name='WER'), ], ) return {k.name: v for k, v in tb_hparams.items()}, tb_hparams def run_metrics(inputs, y_true, metrics, strategy=None): return { metric_fn.__name__: metric_fn(inputs, y_true) for metric_fn in metrics} def run_training(model, optimizer, loss_fn, train_dataset, batch_size, n_epochs, checkpoint_template, hparams, noise=0, # noise=0.075, strategy=None, steps_per_log=None, steps_per_checkpoint=None, eval_dataset=None, train_metrics=[], eval_metrics=[], fp16_run=False): feat_size = hparams[HP_MEL_BINS.name] * hparams[HP_DOWNSAMPLE_FACTOR.name] @tf.function(input_signature=[[ tf.TensorSpec(shape=[None, None, feat_size], dtype=tf.float32), tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None], dtype=tf.int32), tf.TensorSpec(shape=[None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32)]]) def train_step(dist_inputs): def step_fn(inputs): (mel_specs, pred_inp, spec_lengths, label_lengths, labels) = inputs if noise > 0: mel_specs += tf.random.normal([mel_specs.shape[-1]], mean=0, stddev=noise) with tf.GradientTape() as tape: outputs = model([mel_specs, pred_inp], training=True) rnnt_loss = loss_fn(labels, outputs, spec_lengths, label_lengths) if fp16_run: rnnt_loss = optimizer.get_scaled_loss(rnnt_loss) loss = tf.reduce_sum(rnnt_loss) * (1. / batch_size) if train_metrics is not None: metric_results = run_metrics(mel_specs, labels, metrics=train_metrics, strategy=strategy) gradients = tape.gradient(loss, model.trainable_variables) if fp16_run: gradients = optimizer.get_unscaled_gradients(gradients) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) return rnnt_loss, metric_results loss, metrics_results = strategy.run(step_fn, args=(dist_inputs,)) loss = strategy.reduce( tf.distribute.ReduceOp.MEAN, loss, axis=0) metrics_results = {name: strategy.reduce( tf.distribute.ReduceOp.MEAN, result, axis=0) for name, result in metrics_results.items()} return loss, metrics_results def checkpoint_model(): eval_start_time = time.time() eval_loss, eval_metrics_results = run_evaluate( model=model, optimizer=optimizer, loss_fn=loss_fn, eval_dataset=eval_dataset, batch_size=batch_size, hparams=hparams, strategy=strategy, metrics=eval_metrics) validation_log_str = 'VALIDATION RESULTS: Time: {:.4f}, Loss: {:.4f}'.format( time.time() - eval_start_time, eval_loss) for metric_name, metric_result in eval_metrics_results.items(): validation_log_str += ', {}: {:.4f}'.format(metric_name, metric_result) print(validation_log_str) tf.summary.scalar(METRIC_EVAL_LOSS, eval_loss, step=global_step) if 'Accuracy' in eval_metrics_results: tf.summary.scalar(METRIC_EVAL_ACCURACY, eval_metrics_results['Accuracy'], step=global_step) if 'WER' in eval_metrics_results: tf.summary.scalar(METRIC_EVAL_WER, eval_metrics_results['WER'], step=global_step) checkpoint_filepath = checkpoint_template.format( step=global_step, val_loss=eval_loss) print('Saving checkpoint {}'.format(checkpoint_filepath)) model.save_weights(checkpoint_filepath) with strategy.scope(): print('Starting training.') global_step = 0 for epoch in range(n_epochs): loss_object = tf.keras.metrics.Mean() metric_objects = {fn.__name__: tf.keras.metrics.Mean() for fn in train_metrics} for batch, inputs in enumerate(train_dataset): if global_step % steps_per_checkpoint == 0: if eval_dataset is not None: checkpoint_model() start_time = time.time() loss, metrics_results = train_step(inputs) step_time = time.time() - start_time loss_object(loss) for metric_name, metric_result in metrics_results.items(): metric_objects[metric_name](metric_result) if global_step % steps_per_log == 0: log_str = 'Epoch: {}, Batch: {}, Global Step: {}, Step Time: {:.4f}, Loss: {:.4f}'.format( epoch, batch, global_step, step_time, loss_object.result()) for metric_name, metric_object in metric_objects.items(): log_str += ', {}: {:.4f}'.format(metric_name, metric_object.result()) print(log_str) tf.summary.scalar(METRIC_TRAIN_LOSS, loss_object.result(), step=global_step) if 'Accuracy' in metric_objects: tf.summary.scalar(METRIC_TRAIN_ACCURACY, metric_objects['Accuracy'].result(), step=global_step) global_step += 1 epoch_end_log_str = 'EPOCH RESULTS: Loss: {:.4f}'.format(loss_object.result()) for metric_name, metric_object in metric_objects.items(): epoch_end_log_str += ', {}: {:.4f}'.format(metric_name, metric_object.result()) print(epoch_end_log_str) checkpoint_model() def run_evaluate(model, optimizer, loss_fn, eval_dataset, batch_size, strategy, hparams, metrics=[], fp16_run=False): feat_size = hparams[HP_MEL_BINS.name] * hparams[HP_DOWNSAMPLE_FACTOR.name] @tf.function(input_signature=[[ tf.TensorSpec(shape=[None, None, feat_size], dtype=tf.float32), tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None], dtype=tf.int32), tf.TensorSpec(shape=[None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32)]]) def eval_step(dist_inputs): def step_fn(inputs): (mel_specs, pred_inp, spec_lengths, label_lengths, labels) = inputs outputs = model([mel_specs, pred_inp], training=False) loss = loss_fn(labels, outputs, spec_lengths=spec_lengths, label_lengths=label_lengths) if fp16_run: loss = optimizer.get_scaled_loss(loss) if metrics is not None: metric_results = run_metrics(mel_specs, labels, metrics=metrics, strategy=strategy) return loss, metric_results loss, metrics_results = strategy.run(step_fn, args=(dist_inputs,)) loss = strategy.reduce( tf.distribute.ReduceOp.MEAN, loss, axis=0) metrics_results = {name: strategy.reduce( tf.distribute.ReduceOp.MEAN, result, axis=0) for name, result in metrics_results.items()} return loss, metrics_results print('Performing evaluation.') loss_object = tf.keras.metrics.Mean() metric_objects = {fn.__name__: tf.keras.metrics.Mean() for fn in metrics} for batch, inputs in enumerate(eval_dataset): loss, metrics_results = eval_step(inputs) loss_object(loss) for metric_name, metric_result in metrics_results.items(): metric_objects[metric_name](metric_result) metrics_final_results = {name: metric_object.result() for name, metric_object in metric_objects.items()} return loss_object.result(), metrics_final_results def main(_): strategy, dtype = configure_environment( gpu_names=FLAGS.gpus, fp16_run=FLAGS.fp16_run) hparams, tb_hparams = setup_hparams( log_dir=FLAGS.tb_log_dir, checkpoint=FLAGS.checkpoint) os.makedirs(FLAGS.output_dir, exist_ok=True) if FLAGS.checkpoint is None: encoder_dir = FLAGS.data_dir else: encoder_dir = os.path.dirname(os.path.realpath(FLAGS.checkpoint)) shutil.copy( os.path.join(encoder_dir, 'encoder.subwords'), os.path.join(FLAGS.output_dir, 'encoder.subwords')) encoder_fn, idx_to_text, vocab_size = encoding.get_encoder( encoder_dir=FLAGS.output_dir, hparams=hparams) if HP_VOCAB_SIZE.name not in hparams: hparams[HP_VOCAB_SIZE.name] = vocab_size with strategy.scope(): model = build_keras_model(hparams, dtype=dtype) if FLAGS.checkpoint is not None: model.load_weights(FLAGS.checkpoint) logging.info('Restored weights from {}.'.format(FLAGS.checkpoint)) model_utils.save_hparams(hparams, FLAGS.output_dir) optimizer = tf.keras.optimizers.SGD(hparams[HP_LEARNING_RATE.name], momentum=0.9) if FLAGS.fp16_run: optimizer = mixed_precision.LossScaleOptimizer(optimizer, loss_scale='dynamic') logging.info('Using {} encoder with vocab size: {}'.format( hparams[HP_TOKEN_TYPE.name], vocab_size)) loss_fn = get_loss_fn( reduction_factor=hparams[HP_TIME_REDUCT_FACTOR.name]) decode_fn = decoding.greedy_decode_fn(model, hparams) accuracy_fn = metrics.build_accuracy_fn(decode_fn) wer_fn = metrics.build_wer_fn(decode_fn, idx_to_text) encoder = model.layers[2] prediction_network = model.layers[3] encoder.summary() prediction_network.summary() model.summary() dev_dataset = None if FLAGS.eval_size != 0: dev_dataset = get_dataset(FLAGS.data_dir, 'dev', batch_size=FLAGS.batch_size, n_epochs=FLAGS.n_epochs, strategy=strategy, max_size=FLAGS.eval_size) log_dir = os.path.join(FLAGS.tb_log_dir, datetime.now().strftime('%Y%m%d-%H%M%S')) with tf.summary.create_file_writer(log_dir).as_default(): hp.hparams(tb_hparams) if FLAGS.mode == 'train': train_dataset = get_dataset(FLAGS.data_dir, 'train', batch_size=FLAGS.batch_size, n_epochs=FLAGS.n_epochs, strategy=strategy) os.makedirs(FLAGS.output_dir, exist_ok=True) checkpoint_template = os.path.join(FLAGS.output_dir, 'checkpoint_{step}_{val_loss:.4f}.hdf5') run_training( model=model, optimizer=optimizer, loss_fn=loss_fn, train_dataset=train_dataset, batch_size=FLAGS.batch_size, n_epochs=FLAGS.n_epochs, checkpoint_template=checkpoint_template, hparams=hparams, strategy=strategy, steps_per_log=FLAGS.steps_per_log, steps_per_checkpoint=FLAGS.steps_per_checkpoint, eval_dataset=dev_dataset, train_metrics=[], eval_metrics=[accuracy_fn, wer_fn]) elif FLAGS.mode == 'eval' or FLAGS.mode == 'test': if FLAGS.checkpoint is None: raise Exception('You must provide a checkpoint to perform eval.') if FLAGS.mode == 'test': dataset = get_dataset(FLAGS.data_dir, 'test', batch_size=FLAGS.batch_size, n_epochs=FLAGS.n_epochs) else: dataset = dev_dataset eval_start_time = time.time() eval_loss, eval_metrics_results = run_evaluate( model=model, optimizer=optimizer, loss_fn=loss_fn, eval_dataset=dataset, batch_size=FLAGS.batch_size, hparams=hparams, strategy=strategy, metrics=[accuracy_fn, wer_fn]) validation_log_str = 'VALIDATION RESULTS: Time: {:.4f}, Loss: {:.4f}'.format( time.time() - eval_start_time, eval_loss) for metric_name, metric_result in eval_metrics_results.items(): validation_log_str += ', {}: {:.4f}'.format(metric_name, metric_result) print(validation_log_str) if __name__ == '__main__': # tf.config.experimental_run_functions_eagerly(True) flags.mark_flag_as_required('mode') flags.mark_flag_as_required('data_dir') app.run(main) ================================================ FILE: scripts/build_rnnt.sh ================================================ cp cmake/warp-rnnt-cmakelist.txt warp-transducer/CMakeLists.txt cd warp-transducer mkdir build cd build CC=gcc-4.8 CXX=g++-4.8 cmake \ -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME .. make cd ../tensorflow_binding python setup.py install cd ../../ ================================================ FILE: scripts/common_voice_convert.sh ================================================ #!/bin/bash OIFS="$IFS" IFS=$'\n' FORMAT=.mp3 DATA_DIR="$1" N=${2:-1} mkdir -p $DATA_DIR FILES=$(ls "$DATA_DIR" | grep $FORMAT) thread () { local FILE_N=$1 FILENAME="${FILE_N:0:${#FILE_N}-4}" ffmpeg -i $DATA_DIR/$FILE_N -acodec pcm_s16le -ac 1 -ar 16000 $DATA_DIR/$FILENAME.wav rm $DATA_DIR/$FILE_N } for FILE in $FILES do ((i=i%N)); ((i++==0)) && wait thread "$FILE" & done IFS="$OIFS" ================================================ FILE: scripts/remove_missing_samples.py ================================================ from argparse import ArgumentParser import os def remove_missing(data_dir, fname, replace_old=True): clips_dir = os.path.join(data_dir, 'clips') old_filepath = os.path.join(data_dir, '{}.tsv'.format(fname)) new_filepath = os.path.join(data_dir, '{}-tmp.tsv'.format(fname)) with open(old_filepath, 'r') as old_f: with open(new_filepath, 'w') as new_f: new_f.write(next(old_f)) for line in old_f: audio_fn = line.split('\t')[1][:-4] + '.wav' if os.path.exists(os.path.join(clips_dir, audio_fn)): new_f.write(line) if replace_old: os.remove(old_filepath) os.rename(new_filepath, old_filepath) def main(args): tsv_files = ['dev', 'invalidated', 'other', 'test', 'train', 'validated'] for _file in tsv_files: remove_missing(args.data_dir, _file, replace_old=args.replace_old) print('Done.') def parse_args(): ap = ArgumentParser() ap.add_argument('--data_dir', required=True, type=str, help='Path to common voice data directory.') ap.add_argument('--replace_old', type=bool, default=False, help='Replace old tsv files with updated ones.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: streaming_transcribe.py ================================================ from argparse import ArgumentParser import os import time import pyaudio import tensorflow as tf tf.get_logger().setLevel('ERROR') tf.autograph.set_verbosity(0) from utils import preprocessing, encoding, decoding from utils import model as model_utils from model import build_keras_model from hparams import * SAMPLE_RATE = 16000 NUM_CHANNELS = 1 CHUNK_SIZE = 1024 LAST_OUTPUT = '' def main(args): model_dir = os.path.dirname(os.path.realpath(args.checkpoint)) hparams = model_utils.load_hparams(model_dir) _, tok_to_text, vocab_size = encoding.get_encoder( encoder_dir=model_dir, hparams=hparams) hparams[HP_VOCAB_SIZE.name] = vocab_size model = build_keras_model(hparams, stateful=True) model.load_weights(args.checkpoint) decoder_fn = decoding.greedy_decode_fn(model, hparams) p = pyaudio.PyAudio() def listen_callback(in_data, frame_count, time_info, status): global LAST_OUTPUT audio = tf.io.decode_raw(in_data, out_type=tf.float32) log_melspec = preprocessing.preprocess_audio( audio=audio, sample_rate=SAMPLE_RATE, hparams=hparams) log_melspec = tf.expand_dims(log_melspec, axis=0) decoded = decoder_fn(log_melspec)[0] transcription = LAST_OUTPUT + tok_to_text(decoded)\ .numpy().decode('utf8') if transcription != LAST_OUTPUT: LAST_OUTPUT = transcription print(transcription) return in_data, pyaudio.paContinue stream = p.open( format=pyaudio.paFloat32, channels=NUM_CHANNELS, rate=SAMPLE_RATE, input=True, frames_per_buffer=CHUNK_SIZE, stream_callback=listen_callback) print('Listening...') stream.start_stream() while stream.is_active(): time.sleep(0.1) stream.stop_stream() stream.close() p.terminate() def parse_args(): ap = ArgumentParser() ap.add_argument('--checkpoint', type=str, required=True, help='Checkpoint to load.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: transcribe_file.py ================================================ from argparse import ArgumentParser import os import tensorflow as tf tf.get_logger().setLevel('ERROR') tf.autograph.set_verbosity(0) from utils import preprocessing, encoding, decoding from utils import model as model_utils from model import build_keras_model from hparams import * def main(args): model_dir = os.path.dirname(os.path.realpath(args.checkpoint)) hparams = model_utils.load_hparams(model_dir) encode_fn, tok_to_text, vocab_size = encoding.get_encoder( encoder_dir=model_dir, hparams=hparams) hparams[HP_VOCAB_SIZE.name] = vocab_size model = build_keras_model(hparams) model.load_weights(args.checkpoint) audio, sr = preprocessing.tf_load_audio(args.input) log_melspec = preprocessing.preprocess_audio( audio=audio, sample_rate=sr, hparams=hparams) log_melspec = tf.expand_dims(log_melspec, axis=0) decoder_fn = decoding.greedy_decode_fn(model, hparams) decoded = decoder_fn(log_melspec)[0] transcription = tok_to_text(decoded) print('Transcription:', transcription.numpy().decode('utf8')) def parse_args(): ap = ArgumentParser() ap.add_argument('--checkpoint', type=str, required=True, help='Checkpoint to load.') ap.add_argument('-i', '--input', type=str, required=True, help='Wav file to transcribe.') return ap.parse_args() if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/data/__init__.py ================================================ from . import common_voice ================================================ FILE: utils/data/common_voice.py ================================================ import os import tensorflow as tf from .. import preprocessing def tf_parse_line(line, data_dir): line_split = tf.strings.split(line, '\t') audio_fn = line_split[1] transcription = line_split[2] audio_filepath = tf.strings.join([data_dir, 'clips', audio_fn], '/') wav_filepath = tf.strings.substr(audio_filepath, 0, tf.strings.length(audio_filepath) - 4) + '.wav' audio, sr = preprocessing.tf_load_audio(wav_filepath) return audio, sr, transcription def load_dataset(base_path, name): filepath = os.path.join(base_path, '{}.tsv'.format(name)) dataset = tf.data.TextLineDataset([filepath]) dataset = dataset.skip(1) dataset = dataset.map(lambda line: tf_parse_line(line, base_path), num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset def texts_generator(base_path): # split_names = ['dev', 'train', 'test'] split_names = ['train'] for split_name in split_names: with open(os.path.join(base_path, '{}.tsv'.format(split_name)), 'r') as f: for line in f: transcription = line.split('\t')[2] yield transcription ================================================ FILE: utils/data/librispeech.py ================================================ import os import tensorflow as tf import soundfile as sf def load_audio(filepath): return sf.read(filepath) def tf_load_audio(filepath): return tf.py_function( lambda x: load_audio(x.numpy()), inp=[filepath], Tout=[tf.float32, tf.int32]) def tf_file_exists(filepath): return tf.py_function( lambda x: os.path.exists(x.numpy()), inp=[filepath], Tout=tf.bool) def tf_parse_line(line, data_dir, split_names): line_split = tf.strings.split(line, ' ') audio_fn = line_split[0] transcription = tf.py_function( lambda x: b' '.join(x.numpy()).decode('utf8'), inp=[line_split[1:]], Tout=tf.string) speaker_id, chapter_id, _ = tf.unstack(tf.strings.split(audio_fn, '-'), 3) all_fps = tf.map_fn( lambda split_name: tf.strings.join([data_dir, split_name, speaker_id, chapter_id, audio_fn], '/') + '.flac', tf.constant(split_names)) audio_filepath_idx = tf.where( tf.map_fn(tf_file_exists, all_fps, dtype=tf.bool))[0][0] audio_filepath = all_fps[audio_filepath_idx] audio, sr = tf_load_audio(audio_filepath) return audio, sr, transcription def get_transcript_files(base_path, split_names): transcript_files = [] for split_name in split_names: for speaker_id in os.listdir(f'{base_path}/{split_name}'): if speaker_id == '.DS_Store': continue for chapter_id in os.listdir(f'{base_path}/{split_name}/{speaker_id}'): if chapter_id == '.DS_Store': continue transcript_files.append(f'{base_path}/{split_name}/{speaker_id}/{chapter_id}/{speaker_id}-{chapter_id}.trans.txt') return transcript_files def load_dataset(base_path, split_names): transcript_filepaths = get_transcript_files(base_path, split_names) dataset = tf.data.TextLineDataset(transcript_filepaths) dataset = dataset.map(lambda line: tf_parse_line(line, base_path, split_names), num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset def texts_generator(base_path, split_names): transcript_filepaths = get_transcript_files(base_path, split_names) for fp in transcript_filepaths: with open(fp, 'r') as f: for line in f: line = line.strip('\n') transcription = ' '.join(line.split(' ')[1:]) yield transcription ================================================ FILE: utils/decoding.py ================================================ import tensorflow as tf from hparams import * def joint(model, f, g): dense_1 = model.layers[-2] dense_2 = model.layers[-1] joint_inp = ( tf.expand_dims(f, axis=2) + # [B, T, V] => [B, T, 1, V] tf.expand_dims(g[:, -1, :], axis=1)) # [B, U, V] => [B, 1, U, V] outputs = dense_1(joint_inp) outputs = dense_2(outputs) return outputs[:, 0, 0, :] def greedy_decode_fn(model, hparams): # NOTE: Only the first input is decoded encoder = model.layers[2] prediction_network = model.layers[3] start_token = tf.constant([0]) feat_size = hparams[HP_MEL_BINS.name] * hparams[HP_DOWNSAMPLE_FACTOR.name] @tf.function(input_signature=[ tf.TensorSpec(shape=[None, None, feat_size], dtype=tf.float32), tf.TensorSpec(shape=[], dtype=tf.int32)]) def greedy_decode(inputs, max_length=None): inputs = tf.expand_dims(inputs[0], axis=0) encoded = encoder(inputs, training=False) enc_length = tf.shape(encoded)[1] i_0 = tf.constant(0) outputs_0 = tf.expand_dims(start_token, axis=0) max_reached_0 = tf.constant(False) time_cond = lambda i, outputs, max_reached: tf.logical_and( i < enc_length, tf.logical_not(max_reached)) def time_step_body(i, outputs, max_reached): inp_enc = tf.expand_dims(encoded[:, i, :], axis=1) _outputs_0 = outputs _max_reached_0 = max_reached dec_end_0 = tf.constant(False) dec_cond = lambda _outputs, _max_reached, dec_end: tf.logical_and( tf.logical_not(dec_end), tf.logical_not(_max_reached)) def dec_step_body(_outputs, _max_reached, dec_end): pred_out = prediction_network(_outputs, training=False) preds = joint(model, inp_enc, pred_out)[0] preds = tf.nn.log_softmax(preds) predicted_id = tf.cast( tf.argmax(preds, axis=-1), dtype=tf.int32) if predicted_id == 0: dec_end = True else: _outputs = tf.concat([_outputs, [[predicted_id]]], axis=1) if max_length is not None and tf.shape(_outputs)[1] >= max_length + 1: _max_reached = True return _outputs, _max_reached, dec_end _outputs, _max_reached, _ = tf.while_loop( dec_cond, dec_step_body, loop_vars=[_outputs_0, _max_reached_0, dec_end_0], shape_invariants=[ tf.TensorShape([1, None]), _max_reached_0.get_shape(), dec_end_0.get_shape() ]) return i + 1, _outputs, _max_reached _, outputs, _ = tf.while_loop( time_cond, time_step_body, loop_vars=[i_0, outputs_0, max_reached_0], shape_invariants=[ i_0.get_shape(), tf.TensorShape([1, None]), max_reached_0.get_shape() ]) final_outputs = outputs[:, 1:] # output_ids = tf.argmax(final_outputs, axis=-1) # return tf.cast(output_ids, dtype=tf.int32) return tf.cast(final_outputs, dtype=tf.int32) return greedy_decode # def greedy_decode(): # # NOTE: Only the first input is decoded # y_pred = y_pred[0] # # Add blank at end for decoding # pred_len = tf.shape(y_pred)[0] # y_pred = tf.concat([y_pred, # tf.fill([pred_len, 1], 0)], # axis=1) # def _loop_body(_y_pred, _decoded): # first_blank_idx = tf.cast(tf.where( # tf.equal(_y_pred[0], 0)), dtype=tf.int32) # has_blank = tf.not_equal(tf.size(first_blank_idx), 0) # dec_idx = first_blank_idx[0][0] # decoded = _y_pred[0][:dec_idx] # n_dec = tf.shape(decoded)[0] # _decoded = tf.concat([_decoded, decoded], # axis=0) # return _y_pred[1:, n_dec:], _decoded # decoded_0 = tf.constant([], dtype=tf.int32) # _, decoded = tf.while_loop( # lambda _y_pred, _decoded: tf.not_equal(tf.size(_y_pred), 0), # _loop_body, # [y_pred, decoded_0], # shape_invariants=[tf.TensorShape([None, None]), tf.TensorShape([None])], # name='greedy_decode') # return tf.expand_dims(decoded, axis=0) # a = tf.constant([ # [ # [1, 4, 4, 4, 4, 3, 2], # [0, 0, 0, 0, 0, 0, 0], # [0, 0, 1, 0, 0, 0, 0], # [0, 0, 0, 4, 1, 4, 0] # ] # ]) # tf.config.experimental_run_functions_eagerly(True) # a = tf.zeros((4, 100, 80)) # print(a) # import sys # import os # FILE_DIR = os.path.dirname(os.path.realpath(__file__)) # sys.path = [os.path.join(FILE_DIR, '..')] + sys.path # from model import build_keras_model # from hparams import * # hparams = { # HP_TOKEN_TYPE: HP_TOKEN_TYPE.domain.values[1], # # Preprocessing # HP_MEL_BINS: HP_MEL_BINS.domain.values[0], # HP_FRAME_LENGTH: HP_FRAME_LENGTH.domain.values[0], # HP_FRAME_STEP: HP_FRAME_STEP.domain.values[0], # HP_HERTZ_LOW: HP_HERTZ_LOW.domain.values[0], # HP_HERTZ_HIGH: HP_HERTZ_HIGH.domain.values[0], # # Model # HP_EMBEDDING_SIZE: HP_EMBEDDING_SIZE.domain.values[0], # HP_ENCODER_LAYERS: HP_ENCODER_LAYERS.domain.values[0], # HP_ENCODER_SIZE: HP_ENCODER_SIZE.domain.values[0], # HP_PROJECTION_SIZE: HP_PROJECTION_SIZE.domain.values[0], # HP_TIME_REDUCT_INDEX: HP_TIME_REDUCT_INDEX.domain.values[0], # HP_TIME_REDUCT_FACTOR: HP_TIME_REDUCT_FACTOR.domain.values[0], # HP_PRED_NET_LAYERS: HP_PRED_NET_LAYERS.domain.values[0], # HP_PRED_NET_SIZE: HP_PRED_NET_SIZE.domain.values[0], # HP_JOINT_NET_SIZE: HP_JOINT_NET_SIZE.domain.values[0], # HP_LEARNING_RATE: HP_LEARNING_RATE.domain.values[0] # } # hparams = {k.name: v for k, v in hparams.items()} # hparams['vocab_size'] = 73 # model = build_keras_model(hparams) # greedy_decode = greedy_decode_fn(model) # print(greedy_decode(a, max_length=20)) ================================================ FILE: utils/encoding.py ================================================ import os import tensorflow_datasets as tfds import tensorflow as tf from hparams import * from . import vocabulary, preprocessing def build_lookup_table(keys, values=None, default_value=-1): if values is None: values = tf.range(len(keys)) kv_init = tf.lookup.KeyValueTensorInitializer( keys=keys, values=values) return tf.lookup.StaticHashTable(kv_init, default_value=default_value) def wordpiece_encode(text, encoder): return tf.constant(encoder.encode(text.numpy()), dtype=tf.int32) def tf_wordpiece_encode(text, encoder): return tf.py_function(lambda x: wordpiece_encode(x, encoder), inp=[text], Tout=tf.int32) def wordpiece_decode(ids, encoder): return tf.constant(encoder.decode(ids.numpy())) def tf_wordpiece_decode(ids, encoder): return tf.py_function(lambda x: wordpiece_decode(x, encoder), inp=[ids], Tout=[tf.string])[0] def tf_vocab_encode(text, vocab_table): tokens = tf.strings.bytes_split(text) return vocab_table.lookup(tokens) def get_encoder(encoder_dir, hparams, texts_generator=None): def preprocessed_gen(): if texts_generator is None: return for x in texts_generator: yield preprocessing.normalize_text(x) if hparams[HP_TOKEN_TYPE.name] == 'character': vocab = vocabulary.init_vocab() vocab_table = build_lookup_table(vocab, default_value=0) vocab_size = len(vocab) encoder_fn = lambda text: tf_vocab_encode(text, vocab_table) decoder_fn = None elif hparams[HP_TOKEN_TYPE.name] == 'word-piece': encoder_filename = 'encoder' encoder_filepath = os.path.join(encoder_dir, encoder_filename) if os.path.exists('{}.subwords'.format(encoder_filepath)): encoder = tfds.core.features.text.SubwordTextEncoder.load_from_file(encoder_filepath) else: encoder = tfds.core.features.text.SubwordTextEncoder.build_from_corpus( corpus_generator=preprocessed_gen(), target_vocab_size=hparams[HP_VOCAB_SIZE.name]) os.makedirs(encoder_dir, exist_ok=True) encoder.save_to_file(encoder_filepath) vocab_size = encoder.vocab_size encoder_fn = lambda text: tf_wordpiece_encode(text, encoder) decoder_fn = lambda ids: tf_wordpiece_decode(ids, encoder) return encoder_fn, decoder_fn, vocab_size ================================================ FILE: utils/loss.py ================================================ from absl import logging import tensorflow as tf _has_loss_func = False try: from warprnnt_tensorflow import rnnt_loss _has_loss_func = True except ImportError: pass def get_loss_fn(reduction_factor): def _fallback_loss(y_true, y_pred, spec_lengths, label_lengths): logging.info('RNN-T loss function not found.') return y_pred if not _has_loss_func: return _fallback_loss def _loss_fn(y_true, y_pred, spec_lengths, label_lengths): y_true = tf.cast(y_true, dtype=tf.int32) if not tf.test.is_built_with_cuda(): y_pred = tf.nn.log_softmax(y_pred) spec_lengths = tf.cast( tf.math.ceil(spec_lengths / reduction_factor), dtype=tf.int32) loss = rnnt_loss(y_pred, y_true, spec_lengths, label_lengths) return loss return _loss_fn ================================================ FILE: utils/metrics.py ================================================ import tensorflow as tf from . import decoding def error_rate(y_true, decoded): y_true_shape = tf.shape(y_true) decoded_shape = tf.shape(decoded) max_length = tf.maximum(y_true_shape[-1], decoded_shape[-1]) if y_true.dtype == tf.string: truth = string_to_sparse(y_true) else: truth = tf.sparse.from_dense(y_true) if decoded.dtype == tf.string: hypothesis = string_to_sparse(decoded) else: hypothesis = tf.sparse.from_dense(decoded) err = tf.edit_distance(hypothesis, truth, normalize=False) err_norm = err / tf.cast(max_length, dtype=tf.float32) return err_norm def string_to_sparse(str_tensor): orig_shape = tf.cast(tf.shape(str_tensor), dtype=tf.int64) str_tensor = tf.squeeze(str_tensor, axis=0) indices = tf.concat([tf.zeros((orig_shape[-1], 1), dtype=tf.int64), tf.expand_dims(tf.range(0, orig_shape[-1]), axis=-1)], axis=1) return tf.SparseTensor(indices=indices, values=str_tensor, dense_shape=orig_shape) def token_error_rate(y_true, decoded, tok_fn, idx_to_text): text_true = idx_to_text(y_true) text_pred = idx_to_text(decoded) text_true.set_shape(()) text_pred.set_shape(()) tok_true = tok_fn(text_true) tok_pred = tok_fn(text_pred) tok_true = tf.expand_dims(tok_true, axis=0) tok_pred = tf.expand_dims(tok_pred, axis=0) return error_rate(tok_true, tok_pred) def build_accuracy_fn(decode_fn): def Accuracy(inputs, y_true): # Decode functions only returns first result y_true = tf.expand_dims(y_true[0], axis=0) max_length = tf.shape(y_true)[1] decoded = decode_fn(inputs, max_length=max_length) return 1 - error_rate(y_true, decoded) return Accuracy def build_wer_fn(decode_fn, idx_to_text): def WER(inputs, y_true): # Decode functions only returns first result y_true = y_true[0] max_length = tf.shape(y_true)[0] decoded = decode_fn(inputs, max_length=max_length)[0] return token_error_rate(y_true, decoded, tok_fn=lambda t: tf.strings.split(t, sep=' '), idx_to_text=idx_to_text) return WER ================================================ FILE: utils/model.py ================================================ from absl import logging import os import json import re from model import build_keras_model def load_hparams(model_dir): with open(os.path.join(model_dir, 'hparams.json'), 'r') as f: return json.load(f) def save_hparams(hparams, model_dir): with open(os.path.join(model_dir, 'hparams.json'), 'w') as f: json.dump(hparams, f) ================================================ FILE: utils/preprocessing.py ================================================ import glob import os import librosa.display import librosa import matplotlib.pyplot as plt import tensorflow as tf import numpy as np from hparams import * def tf_load_audio(path, pre_emphasis=0.97): audio_raw = tf.io.read_file(path) audio, sr = tf.audio.decode_wav(audio_raw) if tf.rank(audio) > 1: audio = audio[:, 0] return audio, sr def normalize_text(text): text = text.lower() text = text.replace('"', '') return text def tf_normalize_text(text): return tf.py_function( lambda x: normalize_text(x.numpy().decode('utf8')), inp=[text], Tout=tf.string) def print_tensor(t, template='{}'): return tf.py_function( lambda x: print(template.format(x.numpy())), inp=[t], Tout=[]) def compute_mel_spectrograms(audio_arr, sample_rate, n_mel_bins, frame_length, frame_step, hertz_low, hertz_high): sample_rate_f = tf.cast(sample_rate, dtype=tf.float32) frame_length = tf.cast(tf.round(sample_rate_f * frame_length), dtype=tf.int32) frame_step = tf.cast(tf.round(sample_rate_f * frame_step), dtype=tf.int32) stfts = tf.signal.stft(audio_arr, frame_length=frame_length, frame_step=frame_step) mag_specs = tf.abs(stfts) num_spec_bins = tf.shape(mag_specs)[-1] linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=n_mel_bins, num_spectrogram_bins=num_spec_bins, sample_rate=sample_rate_f, lower_edge_hertz=hertz_low, upper_edge_hertz=hertz_high) mel_specs = tf.tensordot(mag_specs, linear_to_mel_weight_matrix, 1) mel_specs.set_shape(mag_specs.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_specs = tf.math.log(mel_specs + 1e-6) log_mel_specs -= (tf.reduce_mean(log_mel_specs, axis=0) + 1e-8) return log_mel_specs def downsample_spec(mel_spec, n=3): spec_shape = tf.shape(mel_spec) spec_length, feat_size = spec_shape[0], spec_shape[1] trimmed_length = (spec_length // n) * n trimmed_spec = mel_spec[:trimmed_length] spec_sampled = tf.reshape(trimmed_spec, (-1, feat_size * n)) return spec_sampled def load_dataset(data_dir, name): filenames = glob.glob(os.path.join(data_dir, '{}.tfrecord'.format(name))) raw_dataset = tf.data.TFRecordDataset(filenames) parsed_dataset = raw_dataset.map(parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) return parsed_dataset def parse_example(serialized_example): parse_dict = { 'mel_specs': tf.io.FixedLenFeature([], tf.string), 'pred_inp': tf.io.FixedLenFeature([], tf.string), 'spec_lengths': tf.io.FixedLenFeature([], tf.string), 'label_lengths': tf.io.FixedLenFeature([], tf.string), 'labels': tf.io.FixedLenFeature([], tf.string), } example = tf.io.parse_single_example(serialized_example, parse_dict) mel_specs = tf.io.parse_tensor(example['mel_specs'], out_type=tf.float32) pred_inp = tf.io.parse_tensor(example['pred_inp'], out_type=tf.int32) spec_lengths = tf.io.parse_tensor(example['spec_lengths'], out_type=tf.int32) label_lengths = tf.io.parse_tensor(example['label_lengths'], out_type=tf.int32) labels = tf.io.parse_tensor(example['labels'], out_type=tf.int32) return (mel_specs, pred_inp, spec_lengths, label_lengths, labels) def serialize_example(mel_specs, pred_inp, spec_lengths, label_lengths, labels): def _bytes_feature(value): """Returns a bytes_list from a string / byte.""" if isinstance(value, type(tf.constant(0))): # if value ist tensor value = value.numpy() # get value of tensor return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) mel_specs_s = tf.io.serialize_tensor(mel_specs) pred_inp_s = tf.io.serialize_tensor(pred_inp) spec_lengths_s = tf.io.serialize_tensor(spec_lengths) label_lengths_s = tf.io.serialize_tensor(label_lengths) labels_s = tf.io.serialize_tensor(labels) feature = { 'mel_specs': _bytes_feature(mel_specs_s), 'pred_inp': _bytes_feature(pred_inp_s), 'spec_lengths': _bytes_feature(spec_lengths_s), 'label_lengths': _bytes_feature(label_lengths_s), 'labels': _bytes_feature(labels_s) } example = tf.train.Example(features=tf.train.Features(feature=feature)) return example.SerializeToString() def tf_serialize_example(mel_specs, pred_inp, spec_lengths, label_lengths, labels): tf_string = tf.py_function( serialize_example, (mel_specs, pred_inp, spec_lengths, label_lengths, labels), tf.string) return tf.reshape(tf_string, ()) def preprocess_text(text, encoder_fn, vocab_size): norm_text = tf_normalize_text(text) enc_text = encoder_fn(norm_text) enc_padded = tf.concat([[0], enc_text], axis=0) return enc_text, enc_padded def plot_spec(spec, sr, transcription, name): spec_db = librosa.amplitude_to_db(spec, ref=np.max) plt.figure(figsize=(12,4)) librosa.display.specshow(spec_db, sr=sr, x_axis='time', y_axis='mel', hop_length=sr * 0.01) plt.colorbar(format='%+02.0f dB') plt.savefig('figs/{}.png'.format(name)) plt.clf() def tf_plot_spec(spec, sr, transcription, name): spec_t = tf.transpose(spec) return tf.py_function( lambda _spec, _sr, trans: plot_spec( _spec.numpy(), _sr.numpy(), trans.numpy().decode('utf8'), name ), inp=[spec_t, sr, transcription], Tout=[]) def plot_audio(audio_arr, sr, trans, name): with open('figs/trans.txt', 'a') as f: f.write('{} {}\n'.format(name, trans)) t = np.linspace(0, audio_arr.shape[0] / sr, num=audio_arr.shape[0]) plt.figure(1) plt.plot(t, audio_arr) plt.savefig('figs/{}.png'.format(name)) plt.clf() def tf_plot_audio(audio_arr, sr, trans, name): return tf.py_function( lambda _audio, _sr, _trans: plot_audio( _audio.numpy(), _sr.numpy(), _trans.numpy(), name ), inp=[audio_arr, sr, trans], Tout=[]) def preprocess_audio(audio, sample_rate, hparams): log_melspec = compute_mel_spectrograms( audio_arr=audio, sample_rate=sample_rate, n_mel_bins=hparams[HP_MEL_BINS.name], frame_length=hparams[HP_FRAME_LENGTH.name], frame_step=hparams[HP_FRAME_STEP.name], hertz_low=hparams[HP_HERTZ_LOW.name], hertz_high=hparams[HP_HERTZ_HIGH.name]) downsampled_spec = downsample_spec(log_melspec) return downsampled_spec def preprocess_dataset(dataset, encoder_fn, hparams, max_length=0, save_plots=False): _dataset = dataset if max_length > 0: _dataset = _dataset.filter(lambda audio, sr, trans: ( tf.shape(audio)[0] <= sr * tf.constant(max_length))) if save_plots: os.makedirs('figs', exist_ok=True) for i, (audio_arr, sr, trans) in enumerate(_dataset.take(5)): tf_plot_audio(audio_arr, sr, trans, 'audio_{}'.format(i)) _dataset = _dataset.map(lambda audio, sr, trans: ( preprocess_audio( audio=audio, sample_rate=sr, hparams=hparams), sr, *preprocess_text(trans, encoder_fn=encoder_fn, vocab_size=hparams[HP_VOCAB_SIZE.name]), trans ), num_parallel_calls=tf.data.experimental.AUTOTUNE) if save_plots: for i, (log_melspec, sr, _, _, trans) in enumerate(_dataset.take(5)): tf_plot_spec(log_melspec, sr, trans, 'input_{}'.format(i)) _dataset = _dataset.map( lambda log_melspec, sr, labels, pred_inp, trans: ( log_melspec, pred_inp, tf.shape(log_melspec)[0], tf.shape(labels)[0], labels ), num_parallel_calls=tf.data.experimental.AUTOTUNE) _dataset = _dataset.map(tf_serialize_example) return _dataset ================================================ FILE: utils/vocabulary.py ================================================ def init_vocab(): alphabet = "abcdefghijklmnopqrstuvwxyz'" alphabet_c = ['', ' ', '', ''] + [c for c in alphabet] return alphabet_c def load_vocab(filepath): vocab = [] with open(filepath, 'r') as f: for line in f: line = line.strip().strip('\n') if line == '': line = '' elif line == '': line = ' ' vocab.append(line) return vocab def save_vocab(vocab, filepath): with open(filepath, 'w') as f: for c in vocab: if c == '': c = '' elif c == ' ': c = '' f.write('{}\n'.format(c))