Repository: SeanNaren/deepspeech.torch
Branch: master
Commit: 26d24fa5a805
Files: 31
Total size: 58.2 KB
Directory structure:
gitextract_kmme0l_o/
├── .gitignore
├── .travis.yml
├── BatchBRNN.lua
├── BatchBRNNReLU.lua
├── DeepSpeechModel.lua
├── LICENSE.md
├── Loader.lua
├── MakeLMDB.lua
├── Mapper.lua
├── ModelEvaluator.lua
├── Network.lua
├── Predict.lua
├── README.md
├── SequenceError.lua
├── Test.lua
├── Train.lua
├── UtilsMultiGPU.lua
├── dictionary
├── doc/
│ ├── DeepSpeechModel.md
│ ├── Loader.md
│ ├── Mapper.md
│ ├── ModelEvaluator.md
│ ├── Network.md
│ ├── SequenceError.md
│ ├── UtilsMultiGPU.md
│ └── index.md
├── mkdocs.yml
├── prepare_datasets/
│ ├── FormatAN4.lua
│ └── FormatLibriSpeech.lua
└── tests/
├── test.lua
└── test_dictionary
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.idea/
Audio
systemtests/
systemtests
CTC.iml
CTCSpeechRecognition.iml
*.log
*.log.eps
*.t7
Seq2Seq/
================================================
FILE: .travis.yml
================================================
language: c
notifications:
email: false
compiler:
- gcc
- clang
cache:
directories:
- $HOME/OpenBlasInstall
sudo: false
env:
- TORCH_LUA_VERSION=LUAJIT21
- TORCH_LUA_VERSION=LUA51
- TORCH_LUA_VERSION=LUA52
addons:
apt:
packages:
- cmake
- gfortran
- gcc-multilib
- gfortran-multilib
- liblapack-dev
- build-essential
- gcc
- g++
- curl
- cmake
- libreadline-dev
- git-core
- libqt4-core
- libqt4-gui
- libqt4-dev
- libjpeg-dev
- libpng-dev
- ncurses-dev
- imagemagick
- libzmq3-dev
- gfortran
- unzip
- gnuplot
- gnuplot-x11
before_script:
- export ROOT_TRAVIS_DIR=$(pwd)
- export INSTALL_PREFIX=~/torch/install
- ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
- git clone https://github.com/torch/distro.git ~/torch --recursive
- cd ~/torch && git submodule update --init --recursive
- mkdir build && cd build
- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
- make && make install
- cd $ROOT_TRAVIS_DIR
- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
- ${INSTALL_PREFIX}/bin/luarocks install nn
- ${INSTALL_PREFIX}/bin/luarocks install dpnn
script:
- cd ${ROOT_TRAVIS_DIR}
- git clone https://github.com/SeanNaren/CTCSpeechRecognition.git
- cd CTCSpeechRecognition/tests
- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.1/ --exclude ../tests --exclude ../prepare_an4/
- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.2/ --exclude ../tests --exclude ../prepare_an4/
- export PATH=${INSTALL_PREFIX}/bin:$PATH
- export TESTLUA=$(which luajit lua | head -n 1)
- echo ${TESTLUA}
- ${TESTLUA} test.lua
================================================
FILE: BatchBRNN.lua
================================================
------------------------------------------------------------------------
--[[ BatchBRNN ]] --
-- Adds sequence-wise batch normalization to cudnn RNN modules.
-- For a simple RNN: ht = ReLU(B(Wixt) + Riht-1 + bRi) where B
-- is the batch normalization.
-- Expects size seqLength x minibatch x inputDim.
-- Returns seqLength x minibatch x outputDim.
-- Can specify an rnnModule such as cudnn.LSTM (defaults to RNNReLU).
------------------------------------------------------------------------
local BatchBRNN, parent = torch.class('cudnn.BatchBRNN', 'nn.Sequential')
function BatchBRNN:__init(inputDim, outputDim)
parent.__init(self)
self.view_in = nn.View(1, 1, -1):setNumInputDims(3)
self.view_out = nn.View(1, -1):setNumInputDims(2)
self.rnn = cudnn.RNN(outputDim, outputDim, 1)
local rnn = self.rnn
rnn.inputMode = 'CUDNN_SKIP_INPUT'
rnn.bidirectional = 'CUDNN_BIDIRECTIONAL'
rnn.numDirections = 2
rnn:reset()
self:add(self.view_in)
self:add(nn.Linear(inputDim, outputDim, false))
self:add(nn.BatchNormalization(outputDim))
self:add(self.view_out)
self:add(rnn)
self:add(nn.View(-1, 2, outputDim):setNumInputDims(2))
self:add(nn.Sum(3))
end
function BatchBRNN:updateOutput(input)
local T, N = input:size(1), input:size(2)
self.view_in:resetSize(T * N, -1)
self.view_out:resetSize(T, N, -1)
return parent.updateOutput(self, input)
end
function BatchBRNN:__tostring__()
local tab = ' '
local line = '\n'
local next = ' -> '
local str = 'BatchBRNN'
str = str .. ' {' .. line .. tab .. '[input'
for i=1,#self.modules do
str = str .. next .. '(' .. i .. ')'
end
str = str .. next .. 'output]'
for i=1,#self.modules do
str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab)
end
str = str .. line .. '}'
return str
end
================================================
FILE: BatchBRNNReLU.lua
================================================
require 'BatchBRNN'
------------------------------------------------------------------------
--[[ BatchBRNNReLU ]] --
-- Based On BatchBRNN. Adds ClippedReLU non-linearity to Vanilla BRNN.
------------------------------------------------------------------------
local BatchBRNNReLU, parent = torch.class('cudnn.BatchBRNNReLU', 'cudnn.BatchBRNN')
function BatchBRNNReLU:__init(inputDim, outputDim)
parent.__init(self, inputDim, outputDim)
local rnn = self.rnn
rnn.mode = 'CUDNN_RNN_RELU'
rnn:reset()
self:insert(nn.Clamp(0, 20), 6)
end
================================================
FILE: DeepSpeechModel.lua
================================================
require 'UtilsMultiGPU'
local function RNNModule(inputDim, hiddenDim, opt)
if opt.nGPU > 0 then
if opt.LSTM then
local blstm = nn.Sequential()
blstm:add(cudnn.BLSTM(inputDim, hiddenDim, 1))
blstm:add(nn.View(-1, 2, hiddenDim):setNumInputDims(2)) -- have to sum activations
blstm:add(nn.Sum(3))
return blstm
else
require 'BatchBRNNReLU'
return cudnn.BatchBRNNReLU(inputDim, hiddenDim)
end
else
require 'rnn'
return nn.SeqBRNN(inputDim, hiddenDim)
end
end
-- Creates the covnet+rnn structure.
local function deepSpeech(opt)
local conv = nn.Sequential()
-- (nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH]) conv layers.
conv:add(nn.SpatialConvolution(1, 32, 11, 41, 2, 2))
conv:add(nn.SpatialBatchNormalization(32))
conv:add(nn.Clamp(0, 20))
conv:add(nn.SpatialConvolution(32, 32, 11, 21, 2, 1))
conv:add(nn.SpatialBatchNormalization(32))
conv:add(nn.Clamp(0, 20))
local rnnInputsize = 32 * 41 -- based on the above convolutions and 16khz audio.
local rnnHiddenSize = opt.hiddenSize -- size of rnn hidden layers
local nbOfHiddenLayers = opt.nbOfHiddenLayers
conv:add(nn.View(rnnInputsize, -1):setNumInputDims(3)) -- batch x features x seqLength
conv:add(nn.Transpose({ 2, 3 }, { 1, 2 })) -- seqLength x batch x features
local rnns = nn.Sequential()
local rnnModule = RNNModule(rnnInputsize, rnnHiddenSize, opt)
rnns:add(rnnModule:clone())
rnnModule = RNNModule(rnnHiddenSize, rnnHiddenSize, opt)
for i = 1, nbOfHiddenLayers - 1 do
rnns:add(nn.Bottle(nn.BatchNormalization(rnnHiddenSize), 2))
rnns:add(rnnModule:clone())
end
local fullyConnected = nn.Sequential()
fullyConnected:add(nn.BatchNormalization(rnnHiddenSize))
fullyConnected:add(nn.Linear(rnnHiddenSize, 29))
local model = nn.Sequential()
model:add(conv)
model:add(rnns)
model:add(nn.Bottle(fullyConnected, 2))
model:add(nn.Transpose({1, 2})) -- batch x seqLength x features
model = makeDataParallel(model, opt.nGPU)
return model
end
-- Based on convolution kernel and strides.
local function calculateInputSizes(sizes)
sizes = torch.floor((sizes - 11) / 2 + 1) -- conv1
sizes = torch.floor((sizes - 11) / 2 + 1) -- conv2
return sizes
end
return { deepSpeech, calculateInputSizes }
================================================
FILE: LICENSE.md
================================================
The MIT License (MIT)
Copyright (c) 2016 Sean Naren
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Loader.lua
================================================
require 'nn'
require 'torch'
require 'lmdb'
require 'xlua'
require 'paths'
require 'Mapper'
local tds = require 'tds'
torch.setdefaulttensortype('torch.FloatTensor')
local indexer = torch.class('indexer')
function indexer:__init(dirPath, batchSize)
local dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }
local dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }
self.batchSize = batchSize
self.count = 1
-- get the size of lmdb
dbSpect:open()
dbTrans:open()
local audioLMDBSize = dbSpect:stat()['entries']
local transcriptLMDBSize = dbTrans:stat()['entries']
self.size = audioLMDBSize
dbSpect:close()
dbTrans:close()
self.nbOfBatches = math.ceil(self.size / self.batchSize)
assert(audioLMDBSize == transcriptLMDBSize, 'Audio and transcript LMDBs had different lengths!')
assert(self.size > self.batchSize, 'batchSize larger than lmdb size!')
self.inds = torch.range(1, self.size):split(batchSize)
self.batchIndices = torch.range(1, self.nbOfBatches)
end
function indexer:nextIndices()
if self.count > #self.inds then self.count = 1 end
local index = self.batchIndices[self.count]
local inds = self.inds[index]
self.count = self.count + 1
return inds
end
function indexer:permuteBatchOrder()
self.batchIndices = torch.randperm(self.nbOfBatches)
end
local Loader = torch.class('Loader')
function Loader:__init(dirPath, mapper)
self.dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }
self.dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }
self.dbSpect:open()
self.size = self.dbSpect:stat()['entries']
self.dbSpect:close()
self.mapper = mapper
end
function Loader:nextBatch(indices)
local tensors = tds.Vec()
local targets = {}
local transcripts = {}
local maxLength = 0
local freq = 0
self.dbSpect:open(); local readerSpect = self.dbSpect:txn(true) -- readonly
self.dbTrans:open(); local readerTrans = self.dbTrans:txn(true)
local size = indices:size(1)
local sizes = torch.Tensor(#indices)
local permutedIndices = torch.randperm(size) -- batch tensor has different order each time
-- reads out a batch and store in lists
for x = 1, size do
local ind = indices[permutedIndices[x]]
local tensor = readerSpect:get(ind):float()
local transcript = readerTrans:get(ind)
freq = tensor:size(1)
sizes[x] = tensor:size(2)
if maxLength < tensor:size(2) then maxLength = tensor:size(2) end -- find the max len in this batch
tensors:insert(tensor)
table.insert(targets, self.mapper:encodeString(transcript))
table.insert(transcripts, transcript)
end
local inputs = torch.Tensor(size, 1, freq, maxLength):zero()
for ind, tensor in ipairs(tensors) do
inputs[ind][1]:narrow(2, 1, tensor:size(2)):copy(tensor)
end
readerSpect:abort(); self.dbSpect:close()
readerTrans:abort(); self.dbTrans:close()
return inputs, targets, sizes, transcripts
end
================================================
FILE: MakeLMDB.lua
================================================
-- Expects data in the format of <root><train/test><datasetname><filename.wav/filename.txt>
-- Creates an LMDB of everything in these folders into a train and test set.
require 'lfs'
require 'audio'
require 'xlua'
require 'lmdb'
require 'torch'
require 'parallel'
local tds = require 'tds'
local cmd = torch.CmdLine()
cmd:option('-rootPath', 'prepare_datasets/an4_dataset', 'Path to the data')
cmd:option('-lmdbPath', 'prepare_datasets/an4_lmdb', 'Path to save LMDBs to')
cmd:option('-windowSize', 0.02, 'Window size for audio data')
cmd:option('-stride', 0.01, 'Stride for audio data')
cmd:option('-sampleRate', 16000, 'Sample rate of audio data (Default 16khz)')
cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-processes', 8, 'Number of processes used to create LMDB')
local opt = cmd:parse(arg)
local dataPath = opt.rootPath
local lmdbPath = opt.lmdbPath
local extension = '.' .. opt.audioExtension
parallel.nfork(opt.processes)
local function startWriter(path, name)
local db = lmdb.env {
Path = path,
Name = name
}
db:open()
local txn = db:txn()
return db, txn
end
local function closeWriter(db, txn)
txn:commit()
db:close()
end
local function createLMDB(dataPath, lmdbPath)
local vecs = tds.Vec()
local size = tonumber(sys.execute("find " .. dataPath .. " -type f -name '*'" .. extension .. " | wc -l "))
vecs:resize(size)
local files = io.popen("find -L " .. dataPath .. " -type f -name '*" .. extension .. "'")
local counter = 1
print("Retrieving sizes for sorting...")
local buffer = tds.Vec()
buffer:resize(size)
for file in files:lines() do
buffer[counter] = file
counter = counter + 1
end
local function getSize(opts)
local audioFilePath = opts.file
local transcriptFilePath = opts.file:gsub(opts.extension, ".txt")
local opt = opts.opt
local audioFile = audio.load(audioFilePath)
local length = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):size(2)
return { audioFilePath, transcriptFilePath, length }
end
for x = 1, opt.processes do
local opts = { extension = extension, file = buffer[x], opt = opt }
parallel.children[x]:send({ opts, getSize })
end
local processCounter = 1
for x = 1, size do
local result = parallel.children[processCounter]:receive()
vecs[x] = tds.Vec(unpack(result))
xlua.progress(x, size)
if x % 1000 == 0 then collectgarbage() end
-- send next index to retrieve
if x + opt.processes <= size then
local opts = { extension = extension, file = buffer[x + opt.processes], opt = opt }
parallel.children[processCounter]:send({ opts, getSize })
end
if processCounter == opt.processes then
processCounter = 1
else
processCounter = processCounter + 1
end
end
print("Sorting...")
-- sort the files by length
local function comp(a, b) return a[3] < b[3] end
vecs:sort(comp)
local size = #vecs
print("Creating LMDB dataset to: " .. lmdbPath)
-- start writing
local dbSpect, readerSpect = startWriter(lmdbPath .. '/spect', 'spect')
local dbTrans, readerTrans = startWriter(lmdbPath .. '/trans', 'trans')
local function getData(opts)
local opt = opts.opt
local audioFile = audio.load(opts.audioFilePath)
local spect = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate) -- freq-by-frames tensor
-- put into lmdb
spect = spect:float()
-- normalize the data
local mean = spect:mean()
local std = spect:std()
spect:add(-mean)
spect:div(std)
local transcript
for line in io.lines(opts.transcriptFilePath) do
transcript = line
end
return { spect, transcript }
end
for x = 1, opt.processes do
local vec = vecs[x]
local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }
parallel.children[x]:send({ opts, getData })
end
local processCounter = 1
for x = 1, size do
local result = parallel.children[processCounter]:receive()
local spect, transcript = unpack(result)
readerSpect:put(x, spect)
readerTrans:put(x, transcript)
-- commit buffer
if x % 500 == 0 then
readerSpect:commit(); readerSpect = dbSpect:txn()
readerTrans:commit(); readerTrans = dbTrans:txn()
collectgarbage()
end
if x + opt.processes <= size then
local vec = vecs[x + opt.processes]
local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }
parallel.children[processCounter]:send({ opts, getData })
end
if processCounter == opt.processes then
processCounter = 1
else
processCounter = processCounter + 1
end
xlua.progress(x, size)
end
closeWriter(dbSpect, readerSpect)
closeWriter(dbTrans, readerTrans)
end
function parent()
local function looper()
require 'torch'
require 'audio'
while true do
local object = parallel.parent:receive()
local opts, code = unpack(object)
local result = code(opts)
parallel.parent:send(result)
collectgarbage()
end
end
parallel.children:exec(looper)
createLMDB(dataPath .. '/train', lmdbPath .. '/train')
createLMDB(dataPath .. '/test', lmdbPath .. '/test')
parallel.close()
end
local ok, err = pcall(parent)
if not ok then
print(err)
parallel.close()
end
================================================
FILE: Mapper.lua
================================================
require 'torch'
-- construct an object to deal with the mapping
local mapper = torch.class('Mapper')
function mapper:__init(dictPath)
assert(paths.filep(dictPath), dictPath ..' not found')
self.alphabet2token = {}
self.token2alphabet = {}
-- make maps
local cnt = 0
for line in io.lines(dictPath) do
self.alphabet2token[line] = cnt
self.token2alphabet[cnt] = line
cnt = cnt + 1
end
end
function mapper:encodeString(line)
line = string.lower(line)
local label = {}
for i = 1, #line do
local character = line:sub(i, i)
table.insert(label, self.alphabet2token[character])
end
return label
end
function mapper:decodeOutput(predictions)
--[[
Turns the predictions tensor into a list of the most likely tokens
NOTE:
to compute WER we strip the begining and ending spaces
--]]
local tokens = {}
local blankToken = self.alphabet2token['$']
local preToken = blankToken
-- The prediction is a sequence of likelihood vectors
local _, maxIndices = torch.max(predictions, 2)
maxIndices = maxIndices:float():squeeze()
for i=1, maxIndices:size(1) do
local token = maxIndices[i] - 1 -- CTC indexes start from 1, while token starts from 0
-- add token if it's not blank, and is not the same as pre_token
if token ~= blankToken and token ~= preToken then
table.insert(tokens, token)
end
preToken = token
end
return tokens
end
function mapper:tokensToText(tokens)
local text = ""
for i, t in ipairs(tokens) do
text = text .. self.token2alphabet[tokens[i]]
end
return text
end
================================================
FILE: ModelEvaluator.lua
================================================
require 'Loader'
require 'Mapper'
require 'torch'
require 'xlua'
local threads = require 'threads'
require 'SequenceError'
local ModelEvaluator = torch.class('ModelEvaluator')
local loader
function ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)
loader = Loader(datasetPath, mapper)
self.testBatchSize = testBatchSize
self.nbOfTestIterations = math.ceil(loader.size / testBatchSize)
self.indexer = indexer(datasetPath, testBatchSize)
self.pool = threads.Threads(1, function() require 'Loader' end)
self.mapper = mapper
self.logsPath = logsPath
self.suffix = '_' .. os.date('%Y%m%d_%H%M%S')
self.sequenceError = SequenceError()
self.input = torch.Tensor()
self.isGPU = isGPU
if isGPU then
self.input = self.input:cuda()
end
end
function ModelEvaluator:runEvaluation(model, verbose, epoch)
local spect_buf, label_buf, sizes_buf
-- get first batch
local inds = self.indexer:nextIndices()
self.pool:addjob(function()
return loader:nextBatch(inds)
end,
function(spect, label, sizes)
spect_buf = spect
label_buf = label
sizes_buf = sizes
end)
if verbose then
local f = assert(io.open(self.logsPath .. 'WER_Test' .. self.suffix .. '.log', 'a'), "Could not create validation test logs, does the folder "
.. self.logsPath .. " exist?")
f:write('======================== BEGIN WER TEST EPOCH: ' .. epoch .. ' =========================\n')
f:close()
end
local evaluationPredictions = {} -- stores the predictions to order for log.
local cumCER = 0
local cumWER = 0
local numberOfSamples = 0
-- ======================= for every test iteration ==========================
for i = 1, self.nbOfTestIterations do
-- get buf and fetch next one
self.pool:synchronize()
local inputsCPU, targets, sizes_array = spect_buf, label_buf, sizes_buf
inds = self.indexer:nextIndices()
self.pool:addjob(function()
return loader:nextBatch(inds)
end,
function(spect, label, sizes)
spect_buf = spect
label_buf = label
sizes_buf = sizes
end)
self.input:resize(inputsCPU:size()):copy(inputsCPU)
local predictions = model:forward(self.input)
if self.isGPU then cutorch.synchronize() end
local size = predictions:size(1)
for j = 1, size do
local prediction = predictions[j]
local predict_tokens = self.mapper:decodeOutput(prediction)
local targetTranscript = self.mapper:tokensToText(targets[j])
local predictTranscript = self.mapper:tokensToText(predict_tokens)
local CER = self.sequenceError:calculateCER(targetTranscript, predictTranscript)
local WER = self.sequenceError:calculateWER(targetTranscript, predictTranscript)
cumCER = cumCER + CER
cumWER = cumWER + WER
table.insert(evaluationPredictions, { wer = WER * 100, cer = CER * 100, target = targetTranscript, prediction = predictTranscript })
end
numberOfSamples = numberOfSamples + size
end
local function comp(a, b) return a.wer < b.wer end
table.sort(evaluationPredictions, comp)
if verbose then
for index, eval in ipairs(evaluationPredictions) do
local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))
f:write(string.format("WER = %.2f | CER = %.2f | Text = \"%s\" | Predict = \"%s\"\n",
eval.wer, eval.cer, eval.target, eval.prediction))
f:close()
end
end
local averageWER = cumWER / numberOfSamples
local averageCER = cumCER / numberOfSamples
local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))
f:write(string.format("Average WER = %.2f | CER = %.2f", averageWER * 100, averageCER * 100))
f:close()
self.pool:synchronize() -- end the last loading
return averageWER, averageCER
end
================================================
FILE: Network.lua
================================================
require 'optim'
require 'nnx'
require 'gnuplot'
require 'lfs'
require 'xlua'
require 'UtilsMultiGPU'
require 'Loader'
require 'nngraph'
require 'Mapper'
require 'ModelEvaluator'
local suffix = '_' .. os.date('%Y%m%d_%H%M%S')
local threads = require 'threads'
local Network = {}
--Training parameters
seed = 10
torch.setdefaulttensortype('torch.FloatTensor')
torch.manualSeed(seed)
function Network:init(opt)
self.fileName = opt.saveFileName
self.nGPU = opt.nGPU
self.gpu = self.nGPU > 0
if not self.gpu then
require 'rnn'
else
require 'cutorch'
require 'cunn'
require 'cudnn'
require 'BatchBRNNReLU'
cutorch.manualSeedAll(seed)
end
self.trainingSetLMDBPath = opt.trainingSetLMDBPath
self.validationSetLMDBPath = opt.validationSetLMDBPath
self.logsTrainPath = opt.logsTrainPath or nil
self.logsValidationPath = opt.logsValidationPath or nil
self.modelTrainingPath = opt.modelTrainingPath or nil
self.permuteBatch = opt.permuteBatch or false
self:makeDirectories({ self.logsTrainPath, self.logsValidationPath, self.modelTrainingPath })
self.mapper = Mapper(opt.dictionaryPath)
self.tester = ModelEvaluator(self.gpu, self.validationSetLMDBPath, self.mapper,
opt.validationBatchSize, self.logsValidationPath)
self.loadModel = opt.loadModel
self.epochSave = opt.epochSave or false -- Saves model every number of iterations.
self.maxNorm = opt.maxNorm or 400 -- value chosen by Baidu for english speech.
-- setting model saving/loading
if self.loadModel then
assert(opt.loadPath, "loadPath hasn't been given to load model.")
self:loadNetwork(opt.loadPath, opt.modelName)
else
assert(opt.modelName, "Must have given a model to train.")
self:prepSpeechModel(opt.modelName, opt)
end
-- setting online loading
self.indexer = indexer(opt.trainingSetLMDBPath, opt.batchSize)
self.pool = threads.Threads(1, function() require 'Loader' end)
self.logger = optim.Logger(self.logsTrainPath .. 'train' .. suffix .. '.log')
self.logger:setNames { 'loss', 'WER', 'CER' }
self.logger:style { '-', '-', '-' }
end
function Network:prepSpeechModel(modelName, opt)
local model = require(modelName)
self.model = model[1](opt)
self.calSize = model[2]
end
function Network:testNetwork(epoch)
self.model:evaluate()
local wer, cer = self.tester:runEvaluation(self.model, true, epoch or 1) -- details in log
self.model:zeroGradParameters()
self.model:training()
return wer, cer
end
function Network:trainNetwork(epochs, optimizerParams)
self.model:training()
local lossHistory = {}
local validationHistory = {}
local criterion = nn.CTCCriterion(true)
local x, gradParameters = self.model:getParameters()
print("Number of parameters: ", gradParameters:size(1))
-- inputs (preallocate)
local inputs = torch.Tensor()
local sizes = torch.Tensor()
if self.gpu then
criterion = criterion:cuda()
inputs = inputs:cuda()
sizes = sizes:cuda()
end
-- def loading buf and loader
local loader = Loader(self.trainingSetLMDBPath, self.mapper)
local specBuf, labelBuf, sizesBuf
-- load first batch
local inds = self.indexer:nextIndices()
self.pool:addjob(function()
return loader:nextBatch(inds)
end,
function(spect, label, sizes)
specBuf = spect
labelBuf = label
sizesBuf = sizes
end)
-- define the feval
local function feval(x_new)
self.pool:synchronize() -- wait previous loading
local inputsCPU, sizes, targets = specBuf, sizesBuf, labelBuf -- move buf to training data
inds = self.indexer:nextIndices() -- load next batch whilst training
self.pool:addjob(function()
return loader:nextBatch(inds)
end,
function(spect, label, sizes)
specBuf = spect
labelBuf = label
sizesBuf = sizes
end)
inputs:resize(inputsCPU:size()):copy(inputsCPU) -- transfer over to GPU
sizes = self.calSize(sizes)
local predictions = self.model:forward(inputs)
local loss = criterion:forward(predictions, targets, sizes)
if loss == math.huge or loss == -math.huge then loss = 0 print("Recieved an inf cost!") end
self.model:zeroGradParameters()
local gradOutput = criterion:backward(predictions, targets)
self.model:backward(inputs, gradOutput)
local norm = gradParameters:norm()
if norm > self.maxNorm then
gradParameters:mul(self.maxNorm / norm)
end
return loss, gradParameters
end
-- training
local currentLoss
local startTime = os.time()
for i = 1, epochs do
local averageLoss = 0
for j = 1, self.indexer.nbOfBatches do
currentLoss = 0
local _, fs = optim.sgd(feval, x, optimizerParams)
if self.gpu then cutorch.synchronize() end
currentLoss = currentLoss + fs[1]
xlua.progress(j, self.indexer.nbOfBatches)
averageLoss = averageLoss + currentLoss
end
if self.permuteBatch then self.indexer:permuteBatchOrder() end
averageLoss = averageLoss / self.indexer.nbOfBatches -- Calculate the average loss at this epoch.
-- anneal learningRate
optimizerParams.learningRate = optimizerParams.learningRate / (optimizerParams.learningRateAnnealing or 1)
-- Update validation error rates
local wer, cer = self:testNetwork(i)
print(string.format("Training Epoch: %d Average Loss: %f Average Validation WER: %.2f Average Validation CER: %.2f",
i, averageLoss, 100 * wer, 100 * cer))
table.insert(lossHistory, averageLoss) -- Add the average loss value to the logger.
table.insert(validationHistory, 100 * wer)
self.logger:add { averageLoss, 100 * wer, 100 * cer }
-- periodically save the model
if self.epochSave then
print("Saving model..")
self:saveNetwork(self.modelTrainingPath .. 'model_epoch_' .. i .. suffix .. '_' .. self.fileName)
end
end
local endTime = os.time()
local secondsTaken = endTime - startTime
local minutesTaken = secondsTaken / 60
print("Minutes taken to train: ", minutesTaken)
print("Saving model..")
self:saveNetwork(self.modelTrainingPath .. 'final_model_' .. suffix .. '_' .. self.fileName)
return lossHistory, validationHistory, minutesTaken
end
function Network:createLossGraph()
self.logger:plot()
end
function Network:saveNetwork(saveName)
self.model:clearState()
saveDataParallel(saveName, self.model)
end
--Loads the model into Network.
function Network:loadNetwork(saveName, modelName)
self.model = loadDataParallel(saveName, self.nGPU)
local model = require(modelName)
self.calSize = model[2]
end
function Network:makeDirectories(folderPaths)
for index, folderPath in ipairs(folderPaths) do
if (folderPath ~= nil) then os.execute("mkdir -p " .. folderPath) end
end
end
return Network
================================================
FILE: Predict.lua
================================================
require 'nn'
require 'audio'
require 'Mapper'
require 'UtilsMultiGPU'
local cmd = torch.CmdLine()
cmd:option('-modelPath', 'deepspeech.t7', 'Path of model to load')
cmd:option('-audioPath', '', 'Path to the input audio to predict on')
cmd:option('-dictionaryPath', './dictionary', 'File containing the dictionary to use')
cmd:option('-windowSize', 0.02, 'Window Size of audio')
cmd:option('-stride', 0.01, 'Stride of audio')
cmd:option('-sampleRate', 16000, 'Rate of audio (default 16khz)')
cmd:option('-nGPU', 1)
local opt = cmd:parse(arg)
if opt.nGPU > 0 then
require 'cunn'
require 'cudnn'
require 'BatchBRNNReLU'
end
local model = loadDataParallel(opt.modelPath, opt.nGPU)
local mapper = Mapper(opt.dictionaryPath)
local wave = audio.load(opt.audioPath)
local spect = audio.spectrogram(wave, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):float() -- freq-by-frames tensor
-- normalize the data
local mean = spect:mean()
local std = spect:std()
spect:add(-mean)
spect:div(std)
spect = spect:view(1, 1, spect:size(1), spect:size(2))
if opt.nGPU > 0 then
spect = spect:cuda()
model = model:cuda()
end
model:evaluate()
local predictions = model:forward(spect)
local tokens = mapper:decodeOutput(predictions[1])
local text = mapper:tokensToText(tokens)
print(text)
================================================
FILE: README.md
================================================
# deepspeech.torch
[](https://travis-ci.org/SeanNaren/deepspeech.torch)
[](http://ctcspeechrecognition.readthedocs.io/en/latest/?badge=latest)
Implementation of [Baidu Warp-CTC](https://github.com/baidu-research/warp-ctc) using torch7.
Creates a network based on the [DeepSpeech2](http://arxiv.org/pdf/1512.02595v1.pdf) architecture using the Torch7 library, trained with the CTC activation function.
## Features
* Train large models with large datasets via online loading using [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) and multi-GPU support.
* Supports variable length batches via padding.
* Implements the [AN4 Audio database](http://www.speech.cs.cmu.edu/databases/an4/) (50 mins of data).
Has also been extended to train using the [LibriSpeech](http://www.openslr.org/12/) dataset (1000 hours of data). Custom dataset preparation is explained in documentation.
## Branches
There are currently two branches, Master and Phoneme:
* Master: This branch trains DeepSpeech2. Also included is an evaluation script which calculates the WER/CER, as well as a prediction script.
This branch is useful for understanding how the DeepSpeech and CTC works and is easy to run after installation. Highly recommended to checkout this branch.
* Phonemes: This branch is experimental and uses phonemes rather than character based predictions. This is fully credited and extended by [CCorfield](https://github.com/CCorfield) and his awesome work in porting to use phonemes. In addition to this
I'd like to also thank [Shane Walker](https://github.com/walkers-mv) for his awesome recent conversion to use phonemes as well.
## Installation/Data Preparation/Documentation
Follow Instructions/Data Preparation/Documentation found in the wiki [here](https://github.com/SeanNaren/deepspeech.torch/wiki/Installation) to set up and run the code.
Technical documentation can be found [here](http://ctcspeechrecognition.readthedocs.io/en/latest/).
## Pre-trained Networks
Pre-trained networks are available for AN4 as well as LibriSpeech for CUDA only (since they use cudnn RNNs). Download Links and accuracies are below. DeepSpeech-light is a smaller model which is less intensive to train (based on LSTMs rather than RNNs).
### AN4
**an4Test**
|Network | WER | CER |Link |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| N/A | N/A | N/A |
|DeepSpeech | 12 | 3.07 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/an4_deepspeech.t7) |
### LibriSpeech
**Librispeech-test-clean**
|Network | WER | CER |Link |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| 15 | 1.34 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech-light.t7) |
|DeepSpeech | 12 | 1.55 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech.t7) |
**Librispeech-test-other**
|Network | WER | CER |Link |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| 36 | 3.80 | (Download Above) |
|DeepSpeech | 33 | 3.24 | (Download Above) |
Once you're set up, you can start training from these nets by using the below parameters (you might need to change the other parameters described in the wiki) after setting the project up:
```lua
th Train.lua -loadModel -loadPath /path/to/model.t7
```
## Acknowledgements
Lots of people helped/contributed to this project that deserve recognition:
* Soumith Chintala for his support on Torch7 and the vast open source projects he has contributed that made this project possible!
* Charles Corfield for his work on the Phoneme Dataset and his overall contribution and aid throughout.
* Will Frey for his thorough communication and aid in the development process.
* Ding Ling, Yuan Yang and Yan Xia for their significant contribution to online training, multi-gpu support and many other important features.
* Erich Elsen and the team from Baidu for their contribution of Warp-CTC that made this possible, and the encouraging words and support given throughout the project.
* Maciej Korzepa for his huge help in training a model on Librispeech!
================================================
FILE: SequenceError.lua
================================================
local SequenceError = torch.class("SequenceError")
-- Calculates a sequence error rate (aka Levenshtein edit distance)
function SequenceError:sequenceErrorRate(target, prediction)
local d = torch.Tensor(#target + 1, #prediction + 1):zero()
for i = 1, #target + 1 do
for j = 1, #prediction + 1 do
if (i == 1) then
d[1][j] = j - 1
elseif (j == 1) then
d[i][1] = i - 1
end
end
end
for i = 2, #target + 1 do
for j = 2, #prediction + 1 do
if (target[i - 1] == prediction[j - 1]) then
d[i][j] = d[i - 1][j - 1]
else
local substitution = d[i - 1][j - 1] + 1
local insertion = d[i][j - 1] + 1
local deletion = d[i - 1][j] + 1
d[i][j] = torch.min(torch.Tensor({ substitution, insertion, deletion }))
end
end
end
local errorRate = d[#target + 1][#prediction + 1] / #target
return errorRate
end
function SequenceError:calculateCER(targetTranscript, predictTranscript)
return self:sequenceErrorRate(targetTranscript, predictTranscript)
end
function SequenceError:calculateWER(targetTranscript, predictTranscript)
-- convert to words before calculation
local targetWords = {}
for word in targetTranscript:gmatch("%S+") do table.insert(targetWords, word) end
local predictedWords = {}
for word in predictTranscript:gmatch("%S+") do table.insert(predictedWords, word) end
return self:sequenceErrorRate(targetWords, predictedWords)
end
================================================
FILE: Test.lua
================================================
local Network = require 'Network'
-- Load the network from the saved model. Options can be overrided on command line run.
local cmd = torch.CmdLine()
cmd:option('-loadModel', true, 'Load previously saved model')
cmd:option('-saveModel', false, 'Save model after training/testing')
cmd:option('-loadPath', 'deepspeech.t7', 'Path of final model to save/load')
cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')
cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')
cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')
cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')
cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')
cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')
cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')
cmd:option('-batchSize', 20, 'Batch size in training')
cmd:option('-validationBatchSize', 32, 'Batch size for validation')
local opt = cmd:parse(arg)
Network:init(opt)
print("Testing network...")
local wer, cer = Network:testNetwork()
print(string.format('Avg WER: %2.f Avg CER: %.2f', 100 * wer, 100 * cer))
print(string.format('More information written to log file at %s', opt.logsValidationPath))
================================================
FILE: Train.lua
================================================
local Network = require 'Network'
-- Options can be overrided on command line run.
local cmd = torch.CmdLine()
cmd:option('-loadModel', false, 'Load previously saved model')
cmd:option('-loadPath', 'deepspeech.t7', 'Path to model to load')
cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')
cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')
cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')
cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')
cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')
cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')
cmd:option('-epochSave', false, 'save model every epoch')
cmd:option('-modelTrainingPath', './models/', ' Path to save periodic training models')
cmd:option('-saveFileName', 'deepspeech.t7', 'Name of model to save as')
cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')
cmd:option('-epochs', 70, 'Number of epochs for training')
cmd:option('-learningRate', 3e-4, ' Training learning rate')
cmd:option('-learningRateAnnealing', 1.1, 'Factor to anneal lr every epoch')
cmd:option('-maxNorm', 400, 'Max norm used to normalize gradients')
cmd:option('-momentum', 0.90, 'Momentum for SGD')
cmd:option('-batchSize', 20, 'Batch size in training')
cmd:option('-permuteBatch', false, 'Set to true if you want to permute batches AFTER the first epoch')
cmd:option('-validationBatchSize', 20, 'Batch size for validation')
cmd:option('-LSTM', false, 'Use LSTMs rather than RNNs')
cmd:option('-hiddenSize', 1760, 'RNN hidden sizes')
cmd:option('-nbOfHiddenLayers', 7, 'Number of rnn layers')
local opt = cmd:parse(arg)
--Parameters for the stochastic gradient descent (using the optim library).
local optimParams = {
learningRate = opt.learningRate,
learningRateAnnealing = opt.learningRateAnnealing,
momentum = opt.momentum,
dampening = 0,
nesterov = true
}
--Create and train the network based on the parameters and training data.
Network:init(opt)
Network:trainNetwork(opt.epochs, optimParams)
--Creates the loss plot.
Network:createLossGraph()
================================================
FILE: UtilsMultiGPU.lua
================================================
require 'rnn'
require 'nngraph'
function makeDataParallel(model, nGPU)
if nGPU > 0 then
cudnn.fastest = true
local function BatchNorm(module)
return torch.type(module):find('BatchNormalization')
end
model = cudnn.convert(model, cudnn, BatchNorm)
if nGPU > 1 then
gpus = torch.range(1, nGPU):totable()
dpt = nn.DataParallelTable(1):add(model, gpus):threads(function()
require 'nngraph'
require 'cudnn'
cudnn.fastest = true
require 'BatchBRNNReLU'
end)
dpt.gradInput = nil
model = dpt
end
model:cuda()
end
return model
end
local function cleanDPT(module, device)
-- This assumes this DPT was created by the function above: all the
-- module.modules are clones of the same network on different GPUs
-- hence we only need to keep one when saving the model to the disk.
local newDPT = nn.DataParallelTable(1)
cutorch.setDevice(device or 1)
newDPT:add(module:get(1), device or 1)
return newDPT
end
function saveDataParallel(modelPath, model)
if torch.type(model) == 'nn.DataParallelTable' then
torch.save(modelPath, cleanDPT(model))
elseif torch.type(model) == 'nn.Sequential' then
local temp_model = nn.Sequential()
for i, module in ipairs(model.modules) do
if torch.type(module) == 'nn.DataParallelTable' then
temp_model:add(cleanDPT(module))
else
temp_model:add(module)
end
end
torch.save(modelPath, temp_model)
elseif torch.type(model) == 'nn.gModule' then
torch.save(modelPath, model)
else
error('This saving function only works with Sequential or DataParallelTable modules.')
end
end
function loadDataParallel(modelPath, nGPU)
if nGPU > 1 then
require 'cudnn'
require 'BatchBRNNReLU'
end
local model = torch.load(modelPath)
if torch.type(model) == 'nn.DataParallelTable' then
return makeDataParallel(model:get(1):float(), nGPU)
elseif torch.type(model) == 'nn.Sequential' then
for i, module in ipairs(model.modules) do
if torch.type(module) == 'nn.DataParallelTable' then
model.modules[i] = makeDataParallel(module:get(1):float(), nGPU)
end
end
return model
elseif torch.type(model) == 'nn.gModule' then
model = makeDataParallel(model, nGPU)
return model
else
error('The loaded model is not a Sequential or DataParallelTable module.')
end
end
================================================
FILE: dictionary
================================================
$
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
'
================================================
FILE: doc/DeepSpeechModel.md
================================================
# DeepSpeechModel
Defines the deep speech 2 conv+rnn architecture.
### deepSpeech(opt)
Defines the torch architecture for Deep Speech 2 as a function that can be called. Returns the final model
`opt` Defines the options we use including using GPUS, hidden size and number of layers for the RNNs.
### calculateInputSizes(sizes)
A function that calculates the sequence sizes after the convolutional layers. Used in the loss calculations in CTC, so the network isn't
penalised for the padded sequences. Returns a same sized tensor.
`sizes` Real size of each sentence in the training sample as a 1D tensor.
================================================
FILE: doc/Loader.md
================================================
# Loader
Defines the indexer class and the loader class, handling batching of the dataset to train the network.
## Indexer
Handles returning the next indices of the batch to load into memory, to train the network with.
### indexer:__init(_dir, batchSize)
`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.
`batchSize` The sizes of each batch to create.
### indexer:nextIndices()
Retrieves the next indices that need to be loaded by the loader from the LMDB dataset.
### indexer:permuteBatchOrder()
Permutes the batch order randomly. This is for the net to not train in sequence order every time.
## Loader
Loads batches of data from LMDB files used in training/testing.
### Loader:__init(dirPath)
`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.
### Loader:nextBatch(indices)
Returns the next batch of the dataset based on the given indices.
`indices` The indices of the test samples that need to be retrieved. This is handled by the Indexer class above.
================================================
FILE: doc/Mapper.md
================================================
# Mapper
Defines how numeric indices are mapped to tokens and vice versa.
### Mapper:__init(dictPath)
Creates mappings based on the given dictionary file. The AN4 dictionary file can be seen [here](https://github.com/SeanNaren/deepspeech.torch/blob/master/dictionary).
### Mapper:encodeString(string)
Converts string into a set of tokens to be used as a label in training.
`string` string to be converted.
### Mapper:decodeOutput(predictions)
Converts predictions of the neural network into a sequence of tokens (characters) via a mapper.
`predictions` is a tensor of sequence likelihood vectors given by the neural network.
### Mapper:tokensToText(tokens)
Using the mapper converts the tokens into readable text.
`tokens` A set of numeric tokens to convert into readable text.
================================================
FILE: doc/ModelEvaluator.md
================================================
# ModelEvaluator
Handles calculation of word error rate using an LMDB dataset. For more information on the calculation, see [Evaluator](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Evaluator.md).
### ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)
'isGPU' Whether to use the GPU (CUDA) or CPU.
`datasetPath` the path to the LMDB test dataset to use in evaluation.
`mapper` Maps predicted numeric values to characters, see [Mapper](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Mapper.md) for more details.
`testBatchSize` The size of the batches to pass the network.
`logsPath` File path to put the details of evaluations into.
### ModelEvaluator:runEvaluation(model, verbose, epoch)
Calculates the word error rate and character error rate averaged over the test iterations. Uses the same threading as the training process does to load batches from the dataset.
`model` The Torch model to evaluate.
`verbose` If set to true, will store details of WER calculations into the log files.
`epoch` Determines the epoch number that is written in the log files for this calculation.
================================================
FILE: doc/Network.md
================================================
# Network
Handles interactions with the neural network for training and testing. Configured by network parameters given in
constructor.
### Network:init(networkParams)
Constructor of the Network class. Below defines each parameter that can be taken as input.
```lua
local networkParams = {
loadModel = false, -- Set to true if loading a model into the Network class rather than training.
saveModel = true, -- Set to true if saving the model after training.
modelName = 'DeepSpeechModel', -- The name of the lua class containing the network architecture
nGPU = 1, -- Number of GPUs, set -1 to use CPU
trainingSetLMDBPath = './prepare_an4/train/', -- online loading path from the LMDB dataset for training.
validationSetLMDBPath = './prepare_an4/test/', -- online loading path from the LMDB dataset for testing.
logsTrainPath = './logs/TrainingLoss/', -- Where training logs will be stored.
logsValidationPath = './logs/ValidationScores/', -- Where testing score logs will be stored.
modelTrainingPath = './models/', -- Where models will be stored on saving.
modelPath = 'CTCNetwork.t7',
dictionaryPath = './dictionary', -- Contains the alphabet/characters that we are to predict on.
batchSize = 20, -- The sizes of batches that we are passing into the network in training.
validationBatchSize = 1, -- Validation batch sizes (should be kept at 1, since we pass 1 sample at a time).
validationIterations = 20, -- Number of validation iterations (kept small, because we only want to run a few tests per epoch).
saveModelInTraining = false, -- saves model periodically through training
saveModelIterations = 50 -- If saveModelInTraining set to true, we save every 50 epochs.
}
```
### Network:prepSpeechModel(modelName, opt)
Used to create the model via the defined modelName and options.
### Network:testNetwork(epoch)
Tests the current stored model via the word error rate.
`epoch` can be used to detail the epoch number in the logs when testing scores are stored.
### Network:trainNetwork(epochs, sgd_params)
Trains a network stored in the `Network` class. Uses multiple threads in an online loading fashion to load the data from the disk.
`epochs` defines the number of iterations of training that will occur across the entire dataset (each epochs trains on the entire dataset).
`sgd_params` defines the SGD parameters for the optim library such as below.
```lua
local sgdParams = {
learningRate = 5e-4,
learningRateDecay = 1e-9,
weightDecay = 0,
momentum = 0.9,
dampening = 0,
nesterov = true
}
```
### Network:createLossGraph()
After training, when called will use gnuplot (through wrapper in the optim library) to generate a graph of the loss and word error rate over epochs.
### Network:saveNetwork(saveName)
Will save the model currently stored in the network class to disk, at the pre-defined save location with the given `saveName`.
### Network:loadNetwork(saveName, modelName)
Loads the network from the save location, stored using the pre-defined save name.
`saveName` The name as to which the network was saved as
`modelName` The name of the class that stores the model or architecture.
================================================
FILE: doc/SequenceError.md
================================================
# SequenceError
Calculates word error rates and handles conversion of CTC predictions to numeric tokens.
### SequenceError.sequenceErrorRate(target, prediction)
Calculates the error rates based on the target and the predicted inputs.
`target` and `prediction` are inputs of strings or tables.
### SequenceError:calculateCER(targetTranscript, predictTranscript)
`targetTranscript` and `predictTranscript` are two strings, returns the Character Error Rate.
### SequenceError:calculateWER(targetTranscript, predictTranscript)
`targetTranscript` and `predictTranscript` are two strings, returns the Word Error Rate.
================================================
FILE: doc/UtilsMultiGPU.md
================================================
# UtilsMultiGPU
Handles multi-gpu setups of the architecture.
### makeDataParallel(model, nGPU)
Converts the model into a multi-gpu set up if necessary using DataParallelTable.
`model` The Torch network model to modify for configured GPUs.
`nGPU` Number of GPUs.
### saveDataParallel(modelPath, model)
Saves the model to disk.
`modelPath` Location to save the model.
`model` The Torch network model to save.
### loadDataParallel(modelPath, nGPU)
Loads a model saved using the above methods.
`modelPath` Location to load the model.
`nGPU` Number of GPUs to load to.
================================================
FILE: doc/index.md
================================================
# Technical Documentation
Below are a few classes that have been documented, explaining their purpose and functions available.
## Classes
* [Network](Network.md)
* [DeepSpeechModel](DeepSpeechModel.md)
* [Mapper](Mapper.md)
* [Evaluator](Evaluator.md)
* [ModelEvaluator](ModelEvaluator.md)
* [Utils](Utils.md)
* [UtilsMultiGPU](UtilsMultiGPU.md)
* [Loader](Loader.md)
================================================
FILE: mkdocs.yml
================================================
site_name: CTCSpeechRecognition
theme : simplex
repo_url : https://github.com/SeanNaren/CTCSpeechRecognition
use_directory_urls : false
markdown_extensions: [extra]
docs_dir : doc
pages:
- [index.md, Home]
- [Network.md, Network]
- [DeepSpeechModel.md, DeepSpeechModel]
- [Mapper.md, Mapper]
- [SequenceError.md, SequenceError]
- [ModelEvaluator.md, ModelEvaluator]
- [UtilsMultiGPU.md, UtilsMultiGPU]
- [Loader.md, Loader]
================================================
FILE: prepare_datasets/FormatAN4.lua
================================================
require 'torch'
local cmd = torch.CmdLine()
cmd:option('-rootPath', 'an4', 'Path to the an4 root')
cmd:option('-newPath', 'an4_dataset', 'Path to the new data path')
cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-move', false, 'Moves the files over rather than copies, used to save space')
local opt = cmd:parse(arg)
local an4TestPath = opt.rootPath .. '/etc/an4_test.'
local an4TrainPath = opt.rootPath .. '/etc/an4_train.'
local an4AudioPath = opt.rootPath .. '/wav'
-- strips down the transcripts into pure text
local function processText(line)
local text = line:gsub('<s>', ''):gsub('</s>', ''):gsub('^%s', ''):gsub('%(.*%)', ''):gsub('%s*$', '')
return text
end
local function createDataset(pathToAN4, an4AudioPath, newPath)
sys.execute("mkdir " .. newPath)
local fileids = pathToAN4 .. 'fileids'
local transcripts = pathToAN4 .. 'transcription'
local filePaths = {}
for filePath in io.lines(fileids) do
table.insert(filePaths, filePath)
end
local counter = 1
for line in io.lines(transcripts) do
local text = processText(line)
local filePath = filePaths[counter]
-- new filename extracted from an4 file id
local fileName = sys.split(filePath, '/')[3] -- last part is the filename
-- create new text file with clean transcript
local textPath = newPath .. '/' .. fileName .. '.txt'
local file = io.open(textPath, "w")
file:write(text)
file:close()
-- move audio to correct place
local audioPath = an4AudioPath .. '/' .. filePath .. '.' .. opt.audioExtension
local newPath = newPath .. '/' .. fileName .. '.' .. opt.audioExtension
local command
if opt.move then command = "mv " else command = "cp " end
sys.execute(command .. audioPath .. ' ' .. newPath)
counter = counter + 1
end
end
sys.execute("mkdir " .. opt.newPath)
createDataset(an4TrainPath, an4AudioPath, opt.newPath .. '/train/')
createDataset(an4TestPath, an4AudioPath, opt.newPath .. '/test/')
================================================
FILE: prepare_datasets/FormatLibriSpeech.lua
================================================
require 'torch'
local threads = require 'threads'
local cmd = torch.CmdLine()
cmd:option('-rootPath', 'LibriSpeech', 'Path to the librispeech root')
cmd:option('-newPath', 'libri_dataset', 'Path to the new data path')
cmd:option('-audioExtension', 'flac', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-move', false, 'Moves the files over rather than copies, used to save space')
cmd:option('-threads', 8, 'Number of threads to use')
local opt = cmd:parse(arg)
local extension = '.' .. opt.audioExtension
local libriTestPath = opt.rootPath .. '/test/'
local libriTrainPath = opt.rootPath .. '/train/'
local threads = threads.Threads(opt.threads, function(idx) require 'torch' require 'sys' end)
-- strips down the transcripts into pure text
local function processText(line)
local text = line:gsub('[^a-zA-Z ]', '')
return text
end
local function createDataset(libriPath, newDirPath)
sys.execute("mkdir " .. newDirPath)
local size = tonumber(sys.execute("find " .. libriPath .. " -type f -name '*'" .. extension .. " | wc -l "))
local counter = 1
local function formatData(line, dir)
local text = processText(line)
local id = line:match("([^ ]*) ") -- first part of transcript, used for audio file path and ID
local audioFolders = sys.split(id, '-')
local textPath = newDirPath .. '/' .. id .. '.txt'
local file = io.open(textPath, "w")
file:write(text)
file:close()
-- move audio to correct place
local audioPath = dir .. '/' .. audioFolders[1] .. '/' .. audioFolders[2] .. '/' .. id .. extension
local newPath = newDirPath .. '/' .. id .. extension
local command
if opt.move then command = "mv " else command = "cp " end
sys.execute(command .. audioPath .. ' ' .. newPath)
end
local counter = 0
local p = io.popen('find "' .. libriPath .. '" -maxdepth 1 -mindepth 1 -type d')
for dir in p:lines() do
local transcripts = io.popen("find -L " .. dir .. " -type f -name '*.txt'")
for transcript in transcripts:lines() do
for line in io.lines(transcript) do
threads:addjob(function()
formatData(line, dir)
end,
function()
counter = counter + 1
xlua.progress(counter, size)
end)
end
end
end
end
sys.execute("mkdir " .. opt.newPath)
createDataset(libriTrainPath, opt.newPath .. '/train/')
createDataset(libriTestPath, opt.newPath .. '/test/')
================================================
FILE: tests/test.lua
================================================
require 'nn'
local test = torch.TestSuite()
local mytester
require '../SequenceError'
require '../Mapper'
local sequenceError = SequenceError()
function test.evaluator()
-- Calculates WER, (nbOfInsertions + nbOfDeletions + nbOfSubstitutions) / nbOfWords
local target = "test a sentence"
local prediction = "a sentence"
local deletion = sequenceError:calculateWER(target, prediction)
local prediction = "test a sentence inserted"
local insertion = sequenceError:calculateWER(target, prediction)
local prediction = "test substituted sentence"
local substitution = sequenceError:calculateWER(target, prediction)
local oneMistakeWER = 1 / 3 -- One insertion/deletion/substitution / number of words
mytester:eq(deletion, oneMistakeWER, 'WER with deletion was incorrect')
mytester:eq(insertion, oneMistakeWER, 'WER with insertion was incorrect')
mytester:eq(substitution, oneMistakeWER, 'WER with substitution was incorrect')
local prediction = "a"
local deletion = sequenceError:calculateWER(target, prediction)
local prediction = "a wrong"
local deletionAndSubstitution = sequenceError:calculateWER(target, prediction)
local prediction = "wrong a sentence inserted"
local substitionAndInsertion = sequenceError:calculateWER(target, prediction)
local twoMistakeWER = 2 / 3 -- Two errors of insertion/deletion/substitution / number of words
mytester:eq(deletion, twoMistakeWER, 'masking of outputs was incorrect')
mytester:eq(deletionAndSubstitution, twoMistakeWER, 'WER with substitution and deletion was incorrect')
mytester:eq(substitionAndInsertion, twoMistakeWER, 'WER with substitution and insertion was incorrect')
end
function test.mapper()
local dir_path = 'test_dictionary'
local mapper = Mapper(dir_path)
local alphabet = {
'$', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '\''
}
local expectedMapping = {}
for index, letter in ipairs(alphabet) do
expectedMapping[letter] = index - 1
end
mytester:eq(mapper.alphabet2token, expectedMapping)
end
function test.mapperDecode()
local dir_path = 'test_dictionary'
local mapper = Mapper(dir_path)
local predictions = torch.Tensor({ { 1, 2, 3 }, { 2, 3, 1 }, { 1, 2, 3 } })
local tokens = mapper:decodeOutput(predictions)
local text = mapper:tokensToText(tokens)
mytester:eq(tokens, { 2, 1, 2 })
mytester:eq(text, 'bab')
end
mytester = torch.Tester()
mytester:add(test)
mytester:run()
================================================
FILE: tests/test_dictionary
================================================
$
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
'
gitextract_kmme0l_o/
├── .gitignore
├── .travis.yml
├── BatchBRNN.lua
├── BatchBRNNReLU.lua
├── DeepSpeechModel.lua
├── LICENSE.md
├── Loader.lua
├── MakeLMDB.lua
├── Mapper.lua
├── ModelEvaluator.lua
├── Network.lua
├── Predict.lua
├── README.md
├── SequenceError.lua
├── Test.lua
├── Train.lua
├── UtilsMultiGPU.lua
├── dictionary
├── doc/
│ ├── DeepSpeechModel.md
│ ├── Loader.md
│ ├── Mapper.md
│ ├── ModelEvaluator.md
│ ├── Network.md
│ ├── SequenceError.md
│ ├── UtilsMultiGPU.md
│ └── index.md
├── mkdocs.yml
├── prepare_datasets/
│ ├── FormatAN4.lua
│ └── FormatLibriSpeech.lua
└── tests/
├── test.lua
└── test_dictionary
Condensed preview — 31 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (63K chars).
[
{
"path": ".gitignore",
"chars": 101,
"preview": ".idea/\nAudio\nsystemtests/\nsystemtests\nCTC.iml\nCTCSpeechRecognition.iml\n*.log\n*.log.eps\n*.t7\nSeq2Seq/\n"
},
{
"path": ".travis.yml",
"chars": 2014,
"preview": "language: c\nnotifications:\n email: false\ncompiler:\n - gcc\n - clang\ncache:\n directories:\n - $HOME/OpenBlasInstall\nsu"
},
{
"path": "BatchBRNN.lua",
"chars": 1907,
"preview": "------------------------------------------------------------------------\n--[[ BatchBRNN ]] --\n-- Adds sequence-wise batc"
},
{
"path": "BatchBRNNReLU.lua",
"chars": 555,
"preview": "require 'BatchBRNN'\n------------------------------------------------------------------------\n--[[ BatchBRNNReLU ]] --\n--"
},
{
"path": "DeepSpeechModel.lua",
"chars": 2441,
"preview": "require 'UtilsMultiGPU'\n\nlocal function RNNModule(inputDim, hiddenDim, opt)\n if opt.nGPU > 0 then\n if opt.LSTM"
},
{
"path": "LICENSE.md",
"chars": 1077,
"preview": "The MIT License (MIT)\n\nCopyright (c) 2016 Sean Naren\n\nPermission is hereby granted, free of charge, to any person obtain"
},
{
"path": "Loader.lua",
"chars": 3082,
"preview": "require 'nn'\nrequire 'torch'\nrequire 'lmdb'\nrequire 'xlua'\nrequire 'paths'\nrequire 'Mapper'\nlocal tds = require 'tds'\n\nt"
},
{
"path": "MakeLMDB.lua",
"chars": 5434,
"preview": "-- Expects data in the format of <root><train/test><datasetname><filename.wav/filename.txt>\n-- Creates an LMDB of everyt"
},
{
"path": "Mapper.lua",
"chars": 1702,
"preview": "require 'torch'\n\n-- construct an object to deal with the mapping\nlocal mapper = torch.class('Mapper')\n\nfunction mapper:_"
},
{
"path": "ModelEvaluator.lua",
"chars": 4154,
"preview": "require 'Loader'\nrequire 'Mapper'\nrequire 'torch'\nrequire 'xlua'\nlocal threads = require 'threads'\nrequire 'SequenceErro"
},
{
"path": "Network.lua",
"chars": 7230,
"preview": "require 'optim'\nrequire 'nnx'\nrequire 'gnuplot'\nrequire 'lfs'\nrequire 'xlua'\nrequire 'UtilsMultiGPU'\nrequire 'Loader'\nre"
},
{
"path": "Predict.lua",
"chars": 1319,
"preview": "require 'nn'\nrequire 'audio'\nrequire 'Mapper'\nrequire 'UtilsMultiGPU'\nlocal cmd = torch.CmdLine()\ncmd:option('-modelPath"
},
{
"path": "README.md",
"chars": 4461,
"preview": "# deepspeech.torch\n\n[](https://travis"
},
{
"path": "SequenceError.lua",
"chars": 1595,
"preview": "local SequenceError = torch.class(\"SequenceError\")\n\n-- Calculates a sequence error rate (aka Levenshtein edit distance)\n"
},
{
"path": "Test.lua",
"chars": 1383,
"preview": "local Network = require 'Network'\n\n-- Load the network from the saved model. Options can be overrided on command line ru"
},
{
"path": "Train.lua",
"chars": 2270,
"preview": "local Network = require 'Network'\n\n-- Options can be overrided on command line run.\nlocal cmd = torch.CmdLine()\ncmd:opti"
},
{
"path": "UtilsMultiGPU.lua",
"chars": 2665,
"preview": "require 'rnn'\nrequire 'nngraph'\nfunction makeDataParallel(model, nGPU)\n if nGPU > 0 then\n cudnn.fastest = true"
},
{
"path": "dictionary",
"chars": 57,
"preview": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
},
{
"path": "doc/DeepSpeechModel.md",
"chars": 609,
"preview": "# DeepSpeechModel\n\nDefines the deep speech 2 conv+rnn architecture.\n\n### deepSpeech(opt)\n\nDefines the torch architecture"
},
{
"path": "doc/Loader.md",
"chars": 1059,
"preview": "# Loader\n\nDefines the indexer class and the loader class, handling batching of the dataset to train the network.\n\n## Ind"
},
{
"path": "doc/Mapper.md",
"chars": 790,
"preview": "# Mapper\n\nDefines how numeric indices are mapped to tokens and vice versa.\n\n### Mapper:__init(dictPath)\n\nCreates mapping"
},
{
"path": "doc/ModelEvaluator.md",
"chars": 1131,
"preview": "# ModelEvaluator\n\nHandles calculation of word error rate using an LMDB dataset. For more information on the calculation,"
},
{
"path": "doc/Network.md",
"chars": 3205,
"preview": "# Network\n\nHandles interactions with the neural network for training and testing. Configured by network parameters given"
},
{
"path": "doc/SequenceError.md",
"chars": 619,
"preview": "# SequenceError\n\nCalculates word error rates and handles conversion of CTC predictions to numeric tokens.\n\n### SequenceE"
},
{
"path": "doc/UtilsMultiGPU.md",
"chars": 577,
"preview": "# UtilsMultiGPU\n\nHandles multi-gpu setups of the architecture.\n\n### makeDataParallel(model, nGPU)\n\nConverts the model in"
},
{
"path": "doc/index.md",
"chars": 387,
"preview": "# Technical Documentation\n\nBelow are a few classes that have been documented, explaining their purpose and functions ava"
},
{
"path": "mkdocs.yml",
"chars": 423,
"preview": "site_name: CTCSpeechRecognition\ntheme : simplex\nrepo_url : https://github.com/SeanNaren/CTCSpeechRecognition\nuse_directo"
},
{
"path": "prepare_datasets/FormatAN4.lua",
"chars": 2101,
"preview": "require 'torch'\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'an4', 'Path to the an4 root')\ncmd:option('-newPath'"
},
{
"path": "prepare_datasets/FormatLibriSpeech.lua",
"chars": 2604,
"preview": "require 'torch'\nlocal threads = require 'threads'\n\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'LibriSpeech', 'P"
},
{
"path": "tests/test.lua",
"chars": 2603,
"preview": "require 'nn'\n\nlocal test = torch.TestSuite()\nlocal mytester\nrequire '../SequenceError'\nrequire '../Mapper'\n\nlocal sequen"
},
{
"path": "tests/test_dictionary",
"chars": 57,
"preview": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
}
]
About this extraction
This page contains the full source code of the SeanNaren/deepspeech.torch GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 31 files (58.2 KB), approximately 15.8k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.