Repository: SeanNaren/deepspeech.torch Branch: master Commit: 26d24fa5a805 Files: 31 Total size: 58.2 KB Directory structure: gitextract_kmme0l_o/ ├── .gitignore ├── .travis.yml ├── BatchBRNN.lua ├── BatchBRNNReLU.lua ├── DeepSpeechModel.lua ├── LICENSE.md ├── Loader.lua ├── MakeLMDB.lua ├── Mapper.lua ├── ModelEvaluator.lua ├── Network.lua ├── Predict.lua ├── README.md ├── SequenceError.lua ├── Test.lua ├── Train.lua ├── UtilsMultiGPU.lua ├── dictionary ├── doc/ │ ├── DeepSpeechModel.md │ ├── Loader.md │ ├── Mapper.md │ ├── ModelEvaluator.md │ ├── Network.md │ ├── SequenceError.md │ ├── UtilsMultiGPU.md │ └── index.md ├── mkdocs.yml ├── prepare_datasets/ │ ├── FormatAN4.lua │ └── FormatLibriSpeech.lua └── tests/ ├── test.lua └── test_dictionary ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/ Audio systemtests/ systemtests CTC.iml CTCSpeechRecognition.iml *.log *.log.eps *.t7 Seq2Seq/ ================================================ FILE: .travis.yml ================================================ language: c notifications: email: false compiler: - gcc - clang cache: directories: - $HOME/OpenBlasInstall sudo: false env: - TORCH_LUA_VERSION=LUAJIT21 - TORCH_LUA_VERSION=LUA51 - TORCH_LUA_VERSION=LUA52 addons: apt: packages: - cmake - gfortran - gcc-multilib - gfortran-multilib - liblapack-dev - build-essential - gcc - g++ - curl - cmake - libreadline-dev - git-core - libqt4-core - libqt4-gui - libqt4-dev - libjpeg-dev - libpng-dev - ncurses-dev - imagemagick - libzmq3-dev - gfortran - unzip - gnuplot - gnuplot-x11 before_script: - export ROOT_TRAVIS_DIR=$(pwd) - export INSTALL_PREFIX=~/torch/install - ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install) - git clone https://github.com/torch/distro.git ~/torch --recursive - cd ~/torch && git submodule update --init --recursive - mkdir build && cd build - export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH - cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON - make && make install - cd $ROOT_TRAVIS_DIR - export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH - ${INSTALL_PREFIX}/bin/luarocks install nn - ${INSTALL_PREFIX}/bin/luarocks install dpnn script: - cd ${ROOT_TRAVIS_DIR} - git clone https://github.com/SeanNaren/CTCSpeechRecognition.git - cd CTCSpeechRecognition/tests - rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.1/ --exclude ../tests --exclude ../prepare_an4/ - rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.2/ --exclude ../tests --exclude ../prepare_an4/ - export PATH=${INSTALL_PREFIX}/bin:$PATH - export TESTLUA=$(which luajit lua | head -n 1) - echo ${TESTLUA} - ${TESTLUA} test.lua ================================================ FILE: BatchBRNN.lua ================================================ ------------------------------------------------------------------------ --[[ BatchBRNN ]] -- -- Adds sequence-wise batch normalization to cudnn RNN modules. -- For a simple RNN: ht = ReLU(B(Wixt) + Riht-1 + bRi) where B -- is the batch normalization. -- Expects size seqLength x minibatch x inputDim. -- Returns seqLength x minibatch x outputDim. -- Can specify an rnnModule such as cudnn.LSTM (defaults to RNNReLU). ------------------------------------------------------------------------ local BatchBRNN, parent = torch.class('cudnn.BatchBRNN', 'nn.Sequential') function BatchBRNN:__init(inputDim, outputDim) parent.__init(self) self.view_in = nn.View(1, 1, -1):setNumInputDims(3) self.view_out = nn.View(1, -1):setNumInputDims(2) self.rnn = cudnn.RNN(outputDim, outputDim, 1) local rnn = self.rnn rnn.inputMode = 'CUDNN_SKIP_INPUT' rnn.bidirectional = 'CUDNN_BIDIRECTIONAL' rnn.numDirections = 2 rnn:reset() self:add(self.view_in) self:add(nn.Linear(inputDim, outputDim, false)) self:add(nn.BatchNormalization(outputDim)) self:add(self.view_out) self:add(rnn) self:add(nn.View(-1, 2, outputDim):setNumInputDims(2)) self:add(nn.Sum(3)) end function BatchBRNN:updateOutput(input) local T, N = input:size(1), input:size(2) self.view_in:resetSize(T * N, -1) self.view_out:resetSize(T, N, -1) return parent.updateOutput(self, input) end function BatchBRNN:__tostring__() local tab = ' ' local line = '\n' local next = ' -> ' local str = 'BatchBRNN' str = str .. ' {' .. line .. tab .. '[input' for i=1,#self.modules do str = str .. next .. '(' .. i .. ')' end str = str .. next .. 'output]' for i=1,#self.modules do str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab) end str = str .. line .. '}' return str end ================================================ FILE: BatchBRNNReLU.lua ================================================ require 'BatchBRNN' ------------------------------------------------------------------------ --[[ BatchBRNNReLU ]] -- -- Based On BatchBRNN. Adds ClippedReLU non-linearity to Vanilla BRNN. ------------------------------------------------------------------------ local BatchBRNNReLU, parent = torch.class('cudnn.BatchBRNNReLU', 'cudnn.BatchBRNN') function BatchBRNNReLU:__init(inputDim, outputDim) parent.__init(self, inputDim, outputDim) local rnn = self.rnn rnn.mode = 'CUDNN_RNN_RELU' rnn:reset() self:insert(nn.Clamp(0, 20), 6) end ================================================ FILE: DeepSpeechModel.lua ================================================ require 'UtilsMultiGPU' local function RNNModule(inputDim, hiddenDim, opt) if opt.nGPU > 0 then if opt.LSTM then local blstm = nn.Sequential() blstm:add(cudnn.BLSTM(inputDim, hiddenDim, 1)) blstm:add(nn.View(-1, 2, hiddenDim):setNumInputDims(2)) -- have to sum activations blstm:add(nn.Sum(3)) return blstm else require 'BatchBRNNReLU' return cudnn.BatchBRNNReLU(inputDim, hiddenDim) end else require 'rnn' return nn.SeqBRNN(inputDim, hiddenDim) end end -- Creates the covnet+rnn structure. local function deepSpeech(opt) local conv = nn.Sequential() -- (nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH]) conv layers. conv:add(nn.SpatialConvolution(1, 32, 11, 41, 2, 2)) conv:add(nn.SpatialBatchNormalization(32)) conv:add(nn.Clamp(0, 20)) conv:add(nn.SpatialConvolution(32, 32, 11, 21, 2, 1)) conv:add(nn.SpatialBatchNormalization(32)) conv:add(nn.Clamp(0, 20)) local rnnInputsize = 32 * 41 -- based on the above convolutions and 16khz audio. local rnnHiddenSize = opt.hiddenSize -- size of rnn hidden layers local nbOfHiddenLayers = opt.nbOfHiddenLayers conv:add(nn.View(rnnInputsize, -1):setNumInputDims(3)) -- batch x features x seqLength conv:add(nn.Transpose({ 2, 3 }, { 1, 2 })) -- seqLength x batch x features local rnns = nn.Sequential() local rnnModule = RNNModule(rnnInputsize, rnnHiddenSize, opt) rnns:add(rnnModule:clone()) rnnModule = RNNModule(rnnHiddenSize, rnnHiddenSize, opt) for i = 1, nbOfHiddenLayers - 1 do rnns:add(nn.Bottle(nn.BatchNormalization(rnnHiddenSize), 2)) rnns:add(rnnModule:clone()) end local fullyConnected = nn.Sequential() fullyConnected:add(nn.BatchNormalization(rnnHiddenSize)) fullyConnected:add(nn.Linear(rnnHiddenSize, 29)) local model = nn.Sequential() model:add(conv) model:add(rnns) model:add(nn.Bottle(fullyConnected, 2)) model:add(nn.Transpose({1, 2})) -- batch x seqLength x features model = makeDataParallel(model, opt.nGPU) return model end -- Based on convolution kernel and strides. local function calculateInputSizes(sizes) sizes = torch.floor((sizes - 11) / 2 + 1) -- conv1 sizes = torch.floor((sizes - 11) / 2 + 1) -- conv2 return sizes end return { deepSpeech, calculateInputSizes } ================================================ FILE: LICENSE.md ================================================ The MIT License (MIT) Copyright (c) 2016 Sean Naren Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Loader.lua ================================================ require 'nn' require 'torch' require 'lmdb' require 'xlua' require 'paths' require 'Mapper' local tds = require 'tds' torch.setdefaulttensortype('torch.FloatTensor') local indexer = torch.class('indexer') function indexer:__init(dirPath, batchSize) local dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' } local dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' } self.batchSize = batchSize self.count = 1 -- get the size of lmdb dbSpect:open() dbTrans:open() local audioLMDBSize = dbSpect:stat()['entries'] local transcriptLMDBSize = dbTrans:stat()['entries'] self.size = audioLMDBSize dbSpect:close() dbTrans:close() self.nbOfBatches = math.ceil(self.size / self.batchSize) assert(audioLMDBSize == transcriptLMDBSize, 'Audio and transcript LMDBs had different lengths!') assert(self.size > self.batchSize, 'batchSize larger than lmdb size!') self.inds = torch.range(1, self.size):split(batchSize) self.batchIndices = torch.range(1, self.nbOfBatches) end function indexer:nextIndices() if self.count > #self.inds then self.count = 1 end local index = self.batchIndices[self.count] local inds = self.inds[index] self.count = self.count + 1 return inds end function indexer:permuteBatchOrder() self.batchIndices = torch.randperm(self.nbOfBatches) end local Loader = torch.class('Loader') function Loader:__init(dirPath, mapper) self.dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' } self.dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' } self.dbSpect:open() self.size = self.dbSpect:stat()['entries'] self.dbSpect:close() self.mapper = mapper end function Loader:nextBatch(indices) local tensors = tds.Vec() local targets = {} local transcripts = {} local maxLength = 0 local freq = 0 self.dbSpect:open(); local readerSpect = self.dbSpect:txn(true) -- readonly self.dbTrans:open(); local readerTrans = self.dbTrans:txn(true) local size = indices:size(1) local sizes = torch.Tensor(#indices) local permutedIndices = torch.randperm(size) -- batch tensor has different order each time -- reads out a batch and store in lists for x = 1, size do local ind = indices[permutedIndices[x]] local tensor = readerSpect:get(ind):float() local transcript = readerTrans:get(ind) freq = tensor:size(1) sizes[x] = tensor:size(2) if maxLength < tensor:size(2) then maxLength = tensor:size(2) end -- find the max len in this batch tensors:insert(tensor) table.insert(targets, self.mapper:encodeString(transcript)) table.insert(transcripts, transcript) end local inputs = torch.Tensor(size, 1, freq, maxLength):zero() for ind, tensor in ipairs(tensors) do inputs[ind][1]:narrow(2, 1, tensor:size(2)):copy(tensor) end readerSpect:abort(); self.dbSpect:close() readerTrans:abort(); self.dbTrans:close() return inputs, targets, sizes, transcripts end ================================================ FILE: MakeLMDB.lua ================================================ -- Expects data in the format of -- Creates an LMDB of everything in these folders into a train and test set. require 'lfs' require 'audio' require 'xlua' require 'lmdb' require 'torch' require 'parallel' local tds = require 'tds' local cmd = torch.CmdLine() cmd:option('-rootPath', 'prepare_datasets/an4_dataset', 'Path to the data') cmd:option('-lmdbPath', 'prepare_datasets/an4_lmdb', 'Path to save LMDBs to') cmd:option('-windowSize', 0.02, 'Window size for audio data') cmd:option('-stride', 0.01, 'Stride for audio data') cmd:option('-sampleRate', 16000, 'Sample rate of audio data (Default 16khz)') cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)') cmd:option('-processes', 8, 'Number of processes used to create LMDB') local opt = cmd:parse(arg) local dataPath = opt.rootPath local lmdbPath = opt.lmdbPath local extension = '.' .. opt.audioExtension parallel.nfork(opt.processes) local function startWriter(path, name) local db = lmdb.env { Path = path, Name = name } db:open() local txn = db:txn() return db, txn end local function closeWriter(db, txn) txn:commit() db:close() end local function createLMDB(dataPath, lmdbPath) local vecs = tds.Vec() local size = tonumber(sys.execute("find " .. dataPath .. " -type f -name '*'" .. extension .. " | wc -l ")) vecs:resize(size) local files = io.popen("find -L " .. dataPath .. " -type f -name '*" .. extension .. "'") local counter = 1 print("Retrieving sizes for sorting...") local buffer = tds.Vec() buffer:resize(size) for file in files:lines() do buffer[counter] = file counter = counter + 1 end local function getSize(opts) local audioFilePath = opts.file local transcriptFilePath = opts.file:gsub(opts.extension, ".txt") local opt = opts.opt local audioFile = audio.load(audioFilePath) local length = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):size(2) return { audioFilePath, transcriptFilePath, length } end for x = 1, opt.processes do local opts = { extension = extension, file = buffer[x], opt = opt } parallel.children[x]:send({ opts, getSize }) end local processCounter = 1 for x = 1, size do local result = parallel.children[processCounter]:receive() vecs[x] = tds.Vec(unpack(result)) xlua.progress(x, size) if x % 1000 == 0 then collectgarbage() end -- send next index to retrieve if x + opt.processes <= size then local opts = { extension = extension, file = buffer[x + opt.processes], opt = opt } parallel.children[processCounter]:send({ opts, getSize }) end if processCounter == opt.processes then processCounter = 1 else processCounter = processCounter + 1 end end print("Sorting...") -- sort the files by length local function comp(a, b) return a[3] < b[3] end vecs:sort(comp) local size = #vecs print("Creating LMDB dataset to: " .. lmdbPath) -- start writing local dbSpect, readerSpect = startWriter(lmdbPath .. '/spect', 'spect') local dbTrans, readerTrans = startWriter(lmdbPath .. '/trans', 'trans') local function getData(opts) local opt = opts.opt local audioFile = audio.load(opts.audioFilePath) local spect = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate) -- freq-by-frames tensor -- put into lmdb spect = spect:float() -- normalize the data local mean = spect:mean() local std = spect:std() spect:add(-mean) spect:div(std) local transcript for line in io.lines(opts.transcriptFilePath) do transcript = line end return { spect, transcript } end for x = 1, opt.processes do local vec = vecs[x] local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt } parallel.children[x]:send({ opts, getData }) end local processCounter = 1 for x = 1, size do local result = parallel.children[processCounter]:receive() local spect, transcript = unpack(result) readerSpect:put(x, spect) readerTrans:put(x, transcript) -- commit buffer if x % 500 == 0 then readerSpect:commit(); readerSpect = dbSpect:txn() readerTrans:commit(); readerTrans = dbTrans:txn() collectgarbage() end if x + opt.processes <= size then local vec = vecs[x + opt.processes] local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt } parallel.children[processCounter]:send({ opts, getData }) end if processCounter == opt.processes then processCounter = 1 else processCounter = processCounter + 1 end xlua.progress(x, size) end closeWriter(dbSpect, readerSpect) closeWriter(dbTrans, readerTrans) end function parent() local function looper() require 'torch' require 'audio' while true do local object = parallel.parent:receive() local opts, code = unpack(object) local result = code(opts) parallel.parent:send(result) collectgarbage() end end parallel.children:exec(looper) createLMDB(dataPath .. '/train', lmdbPath .. '/train') createLMDB(dataPath .. '/test', lmdbPath .. '/test') parallel.close() end local ok, err = pcall(parent) if not ok then print(err) parallel.close() end ================================================ FILE: Mapper.lua ================================================ require 'torch' -- construct an object to deal with the mapping local mapper = torch.class('Mapper') function mapper:__init(dictPath) assert(paths.filep(dictPath), dictPath ..' not found') self.alphabet2token = {} self.token2alphabet = {} -- make maps local cnt = 0 for line in io.lines(dictPath) do self.alphabet2token[line] = cnt self.token2alphabet[cnt] = line cnt = cnt + 1 end end function mapper:encodeString(line) line = string.lower(line) local label = {} for i = 1, #line do local character = line:sub(i, i) table.insert(label, self.alphabet2token[character]) end return label end function mapper:decodeOutput(predictions) --[[ Turns the predictions tensor into a list of the most likely tokens NOTE: to compute WER we strip the begining and ending spaces --]] local tokens = {} local blankToken = self.alphabet2token['$'] local preToken = blankToken -- The prediction is a sequence of likelihood vectors local _, maxIndices = torch.max(predictions, 2) maxIndices = maxIndices:float():squeeze() for i=1, maxIndices:size(1) do local token = maxIndices[i] - 1 -- CTC indexes start from 1, while token starts from 0 -- add token if it's not blank, and is not the same as pre_token if token ~= blankToken and token ~= preToken then table.insert(tokens, token) end preToken = token end return tokens end function mapper:tokensToText(tokens) local text = "" for i, t in ipairs(tokens) do text = text .. self.token2alphabet[tokens[i]] end return text end ================================================ FILE: ModelEvaluator.lua ================================================ require 'Loader' require 'Mapper' require 'torch' require 'xlua' local threads = require 'threads' require 'SequenceError' local ModelEvaluator = torch.class('ModelEvaluator') local loader function ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath) loader = Loader(datasetPath, mapper) self.testBatchSize = testBatchSize self.nbOfTestIterations = math.ceil(loader.size / testBatchSize) self.indexer = indexer(datasetPath, testBatchSize) self.pool = threads.Threads(1, function() require 'Loader' end) self.mapper = mapper self.logsPath = logsPath self.suffix = '_' .. os.date('%Y%m%d_%H%M%S') self.sequenceError = SequenceError() self.input = torch.Tensor() self.isGPU = isGPU if isGPU then self.input = self.input:cuda() end end function ModelEvaluator:runEvaluation(model, verbose, epoch) local spect_buf, label_buf, sizes_buf -- get first batch local inds = self.indexer:nextIndices() self.pool:addjob(function() return loader:nextBatch(inds) end, function(spect, label, sizes) spect_buf = spect label_buf = label sizes_buf = sizes end) if verbose then local f = assert(io.open(self.logsPath .. 'WER_Test' .. self.suffix .. '.log', 'a'), "Could not create validation test logs, does the folder " .. self.logsPath .. " exist?") f:write('======================== BEGIN WER TEST EPOCH: ' .. epoch .. ' =========================\n') f:close() end local evaluationPredictions = {} -- stores the predictions to order for log. local cumCER = 0 local cumWER = 0 local numberOfSamples = 0 -- ======================= for every test iteration ========================== for i = 1, self.nbOfTestIterations do -- get buf and fetch next one self.pool:synchronize() local inputsCPU, targets, sizes_array = spect_buf, label_buf, sizes_buf inds = self.indexer:nextIndices() self.pool:addjob(function() return loader:nextBatch(inds) end, function(spect, label, sizes) spect_buf = spect label_buf = label sizes_buf = sizes end) self.input:resize(inputsCPU:size()):copy(inputsCPU) local predictions = model:forward(self.input) if self.isGPU then cutorch.synchronize() end local size = predictions:size(1) for j = 1, size do local prediction = predictions[j] local predict_tokens = self.mapper:decodeOutput(prediction) local targetTranscript = self.mapper:tokensToText(targets[j]) local predictTranscript = self.mapper:tokensToText(predict_tokens) local CER = self.sequenceError:calculateCER(targetTranscript, predictTranscript) local WER = self.sequenceError:calculateWER(targetTranscript, predictTranscript) cumCER = cumCER + CER cumWER = cumWER + WER table.insert(evaluationPredictions, { wer = WER * 100, cer = CER * 100, target = targetTranscript, prediction = predictTranscript }) end numberOfSamples = numberOfSamples + size end local function comp(a, b) return a.wer < b.wer end table.sort(evaluationPredictions, comp) if verbose then for index, eval in ipairs(evaluationPredictions) do local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a')) f:write(string.format("WER = %.2f | CER = %.2f | Text = \"%s\" | Predict = \"%s\"\n", eval.wer, eval.cer, eval.target, eval.prediction)) f:close() end end local averageWER = cumWER / numberOfSamples local averageCER = cumCER / numberOfSamples local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a')) f:write(string.format("Average WER = %.2f | CER = %.2f", averageWER * 100, averageCER * 100)) f:close() self.pool:synchronize() -- end the last loading return averageWER, averageCER end ================================================ FILE: Network.lua ================================================ require 'optim' require 'nnx' require 'gnuplot' require 'lfs' require 'xlua' require 'UtilsMultiGPU' require 'Loader' require 'nngraph' require 'Mapper' require 'ModelEvaluator' local suffix = '_' .. os.date('%Y%m%d_%H%M%S') local threads = require 'threads' local Network = {} --Training parameters seed = 10 torch.setdefaulttensortype('torch.FloatTensor') torch.manualSeed(seed) function Network:init(opt) self.fileName = opt.saveFileName self.nGPU = opt.nGPU self.gpu = self.nGPU > 0 if not self.gpu then require 'rnn' else require 'cutorch' require 'cunn' require 'cudnn' require 'BatchBRNNReLU' cutorch.manualSeedAll(seed) end self.trainingSetLMDBPath = opt.trainingSetLMDBPath self.validationSetLMDBPath = opt.validationSetLMDBPath self.logsTrainPath = opt.logsTrainPath or nil self.logsValidationPath = opt.logsValidationPath or nil self.modelTrainingPath = opt.modelTrainingPath or nil self.permuteBatch = opt.permuteBatch or false self:makeDirectories({ self.logsTrainPath, self.logsValidationPath, self.modelTrainingPath }) self.mapper = Mapper(opt.dictionaryPath) self.tester = ModelEvaluator(self.gpu, self.validationSetLMDBPath, self.mapper, opt.validationBatchSize, self.logsValidationPath) self.loadModel = opt.loadModel self.epochSave = opt.epochSave or false -- Saves model every number of iterations. self.maxNorm = opt.maxNorm or 400 -- value chosen by Baidu for english speech. -- setting model saving/loading if self.loadModel then assert(opt.loadPath, "loadPath hasn't been given to load model.") self:loadNetwork(opt.loadPath, opt.modelName) else assert(opt.modelName, "Must have given a model to train.") self:prepSpeechModel(opt.modelName, opt) end -- setting online loading self.indexer = indexer(opt.trainingSetLMDBPath, opt.batchSize) self.pool = threads.Threads(1, function() require 'Loader' end) self.logger = optim.Logger(self.logsTrainPath .. 'train' .. suffix .. '.log') self.logger:setNames { 'loss', 'WER', 'CER' } self.logger:style { '-', '-', '-' } end function Network:prepSpeechModel(modelName, opt) local model = require(modelName) self.model = model[1](opt) self.calSize = model[2] end function Network:testNetwork(epoch) self.model:evaluate() local wer, cer = self.tester:runEvaluation(self.model, true, epoch or 1) -- details in log self.model:zeroGradParameters() self.model:training() return wer, cer end function Network:trainNetwork(epochs, optimizerParams) self.model:training() local lossHistory = {} local validationHistory = {} local criterion = nn.CTCCriterion(true) local x, gradParameters = self.model:getParameters() print("Number of parameters: ", gradParameters:size(1)) -- inputs (preallocate) local inputs = torch.Tensor() local sizes = torch.Tensor() if self.gpu then criterion = criterion:cuda() inputs = inputs:cuda() sizes = sizes:cuda() end -- def loading buf and loader local loader = Loader(self.trainingSetLMDBPath, self.mapper) local specBuf, labelBuf, sizesBuf -- load first batch local inds = self.indexer:nextIndices() self.pool:addjob(function() return loader:nextBatch(inds) end, function(spect, label, sizes) specBuf = spect labelBuf = label sizesBuf = sizes end) -- define the feval local function feval(x_new) self.pool:synchronize() -- wait previous loading local inputsCPU, sizes, targets = specBuf, sizesBuf, labelBuf -- move buf to training data inds = self.indexer:nextIndices() -- load next batch whilst training self.pool:addjob(function() return loader:nextBatch(inds) end, function(spect, label, sizes) specBuf = spect labelBuf = label sizesBuf = sizes end) inputs:resize(inputsCPU:size()):copy(inputsCPU) -- transfer over to GPU sizes = self.calSize(sizes) local predictions = self.model:forward(inputs) local loss = criterion:forward(predictions, targets, sizes) if loss == math.huge or loss == -math.huge then loss = 0 print("Recieved an inf cost!") end self.model:zeroGradParameters() local gradOutput = criterion:backward(predictions, targets) self.model:backward(inputs, gradOutput) local norm = gradParameters:norm() if norm > self.maxNorm then gradParameters:mul(self.maxNorm / norm) end return loss, gradParameters end -- training local currentLoss local startTime = os.time() for i = 1, epochs do local averageLoss = 0 for j = 1, self.indexer.nbOfBatches do currentLoss = 0 local _, fs = optim.sgd(feval, x, optimizerParams) if self.gpu then cutorch.synchronize() end currentLoss = currentLoss + fs[1] xlua.progress(j, self.indexer.nbOfBatches) averageLoss = averageLoss + currentLoss end if self.permuteBatch then self.indexer:permuteBatchOrder() end averageLoss = averageLoss / self.indexer.nbOfBatches -- Calculate the average loss at this epoch. -- anneal learningRate optimizerParams.learningRate = optimizerParams.learningRate / (optimizerParams.learningRateAnnealing or 1) -- Update validation error rates local wer, cer = self:testNetwork(i) print(string.format("Training Epoch: %d Average Loss: %f Average Validation WER: %.2f Average Validation CER: %.2f", i, averageLoss, 100 * wer, 100 * cer)) table.insert(lossHistory, averageLoss) -- Add the average loss value to the logger. table.insert(validationHistory, 100 * wer) self.logger:add { averageLoss, 100 * wer, 100 * cer } -- periodically save the model if self.epochSave then print("Saving model..") self:saveNetwork(self.modelTrainingPath .. 'model_epoch_' .. i .. suffix .. '_' .. self.fileName) end end local endTime = os.time() local secondsTaken = endTime - startTime local minutesTaken = secondsTaken / 60 print("Minutes taken to train: ", minutesTaken) print("Saving model..") self:saveNetwork(self.modelTrainingPath .. 'final_model_' .. suffix .. '_' .. self.fileName) return lossHistory, validationHistory, minutesTaken end function Network:createLossGraph() self.logger:plot() end function Network:saveNetwork(saveName) self.model:clearState() saveDataParallel(saveName, self.model) end --Loads the model into Network. function Network:loadNetwork(saveName, modelName) self.model = loadDataParallel(saveName, self.nGPU) local model = require(modelName) self.calSize = model[2] end function Network:makeDirectories(folderPaths) for index, folderPath in ipairs(folderPaths) do if (folderPath ~= nil) then os.execute("mkdir -p " .. folderPath) end end end return Network ================================================ FILE: Predict.lua ================================================ require 'nn' require 'audio' require 'Mapper' require 'UtilsMultiGPU' local cmd = torch.CmdLine() cmd:option('-modelPath', 'deepspeech.t7', 'Path of model to load') cmd:option('-audioPath', '', 'Path to the input audio to predict on') cmd:option('-dictionaryPath', './dictionary', 'File containing the dictionary to use') cmd:option('-windowSize', 0.02, 'Window Size of audio') cmd:option('-stride', 0.01, 'Stride of audio') cmd:option('-sampleRate', 16000, 'Rate of audio (default 16khz)') cmd:option('-nGPU', 1) local opt = cmd:parse(arg) if opt.nGPU > 0 then require 'cunn' require 'cudnn' require 'BatchBRNNReLU' end local model = loadDataParallel(opt.modelPath, opt.nGPU) local mapper = Mapper(opt.dictionaryPath) local wave = audio.load(opt.audioPath) local spect = audio.spectrogram(wave, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):float() -- freq-by-frames tensor -- normalize the data local mean = spect:mean() local std = spect:std() spect:add(-mean) spect:div(std) spect = spect:view(1, 1, spect:size(1), spect:size(2)) if opt.nGPU > 0 then spect = spect:cuda() model = model:cuda() end model:evaluate() local predictions = model:forward(spect) local tokens = mapper:decodeOutput(predictions[1]) local text = mapper:tokensToText(tokens) print(text) ================================================ FILE: README.md ================================================ # deepspeech.torch [![Build Status](https://travis-ci.org/SeanNaren/deepspeech.torch.svg?branch=master)](https://travis-ci.org/SeanNaren/deepspeech.torch) [![Documentation Status](https://readthedocs.org/projects/ctcspeechrecognition/badge/?version=latest)](http://ctcspeechrecognition.readthedocs.io/en/latest/?badge=latest) Implementation of [Baidu Warp-CTC](https://github.com/baidu-research/warp-ctc) using torch7. Creates a network based on the [DeepSpeech2](http://arxiv.org/pdf/1512.02595v1.pdf) architecture using the Torch7 library, trained with the CTC activation function. ## Features * Train large models with large datasets via online loading using [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) and multi-GPU support. * Supports variable length batches via padding. * Implements the [AN4 Audio database](http://www.speech.cs.cmu.edu/databases/an4/) (50 mins of data). Has also been extended to train using the [LibriSpeech](http://www.openslr.org/12/) dataset (1000 hours of data). Custom dataset preparation is explained in documentation. ## Branches There are currently two branches, Master and Phoneme: * Master: This branch trains DeepSpeech2. Also included is an evaluation script which calculates the WER/CER, as well as a prediction script. This branch is useful for understanding how the DeepSpeech and CTC works and is easy to run after installation. Highly recommended to checkout this branch. * Phonemes: This branch is experimental and uses phonemes rather than character based predictions. This is fully credited and extended by [CCorfield](https://github.com/CCorfield) and his awesome work in porting to use phonemes. In addition to this I'd like to also thank [Shane Walker](https://github.com/walkers-mv) for his awesome recent conversion to use phonemes as well. ## Installation/Data Preparation/Documentation Follow Instructions/Data Preparation/Documentation found in the wiki [here](https://github.com/SeanNaren/deepspeech.torch/wiki/Installation) to set up and run the code. Technical documentation can be found [here](http://ctcspeechrecognition.readthedocs.io/en/latest/). ## Pre-trained Networks Pre-trained networks are available for AN4 as well as LibriSpeech for CUDA only (since they use cudnn RNNs). Download Links and accuracies are below. DeepSpeech-light is a smaller model which is less intensive to train (based on LSTMs rather than RNNs). ### AN4 **an4Test** |Network | WER | CER |Link | |-----------------|:--------:|:--------:|:--------:| |DeepSpeech-light| N/A | N/A | N/A | |DeepSpeech | 12 | 3.07 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/an4_deepspeech.t7) | ### LibriSpeech **Librispeech-test-clean** |Network | WER | CER |Link | |-----------------|:--------:|:--------:|:--------:| |DeepSpeech-light| 15 | 1.34 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech-light.t7) | |DeepSpeech | 12 | 1.55 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech.t7) | **Librispeech-test-other** |Network | WER | CER |Link | |-----------------|:--------:|:--------:|:--------:| |DeepSpeech-light| 36 | 3.80 | (Download Above) | |DeepSpeech | 33 | 3.24 | (Download Above) | Once you're set up, you can start training from these nets by using the below parameters (you might need to change the other parameters described in the wiki) after setting the project up: ```lua th Train.lua -loadModel -loadPath /path/to/model.t7 ``` ## Acknowledgements Lots of people helped/contributed to this project that deserve recognition: * Soumith Chintala for his support on Torch7 and the vast open source projects he has contributed that made this project possible! * Charles Corfield for his work on the Phoneme Dataset and his overall contribution and aid throughout. * Will Frey for his thorough communication and aid in the development process. * Ding Ling, Yuan Yang and Yan Xia for their significant contribution to online training, multi-gpu support and many other important features. * Erich Elsen and the team from Baidu for their contribution of Warp-CTC that made this possible, and the encouraging words and support given throughout the project. * Maciej Korzepa for his huge help in training a model on Librispeech! ================================================ FILE: SequenceError.lua ================================================ local SequenceError = torch.class("SequenceError") -- Calculates a sequence error rate (aka Levenshtein edit distance) function SequenceError:sequenceErrorRate(target, prediction) local d = torch.Tensor(#target + 1, #prediction + 1):zero() for i = 1, #target + 1 do for j = 1, #prediction + 1 do if (i == 1) then d[1][j] = j - 1 elseif (j == 1) then d[i][1] = i - 1 end end end for i = 2, #target + 1 do for j = 2, #prediction + 1 do if (target[i - 1] == prediction[j - 1]) then d[i][j] = d[i - 1][j - 1] else local substitution = d[i - 1][j - 1] + 1 local insertion = d[i][j - 1] + 1 local deletion = d[i - 1][j] + 1 d[i][j] = torch.min(torch.Tensor({ substitution, insertion, deletion })) end end end local errorRate = d[#target + 1][#prediction + 1] / #target return errorRate end function SequenceError:calculateCER(targetTranscript, predictTranscript) return self:sequenceErrorRate(targetTranscript, predictTranscript) end function SequenceError:calculateWER(targetTranscript, predictTranscript) -- convert to words before calculation local targetWords = {} for word in targetTranscript:gmatch("%S+") do table.insert(targetWords, word) end local predictedWords = {} for word in predictTranscript:gmatch("%S+") do table.insert(predictedWords, word) end return self:sequenceErrorRate(targetWords, predictedWords) end ================================================ FILE: Test.lua ================================================ local Network = require 'Network' -- Load the network from the saved model. Options can be overrided on command line run. local cmd = torch.CmdLine() cmd:option('-loadModel', true, 'Load previously saved model') cmd:option('-saveModel', false, 'Save model after training/testing') cmd:option('-loadPath', 'deepspeech.t7', 'Path of final model to save/load') cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture') cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU') cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset') cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset') cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs') cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs') cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use') cmd:option('-batchSize', 20, 'Batch size in training') cmd:option('-validationBatchSize', 32, 'Batch size for validation') local opt = cmd:parse(arg) Network:init(opt) print("Testing network...") local wer, cer = Network:testNetwork() print(string.format('Avg WER: %2.f Avg CER: %.2f', 100 * wer, 100 * cer)) print(string.format('More information written to log file at %s', opt.logsValidationPath)) ================================================ FILE: Train.lua ================================================ local Network = require 'Network' -- Options can be overrided on command line run. local cmd = torch.CmdLine() cmd:option('-loadModel', false, 'Load previously saved model') cmd:option('-loadPath', 'deepspeech.t7', 'Path to model to load') cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture') cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU') cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset') cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset') cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs') cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs') cmd:option('-epochSave', false, 'save model every epoch') cmd:option('-modelTrainingPath', './models/', ' Path to save periodic training models') cmd:option('-saveFileName', 'deepspeech.t7', 'Name of model to save as') cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use') cmd:option('-epochs', 70, 'Number of epochs for training') cmd:option('-learningRate', 3e-4, ' Training learning rate') cmd:option('-learningRateAnnealing', 1.1, 'Factor to anneal lr every epoch') cmd:option('-maxNorm', 400, 'Max norm used to normalize gradients') cmd:option('-momentum', 0.90, 'Momentum for SGD') cmd:option('-batchSize', 20, 'Batch size in training') cmd:option('-permuteBatch', false, 'Set to true if you want to permute batches AFTER the first epoch') cmd:option('-validationBatchSize', 20, 'Batch size for validation') cmd:option('-LSTM', false, 'Use LSTMs rather than RNNs') cmd:option('-hiddenSize', 1760, 'RNN hidden sizes') cmd:option('-nbOfHiddenLayers', 7, 'Number of rnn layers') local opt = cmd:parse(arg) --Parameters for the stochastic gradient descent (using the optim library). local optimParams = { learningRate = opt.learningRate, learningRateAnnealing = opt.learningRateAnnealing, momentum = opt.momentum, dampening = 0, nesterov = true } --Create and train the network based on the parameters and training data. Network:init(opt) Network:trainNetwork(opt.epochs, optimParams) --Creates the loss plot. Network:createLossGraph() ================================================ FILE: UtilsMultiGPU.lua ================================================ require 'rnn' require 'nngraph' function makeDataParallel(model, nGPU) if nGPU > 0 then cudnn.fastest = true local function BatchNorm(module) return torch.type(module):find('BatchNormalization') end model = cudnn.convert(model, cudnn, BatchNorm) if nGPU > 1 then gpus = torch.range(1, nGPU):totable() dpt = nn.DataParallelTable(1):add(model, gpus):threads(function() require 'nngraph' require 'cudnn' cudnn.fastest = true require 'BatchBRNNReLU' end) dpt.gradInput = nil model = dpt end model:cuda() end return model end local function cleanDPT(module, device) -- This assumes this DPT was created by the function above: all the -- module.modules are clones of the same network on different GPUs -- hence we only need to keep one when saving the model to the disk. local newDPT = nn.DataParallelTable(1) cutorch.setDevice(device or 1) newDPT:add(module:get(1), device or 1) return newDPT end function saveDataParallel(modelPath, model) if torch.type(model) == 'nn.DataParallelTable' then torch.save(modelPath, cleanDPT(model)) elseif torch.type(model) == 'nn.Sequential' then local temp_model = nn.Sequential() for i, module in ipairs(model.modules) do if torch.type(module) == 'nn.DataParallelTable' then temp_model:add(cleanDPT(module)) else temp_model:add(module) end end torch.save(modelPath, temp_model) elseif torch.type(model) == 'nn.gModule' then torch.save(modelPath, model) else error('This saving function only works with Sequential or DataParallelTable modules.') end end function loadDataParallel(modelPath, nGPU) if nGPU > 1 then require 'cudnn' require 'BatchBRNNReLU' end local model = torch.load(modelPath) if torch.type(model) == 'nn.DataParallelTable' then return makeDataParallel(model:get(1):float(), nGPU) elseif torch.type(model) == 'nn.Sequential' then for i, module in ipairs(model.modules) do if torch.type(module) == 'nn.DataParallelTable' then model.modules[i] = makeDataParallel(module:get(1):float(), nGPU) end end return model elseif torch.type(model) == 'nn.gModule' then model = makeDataParallel(model, nGPU) return model else error('The loaded model is not a Sequential or DataParallelTable module.') end end ================================================ FILE: dictionary ================================================ $ a b c d e f g h i j k l m n o p q r s t u v w x y z ' ================================================ FILE: doc/DeepSpeechModel.md ================================================ # DeepSpeechModel Defines the deep speech 2 conv+rnn architecture. ### deepSpeech(opt) Defines the torch architecture for Deep Speech 2 as a function that can be called. Returns the final model `opt` Defines the options we use including using GPUS, hidden size and number of layers for the RNNs. ### calculateInputSizes(sizes) A function that calculates the sequence sizes after the convolutional layers. Used in the loss calculations in CTC, so the network isn't penalised for the padded sequences. Returns a same sized tensor. `sizes` Real size of each sentence in the training sample as a 1D tensor. ================================================ FILE: doc/Loader.md ================================================ # Loader Defines the indexer class and the loader class, handling batching of the dataset to train the network. ## Indexer Handles returning the next indices of the batch to load into memory, to train the network with. ### indexer:__init(_dir, batchSize) `dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts. `batchSize` The sizes of each batch to create. ### indexer:nextIndices() Retrieves the next indices that need to be loaded by the loader from the LMDB dataset. ### indexer:permuteBatchOrder() Permutes the batch order randomly. This is for the net to not train in sequence order every time. ## Loader Loads batches of data from LMDB files used in training/testing. ### Loader:__init(dirPath) `dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts. ### Loader:nextBatch(indices) Returns the next batch of the dataset based on the given indices. `indices` The indices of the test samples that need to be retrieved. This is handled by the Indexer class above. ================================================ FILE: doc/Mapper.md ================================================ # Mapper Defines how numeric indices are mapped to tokens and vice versa. ### Mapper:__init(dictPath) Creates mappings based on the given dictionary file. The AN4 dictionary file can be seen [here](https://github.com/SeanNaren/deepspeech.torch/blob/master/dictionary). ### Mapper:encodeString(string) Converts string into a set of tokens to be used as a label in training. `string` string to be converted. ### Mapper:decodeOutput(predictions) Converts predictions of the neural network into a sequence of tokens (characters) via a mapper. `predictions` is a tensor of sequence likelihood vectors given by the neural network. ### Mapper:tokensToText(tokens) Using the mapper converts the tokens into readable text. `tokens` A set of numeric tokens to convert into readable text. ================================================ FILE: doc/ModelEvaluator.md ================================================ # ModelEvaluator Handles calculation of word error rate using an LMDB dataset. For more information on the calculation, see [Evaluator](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Evaluator.md). ### ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath) 'isGPU' Whether to use the GPU (CUDA) or CPU. `datasetPath` the path to the LMDB test dataset to use in evaluation. `mapper` Maps predicted numeric values to characters, see [Mapper](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Mapper.md) for more details. `testBatchSize` The size of the batches to pass the network. `logsPath` File path to put the details of evaluations into. ### ModelEvaluator:runEvaluation(model, verbose, epoch) Calculates the word error rate and character error rate averaged over the test iterations. Uses the same threading as the training process does to load batches from the dataset. `model` The Torch model to evaluate. `verbose` If set to true, will store details of WER calculations into the log files. `epoch` Determines the epoch number that is written in the log files for this calculation. ================================================ FILE: doc/Network.md ================================================ # Network Handles interactions with the neural network for training and testing. Configured by network parameters given in constructor. ### Network:init(networkParams) Constructor of the Network class. Below defines each parameter that can be taken as input. ```lua local networkParams = { loadModel = false, -- Set to true if loading a model into the Network class rather than training. saveModel = true, -- Set to true if saving the model after training. modelName = 'DeepSpeechModel', -- The name of the lua class containing the network architecture nGPU = 1, -- Number of GPUs, set -1 to use CPU trainingSetLMDBPath = './prepare_an4/train/', -- online loading path from the LMDB dataset for training. validationSetLMDBPath = './prepare_an4/test/', -- online loading path from the LMDB dataset for testing. logsTrainPath = './logs/TrainingLoss/', -- Where training logs will be stored. logsValidationPath = './logs/ValidationScores/', -- Where testing score logs will be stored. modelTrainingPath = './models/', -- Where models will be stored on saving. modelPath = 'CTCNetwork.t7', dictionaryPath = './dictionary', -- Contains the alphabet/characters that we are to predict on. batchSize = 20, -- The sizes of batches that we are passing into the network in training. validationBatchSize = 1, -- Validation batch sizes (should be kept at 1, since we pass 1 sample at a time). validationIterations = 20, -- Number of validation iterations (kept small, because we only want to run a few tests per epoch). saveModelInTraining = false, -- saves model periodically through training saveModelIterations = 50 -- If saveModelInTraining set to true, we save every 50 epochs. } ``` ### Network:prepSpeechModel(modelName, opt) Used to create the model via the defined modelName and options. ### Network:testNetwork(epoch) Tests the current stored model via the word error rate. `epoch` can be used to detail the epoch number in the logs when testing scores are stored. ### Network:trainNetwork(epochs, sgd_params) Trains a network stored in the `Network` class. Uses multiple threads in an online loading fashion to load the data from the disk. `epochs` defines the number of iterations of training that will occur across the entire dataset (each epochs trains on the entire dataset). `sgd_params` defines the SGD parameters for the optim library such as below. ```lua local sgdParams = { learningRate = 5e-4, learningRateDecay = 1e-9, weightDecay = 0, momentum = 0.9, dampening = 0, nesterov = true } ``` ### Network:createLossGraph() After training, when called will use gnuplot (through wrapper in the optim library) to generate a graph of the loss and word error rate over epochs. ### Network:saveNetwork(saveName) Will save the model currently stored in the network class to disk, at the pre-defined save location with the given `saveName`. ### Network:loadNetwork(saveName, modelName) Loads the network from the save location, stored using the pre-defined save name. `saveName` The name as to which the network was saved as `modelName` The name of the class that stores the model or architecture. ================================================ FILE: doc/SequenceError.md ================================================ # SequenceError Calculates word error rates and handles conversion of CTC predictions to numeric tokens. ### SequenceError.sequenceErrorRate(target, prediction) Calculates the error rates based on the target and the predicted inputs. `target` and `prediction` are inputs of strings or tables. ### SequenceError:calculateCER(targetTranscript, predictTranscript) `targetTranscript` and `predictTranscript` are two strings, returns the Character Error Rate. ### SequenceError:calculateWER(targetTranscript, predictTranscript) `targetTranscript` and `predictTranscript` are two strings, returns the Word Error Rate. ================================================ FILE: doc/UtilsMultiGPU.md ================================================ # UtilsMultiGPU Handles multi-gpu setups of the architecture. ### makeDataParallel(model, nGPU) Converts the model into a multi-gpu set up if necessary using DataParallelTable. `model` The Torch network model to modify for configured GPUs. `nGPU` Number of GPUs. ### saveDataParallel(modelPath, model) Saves the model to disk. `modelPath` Location to save the model. `model` The Torch network model to save. ### loadDataParallel(modelPath, nGPU) Loads a model saved using the above methods. `modelPath` Location to load the model. `nGPU` Number of GPUs to load to. ================================================ FILE: doc/index.md ================================================ # Technical Documentation Below are a few classes that have been documented, explaining their purpose and functions available. ## Classes * [Network](Network.md) * [DeepSpeechModel](DeepSpeechModel.md) * [Mapper](Mapper.md) * [Evaluator](Evaluator.md) * [ModelEvaluator](ModelEvaluator.md) * [Utils](Utils.md) * [UtilsMultiGPU](UtilsMultiGPU.md) * [Loader](Loader.md) ================================================ FILE: mkdocs.yml ================================================ site_name: CTCSpeechRecognition theme : simplex repo_url : https://github.com/SeanNaren/CTCSpeechRecognition use_directory_urls : false markdown_extensions: [extra] docs_dir : doc pages: - [index.md, Home] - [Network.md, Network] - [DeepSpeechModel.md, DeepSpeechModel] - [Mapper.md, Mapper] - [SequenceError.md, SequenceError] - [ModelEvaluator.md, ModelEvaluator] - [UtilsMultiGPU.md, UtilsMultiGPU] - [Loader.md, Loader] ================================================ FILE: prepare_datasets/FormatAN4.lua ================================================ require 'torch' local cmd = torch.CmdLine() cmd:option('-rootPath', 'an4', 'Path to the an4 root') cmd:option('-newPath', 'an4_dataset', 'Path to the new data path') cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)') cmd:option('-move', false, 'Moves the files over rather than copies, used to save space') local opt = cmd:parse(arg) local an4TestPath = opt.rootPath .. '/etc/an4_test.' local an4TrainPath = opt.rootPath .. '/etc/an4_train.' local an4AudioPath = opt.rootPath .. '/wav' -- strips down the transcripts into pure text local function processText(line) local text = line:gsub('', ''):gsub('', ''):gsub('^%s', ''):gsub('%(.*%)', ''):gsub('%s*$', '') return text end local function createDataset(pathToAN4, an4AudioPath, newPath) sys.execute("mkdir " .. newPath) local fileids = pathToAN4 .. 'fileids' local transcripts = pathToAN4 .. 'transcription' local filePaths = {} for filePath in io.lines(fileids) do table.insert(filePaths, filePath) end local counter = 1 for line in io.lines(transcripts) do local text = processText(line) local filePath = filePaths[counter] -- new filename extracted from an4 file id local fileName = sys.split(filePath, '/')[3] -- last part is the filename -- create new text file with clean transcript local textPath = newPath .. '/' .. fileName .. '.txt' local file = io.open(textPath, "w") file:write(text) file:close() -- move audio to correct place local audioPath = an4AudioPath .. '/' .. filePath .. '.' .. opt.audioExtension local newPath = newPath .. '/' .. fileName .. '.' .. opt.audioExtension local command if opt.move then command = "mv " else command = "cp " end sys.execute(command .. audioPath .. ' ' .. newPath) counter = counter + 1 end end sys.execute("mkdir " .. opt.newPath) createDataset(an4TrainPath, an4AudioPath, opt.newPath .. '/train/') createDataset(an4TestPath, an4AudioPath, opt.newPath .. '/test/') ================================================ FILE: prepare_datasets/FormatLibriSpeech.lua ================================================ require 'torch' local threads = require 'threads' local cmd = torch.CmdLine() cmd:option('-rootPath', 'LibriSpeech', 'Path to the librispeech root') cmd:option('-newPath', 'libri_dataset', 'Path to the new data path') cmd:option('-audioExtension', 'flac', 'The extension of the audio files (wav/mp3/sph/etc)') cmd:option('-move', false, 'Moves the files over rather than copies, used to save space') cmd:option('-threads', 8, 'Number of threads to use') local opt = cmd:parse(arg) local extension = '.' .. opt.audioExtension local libriTestPath = opt.rootPath .. '/test/' local libriTrainPath = opt.rootPath .. '/train/' local threads = threads.Threads(opt.threads, function(idx) require 'torch' require 'sys' end) -- strips down the transcripts into pure text local function processText(line) local text = line:gsub('[^a-zA-Z ]', '') return text end local function createDataset(libriPath, newDirPath) sys.execute("mkdir " .. newDirPath) local size = tonumber(sys.execute("find " .. libriPath .. " -type f -name '*'" .. extension .. " | wc -l ")) local counter = 1 local function formatData(line, dir) local text = processText(line) local id = line:match("([^ ]*) ") -- first part of transcript, used for audio file path and ID local audioFolders = sys.split(id, '-') local textPath = newDirPath .. '/' .. id .. '.txt' local file = io.open(textPath, "w") file:write(text) file:close() -- move audio to correct place local audioPath = dir .. '/' .. audioFolders[1] .. '/' .. audioFolders[2] .. '/' .. id .. extension local newPath = newDirPath .. '/' .. id .. extension local command if opt.move then command = "mv " else command = "cp " end sys.execute(command .. audioPath .. ' ' .. newPath) end local counter = 0 local p = io.popen('find "' .. libriPath .. '" -maxdepth 1 -mindepth 1 -type d') for dir in p:lines() do local transcripts = io.popen("find -L " .. dir .. " -type f -name '*.txt'") for transcript in transcripts:lines() do for line in io.lines(transcript) do threads:addjob(function() formatData(line, dir) end, function() counter = counter + 1 xlua.progress(counter, size) end) end end end end sys.execute("mkdir " .. opt.newPath) createDataset(libriTrainPath, opt.newPath .. '/train/') createDataset(libriTestPath, opt.newPath .. '/test/') ================================================ FILE: tests/test.lua ================================================ require 'nn' local test = torch.TestSuite() local mytester require '../SequenceError' require '../Mapper' local sequenceError = SequenceError() function test.evaluator() -- Calculates WER, (nbOfInsertions + nbOfDeletions + nbOfSubstitutions) / nbOfWords local target = "test a sentence" local prediction = "a sentence" local deletion = sequenceError:calculateWER(target, prediction) local prediction = "test a sentence inserted" local insertion = sequenceError:calculateWER(target, prediction) local prediction = "test substituted sentence" local substitution = sequenceError:calculateWER(target, prediction) local oneMistakeWER = 1 / 3 -- One insertion/deletion/substitution / number of words mytester:eq(deletion, oneMistakeWER, 'WER with deletion was incorrect') mytester:eq(insertion, oneMistakeWER, 'WER with insertion was incorrect') mytester:eq(substitution, oneMistakeWER, 'WER with substitution was incorrect') local prediction = "a" local deletion = sequenceError:calculateWER(target, prediction) local prediction = "a wrong" local deletionAndSubstitution = sequenceError:calculateWER(target, prediction) local prediction = "wrong a sentence inserted" local substitionAndInsertion = sequenceError:calculateWER(target, prediction) local twoMistakeWER = 2 / 3 -- Two errors of insertion/deletion/substitution / number of words mytester:eq(deletion, twoMistakeWER, 'masking of outputs was incorrect') mytester:eq(deletionAndSubstitution, twoMistakeWER, 'WER with substitution and deletion was incorrect') mytester:eq(substitionAndInsertion, twoMistakeWER, 'WER with substitution and insertion was incorrect') end function test.mapper() local dir_path = 'test_dictionary' local mapper = Mapper(dir_path) local alphabet = { '$', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '\'' } local expectedMapping = {} for index, letter in ipairs(alphabet) do expectedMapping[letter] = index - 1 end mytester:eq(mapper.alphabet2token, expectedMapping) end function test.mapperDecode() local dir_path = 'test_dictionary' local mapper = Mapper(dir_path) local predictions = torch.Tensor({ { 1, 2, 3 }, { 2, 3, 1 }, { 1, 2, 3 } }) local tokens = mapper:decodeOutput(predictions) local text = mapper:tokensToText(tokens) mytester:eq(tokens, { 2, 1, 2 }) mytester:eq(text, 'bab') end mytester = torch.Tester() mytester:add(test) mytester:run() ================================================ FILE: tests/test_dictionary ================================================ $ a b c d e f g h i j k l m n o p q r s t u v w x y z '