Full Code of SeanNaren/deepspeech.torch for AI

master 26d24fa5a805 cached

31 files

58.2 KB

15.8k tokens

1 requests

Download .txt

Repository: SeanNaren/deepspeech.torch
Branch: master
Commit: 26d24fa5a805
Files: 31
Total size: 58.2 KB

Directory structure:
gitextract_kmme0l_o/

├── .gitignore
├── .travis.yml
├── BatchBRNN.lua
├── BatchBRNNReLU.lua
├── DeepSpeechModel.lua
├── LICENSE.md
├── Loader.lua
├── MakeLMDB.lua
├── Mapper.lua
├── ModelEvaluator.lua
├── Network.lua
├── Predict.lua
├── README.md
├── SequenceError.lua
├── Test.lua
├── Train.lua
├── UtilsMultiGPU.lua
├── dictionary
├── doc/
│   ├── DeepSpeechModel.md
│   ├── Loader.md
│   ├── Mapper.md
│   ├── ModelEvaluator.md
│   ├── Network.md
│   ├── SequenceError.md
│   ├── UtilsMultiGPU.md
│   └── index.md
├── mkdocs.yml
├── prepare_datasets/
│   ├── FormatAN4.lua
│   └── FormatLibriSpeech.lua
└── tests/
    ├── test.lua
    └── test_dictionary

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.idea/
Audio
systemtests/
systemtests
CTC.iml
CTCSpeechRecognition.iml
*.log
*.log.eps
*.t7
Seq2Seq/


================================================
FILE: .travis.yml
================================================
language: c
notifications:
  email: false
compiler:
  - gcc
  - clang
cache:
  directories:
  - $HOME/OpenBlasInstall
sudo: false
env:
  - TORCH_LUA_VERSION=LUAJIT21
  - TORCH_LUA_VERSION=LUA51
  - TORCH_LUA_VERSION=LUA52
addons:
  apt:
    packages:
    - cmake
    - gfortran
    - gcc-multilib
    - gfortran-multilib
    - liblapack-dev
    - build-essential
    - gcc
    - g++
    - curl
    - cmake
    - libreadline-dev
    - git-core
    - libqt4-core
    - libqt4-gui
    - libqt4-dev
    - libjpeg-dev
    - libpng-dev
    - ncurses-dev
    - imagemagick
    - libzmq3-dev
    - gfortran
    - unzip
    - gnuplot
    - gnuplot-x11
before_script:
- export ROOT_TRAVIS_DIR=$(pwd)
- export INSTALL_PREFIX=~/torch/install
-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
- git clone https://github.com/torch/distro.git ~/torch --recursive
- cd ~/torch && git submodule update --init --recursive
- mkdir build && cd build
- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
- make && make install
- cd $ROOT_TRAVIS_DIR
- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
- ${INSTALL_PREFIX}/bin/luarocks install nn
- ${INSTALL_PREFIX}/bin/luarocks install dpnn
script:
- cd ${ROOT_TRAVIS_DIR}
- git clone https://github.com/SeanNaren/CTCSpeechRecognition.git
- cd CTCSpeechRecognition/tests
- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.1/ --exclude ../tests --exclude ../prepare_an4/
- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.2/ --exclude ../tests --exclude ../prepare_an4/
- export PATH=${INSTALL_PREFIX}/bin:$PATH
- export TESTLUA=$(which luajit lua | head -n 1)
- echo ${TESTLUA}
- ${TESTLUA} test.lua


================================================
FILE: BatchBRNN.lua
================================================
------------------------------------------------------------------------
--[[ BatchBRNN ]] --
-- Adds sequence-wise batch normalization to cudnn RNN modules.
-- For a simple RNN: ht = ReLU(B(Wixt) + Riht-1 + bRi) where B
-- is the batch normalization.
-- Expects size seqLength x minibatch x inputDim.
-- Returns seqLength x minibatch x outputDim.
-- Can specify an rnnModule such as cudnn.LSTM (defaults to RNNReLU).
------------------------------------------------------------------------
local BatchBRNN, parent = torch.class('cudnn.BatchBRNN', 'nn.Sequential')

function BatchBRNN:__init(inputDim, outputDim)
    parent.__init(self)

    self.view_in = nn.View(1, 1, -1):setNumInputDims(3)
    self.view_out = nn.View(1, -1):setNumInputDims(2)

    self.rnn = cudnn.RNN(outputDim, outputDim, 1)
    local rnn = self.rnn
    rnn.inputMode = 'CUDNN_SKIP_INPUT'
    rnn.bidirectional = 'CUDNN_BIDIRECTIONAL'
    rnn.numDirections = 2
    rnn:reset()
    self:add(self.view_in)
    self:add(nn.Linear(inputDim, outputDim, false))
    self:add(nn.BatchNormalization(outputDim))
    self:add(self.view_out)
    self:add(rnn)
    self:add(nn.View(-1, 2, outputDim):setNumInputDims(2))
    self:add(nn.Sum(3))
end

function BatchBRNN:updateOutput(input)
    local T, N = input:size(1), input:size(2)
    self.view_in:resetSize(T * N, -1)
    self.view_out:resetSize(T, N, -1)
    return parent.updateOutput(self, input)
end

function BatchBRNN:__tostring__()
    local tab = '  '
    local line = '\n'
    local next = ' -> '
    local str = 'BatchBRNN'
    str = str .. ' {' .. line .. tab .. '[input'
    for i=1,#self.modules do
        str = str .. next .. '(' .. i .. ')'
    end
    str = str .. next .. 'output]'
    for i=1,#self.modules do
        str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab)
    end
    str = str .. line .. '}'
    return str
end

================================================
FILE: BatchBRNNReLU.lua
================================================
require 'BatchBRNN'
------------------------------------------------------------------------
--[[ BatchBRNNReLU ]] --
-- Based On BatchBRNN. Adds ClippedReLU non-linearity to Vanilla BRNN.
------------------------------------------------------------------------
local BatchBRNNReLU, parent = torch.class('cudnn.BatchBRNNReLU', 'cudnn.BatchBRNN')

function BatchBRNNReLU:__init(inputDim, outputDim)
    parent.__init(self, inputDim, outputDim)
    local rnn = self.rnn
    rnn.mode = 'CUDNN_RNN_RELU'
    rnn:reset()
    self:insert(nn.Clamp(0, 20), 6)
end

================================================
FILE: DeepSpeechModel.lua
================================================
require 'UtilsMultiGPU'

local function RNNModule(inputDim, hiddenDim, opt)
    if opt.nGPU > 0 then
        if opt.LSTM then
            local blstm = nn.Sequential()
            blstm:add(cudnn.BLSTM(inputDim, hiddenDim, 1))
            blstm:add(nn.View(-1, 2, hiddenDim):setNumInputDims(2)) -- have to sum activations
            blstm:add(nn.Sum(3))
            return blstm
        else
            require 'BatchBRNNReLU'
            return cudnn.BatchBRNNReLU(inputDim, hiddenDim)
        end
    else
        require 'rnn'
        return nn.SeqBRNN(inputDim, hiddenDim)
    end
end

-- Creates the covnet+rnn structure.
local function deepSpeech(opt)
    local conv = nn.Sequential()
    -- (nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH]) conv layers.
    conv:add(nn.SpatialConvolution(1, 32, 11, 41, 2, 2))
    conv:add(nn.SpatialBatchNormalization(32))
    conv:add(nn.Clamp(0, 20))
    conv:add(nn.SpatialConvolution(32, 32, 11, 21, 2, 1))
    conv:add(nn.SpatialBatchNormalization(32))
    conv:add(nn.Clamp(0, 20))
    local rnnInputsize = 32 * 41 -- based on the above convolutions and 16khz audio.
    local rnnHiddenSize = opt.hiddenSize -- size of rnn hidden layers
    local nbOfHiddenLayers = opt.nbOfHiddenLayers

    conv:add(nn.View(rnnInputsize, -1):setNumInputDims(3)) -- batch x features x seqLength
    conv:add(nn.Transpose({ 2, 3 }, { 1, 2 })) -- seqLength x batch x features

    local rnns = nn.Sequential()
    local rnnModule = RNNModule(rnnInputsize, rnnHiddenSize, opt)
    rnns:add(rnnModule:clone())
    rnnModule = RNNModule(rnnHiddenSize, rnnHiddenSize, opt)

    for i = 1, nbOfHiddenLayers - 1 do
        rnns:add(nn.Bottle(nn.BatchNormalization(rnnHiddenSize), 2))
        rnns:add(rnnModule:clone())
    end

    local fullyConnected = nn.Sequential()
    fullyConnected:add(nn.BatchNormalization(rnnHiddenSize))
    fullyConnected:add(nn.Linear(rnnHiddenSize, 29))

    local model = nn.Sequential()
    model:add(conv)
    model:add(rnns)
    model:add(nn.Bottle(fullyConnected, 2))
    model:add(nn.Transpose({1, 2})) -- batch x seqLength x features
    model = makeDataParallel(model, opt.nGPU)
    return model
end

-- Based on convolution kernel and strides.
local function calculateInputSizes(sizes)
    sizes = torch.floor((sizes - 11) / 2 + 1) -- conv1
    sizes = torch.floor((sizes - 11) / 2 + 1) -- conv2
    return sizes
end

return { deepSpeech, calculateInputSizes }

================================================
FILE: LICENSE.md
================================================
The MIT License (MIT)

Copyright (c) 2016 Sean Naren

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Loader.lua
================================================
require 'nn'
require 'torch'
require 'lmdb'
require 'xlua'
require 'paths'
require 'Mapper'
local tds = require 'tds'

torch.setdefaulttensortype('torch.FloatTensor')

local indexer = torch.class('indexer')

function indexer:__init(dirPath, batchSize)

    local dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }
    local dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }

    self.batchSize = batchSize
    self.count = 1
    -- get the size of lmdb
    dbSpect:open()
    dbTrans:open()
    local audioLMDBSize = dbSpect:stat()['entries']
    local transcriptLMDBSize = dbTrans:stat()['entries']
    self.size = audioLMDBSize
    dbSpect:close()
    dbTrans:close()
    self.nbOfBatches = math.ceil(self.size / self.batchSize)
    assert(audioLMDBSize == transcriptLMDBSize, 'Audio and transcript LMDBs had different lengths!')
    assert(self.size > self.batchSize, 'batchSize larger than lmdb size!')

    self.inds = torch.range(1, self.size):split(batchSize)
    self.batchIndices = torch.range(1, self.nbOfBatches)
end

function indexer:nextIndices()
    if self.count > #self.inds then self.count = 1 end
    local index = self.batchIndices[self.count]
    local inds = self.inds[index]
    self.count = self.count + 1
    return inds
end

function indexer:permuteBatchOrder()
    self.batchIndices = torch.randperm(self.nbOfBatches)
end

local Loader = torch.class('Loader')

function Loader:__init(dirPath, mapper)
    self.dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }
    self.dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }
    self.dbSpect:open()
    self.size = self.dbSpect:stat()['entries']
    self.dbSpect:close()
    self.mapper = mapper
end

function Loader:nextBatch(indices)
    local tensors = tds.Vec()
    local targets = {}
    local transcripts = {}

    local maxLength = 0
    local freq = 0

    self.dbSpect:open(); local readerSpect = self.dbSpect:txn(true) -- readonly
    self.dbTrans:open(); local readerTrans = self.dbTrans:txn(true)

    local size = indices:size(1)

    local sizes = torch.Tensor(#indices)

    local permutedIndices = torch.randperm(size) -- batch tensor has different order each time
    -- reads out a batch and store in lists
    for x = 1, size do
        local ind = indices[permutedIndices[x]]
        local tensor = readerSpect:get(ind):float()
        local transcript = readerTrans:get(ind)

        freq = tensor:size(1)
        sizes[x] = tensor:size(2)
        if maxLength < tensor:size(2) then maxLength = tensor:size(2) end -- find the max len in this batch

        tensors:insert(tensor)
        table.insert(targets, self.mapper:encodeString(transcript))
        table.insert(transcripts, transcript)
    end

    local inputs = torch.Tensor(size, 1, freq, maxLength):zero()
    for ind, tensor in ipairs(tensors) do
        inputs[ind][1]:narrow(2, 1, tensor:size(2)):copy(tensor)
    end

    readerSpect:abort(); self.dbSpect:close()
    readerTrans:abort(); self.dbTrans:close()

    return inputs, targets, sizes, transcripts
end


================================================
FILE: MakeLMDB.lua
================================================
-- Expects data in the format of <root><train/test><datasetname><filename.wav/filename.txt>
-- Creates an LMDB of everything in these folders into a train and test set.

require 'lfs'
require 'audio'
require 'xlua'
require 'lmdb'
require 'torch'
require 'parallel'

local tds = require 'tds'

local cmd = torch.CmdLine()
cmd:option('-rootPath', 'prepare_datasets/an4_dataset', 'Path to the data')
cmd:option('-lmdbPath', 'prepare_datasets/an4_lmdb', 'Path to save LMDBs to')
cmd:option('-windowSize', 0.02, 'Window size for audio data')
cmd:option('-stride', 0.01, 'Stride for audio data')
cmd:option('-sampleRate', 16000, 'Sample rate of audio data (Default 16khz)')
cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-processes', 8, 'Number of processes used to create LMDB')

local opt = cmd:parse(arg)
local dataPath = opt.rootPath
local lmdbPath = opt.lmdbPath
local extension = '.' .. opt.audioExtension
parallel.nfork(opt.processes)

local function startWriter(path, name)
  local db = lmdb.env {
    Path = path,
    Name = name
  }
  db:open()
  local txn = db:txn()
  return db, txn
end

local function closeWriter(db, txn)
  txn:commit()
  db:close()
end

local function createLMDB(dataPath, lmdbPath)
  local vecs = tds.Vec()
  local size = tonumber(sys.execute("find " .. dataPath .. " -type f -name '*'" .. extension .. " | wc -l "))
  vecs:resize(size)

  local files = io.popen("find -L " .. dataPath .. " -type f -name '*" .. extension .. "'")
  local counter = 1
  print("Retrieving sizes for sorting...")
  local buffer = tds.Vec()
  buffer:resize(size)

  for file in files:lines() do
    buffer[counter] = file
    counter = counter + 1
  end

  local function getSize(opts)
    local audioFilePath = opts.file
    local transcriptFilePath = opts.file:gsub(opts.extension, ".txt")
    local opt = opts.opt
    local audioFile = audio.load(audioFilePath)
    local length = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):size(2)
    return { audioFilePath, transcriptFilePath, length }
  end

  for x = 1, opt.processes do
    local opts = { extension = extension, file = buffer[x], opt = opt }
    parallel.children[x]:send({ opts, getSize })
  end

  local processCounter = 1
  for x = 1, size do
    local result = parallel.children[processCounter]:receive()
    vecs[x] = tds.Vec(unpack(result))
    xlua.progress(x, size)
    if x % 1000 == 0 then collectgarbage() end
    -- send next index to retrieve
    if x + opt.processes <= size then
      local opts = { extension = extension, file = buffer[x + opt.processes], opt = opt }
      parallel.children[processCounter]:send({ opts, getSize })
    end
    if processCounter == opt.processes then
      processCounter = 1
    else
      processCounter = processCounter + 1
    end
  end
  print("Sorting...")
  -- sort the files by length
  local function comp(a, b) return a[3] < b[3] end

  vecs:sort(comp)
  local size = #vecs

  print("Creating LMDB dataset to: " .. lmdbPath)
  -- start writing
  local dbSpect, readerSpect = startWriter(lmdbPath .. '/spect', 'spect')
  local dbTrans, readerTrans = startWriter(lmdbPath .. '/trans', 'trans')

  local function getData(opts)
    local opt = opts.opt
    local audioFile = audio.load(opts.audioFilePath)
    local spect = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate) -- freq-by-frames tensor

    -- put into lmdb
    spect = spect:float()

    -- normalize the data
    local mean = spect:mean()
    local std = spect:std()
    spect:add(-mean)
    spect:div(std)

    local transcript
    for line in io.lines(opts.transcriptFilePath) do
      transcript = line
    end
    return { spect, transcript }
  end

  for x = 1, opt.processes do
    local vec = vecs[x]
    local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }
    parallel.children[x]:send({ opts, getData })
  end

  local processCounter = 1
  for x = 1, size do
    local result = parallel.children[processCounter]:receive()
    local spect, transcript = unpack(result)

    readerSpect:put(x, spect)
    readerTrans:put(x, transcript)

    -- commit buffer
    if x % 500 == 0 then
      readerSpect:commit(); readerSpect = dbSpect:txn()
      readerTrans:commit(); readerTrans = dbTrans:txn()
      collectgarbage()
    end

    if x + opt.processes <= size then
      local vec = vecs[x + opt.processes]
      local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }
      parallel.children[processCounter]:send({ opts, getData })
    end
    if processCounter == opt.processes then
      processCounter = 1
    else
      processCounter = processCounter + 1
    end
    xlua.progress(x, size)
  end

  closeWriter(dbSpect, readerSpect)
  closeWriter(dbTrans, readerTrans)
end

function parent()
  local function looper()
    require 'torch'
    require 'audio'
    while true do
      local object = parallel.parent:receive()
      local opts, code = unpack(object)
      local result = code(opts)
      parallel.parent:send(result)
      collectgarbage()
    end
  end

  parallel.children:exec(looper)

  createLMDB(dataPath .. '/train', lmdbPath .. '/train')
  createLMDB(dataPath .. '/test', lmdbPath .. '/test')
  parallel.close()
end

local ok, err = pcall(parent)
if not ok then
  print(err)
  parallel.close()
end

================================================
FILE: Mapper.lua
================================================
require 'torch'

-- construct an object to deal with the mapping
local mapper = torch.class('Mapper')

function mapper:__init(dictPath)
    assert(paths.filep(dictPath), dictPath ..' not found')

    self.alphabet2token = {}
    self.token2alphabet = {}

    -- make maps
    local cnt = 0
    for line in io.lines(dictPath) do
        self.alphabet2token[line] = cnt
        self.token2alphabet[cnt] = line
        cnt = cnt + 1
    end
end

function mapper:encodeString(line)
    line = string.lower(line)
    local label = {}
    for i = 1, #line do
        local character = line:sub(i, i)
        table.insert(label, self.alphabet2token[character])
    end
    return label
end

function mapper:decodeOutput(predictions)
    --[[
        Turns the predictions tensor into a list of the most likely tokens

        NOTE:
            to compute WER we strip the begining and ending spaces
    --]]
    local tokens = {}
    local blankToken = self.alphabet2token['$']
    local preToken = blankToken
    -- The prediction is a sequence of likelihood vectors
    local _, maxIndices = torch.max(predictions, 2)
    maxIndices = maxIndices:float():squeeze()

    for i=1, maxIndices:size(1) do
        local token = maxIndices[i] - 1 -- CTC indexes start from 1, while token starts from 0
        -- add token if it's not blank, and is not the same as pre_token
        if token ~= blankToken and token ~= preToken then
            table.insert(tokens, token)
        end
        preToken = token
    end
    return tokens
end

function mapper:tokensToText(tokens)
    local text = ""
    for i, t in ipairs(tokens) do
        text = text .. self.token2alphabet[tokens[i]]
    end
    return text
end


================================================
FILE: ModelEvaluator.lua
================================================
require 'Loader'
require 'Mapper'
require 'torch'
require 'xlua'
local threads = require 'threads'
require 'SequenceError'

local ModelEvaluator = torch.class('ModelEvaluator')

local loader

function ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)
    loader = Loader(datasetPath, mapper)
    self.testBatchSize = testBatchSize
    self.nbOfTestIterations = math.ceil(loader.size / testBatchSize)
    self.indexer = indexer(datasetPath, testBatchSize)
    self.pool = threads.Threads(1, function() require 'Loader' end)
    self.mapper = mapper
    self.logsPath = logsPath
    self.suffix = '_' .. os.date('%Y%m%d_%H%M%S')
    self.sequenceError = SequenceError()
    self.input = torch.Tensor()
    self.isGPU = isGPU
    if isGPU then
        self.input = self.input:cuda()
    end
end

function ModelEvaluator:runEvaluation(model, verbose, epoch)
    local spect_buf, label_buf, sizes_buf

    -- get first batch
    local inds = self.indexer:nextIndices()
    self.pool:addjob(function()
        return loader:nextBatch(inds)
    end,
        function(spect, label, sizes)
            spect_buf = spect
            label_buf = label
            sizes_buf = sizes
        end)

    if verbose then
        local f = assert(io.open(self.logsPath .. 'WER_Test' .. self.suffix .. '.log', 'a'), "Could not create validation test logs, does the folder "
                .. self.logsPath .. " exist?")
        f:write('======================== BEGIN WER TEST EPOCH: ' .. epoch .. ' =========================\n')
        f:close()
    end

    local evaluationPredictions = {} -- stores the predictions to order for log.
    local cumCER = 0
    local cumWER = 0
    local numberOfSamples = 0
    -- ======================= for every test iteration ==========================
    for i = 1, self.nbOfTestIterations do
        -- get buf and fetch next one
        self.pool:synchronize()
        local inputsCPU, targets, sizes_array = spect_buf, label_buf, sizes_buf
        inds = self.indexer:nextIndices()
        self.pool:addjob(function()
            return loader:nextBatch(inds)
        end,
            function(spect, label, sizes)
                spect_buf = spect
                label_buf = label
                sizes_buf = sizes
            end)

        self.input:resize(inputsCPU:size()):copy(inputsCPU)
        local predictions = model:forward(self.input)
        if self.isGPU then cutorch.synchronize() end

        local size = predictions:size(1)
        for j = 1, size do
            local prediction = predictions[j]
            local predict_tokens = self.mapper:decodeOutput(prediction)
            local targetTranscript = self.mapper:tokensToText(targets[j])
            local predictTranscript = self.mapper:tokensToText(predict_tokens)

            local CER = self.sequenceError:calculateCER(targetTranscript, predictTranscript)
            local WER = self.sequenceError:calculateWER(targetTranscript, predictTranscript)

            cumCER = cumCER + CER
            cumWER = cumWER + WER

            table.insert(evaluationPredictions, { wer = WER * 100, cer = CER * 100, target = targetTranscript, prediction = predictTranscript })
        end
        numberOfSamples = numberOfSamples + size
    end

    local function comp(a, b) return a.wer < b.wer end

    table.sort(evaluationPredictions, comp)

    if verbose then
        for index, eval in ipairs(evaluationPredictions) do
            local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))
            f:write(string.format("WER = %.2f | CER = %.2f | Text = \"%s\" | Predict = \"%s\"\n",
                eval.wer, eval.cer, eval.target, eval.prediction))
            f:close()
        end
    end
    local averageWER = cumWER / numberOfSamples
    local averageCER = cumCER / numberOfSamples

    local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))
    f:write(string.format("Average WER = %.2f | CER = %.2f", averageWER * 100, averageCER * 100))
    f:close()

    self.pool:synchronize() -- end the last loading
    return averageWER, averageCER
end

================================================
FILE: Network.lua
================================================
require 'optim'
require 'nnx'
require 'gnuplot'
require 'lfs'
require 'xlua'
require 'UtilsMultiGPU'
require 'Loader'
require 'nngraph'
require 'Mapper'
require 'ModelEvaluator'

local suffix = '_' .. os.date('%Y%m%d_%H%M%S')
local threads = require 'threads'
local Network = {}

--Training parameters
seed = 10
torch.setdefaulttensortype('torch.FloatTensor')
torch.manualSeed(seed)

function Network:init(opt)
    self.fileName = opt.saveFileName
    self.nGPU = opt.nGPU
    self.gpu = self.nGPU > 0

    if not self.gpu then
        require 'rnn'
    else
        require 'cutorch'
        require 'cunn'
        require 'cudnn'
        require 'BatchBRNNReLU'
        cutorch.manualSeedAll(seed)
    end
    self.trainingSetLMDBPath = opt.trainingSetLMDBPath
    self.validationSetLMDBPath = opt.validationSetLMDBPath
    self.logsTrainPath = opt.logsTrainPath or nil
    self.logsValidationPath = opt.logsValidationPath or nil
    self.modelTrainingPath = opt.modelTrainingPath or nil
    self.permuteBatch = opt.permuteBatch or false

    self:makeDirectories({ self.logsTrainPath, self.logsValidationPath, self.modelTrainingPath })

    self.mapper = Mapper(opt.dictionaryPath)
    self.tester = ModelEvaluator(self.gpu, self.validationSetLMDBPath, self.mapper,
        opt.validationBatchSize, self.logsValidationPath)
    self.loadModel = opt.loadModel
    self.epochSave = opt.epochSave or false -- Saves model every number of iterations.
    self.maxNorm = opt.maxNorm or 400 -- value chosen by Baidu for english speech.
    -- setting model saving/loading
    if self.loadModel then
        assert(opt.loadPath, "loadPath hasn't been given to load model.")
        self:loadNetwork(opt.loadPath, opt.modelName)
    else
        assert(opt.modelName, "Must have given a model to train.")
        self:prepSpeechModel(opt.modelName, opt)
    end
    -- setting online loading
    self.indexer = indexer(opt.trainingSetLMDBPath, opt.batchSize)
    self.pool = threads.Threads(1, function() require 'Loader' end)

    self.logger = optim.Logger(self.logsTrainPath .. 'train' .. suffix .. '.log')
    self.logger:setNames { 'loss', 'WER', 'CER' }
    self.logger:style { '-', '-', '-' }
end

function Network:prepSpeechModel(modelName, opt)
    local model = require(modelName)
    self.model = model[1](opt)
    self.calSize = model[2]
end

function Network:testNetwork(epoch)
    self.model:evaluate()
    local wer, cer = self.tester:runEvaluation(self.model, true, epoch or 1) -- details in log
    self.model:zeroGradParameters()
    self.model:training()
    return wer, cer
end

function Network:trainNetwork(epochs, optimizerParams)
    self.model:training()

    local lossHistory = {}
    local validationHistory = {}
    local criterion = nn.CTCCriterion(true)
    local x, gradParameters = self.model:getParameters()

    print("Number of parameters: ", gradParameters:size(1))

    -- inputs (preallocate)
    local inputs = torch.Tensor()
    local sizes = torch.Tensor()
    if self.gpu then
        criterion = criterion:cuda()
        inputs = inputs:cuda()
        sizes = sizes:cuda()
    end

    -- def loading buf and loader
    local loader = Loader(self.trainingSetLMDBPath, self.mapper)
    local specBuf, labelBuf, sizesBuf

    -- load first batch
    local inds = self.indexer:nextIndices()
    self.pool:addjob(function()
        return loader:nextBatch(inds)
    end,
        function(spect, label, sizes)
            specBuf = spect
            labelBuf = label
            sizesBuf = sizes
        end)

    -- define the feval
    local function feval(x_new)
        self.pool:synchronize() -- wait previous loading
        local inputsCPU, sizes, targets = specBuf, sizesBuf, labelBuf -- move buf to training data
        inds = self.indexer:nextIndices() -- load next batch whilst training
        self.pool:addjob(function()
            return loader:nextBatch(inds)
        end,
            function(spect, label, sizes)
                specBuf = spect
                labelBuf = label
                sizesBuf = sizes
            end)

        inputs:resize(inputsCPU:size()):copy(inputsCPU) -- transfer over to GPU
        sizes = self.calSize(sizes)
        local predictions = self.model:forward(inputs)
        local loss = criterion:forward(predictions, targets, sizes)
        if loss == math.huge or loss == -math.huge then loss = 0 print("Recieved an inf cost!") end
        self.model:zeroGradParameters()
        local gradOutput = criterion:backward(predictions, targets)
        self.model:backward(inputs, gradOutput)
        local norm = gradParameters:norm()
        if norm > self.maxNorm then
            gradParameters:mul(self.maxNorm / norm)
        end
        return loss, gradParameters
    end

    -- training
    local currentLoss
    local startTime = os.time()

    for i = 1, epochs do
        local averageLoss = 0

        for j = 1, self.indexer.nbOfBatches do
            currentLoss = 0
            local _, fs = optim.sgd(feval, x, optimizerParams)
            if self.gpu then cutorch.synchronize() end
            currentLoss = currentLoss + fs[1]
            xlua.progress(j, self.indexer.nbOfBatches)
            averageLoss = averageLoss + currentLoss
        end

        if self.permuteBatch then self.indexer:permuteBatchOrder() end

        averageLoss = averageLoss / self.indexer.nbOfBatches -- Calculate the average loss at this epoch.

        -- anneal learningRate
        optimizerParams.learningRate = optimizerParams.learningRate / (optimizerParams.learningRateAnnealing or 1)

        -- Update validation error rates
        local wer, cer = self:testNetwork(i)

        print(string.format("Training Epoch: %d Average Loss: %f Average Validation WER: %.2f Average Validation CER: %.2f",
            i, averageLoss, 100 * wer, 100 * cer))

        table.insert(lossHistory, averageLoss) -- Add the average loss value to the logger.
        table.insert(validationHistory, 100 * wer)
        self.logger:add { averageLoss, 100 * wer, 100 * cer }

        -- periodically save the model
        if self.epochSave then
            print("Saving model..")
            self:saveNetwork(self.modelTrainingPath .. 'model_epoch_' .. i .. suffix .. '_' .. self.fileName)
        end
    end

    local endTime = os.time()
    local secondsTaken = endTime - startTime
    local minutesTaken = secondsTaken / 60
    print("Minutes taken to train: ", minutesTaken)

    print("Saving model..")
    self:saveNetwork(self.modelTrainingPath .. 'final_model_' .. suffix .. '_' .. self.fileName)

    return lossHistory, validationHistory, minutesTaken
end

function Network:createLossGraph()
    self.logger:plot()
end

function Network:saveNetwork(saveName)
    self.model:clearState()
    saveDataParallel(saveName, self.model)
end

--Loads the model into Network.
function Network:loadNetwork(saveName, modelName)
    self.model = loadDataParallel(saveName, self.nGPU)
    local model = require(modelName)
    self.calSize = model[2]
end

function Network:makeDirectories(folderPaths)
    for index, folderPath in ipairs(folderPaths) do
        if (folderPath ~= nil) then os.execute("mkdir -p " .. folderPath) end
    end
end

return Network


================================================
FILE: Predict.lua
================================================
require 'nn'
require 'audio'
require 'Mapper'
require 'UtilsMultiGPU'
local cmd = torch.CmdLine()
cmd:option('-modelPath', 'deepspeech.t7', 'Path of model to load')
cmd:option('-audioPath', '', 'Path to the input audio to predict on')
cmd:option('-dictionaryPath', './dictionary', 'File containing the dictionary to use')
cmd:option('-windowSize', 0.02, 'Window Size of audio')
cmd:option('-stride', 0.01, 'Stride of audio')
cmd:option('-sampleRate', 16000, 'Rate of audio (default 16khz)')
cmd:option('-nGPU', 1)

local opt = cmd:parse(arg)

if opt.nGPU > 0 then
    require 'cunn'
    require 'cudnn'
    require 'BatchBRNNReLU'
end

local model =  loadDataParallel(opt.modelPath, opt.nGPU)
local mapper = Mapper(opt.dictionaryPath)

local wave = audio.load(opt.audioPath)
local spect = audio.spectrogram(wave, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):float() -- freq-by-frames tensor

-- normalize the data
local mean = spect:mean()
local std = spect:std()
spect:add(-mean)
spect:div(std)

spect = spect:view(1, 1, spect:size(1), spect:size(2))

if opt.nGPU > 0 then
    spect = spect:cuda()
    model = model:cuda()
end

model:evaluate()
local predictions = model:forward(spect)
local tokens = mapper:decodeOutput(predictions[1])
local text = mapper:tokensToText(tokens)

print(text)

================================================
FILE: README.md
================================================
# deepspeech.torch

[![Build Status](https://travis-ci.org/SeanNaren/deepspeech.torch.svg?branch=master)](https://travis-ci.org/SeanNaren/deepspeech.torch)
[![Documentation Status](https://readthedocs.org/projects/ctcspeechrecognition/badge/?version=latest)](http://ctcspeechrecognition.readthedocs.io/en/latest/?badge=latest)


Implementation of [Baidu Warp-CTC](https://github.com/baidu-research/warp-ctc) using torch7.
Creates a network based on the [DeepSpeech2](http://arxiv.org/pdf/1512.02595v1.pdf) architecture using the Torch7 library, trained with the CTC activation function.

## Features
* Train large models with large datasets via online loading using [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) and multi-GPU support.
* Supports variable length batches via padding.
* Implements the [AN4 Audio database](http://www.speech.cs.cmu.edu/databases/an4/) (50 mins of data).
Has also been extended to train using the [LibriSpeech](http://www.openslr.org/12/) dataset (1000 hours of data). Custom dataset preparation is explained in documentation.

## Branches

There are currently two branches, Master and Phoneme:
* Master: This branch trains DeepSpeech2. Also included is an evaluation script which calculates the WER/CER, as well as a prediction script.
This branch is useful for understanding how the DeepSpeech and CTC works and is easy to run after installation. Highly recommended to checkout this branch.
* Phonemes: This branch is experimental and uses phonemes rather than character based predictions. This is fully credited and extended by [CCorfield](https://github.com/CCorfield) and his awesome work in porting to use phonemes. In addition to this
I'd like to also thank [Shane Walker](https://github.com/walkers-mv) for his awesome recent conversion to use phonemes as well.

## Installation/Data Preparation/Documentation

Follow Instructions/Data Preparation/Documentation found in the wiki [here](https://github.com/SeanNaren/deepspeech.torch/wiki/Installation) to set up and run the code.

Technical documentation can be found [here](http://ctcspeechrecognition.readthedocs.io/en/latest/).

## Pre-trained Networks

Pre-trained networks are available for AN4 as well as LibriSpeech for CUDA only (since they use cudnn RNNs). Download Links and accuracies are below. DeepSpeech-light is a smaller model which is less intensive to train (based on LSTMs rather than RNNs).

### AN4

**an4Test**

|Network                | WER       | CER       |Link       |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| N/A     | N/A | N/A |
|DeepSpeech | 12    | 3.07 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/an4_deepspeech.t7) |

### LibriSpeech

**Librispeech-test-clean**

|Network                | WER       | CER       |Link       |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| 15     | 1.34 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech-light.t7) |
|DeepSpeech | 12    | 1.55 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech.t7) |

**Librispeech-test-other**

|Network                | WER       | CER       |Link       |
|-----------------|:--------:|:--------:|:--------:|
|DeepSpeech-light| 36    | 3.80 | (Download Above) |
|DeepSpeech | 33    | 3.24 | (Download Above) |

Once you're set up, you can start training from these nets by using the below parameters (you might need to change the other parameters described in the wiki) after setting the project up:

```lua
th Train.lua -loadModel -loadPath /path/to/model.t7
```

## Acknowledgements

Lots of people helped/contributed to this project that deserve recognition:
* Soumith Chintala for his support on Torch7 and the vast open source projects he has contributed that made this project possible!
* Charles Corfield for his work on the Phoneme Dataset and his overall contribution and aid throughout.
* Will Frey for his thorough communication and aid in the development process.
* Ding Ling, Yuan Yang and Yan Xia for their significant contribution to online training, multi-gpu support and many other important features.
* Erich Elsen and the team from Baidu for their contribution of Warp-CTC that made this possible, and the encouraging words and support given throughout the project.
* Maciej Korzepa for his huge help in training a model on Librispeech!


================================================
FILE: SequenceError.lua
================================================
local SequenceError = torch.class("SequenceError")

-- Calculates a sequence error rate (aka Levenshtein edit distance)
function SequenceError:sequenceErrorRate(target, prediction)
    local d = torch.Tensor(#target + 1, #prediction + 1):zero()
    for i = 1, #target + 1 do
        for j = 1, #prediction + 1 do
            if (i == 1) then
                d[1][j] = j - 1
            elseif (j == 1) then
                d[i][1] = i - 1
            end
        end
    end

    for i = 2, #target + 1 do
        for j = 2, #prediction + 1 do
            if (target[i - 1] == prediction[j - 1]) then
                d[i][j] = d[i - 1][j - 1]
            else
                local substitution = d[i - 1][j - 1] + 1
                local insertion = d[i][j - 1] + 1
                local deletion = d[i - 1][j] + 1
                d[i][j] = torch.min(torch.Tensor({ substitution, insertion, deletion }))
            end
        end
    end
    local errorRate = d[#target + 1][#prediction + 1] / #target
    return errorRate
end

function SequenceError:calculateCER(targetTranscript, predictTranscript)
    return self:sequenceErrorRate(targetTranscript, predictTranscript)
end

function SequenceError:calculateWER(targetTranscript, predictTranscript)
    -- convert to words before calculation
    local targetWords = {}
    for word in targetTranscript:gmatch("%S+") do table.insert(targetWords, word) end
    local predictedWords = {}
    for word in predictTranscript:gmatch("%S+") do table.insert(predictedWords, word) end
    return self:sequenceErrorRate(targetWords, predictedWords)
end

================================================
FILE: Test.lua
================================================
local Network = require 'Network'

-- Load the network from the saved model. Options can be overrided on command line run.
local cmd = torch.CmdLine()
cmd:option('-loadModel', true, 'Load previously saved model')
cmd:option('-saveModel', false, 'Save model after training/testing')
cmd:option('-loadPath', 'deepspeech.t7', 'Path of final model to save/load')
cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')
cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')
cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')
cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')
cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')
cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')
cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')
cmd:option('-batchSize', 20, 'Batch size in training')
cmd:option('-validationBatchSize', 32, 'Batch size for validation')

local opt = cmd:parse(arg)

Network:init(opt)

print("Testing network...")
local wer, cer = Network:testNetwork()
print(string.format('Avg WER: %2.f  Avg CER: %.2f', 100 * wer, 100 * cer))
print(string.format('More information written to log file at %s', opt.logsValidationPath))

================================================
FILE: Train.lua
================================================
local Network = require 'Network'

-- Options can be overrided on command line run.
local cmd = torch.CmdLine()
cmd:option('-loadModel', false, 'Load previously saved model')
cmd:option('-loadPath', 'deepspeech.t7', 'Path to model to load')
cmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')
cmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')
cmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')
cmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')
cmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')
cmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')
cmd:option('-epochSave', false, 'save model every epoch')
cmd:option('-modelTrainingPath', './models/', ' Path to save periodic training models')
cmd:option('-saveFileName', 'deepspeech.t7', 'Name of model to save as')
cmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')
cmd:option('-epochs', 70, 'Number of epochs for training')
cmd:option('-learningRate', 3e-4, ' Training learning rate')
cmd:option('-learningRateAnnealing', 1.1, 'Factor to anneal lr every epoch')
cmd:option('-maxNorm', 400, 'Max norm used to normalize gradients')
cmd:option('-momentum', 0.90, 'Momentum for SGD')
cmd:option('-batchSize', 20, 'Batch size in training')
cmd:option('-permuteBatch', false, 'Set to true if you want to permute batches AFTER the first epoch')
cmd:option('-validationBatchSize', 20, 'Batch size for validation')
cmd:option('-LSTM', false, 'Use LSTMs rather than RNNs')
cmd:option('-hiddenSize', 1760, 'RNN hidden sizes')
cmd:option('-nbOfHiddenLayers', 7, 'Number of rnn layers')

local opt = cmd:parse(arg)

--Parameters for the stochastic gradient descent (using the optim library).
local optimParams = {
    learningRate = opt.learningRate,
    learningRateAnnealing = opt.learningRateAnnealing,
    momentum = opt.momentum,
    dampening = 0,
    nesterov = true
}

--Create and train the network based on the parameters and training data.
Network:init(opt)

Network:trainNetwork(opt.epochs, optimParams)

--Creates the loss plot.
Network:createLossGraph()

================================================
FILE: UtilsMultiGPU.lua
================================================
require 'rnn'
require 'nngraph'
function makeDataParallel(model, nGPU)
    if nGPU > 0 then
        cudnn.fastest = true
        local function BatchNorm(module)
            return torch.type(module):find('BatchNormalization')
        end
        model = cudnn.convert(model, cudnn, BatchNorm)
        if nGPU > 1 then
            gpus = torch.range(1, nGPU):totable()
            dpt = nn.DataParallelTable(1):add(model, gpus):threads(function()
                require 'nngraph'
                require 'cudnn'
                cudnn.fastest = true
                require 'BatchBRNNReLU'
            end)
            dpt.gradInput = nil
            model = dpt
        end
        model:cuda()
    end
    return model
end

local function cleanDPT(module, device)
    -- This assumes this DPT was created by the function above: all the
    -- module.modules are clones of the same network on different GPUs
    -- hence we only need to keep one when saving the model to the disk.
    local newDPT = nn.DataParallelTable(1)
    cutorch.setDevice(device or 1)
    newDPT:add(module:get(1), device or 1)
    return newDPT
end

function saveDataParallel(modelPath, model)
    if torch.type(model) == 'nn.DataParallelTable' then
        torch.save(modelPath, cleanDPT(model))
    elseif torch.type(model) == 'nn.Sequential' then
        local temp_model = nn.Sequential()
        for i, module in ipairs(model.modules) do
            if torch.type(module) == 'nn.DataParallelTable' then
                temp_model:add(cleanDPT(module))
            else
                temp_model:add(module)
            end
        end
        torch.save(modelPath, temp_model)
    elseif torch.type(model) == 'nn.gModule' then
        torch.save(modelPath, model)
    else
        error('This saving function only works with Sequential or DataParallelTable modules.')
    end
end

function loadDataParallel(modelPath, nGPU)
    if nGPU > 1 then
        require 'cudnn'
        require 'BatchBRNNReLU'
    end
    local model = torch.load(modelPath)
    if torch.type(model) == 'nn.DataParallelTable' then
        return makeDataParallel(model:get(1):float(), nGPU)
    elseif torch.type(model) == 'nn.Sequential' then
        for i, module in ipairs(model.modules) do
            if torch.type(module) == 'nn.DataParallelTable' then
                model.modules[i] = makeDataParallel(module:get(1):float(), nGPU)
            end
        end
        return model
    elseif torch.type(model) == 'nn.gModule' then
        model = makeDataParallel(model, nGPU)
        return model
    else
        error('The loaded model is not a Sequential or DataParallelTable module.')
    end
end

================================================
FILE: dictionary
================================================
$
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
 
'

================================================
FILE: doc/DeepSpeechModel.md
================================================
# DeepSpeechModel

Defines the deep speech 2 conv+rnn architecture.

### deepSpeech(opt)

Defines the torch architecture for Deep Speech 2 as a function that can be called. Returns the final model

`opt` Defines the options we use including using GPUS, hidden size and number of layers for the RNNs.

### calculateInputSizes(sizes)

A function that calculates the sequence sizes after the convolutional layers. Used in the loss calculations in CTC, so the network isn't
penalised for the padded sequences. Returns a same sized tensor.

`sizes` Real size of each sentence in the training sample as a 1D tensor.

================================================
FILE: doc/Loader.md
================================================
# Loader

Defines the indexer class and the loader class, handling batching of the dataset to train the network.

## Indexer

Handles returning the next indices of the batch to load into memory, to train the network with.

### indexer:__init(_dir, batchSize)

`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.

`batchSize` The sizes of each batch to create.

### indexer:nextIndices()

Retrieves the next indices that need to be loaded by the loader from the LMDB dataset.

### indexer:permuteBatchOrder()

Permutes the batch order randomly. This is for the net to not train in sequence order every time.

## Loader

Loads batches of data from LMDB files used in training/testing.

### Loader:__init(dirPath)

`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.

### Loader:nextBatch(indices)

Returns the next batch of the dataset based on the given indices.

`indices` The indices of the test samples that need to be retrieved. This is handled by the Indexer class above.

================================================
FILE: doc/Mapper.md
================================================
# Mapper

Defines how numeric indices are mapped to tokens and vice versa.

### Mapper:__init(dictPath)

Creates mappings based on the given dictionary file. The AN4 dictionary file can be seen [here](https://github.com/SeanNaren/deepspeech.torch/blob/master/dictionary).

### Mapper:encodeString(string)

Converts string into a set of tokens to be used as a label in training.

`string` string to be converted.

### Mapper:decodeOutput(predictions)

Converts predictions of the neural network into a sequence of tokens (characters) via a mapper.

`predictions` is a tensor of sequence likelihood vectors given by the neural network.

### Mapper:tokensToText(tokens)

Using the mapper converts the tokens into readable text.

`tokens` A set of numeric tokens to convert into readable text.


================================================
FILE: doc/ModelEvaluator.md
================================================
# ModelEvaluator

Handles calculation of word error rate using an LMDB dataset. For more information on the calculation, see [Evaluator](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Evaluator.md).

### ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)

'isGPU' Whether to use the GPU (CUDA) or CPU.

`datasetPath` the path to the LMDB test dataset to use in evaluation.

`mapper` Maps predicted numeric values to characters, see [Mapper](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Mapper.md) for more details.

`testBatchSize` The size of the batches to pass the network.

`logsPath` File path to put the details of evaluations into.


### ModelEvaluator:runEvaluation(model, verbose, epoch)

Calculates the word error rate and character error rate averaged over the test iterations. Uses the same threading as the training process does to load batches from the dataset.

`model` The Torch model to evaluate.

`verbose` If set to true, will store details of WER calculations into the log files.

`epoch` Determines the epoch number that is written in the log files for this calculation.

================================================
FILE: doc/Network.md
================================================
# Network

Handles interactions with the neural network for training and testing. Configured by network parameters given in
constructor.

### Network:init(networkParams)

Constructor of the Network class. Below defines each parameter that can be taken as input.

```lua
local networkParams = {
    loadModel = false, -- Set to true if loading a model into the Network class rather than training.
    saveModel = true, -- Set to true if saving the model after training.
    modelName = 'DeepSpeechModel', -- The name of the lua class containing the network architecture
    nGPU = 1, -- Number of GPUs, set -1 to use CPU
    trainingSetLMDBPath = './prepare_an4/train/', -- online loading path from the LMDB dataset for training.
    validationSetLMDBPath = './prepare_an4/test/', -- online loading path from the LMDB dataset for testing.
    logsTrainPath = './logs/TrainingLoss/', -- Where training logs will be stored.
    logsValidationPath = './logs/ValidationScores/', -- Where testing score logs will be stored.
    modelTrainingPath = './models/', -- Where models will be stored on saving.
    modelPath = 'CTCNetwork.t7',
    dictionaryPath = './dictionary', -- Contains the alphabet/characters that we are to predict on.
    batchSize = 20, -- The sizes of batches that we are passing into the network in training.
    validationBatchSize = 1, -- Validation batch sizes (should be kept at 1, since we pass 1 sample at a time).
    validationIterations = 20, -- Number of validation iterations (kept small, because we only want to run a few tests per epoch).
    saveModelInTraining = false, -- saves model periodically through training
    saveModelIterations = 50 -- If saveModelInTraining set to true, we save every 50 epochs.
}
```

### Network:prepSpeechModel(modelName, opt)

Used to create the model via the defined modelName and options.

### Network:testNetwork(epoch)

Tests the current stored model via the word error rate.

`epoch` can be used to detail the epoch number in the logs when testing scores are stored.

### Network:trainNetwork(epochs, sgd_params)

Trains a network stored in the `Network` class. Uses multiple threads in an online loading fashion to load the data from the disk.

`epochs` defines the number of iterations of training that will occur across the entire dataset (each epochs trains on the entire dataset).

`sgd_params` defines the SGD parameters for the optim library such as below.

```lua
local sgdParams = {
    learningRate = 5e-4,
    learningRateDecay = 1e-9,
    weightDecay = 0,
    momentum = 0.9,
    dampening = 0,
    nesterov = true
}
```

### Network:createLossGraph()

After training, when called will use gnuplot (through wrapper in the optim library) to generate a graph of the loss and word error rate over epochs.

### Network:saveNetwork(saveName)

Will save the model currently stored in the network class to disk, at the pre-defined save location with the given `saveName`.

### Network:loadNetwork(saveName, modelName)

Loads the network from the save location, stored using the pre-defined save name.

`saveName` The name as to which the network was saved as

`modelName` The name of the class that stores the model or architecture.

================================================
FILE: doc/SequenceError.md
================================================
# SequenceError

Calculates word error rates and handles conversion of CTC predictions to numeric tokens.

### SequenceError.sequenceErrorRate(target, prediction)

Calculates the error rates based on the target and the predicted inputs.

`target` and `prediction` are inputs of strings or tables.

### SequenceError:calculateCER(targetTranscript, predictTranscript)

`targetTranscript` and `predictTranscript` are two strings, returns the Character Error Rate.

### SequenceError:calculateWER(targetTranscript, predictTranscript)

`targetTranscript` and `predictTranscript` are two strings, returns the Word Error Rate.

================================================
FILE: doc/UtilsMultiGPU.md
================================================
# UtilsMultiGPU

Handles multi-gpu setups of the architecture.

### makeDataParallel(model, nGPU)

Converts the model into a multi-gpu set up if necessary using DataParallelTable.

`model` The Torch network model to modify for configured GPUs.

`nGPU` Number of GPUs.

### saveDataParallel(modelPath, model)

Saves the model to disk.

`modelPath` Location to save the model.

`model` The Torch network model to save.

### loadDataParallel(modelPath, nGPU)

Loads a model saved using the above methods.

`modelPath` Location to load the model.

`nGPU` Number of GPUs to load to.

================================================
FILE: doc/index.md
================================================
# Technical Documentation

Below are a few classes that have been documented, explaining their purpose and functions available.

## Classes

  * [Network](Network.md)
  * [DeepSpeechModel](DeepSpeechModel.md)
  * [Mapper](Mapper.md)
  * [Evaluator](Evaluator.md)
  * [ModelEvaluator](ModelEvaluator.md)
  * [Utils](Utils.md)
  * [UtilsMultiGPU](UtilsMultiGPU.md)
  * [Loader](Loader.md)


================================================
FILE: mkdocs.yml
================================================
site_name: CTCSpeechRecognition
theme : simplex
repo_url : https://github.com/SeanNaren/CTCSpeechRecognition
use_directory_urls : false
markdown_extensions: [extra]
docs_dir : doc
pages:
- [index.md, Home]
- [Network.md, Network]
- [DeepSpeechModel.md, DeepSpeechModel]
- [Mapper.md, Mapper]
- [SequenceError.md, SequenceError]
- [ModelEvaluator.md, ModelEvaluator]
- [UtilsMultiGPU.md, UtilsMultiGPU]
- [Loader.md, Loader]

================================================
FILE: prepare_datasets/FormatAN4.lua
================================================
require 'torch'
local cmd = torch.CmdLine()
cmd:option('-rootPath', 'an4', 'Path to the an4 root')
cmd:option('-newPath', 'an4_dataset', 'Path to the new data path')
cmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-move', false, 'Moves the files over rather than copies, used to save space')

local opt = cmd:parse(arg)

local an4TestPath = opt.rootPath .. '/etc/an4_test.'
local an4TrainPath = opt.rootPath .. '/etc/an4_train.'
local an4AudioPath = opt.rootPath .. '/wav'

-- strips down the transcripts into pure text
local function processText(line)
    local text = line:gsub('<s>', ''):gsub('</s>', ''):gsub('^%s', ''):gsub('%(.*%)', ''):gsub('%s*$', '')
    return text
end

local function createDataset(pathToAN4, an4AudioPath, newPath)
    sys.execute("mkdir " .. newPath)
    local fileids = pathToAN4 .. 'fileids'
    local transcripts = pathToAN4 .. 'transcription'
    local filePaths = {}
    for filePath in io.lines(fileids) do
        table.insert(filePaths, filePath)
    end
    local counter = 1
    for line in io.lines(transcripts) do
        local text = processText(line)
        local filePath = filePaths[counter]
        -- new filename extracted from an4 file id
        local fileName = sys.split(filePath, '/')[3] -- last part is the filename
        -- create new text file with clean transcript
        local textPath = newPath .. '/' .. fileName .. '.txt'
        local file = io.open(textPath, "w")
        file:write(text)
        file:close()
        -- move audio to correct place
        local audioPath = an4AudioPath .. '/' .. filePath .. '.' .. opt.audioExtension
        local newPath = newPath .. '/' .. fileName .. '.' .. opt.audioExtension
        local command
        if opt.move then command = "mv " else command = "cp " end
        sys.execute(command .. audioPath .. ' ' .. newPath)
        counter = counter + 1
    end
end

sys.execute("mkdir " .. opt.newPath)
createDataset(an4TrainPath, an4AudioPath, opt.newPath .. '/train/')
createDataset(an4TestPath, an4AudioPath, opt.newPath .. '/test/')


================================================
FILE: prepare_datasets/FormatLibriSpeech.lua
================================================
require 'torch'
local threads = require 'threads'

local cmd = torch.CmdLine()
cmd:option('-rootPath', 'LibriSpeech', 'Path to the librispeech root')
cmd:option('-newPath', 'libri_dataset', 'Path to the new data path')
cmd:option('-audioExtension', 'flac', 'The extension of the audio files (wav/mp3/sph/etc)')
cmd:option('-move', false, 'Moves the files over rather than copies, used to save space')
cmd:option('-threads', 8, 'Number of threads to use')

local opt = cmd:parse(arg)
local extension = '.' .. opt.audioExtension

local libriTestPath = opt.rootPath .. '/test/'
local libriTrainPath = opt.rootPath .. '/train/'
local threads = threads.Threads(opt.threads, function(idx) require 'torch' require 'sys' end)

-- strips down the transcripts into pure text
local function processText(line)
    local text = line:gsub('[^a-zA-Z ]', '')
    return text
end

local function createDataset(libriPath, newDirPath)
    sys.execute("mkdir " .. newDirPath)
    local size = tonumber(sys.execute("find " .. libriPath .. " -type f -name '*'" .. extension .. " | wc -l "))

    local counter = 1

    local function formatData(line, dir)
        local text = processText(line)
        local id = line:match("([^ ]*) ") -- first part of transcript, used for audio file path and ID
        local audioFolders = sys.split(id, '-')

        local textPath = newDirPath .. '/' .. id .. '.txt'
        local file = io.open(textPath, "w")
        file:write(text)
        file:close()
        -- move audio to correct place
        local audioPath = dir .. '/' .. audioFolders[1] .. '/' .. audioFolders[2] .. '/' .. id .. extension
        local newPath = newDirPath .. '/' .. id .. extension
        local command
        if opt.move then command = "mv " else command = "cp " end
        sys.execute(command .. audioPath .. ' ' .. newPath)
    end

    local counter = 0

    local p = io.popen('find "' .. libriPath .. '" -maxdepth 1 -mindepth 1 -type d')
    for dir in p:lines() do
        local transcripts = io.popen("find -L " .. dir .. " -type f -name '*.txt'")
        for transcript in transcripts:lines() do
            for line in io.lines(transcript) do
                threads:addjob(function()
                    formatData(line, dir)
                end,
                    function()
                        counter = counter + 1
                        xlua.progress(counter, size)
                    end)
            end
        end
    end
end

sys.execute("mkdir " .. opt.newPath)
createDataset(libriTrainPath, opt.newPath .. '/train/')
createDataset(libriTestPath, opt.newPath .. '/test/')


================================================
FILE: tests/test.lua
================================================
require 'nn'

local test = torch.TestSuite()
local mytester
require '../SequenceError'
require '../Mapper'

local sequenceError = SequenceError()

function test.evaluator()
    -- Calculates WER, (nbOfInsertions + nbOfDeletions + nbOfSubstitutions) / nbOfWords
    local target = "test a sentence"

    local prediction = "a sentence"
    local deletion = sequenceError:calculateWER(target, prediction)
    local prediction = "test a sentence inserted"
    local insertion = sequenceError:calculateWER(target, prediction)
    local prediction = "test substituted sentence"
    local substitution = sequenceError:calculateWER(target, prediction)
    local oneMistakeWER = 1 / 3 -- One insertion/deletion/substitution / number of words
    mytester:eq(deletion, oneMistakeWER, 'WER with deletion was incorrect')
    mytester:eq(insertion, oneMistakeWER, 'WER with insertion was incorrect')
    mytester:eq(substitution, oneMistakeWER, 'WER with substitution was incorrect')

    local prediction = "a"
    local deletion = sequenceError:calculateWER(target, prediction)
    local prediction = "a wrong"
    local deletionAndSubstitution = sequenceError:calculateWER(target, prediction)
    local prediction = "wrong a sentence inserted"
    local substitionAndInsertion = sequenceError:calculateWER(target, prediction)
    local twoMistakeWER = 2 / 3 -- Two errors of insertion/deletion/substitution / number of words
    mytester:eq(deletion, twoMistakeWER, 'masking of outputs was incorrect')
    mytester:eq(deletionAndSubstitution, twoMistakeWER, 'WER with substitution and deletion was incorrect')
    mytester:eq(substitionAndInsertion, twoMistakeWER, 'WER with substitution and insertion was incorrect')
end

function test.mapper()
    local dir_path = 'test_dictionary'
    local mapper = Mapper(dir_path)
    local alphabet = {
        '$', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
        's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '\''
    }
    local expectedMapping = {}
    for index, letter in ipairs(alphabet) do
        expectedMapping[letter] = index - 1
    end
    mytester:eq(mapper.alphabet2token, expectedMapping)
end

function test.mapperDecode()
    local dir_path = 'test_dictionary'
    local mapper = Mapper(dir_path)
    local predictions = torch.Tensor({ { 1, 2, 3 }, { 2, 3, 1 }, { 1, 2, 3 } })
    local tokens = mapper:decodeOutput(predictions)
    local text = mapper:tokensToText(tokens)
    mytester:eq(tokens, { 2, 1, 2 })
    mytester:eq(text, 'bab')
end

mytester = torch.Tester()
mytester:add(test)
mytester:run()


================================================
FILE: tests/test_dictionary
================================================
$
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
 
'

Download .txt

gitextract_kmme0l_o/

├── .gitignore
├── .travis.yml
├── BatchBRNN.lua
├── BatchBRNNReLU.lua
├── DeepSpeechModel.lua
├── LICENSE.md
├── Loader.lua
├── MakeLMDB.lua
├── Mapper.lua
├── ModelEvaluator.lua
├── Network.lua
├── Predict.lua
├── README.md
├── SequenceError.lua
├── Test.lua
├── Train.lua
├── UtilsMultiGPU.lua
├── dictionary
├── doc/
│   ├── DeepSpeechModel.md
│   ├── Loader.md
│   ├── Mapper.md
│   ├── ModelEvaluator.md
│   ├── Network.md
│   ├── SequenceError.md
│   ├── UtilsMultiGPU.md
│   └── index.md
├── mkdocs.yml
├── prepare_datasets/
│   ├── FormatAN4.lua
│   └── FormatLibriSpeech.lua
└── tests/
    ├── test.lua
    └── test_dictionary

Download .json

Condensed preview — 31 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (63K chars).

[
  {
    "path": ".gitignore",
    "chars": 101,
    "preview": ".idea/\nAudio\nsystemtests/\nsystemtests\nCTC.iml\nCTCSpeechRecognition.iml\n*.log\n*.log.eps\n*.t7\nSeq2Seq/\n"
  },
  {
    "path": ".travis.yml",
    "chars": 2014,
    "preview": "language: c\nnotifications:\n  email: false\ncompiler:\n  - gcc\n  - clang\ncache:\n  directories:\n  - $HOME/OpenBlasInstall\nsu"
  },
  {
    "path": "BatchBRNN.lua",
    "chars": 1907,
    "preview": "------------------------------------------------------------------------\n--[[ BatchBRNN ]] --\n-- Adds sequence-wise batc"
  },
  {
    "path": "BatchBRNNReLU.lua",
    "chars": 555,
    "preview": "require 'BatchBRNN'\n------------------------------------------------------------------------\n--[[ BatchBRNNReLU ]] --\n--"
  },
  {
    "path": "DeepSpeechModel.lua",
    "chars": 2441,
    "preview": "require 'UtilsMultiGPU'\n\nlocal function RNNModule(inputDim, hiddenDim, opt)\n    if opt.nGPU > 0 then\n        if opt.LSTM"
  },
  {
    "path": "LICENSE.md",
    "chars": 1077,
    "preview": "The MIT License (MIT)\n\nCopyright (c) 2016 Sean Naren\n\nPermission is hereby granted, free of charge, to any person obtain"
  },
  {
    "path": "Loader.lua",
    "chars": 3082,
    "preview": "require 'nn'\nrequire 'torch'\nrequire 'lmdb'\nrequire 'xlua'\nrequire 'paths'\nrequire 'Mapper'\nlocal tds = require 'tds'\n\nt"
  },
  {
    "path": "MakeLMDB.lua",
    "chars": 5434,
    "preview": "-- Expects data in the format of <root><train/test><datasetname><filename.wav/filename.txt>\n-- Creates an LMDB of everyt"
  },
  {
    "path": "Mapper.lua",
    "chars": 1702,
    "preview": "require 'torch'\n\n-- construct an object to deal with the mapping\nlocal mapper = torch.class('Mapper')\n\nfunction mapper:_"
  },
  {
    "path": "ModelEvaluator.lua",
    "chars": 4154,
    "preview": "require 'Loader'\nrequire 'Mapper'\nrequire 'torch'\nrequire 'xlua'\nlocal threads = require 'threads'\nrequire 'SequenceErro"
  },
  {
    "path": "Network.lua",
    "chars": 7230,
    "preview": "require 'optim'\nrequire 'nnx'\nrequire 'gnuplot'\nrequire 'lfs'\nrequire 'xlua'\nrequire 'UtilsMultiGPU'\nrequire 'Loader'\nre"
  },
  {
    "path": "Predict.lua",
    "chars": 1319,
    "preview": "require 'nn'\nrequire 'audio'\nrequire 'Mapper'\nrequire 'UtilsMultiGPU'\nlocal cmd = torch.CmdLine()\ncmd:option('-modelPath"
  },
  {
    "path": "README.md",
    "chars": 4461,
    "preview": "# deepspeech.torch\n\n[![Build Status](https://travis-ci.org/SeanNaren/deepspeech.torch.svg?branch=master)](https://travis"
  },
  {
    "path": "SequenceError.lua",
    "chars": 1595,
    "preview": "local SequenceError = torch.class(\"SequenceError\")\n\n-- Calculates a sequence error rate (aka Levenshtein edit distance)\n"
  },
  {
    "path": "Test.lua",
    "chars": 1383,
    "preview": "local Network = require 'Network'\n\n-- Load the network from the saved model. Options can be overrided on command line ru"
  },
  {
    "path": "Train.lua",
    "chars": 2270,
    "preview": "local Network = require 'Network'\n\n-- Options can be overrided on command line run.\nlocal cmd = torch.CmdLine()\ncmd:opti"
  },
  {
    "path": "UtilsMultiGPU.lua",
    "chars": 2665,
    "preview": "require 'rnn'\nrequire 'nngraph'\nfunction makeDataParallel(model, nGPU)\n    if nGPU > 0 then\n        cudnn.fastest = true"
  },
  {
    "path": "dictionary",
    "chars": 57,
    "preview": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
  },
  {
    "path": "doc/DeepSpeechModel.md",
    "chars": 609,
    "preview": "# DeepSpeechModel\n\nDefines the deep speech 2 conv+rnn architecture.\n\n### deepSpeech(opt)\n\nDefines the torch architecture"
  },
  {
    "path": "doc/Loader.md",
    "chars": 1059,
    "preview": "# Loader\n\nDefines the indexer class and the loader class, handling batching of the dataset to train the network.\n\n## Ind"
  },
  {
    "path": "doc/Mapper.md",
    "chars": 790,
    "preview": "# Mapper\n\nDefines how numeric indices are mapped to tokens and vice versa.\n\n### Mapper:__init(dictPath)\n\nCreates mapping"
  },
  {
    "path": "doc/ModelEvaluator.md",
    "chars": 1131,
    "preview": "# ModelEvaluator\n\nHandles calculation of word error rate using an LMDB dataset. For more information on the calculation,"
  },
  {
    "path": "doc/Network.md",
    "chars": 3205,
    "preview": "# Network\n\nHandles interactions with the neural network for training and testing. Configured by network parameters given"
  },
  {
    "path": "doc/SequenceError.md",
    "chars": 619,
    "preview": "# SequenceError\n\nCalculates word error rates and handles conversion of CTC predictions to numeric tokens.\n\n### SequenceE"
  },
  {
    "path": "doc/UtilsMultiGPU.md",
    "chars": 577,
    "preview": "# UtilsMultiGPU\n\nHandles multi-gpu setups of the architecture.\n\n### makeDataParallel(model, nGPU)\n\nConverts the model in"
  },
  {
    "path": "doc/index.md",
    "chars": 387,
    "preview": "# Technical Documentation\n\nBelow are a few classes that have been documented, explaining their purpose and functions ava"
  },
  {
    "path": "mkdocs.yml",
    "chars": 423,
    "preview": "site_name: CTCSpeechRecognition\ntheme : simplex\nrepo_url : https://github.com/SeanNaren/CTCSpeechRecognition\nuse_directo"
  },
  {
    "path": "prepare_datasets/FormatAN4.lua",
    "chars": 2101,
    "preview": "require 'torch'\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'an4', 'Path to the an4 root')\ncmd:option('-newPath'"
  },
  {
    "path": "prepare_datasets/FormatLibriSpeech.lua",
    "chars": 2604,
    "preview": "require 'torch'\nlocal threads = require 'threads'\n\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'LibriSpeech', 'P"
  },
  {
    "path": "tests/test.lua",
    "chars": 2603,
    "preview": "require 'nn'\n\nlocal test = torch.TestSuite()\nlocal mytester\nrequire '../SequenceError'\nrequire '../Mapper'\n\nlocal sequen"
  },
  {
    "path": "tests/test_dictionary",
    "chars": 57,
    "preview": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
  }
]

About this extraction

This page contains the full source code of the SeanNaren/deepspeech.torch GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 31 files (58.2 KB), approximately 15.8k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo