[
  {
    "path": ".gitignore",
    "content": ".idea/\nAudio\nsystemtests/\nsystemtests\nCTC.iml\nCTCSpeechRecognition.iml\n*.log\n*.log.eps\n*.t7\nSeq2Seq/\n"
  },
  {
    "path": ".travis.yml",
    "content": "language: c\nnotifications:\n  email: false\ncompiler:\n  - gcc\n  - clang\ncache:\n  directories:\n  - $HOME/OpenBlasInstall\nsudo: false\nenv:\n  - TORCH_LUA_VERSION=LUAJIT21\n  - TORCH_LUA_VERSION=LUA51\n  - TORCH_LUA_VERSION=LUA52\naddons:\n  apt:\n    packages:\n    - cmake\n    - gfortran\n    - gcc-multilib\n    - gfortran-multilib\n    - liblapack-dev\n    - build-essential\n    - gcc\n    - g++\n    - curl\n    - cmake\n    - libreadline-dev\n    - git-core\n    - libqt4-core\n    - libqt4-gui\n    - libqt4-dev\n    - libjpeg-dev\n    - libpng-dev\n    - ncurses-dev\n    - imagemagick\n    - libzmq3-dev\n    - gfortran\n    - unzip\n    - gnuplot\n    - gnuplot-x11\nbefore_script:\n- export ROOT_TRAVIS_DIR=$(pwd)\n- export INSTALL_PREFIX=~/torch/install\n-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)\n- git clone https://github.com/torch/distro.git ~/torch --recursive\n- cd ~/torch && git submodule update --init --recursive\n- mkdir build && cd build\n- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH\n- cmake .. -DCMAKE_INSTALL_PREFIX=\"${INSTALL_PREFIX}\" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON\n- make && make install\n- cd $ROOT_TRAVIS_DIR\n- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH\n- ${INSTALL_PREFIX}/bin/luarocks install nn\n- ${INSTALL_PREFIX}/bin/luarocks install dpnn\nscript:\n- cd ${ROOT_TRAVIS_DIR}\n- git clone https://github.com/SeanNaren/CTCSpeechRecognition.git\n- cd CTCSpeechRecognition/tests\n- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.1/ --exclude ../tests --exclude ../prepare_an4/\n- rsync -av --progress ../* ${INSTALL_PREFIX}/share/lua/5.2/ --exclude ../tests --exclude ../prepare_an4/\n- export PATH=${INSTALL_PREFIX}/bin:$PATH\n- export TESTLUA=$(which luajit lua | head -n 1)\n- echo ${TESTLUA}\n- ${TESTLUA} test.lua\n"
  },
  {
    "path": "BatchBRNN.lua",
    "content": "------------------------------------------------------------------------\n--[[ BatchBRNN ]] --\n-- Adds sequence-wise batch normalization to cudnn RNN modules.\n-- For a simple RNN: ht = ReLU(B(Wixt) + Riht-1 + bRi) where B\n-- is the batch normalization.\n-- Expects size seqLength x minibatch x inputDim.\n-- Returns seqLength x minibatch x outputDim.\n-- Can specify an rnnModule such as cudnn.LSTM (defaults to RNNReLU).\n------------------------------------------------------------------------\nlocal BatchBRNN, parent = torch.class('cudnn.BatchBRNN', 'nn.Sequential')\n\nfunction BatchBRNN:__init(inputDim, outputDim)\n    parent.__init(self)\n\n    self.view_in = nn.View(1, 1, -1):setNumInputDims(3)\n    self.view_out = nn.View(1, -1):setNumInputDims(2)\n\n    self.rnn = cudnn.RNN(outputDim, outputDim, 1)\n    local rnn = self.rnn\n    rnn.inputMode = 'CUDNN_SKIP_INPUT'\n    rnn.bidirectional = 'CUDNN_BIDIRECTIONAL'\n    rnn.numDirections = 2\n    rnn:reset()\n    self:add(self.view_in)\n    self:add(nn.Linear(inputDim, outputDim, false))\n    self:add(nn.BatchNormalization(outputDim))\n    self:add(self.view_out)\n    self:add(rnn)\n    self:add(nn.View(-1, 2, outputDim):setNumInputDims(2))\n    self:add(nn.Sum(3))\nend\n\nfunction BatchBRNN:updateOutput(input)\n    local T, N = input:size(1), input:size(2)\n    self.view_in:resetSize(T * N, -1)\n    self.view_out:resetSize(T, N, -1)\n    return parent.updateOutput(self, input)\nend\n\nfunction BatchBRNN:__tostring__()\n    local tab = '  '\n    local line = '\\n'\n    local next = ' -> '\n    local str = 'BatchBRNN'\n    str = str .. ' {' .. line .. tab .. '[input'\n    for i=1,#self.modules do\n        str = str .. next .. '(' .. i .. ')'\n    end\n    str = str .. next .. 'output]'\n    for i=1,#self.modules do\n        str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab)\n    end\n    str = str .. line .. '}'\n    return str\nend"
  },
  {
    "path": "BatchBRNNReLU.lua",
    "content": "require 'BatchBRNN'\n------------------------------------------------------------------------\n--[[ BatchBRNNReLU ]] --\n-- Based On BatchBRNN. Adds ClippedReLU non-linearity to Vanilla BRNN.\n------------------------------------------------------------------------\nlocal BatchBRNNReLU, parent = torch.class('cudnn.BatchBRNNReLU', 'cudnn.BatchBRNN')\n\nfunction BatchBRNNReLU:__init(inputDim, outputDim)\n    parent.__init(self, inputDim, outputDim)\n    local rnn = self.rnn\n    rnn.mode = 'CUDNN_RNN_RELU'\n    rnn:reset()\n    self:insert(nn.Clamp(0, 20), 6)\nend"
  },
  {
    "path": "DeepSpeechModel.lua",
    "content": "require 'UtilsMultiGPU'\n\nlocal function RNNModule(inputDim, hiddenDim, opt)\n    if opt.nGPU > 0 then\n        if opt.LSTM then\n            local blstm = nn.Sequential()\n            blstm:add(cudnn.BLSTM(inputDim, hiddenDim, 1))\n            blstm:add(nn.View(-1, 2, hiddenDim):setNumInputDims(2)) -- have to sum activations\n            blstm:add(nn.Sum(3))\n            return blstm\n        else\n            require 'BatchBRNNReLU'\n            return cudnn.BatchBRNNReLU(inputDim, hiddenDim)\n        end\n    else\n        require 'rnn'\n        return nn.SeqBRNN(inputDim, hiddenDim)\n    end\nend\n\n-- Creates the covnet+rnn structure.\nlocal function deepSpeech(opt)\n    local conv = nn.Sequential()\n    -- (nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH]) conv layers.\n    conv:add(nn.SpatialConvolution(1, 32, 11, 41, 2, 2))\n    conv:add(nn.SpatialBatchNormalization(32))\n    conv:add(nn.Clamp(0, 20))\n    conv:add(nn.SpatialConvolution(32, 32, 11, 21, 2, 1))\n    conv:add(nn.SpatialBatchNormalization(32))\n    conv:add(nn.Clamp(0, 20))\n    local rnnInputsize = 32 * 41 -- based on the above convolutions and 16khz audio.\n    local rnnHiddenSize = opt.hiddenSize -- size of rnn hidden layers\n    local nbOfHiddenLayers = opt.nbOfHiddenLayers\n\n    conv:add(nn.View(rnnInputsize, -1):setNumInputDims(3)) -- batch x features x seqLength\n    conv:add(nn.Transpose({ 2, 3 }, { 1, 2 })) -- seqLength x batch x features\n\n    local rnns = nn.Sequential()\n    local rnnModule = RNNModule(rnnInputsize, rnnHiddenSize, opt)\n    rnns:add(rnnModule:clone())\n    rnnModule = RNNModule(rnnHiddenSize, rnnHiddenSize, opt)\n\n    for i = 1, nbOfHiddenLayers - 1 do\n        rnns:add(nn.Bottle(nn.BatchNormalization(rnnHiddenSize), 2))\n        rnns:add(rnnModule:clone())\n    end\n\n    local fullyConnected = nn.Sequential()\n    fullyConnected:add(nn.BatchNormalization(rnnHiddenSize))\n    fullyConnected:add(nn.Linear(rnnHiddenSize, 29))\n\n    local model = nn.Sequential()\n    model:add(conv)\n    model:add(rnns)\n    model:add(nn.Bottle(fullyConnected, 2))\n    model:add(nn.Transpose({1, 2})) -- batch x seqLength x features\n    model = makeDataParallel(model, opt.nGPU)\n    return model\nend\n\n-- Based on convolution kernel and strides.\nlocal function calculateInputSizes(sizes)\n    sizes = torch.floor((sizes - 11) / 2 + 1) -- conv1\n    sizes = torch.floor((sizes - 11) / 2 + 1) -- conv2\n    return sizes\nend\n\nreturn { deepSpeech, calculateInputSizes }"
  },
  {
    "path": "LICENSE.md",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2016 Sean Naren\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Loader.lua",
    "content": "require 'nn'\nrequire 'torch'\nrequire 'lmdb'\nrequire 'xlua'\nrequire 'paths'\nrequire 'Mapper'\nlocal tds = require 'tds'\n\ntorch.setdefaulttensortype('torch.FloatTensor')\n\nlocal indexer = torch.class('indexer')\n\nfunction indexer:__init(dirPath, batchSize)\n\n    local dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }\n    local dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }\n\n    self.batchSize = batchSize\n    self.count = 1\n    -- get the size of lmdb\n    dbSpect:open()\n    dbTrans:open()\n    local audioLMDBSize = dbSpect:stat()['entries']\n    local transcriptLMDBSize = dbTrans:stat()['entries']\n    self.size = audioLMDBSize\n    dbSpect:close()\n    dbTrans:close()\n    self.nbOfBatches = math.ceil(self.size / self.batchSize)\n    assert(audioLMDBSize == transcriptLMDBSize, 'Audio and transcript LMDBs had different lengths!')\n    assert(self.size > self.batchSize, 'batchSize larger than lmdb size!')\n\n    self.inds = torch.range(1, self.size):split(batchSize)\n    self.batchIndices = torch.range(1, self.nbOfBatches)\nend\n\nfunction indexer:nextIndices()\n    if self.count > #self.inds then self.count = 1 end\n    local index = self.batchIndices[self.count]\n    local inds = self.inds[index]\n    self.count = self.count + 1\n    return inds\nend\n\nfunction indexer:permuteBatchOrder()\n    self.batchIndices = torch.randperm(self.nbOfBatches)\nend\n\nlocal Loader = torch.class('Loader')\n\nfunction Loader:__init(dirPath, mapper)\n    self.dbSpect = lmdb.env { Path = dirPath .. '/spect', Name = 'spect' }\n    self.dbTrans = lmdb.env { Path = dirPath .. '/trans', Name = 'trans' }\n    self.dbSpect:open()\n    self.size = self.dbSpect:stat()['entries']\n    self.dbSpect:close()\n    self.mapper = mapper\nend\n\nfunction Loader:nextBatch(indices)\n    local tensors = tds.Vec()\n    local targets = {}\n    local transcripts = {}\n\n    local maxLength = 0\n    local freq = 0\n\n    self.dbSpect:open(); local readerSpect = self.dbSpect:txn(true) -- readonly\n    self.dbTrans:open(); local readerTrans = self.dbTrans:txn(true)\n\n    local size = indices:size(1)\n\n    local sizes = torch.Tensor(#indices)\n\n    local permutedIndices = torch.randperm(size) -- batch tensor has different order each time\n    -- reads out a batch and store in lists\n    for x = 1, size do\n        local ind = indices[permutedIndices[x]]\n        local tensor = readerSpect:get(ind):float()\n        local transcript = readerTrans:get(ind)\n\n        freq = tensor:size(1)\n        sizes[x] = tensor:size(2)\n        if maxLength < tensor:size(2) then maxLength = tensor:size(2) end -- find the max len in this batch\n\n        tensors:insert(tensor)\n        table.insert(targets, self.mapper:encodeString(transcript))\n        table.insert(transcripts, transcript)\n    end\n\n    local inputs = torch.Tensor(size, 1, freq, maxLength):zero()\n    for ind, tensor in ipairs(tensors) do\n        inputs[ind][1]:narrow(2, 1, tensor:size(2)):copy(tensor)\n    end\n\n    readerSpect:abort(); self.dbSpect:close()\n    readerTrans:abort(); self.dbTrans:close()\n\n    return inputs, targets, sizes, transcripts\nend\n"
  },
  {
    "path": "MakeLMDB.lua",
    "content": "-- Expects data in the format of <root><train/test><datasetname><filename.wav/filename.txt>\n-- Creates an LMDB of everything in these folders into a train and test set.\n\nrequire 'lfs'\nrequire 'audio'\nrequire 'xlua'\nrequire 'lmdb'\nrequire 'torch'\nrequire 'parallel'\n\nlocal tds = require 'tds'\n\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'prepare_datasets/an4_dataset', 'Path to the data')\ncmd:option('-lmdbPath', 'prepare_datasets/an4_lmdb', 'Path to save LMDBs to')\ncmd:option('-windowSize', 0.02, 'Window size for audio data')\ncmd:option('-stride', 0.01, 'Stride for audio data')\ncmd:option('-sampleRate', 16000, 'Sample rate of audio data (Default 16khz)')\ncmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')\ncmd:option('-processes', 8, 'Number of processes used to create LMDB')\n\nlocal opt = cmd:parse(arg)\nlocal dataPath = opt.rootPath\nlocal lmdbPath = opt.lmdbPath\nlocal extension = '.' .. opt.audioExtension\nparallel.nfork(opt.processes)\n\nlocal function startWriter(path, name)\n  local db = lmdb.env {\n    Path = path,\n    Name = name\n  }\n  db:open()\n  local txn = db:txn()\n  return db, txn\nend\n\nlocal function closeWriter(db, txn)\n  txn:commit()\n  db:close()\nend\n\nlocal function createLMDB(dataPath, lmdbPath)\n  local vecs = tds.Vec()\n  local size = tonumber(sys.execute(\"find \" .. dataPath .. \" -type f -name '*'\" .. extension .. \" | wc -l \"))\n  vecs:resize(size)\n\n  local files = io.popen(\"find -L \" .. dataPath .. \" -type f -name '*\" .. extension .. \"'\")\n  local counter = 1\n  print(\"Retrieving sizes for sorting...\")\n  local buffer = tds.Vec()\n  buffer:resize(size)\n\n  for file in files:lines() do\n    buffer[counter] = file\n    counter = counter + 1\n  end\n\n  local function getSize(opts)\n    local audioFilePath = opts.file\n    local transcriptFilePath = opts.file:gsub(opts.extension, \".txt\")\n    local opt = opts.opt\n    local audioFile = audio.load(audioFilePath)\n    local length = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):size(2)\n    return { audioFilePath, transcriptFilePath, length }\n  end\n\n  for x = 1, opt.processes do\n    local opts = { extension = extension, file = buffer[x], opt = opt }\n    parallel.children[x]:send({ opts, getSize })\n  end\n\n  local processCounter = 1\n  for x = 1, size do\n    local result = parallel.children[processCounter]:receive()\n    vecs[x] = tds.Vec(unpack(result))\n    xlua.progress(x, size)\n    if x % 1000 == 0 then collectgarbage() end\n    -- send next index to retrieve\n    if x + opt.processes <= size then\n      local opts = { extension = extension, file = buffer[x + opt.processes], opt = opt }\n      parallel.children[processCounter]:send({ opts, getSize })\n    end\n    if processCounter == opt.processes then\n      processCounter = 1\n    else\n      processCounter = processCounter + 1\n    end\n  end\n  print(\"Sorting...\")\n  -- sort the files by length\n  local function comp(a, b) return a[3] < b[3] end\n\n  vecs:sort(comp)\n  local size = #vecs\n\n  print(\"Creating LMDB dataset to: \" .. lmdbPath)\n  -- start writing\n  local dbSpect, readerSpect = startWriter(lmdbPath .. '/spect', 'spect')\n  local dbTrans, readerTrans = startWriter(lmdbPath .. '/trans', 'trans')\n\n  local function getData(opts)\n    local opt = opts.opt\n    local audioFile = audio.load(opts.audioFilePath)\n    local spect = audio.spectrogram(audioFile, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate) -- freq-by-frames tensor\n\n    -- put into lmdb\n    spect = spect:float()\n\n    -- normalize the data\n    local mean = spect:mean()\n    local std = spect:std()\n    spect:add(-mean)\n    spect:div(std)\n\n    local transcript\n    for line in io.lines(opts.transcriptFilePath) do\n      transcript = line\n    end\n    return { spect, transcript }\n  end\n\n  for x = 1, opt.processes do\n    local vec = vecs[x]\n    local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }\n    parallel.children[x]:send({ opts, getData })\n  end\n\n  local processCounter = 1\n  for x = 1, size do\n    local result = parallel.children[processCounter]:receive()\n    local spect, transcript = unpack(result)\n\n    readerSpect:put(x, spect)\n    readerTrans:put(x, transcript)\n\n    -- commit buffer\n    if x % 500 == 0 then\n      readerSpect:commit(); readerSpect = dbSpect:txn()\n      readerTrans:commit(); readerTrans = dbTrans:txn()\n      collectgarbage()\n    end\n\n    if x + opt.processes <= size then\n      local vec = vecs[x + opt.processes]\n      local opts = { audioFilePath = vec[1], transcriptFilePath = vec[2], opt = opt }\n      parallel.children[processCounter]:send({ opts, getData })\n    end\n    if processCounter == opt.processes then\n      processCounter = 1\n    else\n      processCounter = processCounter + 1\n    end\n    xlua.progress(x, size)\n  end\n\n  closeWriter(dbSpect, readerSpect)\n  closeWriter(dbTrans, readerTrans)\nend\n\nfunction parent()\n  local function looper()\n    require 'torch'\n    require 'audio'\n    while true do\n      local object = parallel.parent:receive()\n      local opts, code = unpack(object)\n      local result = code(opts)\n      parallel.parent:send(result)\n      collectgarbage()\n    end\n  end\n\n  parallel.children:exec(looper)\n\n  createLMDB(dataPath .. '/train', lmdbPath .. '/train')\n  createLMDB(dataPath .. '/test', lmdbPath .. '/test')\n  parallel.close()\nend\n\nlocal ok, err = pcall(parent)\nif not ok then\n  print(err)\n  parallel.close()\nend"
  },
  {
    "path": "Mapper.lua",
    "content": "require 'torch'\n\n-- construct an object to deal with the mapping\nlocal mapper = torch.class('Mapper')\n\nfunction mapper:__init(dictPath)\n    assert(paths.filep(dictPath), dictPath ..' not found')\n\n    self.alphabet2token = {}\n    self.token2alphabet = {}\n\n    -- make maps\n    local cnt = 0\n    for line in io.lines(dictPath) do\n        self.alphabet2token[line] = cnt\n        self.token2alphabet[cnt] = line\n        cnt = cnt + 1\n    end\nend\n\nfunction mapper:encodeString(line)\n    line = string.lower(line)\n    local label = {}\n    for i = 1, #line do\n        local character = line:sub(i, i)\n        table.insert(label, self.alphabet2token[character])\n    end\n    return label\nend\n\nfunction mapper:decodeOutput(predictions)\n    --[[\n        Turns the predictions tensor into a list of the most likely tokens\n\n        NOTE:\n            to compute WER we strip the begining and ending spaces\n    --]]\n    local tokens = {}\n    local blankToken = self.alphabet2token['$']\n    local preToken = blankToken\n    -- The prediction is a sequence of likelihood vectors\n    local _, maxIndices = torch.max(predictions, 2)\n    maxIndices = maxIndices:float():squeeze()\n\n    for i=1, maxIndices:size(1) do\n        local token = maxIndices[i] - 1 -- CTC indexes start from 1, while token starts from 0\n        -- add token if it's not blank, and is not the same as pre_token\n        if token ~= blankToken and token ~= preToken then\n            table.insert(tokens, token)\n        end\n        preToken = token\n    end\n    return tokens\nend\n\nfunction mapper:tokensToText(tokens)\n    local text = \"\"\n    for i, t in ipairs(tokens) do\n        text = text .. self.token2alphabet[tokens[i]]\n    end\n    return text\nend\n"
  },
  {
    "path": "ModelEvaluator.lua",
    "content": "require 'Loader'\nrequire 'Mapper'\nrequire 'torch'\nrequire 'xlua'\nlocal threads = require 'threads'\nrequire 'SequenceError'\n\nlocal ModelEvaluator = torch.class('ModelEvaluator')\n\nlocal loader\n\nfunction ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)\n    loader = Loader(datasetPath, mapper)\n    self.testBatchSize = testBatchSize\n    self.nbOfTestIterations = math.ceil(loader.size / testBatchSize)\n    self.indexer = indexer(datasetPath, testBatchSize)\n    self.pool = threads.Threads(1, function() require 'Loader' end)\n    self.mapper = mapper\n    self.logsPath = logsPath\n    self.suffix = '_' .. os.date('%Y%m%d_%H%M%S')\n    self.sequenceError = SequenceError()\n    self.input = torch.Tensor()\n    self.isGPU = isGPU\n    if isGPU then\n        self.input = self.input:cuda()\n    end\nend\n\nfunction ModelEvaluator:runEvaluation(model, verbose, epoch)\n    local spect_buf, label_buf, sizes_buf\n\n    -- get first batch\n    local inds = self.indexer:nextIndices()\n    self.pool:addjob(function()\n        return loader:nextBatch(inds)\n    end,\n        function(spect, label, sizes)\n            spect_buf = spect\n            label_buf = label\n            sizes_buf = sizes\n        end)\n\n    if verbose then\n        local f = assert(io.open(self.logsPath .. 'WER_Test' .. self.suffix .. '.log', 'a'), \"Could not create validation test logs, does the folder \"\n                .. self.logsPath .. \" exist?\")\n        f:write('======================== BEGIN WER TEST EPOCH: ' .. epoch .. ' =========================\\n')\n        f:close()\n    end\n\n    local evaluationPredictions = {} -- stores the predictions to order for log.\n    local cumCER = 0\n    local cumWER = 0\n    local numberOfSamples = 0\n    -- ======================= for every test iteration ==========================\n    for i = 1, self.nbOfTestIterations do\n        -- get buf and fetch next one\n        self.pool:synchronize()\n        local inputsCPU, targets, sizes_array = spect_buf, label_buf, sizes_buf\n        inds = self.indexer:nextIndices()\n        self.pool:addjob(function()\n            return loader:nextBatch(inds)\n        end,\n            function(spect, label, sizes)\n                spect_buf = spect\n                label_buf = label\n                sizes_buf = sizes\n            end)\n\n        self.input:resize(inputsCPU:size()):copy(inputsCPU)\n        local predictions = model:forward(self.input)\n        if self.isGPU then cutorch.synchronize() end\n\n        local size = predictions:size(1)\n        for j = 1, size do\n            local prediction = predictions[j]\n            local predict_tokens = self.mapper:decodeOutput(prediction)\n            local targetTranscript = self.mapper:tokensToText(targets[j])\n            local predictTranscript = self.mapper:tokensToText(predict_tokens)\n\n            local CER = self.sequenceError:calculateCER(targetTranscript, predictTranscript)\n            local WER = self.sequenceError:calculateWER(targetTranscript, predictTranscript)\n\n            cumCER = cumCER + CER\n            cumWER = cumWER + WER\n\n            table.insert(evaluationPredictions, { wer = WER * 100, cer = CER * 100, target = targetTranscript, prediction = predictTranscript })\n        end\n        numberOfSamples = numberOfSamples + size\n    end\n\n    local function comp(a, b) return a.wer < b.wer end\n\n    table.sort(evaluationPredictions, comp)\n\n    if verbose then\n        for index, eval in ipairs(evaluationPredictions) do\n            local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))\n            f:write(string.format(\"WER = %.2f | CER = %.2f | Text = \\\"%s\\\" | Predict = \\\"%s\\\"\\n\",\n                eval.wer, eval.cer, eval.target, eval.prediction))\n            f:close()\n        end\n    end\n    local averageWER = cumWER / numberOfSamples\n    local averageCER = cumCER / numberOfSamples\n\n    local f = assert(io.open(self.logsPath .. 'Evaluation_Test' .. self.suffix .. '.log', 'a'))\n    f:write(string.format(\"Average WER = %.2f | CER = %.2f\", averageWER * 100, averageCER * 100))\n    f:close()\n\n    self.pool:synchronize() -- end the last loading\n    return averageWER, averageCER\nend"
  },
  {
    "path": "Network.lua",
    "content": "require 'optim'\nrequire 'nnx'\nrequire 'gnuplot'\nrequire 'lfs'\nrequire 'xlua'\nrequire 'UtilsMultiGPU'\nrequire 'Loader'\nrequire 'nngraph'\nrequire 'Mapper'\nrequire 'ModelEvaluator'\n\nlocal suffix = '_' .. os.date('%Y%m%d_%H%M%S')\nlocal threads = require 'threads'\nlocal Network = {}\n\n--Training parameters\nseed = 10\ntorch.setdefaulttensortype('torch.FloatTensor')\ntorch.manualSeed(seed)\n\nfunction Network:init(opt)\n    self.fileName = opt.saveFileName\n    self.nGPU = opt.nGPU\n    self.gpu = self.nGPU > 0\n\n    if not self.gpu then\n        require 'rnn'\n    else\n        require 'cutorch'\n        require 'cunn'\n        require 'cudnn'\n        require 'BatchBRNNReLU'\n        cutorch.manualSeedAll(seed)\n    end\n    self.trainingSetLMDBPath = opt.trainingSetLMDBPath\n    self.validationSetLMDBPath = opt.validationSetLMDBPath\n    self.logsTrainPath = opt.logsTrainPath or nil\n    self.logsValidationPath = opt.logsValidationPath or nil\n    self.modelTrainingPath = opt.modelTrainingPath or nil\n    self.permuteBatch = opt.permuteBatch or false\n\n    self:makeDirectories({ self.logsTrainPath, self.logsValidationPath, self.modelTrainingPath })\n\n    self.mapper = Mapper(opt.dictionaryPath)\n    self.tester = ModelEvaluator(self.gpu, self.validationSetLMDBPath, self.mapper,\n        opt.validationBatchSize, self.logsValidationPath)\n    self.loadModel = opt.loadModel\n    self.epochSave = opt.epochSave or false -- Saves model every number of iterations.\n    self.maxNorm = opt.maxNorm or 400 -- value chosen by Baidu for english speech.\n    -- setting model saving/loading\n    if self.loadModel then\n        assert(opt.loadPath, \"loadPath hasn't been given to load model.\")\n        self:loadNetwork(opt.loadPath, opt.modelName)\n    else\n        assert(opt.modelName, \"Must have given a model to train.\")\n        self:prepSpeechModel(opt.modelName, opt)\n    end\n    -- setting online loading\n    self.indexer = indexer(opt.trainingSetLMDBPath, opt.batchSize)\n    self.pool = threads.Threads(1, function() require 'Loader' end)\n\n    self.logger = optim.Logger(self.logsTrainPath .. 'train' .. suffix .. '.log')\n    self.logger:setNames { 'loss', 'WER', 'CER' }\n    self.logger:style { '-', '-', '-' }\nend\n\nfunction Network:prepSpeechModel(modelName, opt)\n    local model = require(modelName)\n    self.model = model[1](opt)\n    self.calSize = model[2]\nend\n\nfunction Network:testNetwork(epoch)\n    self.model:evaluate()\n    local wer, cer = self.tester:runEvaluation(self.model, true, epoch or 1) -- details in log\n    self.model:zeroGradParameters()\n    self.model:training()\n    return wer, cer\nend\n\nfunction Network:trainNetwork(epochs, optimizerParams)\n    self.model:training()\n\n    local lossHistory = {}\n    local validationHistory = {}\n    local criterion = nn.CTCCriterion(true)\n    local x, gradParameters = self.model:getParameters()\n\n    print(\"Number of parameters: \", gradParameters:size(1))\n\n    -- inputs (preallocate)\n    local inputs = torch.Tensor()\n    local sizes = torch.Tensor()\n    if self.gpu then\n        criterion = criterion:cuda()\n        inputs = inputs:cuda()\n        sizes = sizes:cuda()\n    end\n\n    -- def loading buf and loader\n    local loader = Loader(self.trainingSetLMDBPath, self.mapper)\n    local specBuf, labelBuf, sizesBuf\n\n    -- load first batch\n    local inds = self.indexer:nextIndices()\n    self.pool:addjob(function()\n        return loader:nextBatch(inds)\n    end,\n        function(spect, label, sizes)\n            specBuf = spect\n            labelBuf = label\n            sizesBuf = sizes\n        end)\n\n    -- define the feval\n    local function feval(x_new)\n        self.pool:synchronize() -- wait previous loading\n        local inputsCPU, sizes, targets = specBuf, sizesBuf, labelBuf -- move buf to training data\n        inds = self.indexer:nextIndices() -- load next batch whilst training\n        self.pool:addjob(function()\n            return loader:nextBatch(inds)\n        end,\n            function(spect, label, sizes)\n                specBuf = spect\n                labelBuf = label\n                sizesBuf = sizes\n            end)\n\n        inputs:resize(inputsCPU:size()):copy(inputsCPU) -- transfer over to GPU\n        sizes = self.calSize(sizes)\n        local predictions = self.model:forward(inputs)\n        local loss = criterion:forward(predictions, targets, sizes)\n        if loss == math.huge or loss == -math.huge then loss = 0 print(\"Recieved an inf cost!\") end\n        self.model:zeroGradParameters()\n        local gradOutput = criterion:backward(predictions, targets)\n        self.model:backward(inputs, gradOutput)\n        local norm = gradParameters:norm()\n        if norm > self.maxNorm then\n            gradParameters:mul(self.maxNorm / norm)\n        end\n        return loss, gradParameters\n    end\n\n    -- training\n    local currentLoss\n    local startTime = os.time()\n\n    for i = 1, epochs do\n        local averageLoss = 0\n\n        for j = 1, self.indexer.nbOfBatches do\n            currentLoss = 0\n            local _, fs = optim.sgd(feval, x, optimizerParams)\n            if self.gpu then cutorch.synchronize() end\n            currentLoss = currentLoss + fs[1]\n            xlua.progress(j, self.indexer.nbOfBatches)\n            averageLoss = averageLoss + currentLoss\n        end\n\n        if self.permuteBatch then self.indexer:permuteBatchOrder() end\n\n        averageLoss = averageLoss / self.indexer.nbOfBatches -- Calculate the average loss at this epoch.\n\n        -- anneal learningRate\n        optimizerParams.learningRate = optimizerParams.learningRate / (optimizerParams.learningRateAnnealing or 1)\n\n        -- Update validation error rates\n        local wer, cer = self:testNetwork(i)\n\n        print(string.format(\"Training Epoch: %d Average Loss: %f Average Validation WER: %.2f Average Validation CER: %.2f\",\n            i, averageLoss, 100 * wer, 100 * cer))\n\n        table.insert(lossHistory, averageLoss) -- Add the average loss value to the logger.\n        table.insert(validationHistory, 100 * wer)\n        self.logger:add { averageLoss, 100 * wer, 100 * cer }\n\n        -- periodically save the model\n        if self.epochSave then\n            print(\"Saving model..\")\n            self:saveNetwork(self.modelTrainingPath .. 'model_epoch_' .. i .. suffix .. '_' .. self.fileName)\n        end\n    end\n\n    local endTime = os.time()\n    local secondsTaken = endTime - startTime\n    local minutesTaken = secondsTaken / 60\n    print(\"Minutes taken to train: \", minutesTaken)\n\n    print(\"Saving model..\")\n    self:saveNetwork(self.modelTrainingPath .. 'final_model_' .. suffix .. '_' .. self.fileName)\n\n    return lossHistory, validationHistory, minutesTaken\nend\n\nfunction Network:createLossGraph()\n    self.logger:plot()\nend\n\nfunction Network:saveNetwork(saveName)\n    self.model:clearState()\n    saveDataParallel(saveName, self.model)\nend\n\n--Loads the model into Network.\nfunction Network:loadNetwork(saveName, modelName)\n    self.model = loadDataParallel(saveName, self.nGPU)\n    local model = require(modelName)\n    self.calSize = model[2]\nend\n\nfunction Network:makeDirectories(folderPaths)\n    for index, folderPath in ipairs(folderPaths) do\n        if (folderPath ~= nil) then os.execute(\"mkdir -p \" .. folderPath) end\n    end\nend\n\nreturn Network\n"
  },
  {
    "path": "Predict.lua",
    "content": "require 'nn'\nrequire 'audio'\nrequire 'Mapper'\nrequire 'UtilsMultiGPU'\nlocal cmd = torch.CmdLine()\ncmd:option('-modelPath', 'deepspeech.t7', 'Path of model to load')\ncmd:option('-audioPath', '', 'Path to the input audio to predict on')\ncmd:option('-dictionaryPath', './dictionary', 'File containing the dictionary to use')\ncmd:option('-windowSize', 0.02, 'Window Size of audio')\ncmd:option('-stride', 0.01, 'Stride of audio')\ncmd:option('-sampleRate', 16000, 'Rate of audio (default 16khz)')\ncmd:option('-nGPU', 1)\n\nlocal opt = cmd:parse(arg)\n\nif opt.nGPU > 0 then\n    require 'cunn'\n    require 'cudnn'\n    require 'BatchBRNNReLU'\nend\n\nlocal model =  loadDataParallel(opt.modelPath, opt.nGPU)\nlocal mapper = Mapper(opt.dictionaryPath)\n\nlocal wave = audio.load(opt.audioPath)\nlocal spect = audio.spectrogram(wave, opt.windowSize * opt.sampleRate, 'hamming', opt.stride * opt.sampleRate):float() -- freq-by-frames tensor\n\n-- normalize the data\nlocal mean = spect:mean()\nlocal std = spect:std()\nspect:add(-mean)\nspect:div(std)\n\nspect = spect:view(1, 1, spect:size(1), spect:size(2))\n\nif opt.nGPU > 0 then\n    spect = spect:cuda()\n    model = model:cuda()\nend\n\nmodel:evaluate()\nlocal predictions = model:forward(spect)\nlocal tokens = mapper:decodeOutput(predictions[1])\nlocal text = mapper:tokensToText(tokens)\n\nprint(text)"
  },
  {
    "path": "README.md",
    "content": "# deepspeech.torch\n\n[![Build Status](https://travis-ci.org/SeanNaren/deepspeech.torch.svg?branch=master)](https://travis-ci.org/SeanNaren/deepspeech.torch)\n[![Documentation Status](https://readthedocs.org/projects/ctcspeechrecognition/badge/?version=latest)](http://ctcspeechrecognition.readthedocs.io/en/latest/?badge=latest)\n\n\nImplementation of [Baidu Warp-CTC](https://github.com/baidu-research/warp-ctc) using torch7.\nCreates a network based on the [DeepSpeech2](http://arxiv.org/pdf/1512.02595v1.pdf) architecture using the Torch7 library, trained with the CTC activation function.\n\n## Features\n* Train large models with large datasets via online loading using [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) and multi-GPU support.\n* Supports variable length batches via padding.\n* Implements the [AN4 Audio database](http://www.speech.cs.cmu.edu/databases/an4/) (50 mins of data).\nHas also been extended to train using the [LibriSpeech](http://www.openslr.org/12/) dataset (1000 hours of data). Custom dataset preparation is explained in documentation.\n\n## Branches\n\nThere are currently two branches, Master and Phoneme:\n* Master: This branch trains DeepSpeech2. Also included is an evaluation script which calculates the WER/CER, as well as a prediction script.\nThis branch is useful for understanding how the DeepSpeech and CTC works and is easy to run after installation. Highly recommended to checkout this branch.\n* Phonemes: This branch is experimental and uses phonemes rather than character based predictions. This is fully credited and extended by [CCorfield](https://github.com/CCorfield) and his awesome work in porting to use phonemes. In addition to this\nI'd like to also thank [Shane Walker](https://github.com/walkers-mv) for his awesome recent conversion to use phonemes as well.\n\n## Installation/Data Preparation/Documentation\n\nFollow Instructions/Data Preparation/Documentation found in the wiki [here](https://github.com/SeanNaren/deepspeech.torch/wiki/Installation) to set up and run the code.\n\nTechnical documentation can be found [here](http://ctcspeechrecognition.readthedocs.io/en/latest/).\n\n## Pre-trained Networks\n\nPre-trained networks are available for AN4 as well as LibriSpeech for CUDA only (since they use cudnn RNNs). Download Links and accuracies are below. DeepSpeech-light is a smaller model which is less intensive to train (based on LSTMs rather than RNNs).\n\n### AN4\n\n**an4Test**\n\n|Network                | WER       | CER       |Link       |\n|-----------------|:--------:|:--------:|:--------:|\n|DeepSpeech-light| N/A     | N/A | N/A |\n|DeepSpeech | 12    | 3.07 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/an4_deepspeech.t7) |\n\n### LibriSpeech\n\n**Librispeech-test-clean**\n\n|Network                | WER       | CER       |Link       |\n|-----------------|:--------:|:--------:|:--------:|\n|DeepSpeech-light| 15     | 1.34 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech-light.t7) |\n|DeepSpeech | 12    | 1.55 | [Download](https://github.com/SeanNaren/deepspeech.torch/releases/download/v1.0/libri_deepspeech.t7) |\n\n**Librispeech-test-other**\n\n|Network                | WER       | CER       |Link       |\n|-----------------|:--------:|:--------:|:--------:|\n|DeepSpeech-light| 36    | 3.80 | (Download Above) |\n|DeepSpeech | 33    | 3.24 | (Download Above) |\n\nOnce you're set up, you can start training from these nets by using the below parameters (you might need to change the other parameters described in the wiki) after setting the project up:\n\n```lua\nth Train.lua -loadModel -loadPath /path/to/model.t7\n```\n\n## Acknowledgements\n\nLots of people helped/contributed to this project that deserve recognition:\n* Soumith Chintala for his support on Torch7 and the vast open source projects he has contributed that made this project possible!\n* Charles Corfield for his work on the Phoneme Dataset and his overall contribution and aid throughout.\n* Will Frey for his thorough communication and aid in the development process.\n* Ding Ling, Yuan Yang and Yan Xia for their significant contribution to online training, multi-gpu support and many other important features.\n* Erich Elsen and the team from Baidu for their contribution of Warp-CTC that made this possible, and the encouraging words and support given throughout the project.\n* Maciej Korzepa for his huge help in training a model on Librispeech!\n"
  },
  {
    "path": "SequenceError.lua",
    "content": "local SequenceError = torch.class(\"SequenceError\")\n\n-- Calculates a sequence error rate (aka Levenshtein edit distance)\nfunction SequenceError:sequenceErrorRate(target, prediction)\n    local d = torch.Tensor(#target + 1, #prediction + 1):zero()\n    for i = 1, #target + 1 do\n        for j = 1, #prediction + 1 do\n            if (i == 1) then\n                d[1][j] = j - 1\n            elseif (j == 1) then\n                d[i][1] = i - 1\n            end\n        end\n    end\n\n    for i = 2, #target + 1 do\n        for j = 2, #prediction + 1 do\n            if (target[i - 1] == prediction[j - 1]) then\n                d[i][j] = d[i - 1][j - 1]\n            else\n                local substitution = d[i - 1][j - 1] + 1\n                local insertion = d[i][j - 1] + 1\n                local deletion = d[i - 1][j] + 1\n                d[i][j] = torch.min(torch.Tensor({ substitution, insertion, deletion }))\n            end\n        end\n    end\n    local errorRate = d[#target + 1][#prediction + 1] / #target\n    return errorRate\nend\n\nfunction SequenceError:calculateCER(targetTranscript, predictTranscript)\n    return self:sequenceErrorRate(targetTranscript, predictTranscript)\nend\n\nfunction SequenceError:calculateWER(targetTranscript, predictTranscript)\n    -- convert to words before calculation\n    local targetWords = {}\n    for word in targetTranscript:gmatch(\"%S+\") do table.insert(targetWords, word) end\n    local predictedWords = {}\n    for word in predictTranscript:gmatch(\"%S+\") do table.insert(predictedWords, word) end\n    return self:sequenceErrorRate(targetWords, predictedWords)\nend"
  },
  {
    "path": "Test.lua",
    "content": "local Network = require 'Network'\n\n-- Load the network from the saved model. Options can be overrided on command line run.\nlocal cmd = torch.CmdLine()\ncmd:option('-loadModel', true, 'Load previously saved model')\ncmd:option('-saveModel', false, 'Save model after training/testing')\ncmd:option('-loadPath', 'deepspeech.t7', 'Path of final model to save/load')\ncmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')\ncmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')\ncmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')\ncmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')\ncmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')\ncmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')\ncmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')\ncmd:option('-batchSize', 20, 'Batch size in training')\ncmd:option('-validationBatchSize', 32, 'Batch size for validation')\n\nlocal opt = cmd:parse(arg)\n\nNetwork:init(opt)\n\nprint(\"Testing network...\")\nlocal wer, cer = Network:testNetwork()\nprint(string.format('Avg WER: %2.f  Avg CER: %.2f', 100 * wer, 100 * cer))\nprint(string.format('More information written to log file at %s', opt.logsValidationPath))"
  },
  {
    "path": "Train.lua",
    "content": "local Network = require 'Network'\n\n-- Options can be overrided on command line run.\nlocal cmd = torch.CmdLine()\ncmd:option('-loadModel', false, 'Load previously saved model')\ncmd:option('-loadPath', 'deepspeech.t7', 'Path to model to load')\ncmd:option('-modelName', 'DeepSpeechModel', 'Name of class containing architecture')\ncmd:option('-nGPU', 1, 'Number of GPUs, set -1 to use CPU')\ncmd:option('-trainingSetLMDBPath', './prepare_datasets/an4_lmdb/train/', 'Path to LMDB training dataset')\ncmd:option('-validationSetLMDBPath', './prepare_datasets/an4_lmdb/test/', 'Path to LMDB test dataset')\ncmd:option('-logsTrainPath', './logs/TrainingLoss/', ' Path to save Training logs')\ncmd:option('-logsValidationPath', './logs/ValidationScores/', ' Path to save Validation logs')\ncmd:option('-epochSave', false, 'save model every epoch')\ncmd:option('-modelTrainingPath', './models/', ' Path to save periodic training models')\ncmd:option('-saveFileName', 'deepspeech.t7', 'Name of model to save as')\ncmd:option('-dictionaryPath', './dictionary', ' File containing the dictionary to use')\ncmd:option('-epochs', 70, 'Number of epochs for training')\ncmd:option('-learningRate', 3e-4, ' Training learning rate')\ncmd:option('-learningRateAnnealing', 1.1, 'Factor to anneal lr every epoch')\ncmd:option('-maxNorm', 400, 'Max norm used to normalize gradients')\ncmd:option('-momentum', 0.90, 'Momentum for SGD')\ncmd:option('-batchSize', 20, 'Batch size in training')\ncmd:option('-permuteBatch', false, 'Set to true if you want to permute batches AFTER the first epoch')\ncmd:option('-validationBatchSize', 20, 'Batch size for validation')\ncmd:option('-LSTM', false, 'Use LSTMs rather than RNNs')\ncmd:option('-hiddenSize', 1760, 'RNN hidden sizes')\ncmd:option('-nbOfHiddenLayers', 7, 'Number of rnn layers')\n\nlocal opt = cmd:parse(arg)\n\n--Parameters for the stochastic gradient descent (using the optim library).\nlocal optimParams = {\n    learningRate = opt.learningRate,\n    learningRateAnnealing = opt.learningRateAnnealing,\n    momentum = opt.momentum,\n    dampening = 0,\n    nesterov = true\n}\n\n--Create and train the network based on the parameters and training data.\nNetwork:init(opt)\n\nNetwork:trainNetwork(opt.epochs, optimParams)\n\n--Creates the loss plot.\nNetwork:createLossGraph()"
  },
  {
    "path": "UtilsMultiGPU.lua",
    "content": "require 'rnn'\nrequire 'nngraph'\nfunction makeDataParallel(model, nGPU)\n    if nGPU > 0 then\n        cudnn.fastest = true\n        local function BatchNorm(module)\n            return torch.type(module):find('BatchNormalization')\n        end\n        model = cudnn.convert(model, cudnn, BatchNorm)\n        if nGPU > 1 then\n            gpus = torch.range(1, nGPU):totable()\n            dpt = nn.DataParallelTable(1):add(model, gpus):threads(function()\n                require 'nngraph'\n                require 'cudnn'\n                cudnn.fastest = true\n                require 'BatchBRNNReLU'\n            end)\n            dpt.gradInput = nil\n            model = dpt\n        end\n        model:cuda()\n    end\n    return model\nend\n\nlocal function cleanDPT(module, device)\n    -- This assumes this DPT was created by the function above: all the\n    -- module.modules are clones of the same network on different GPUs\n    -- hence we only need to keep one when saving the model to the disk.\n    local newDPT = nn.DataParallelTable(1)\n    cutorch.setDevice(device or 1)\n    newDPT:add(module:get(1), device or 1)\n    return newDPT\nend\n\nfunction saveDataParallel(modelPath, model)\n    if torch.type(model) == 'nn.DataParallelTable' then\n        torch.save(modelPath, cleanDPT(model))\n    elseif torch.type(model) == 'nn.Sequential' then\n        local temp_model = nn.Sequential()\n        for i, module in ipairs(model.modules) do\n            if torch.type(module) == 'nn.DataParallelTable' then\n                temp_model:add(cleanDPT(module))\n            else\n                temp_model:add(module)\n            end\n        end\n        torch.save(modelPath, temp_model)\n    elseif torch.type(model) == 'nn.gModule' then\n        torch.save(modelPath, model)\n    else\n        error('This saving function only works with Sequential or DataParallelTable modules.')\n    end\nend\n\nfunction loadDataParallel(modelPath, nGPU)\n    if nGPU > 1 then\n        require 'cudnn'\n        require 'BatchBRNNReLU'\n    end\n    local model = torch.load(modelPath)\n    if torch.type(model) == 'nn.DataParallelTable' then\n        return makeDataParallel(model:get(1):float(), nGPU)\n    elseif torch.type(model) == 'nn.Sequential' then\n        for i, module in ipairs(model.modules) do\n            if torch.type(module) == 'nn.DataParallelTable' then\n                model.modules[i] = makeDataParallel(module:get(1):float(), nGPU)\n            end\n        end\n        return model\n    elseif torch.type(model) == 'nn.gModule' then\n        model = makeDataParallel(model, nGPU)\n        return model\n    else\n        error('The loaded model is not a Sequential or DataParallelTable module.')\n    end\nend"
  },
  {
    "path": "dictionary",
    "content": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
  },
  {
    "path": "doc/DeepSpeechModel.md",
    "content": "# DeepSpeechModel\n\nDefines the deep speech 2 conv+rnn architecture.\n\n### deepSpeech(opt)\n\nDefines the torch architecture for Deep Speech 2 as a function that can be called. Returns the final model\n\n`opt` Defines the options we use including using GPUS, hidden size and number of layers for the RNNs.\n\n### calculateInputSizes(sizes)\n\nA function that calculates the sequence sizes after the convolutional layers. Used in the loss calculations in CTC, so the network isn't\npenalised for the padded sequences. Returns a same sized tensor.\n\n`sizes` Real size of each sentence in the training sample as a 1D tensor."
  },
  {
    "path": "doc/Loader.md",
    "content": "# Loader\n\nDefines the indexer class and the loader class, handling batching of the dataset to train the network.\n\n## Indexer\n\nHandles returning the next indices of the batch to load into memory, to train the network with.\n\n### indexer:__init(_dir, batchSize)\n\n`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.\n\n`batchSize` The sizes of each batch to create.\n\n### indexer:nextIndices()\n\nRetrieves the next indices that need to be loaded by the loader from the LMDB dataset.\n\n### indexer:permuteBatchOrder()\n\nPermutes the batch order randomly. This is for the net to not train in sequence order every time.\n\n## Loader\n\nLoads batches of data from LMDB files used in training/testing.\n\n### Loader:__init(dirPath)\n\n`dirPath` Directory containing the LMDB data folders for spectrogram, labels and transcripts.\n\n### Loader:nextBatch(indices)\n\nReturns the next batch of the dataset based on the given indices.\n\n`indices` The indices of the test samples that need to be retrieved. This is handled by the Indexer class above."
  },
  {
    "path": "doc/Mapper.md",
    "content": "# Mapper\n\nDefines how numeric indices are mapped to tokens and vice versa.\n\n### Mapper:__init(dictPath)\n\nCreates mappings based on the given dictionary file. The AN4 dictionary file can be seen [here](https://github.com/SeanNaren/deepspeech.torch/blob/master/dictionary).\n\n### Mapper:encodeString(string)\n\nConverts string into a set of tokens to be used as a label in training.\n\n`string` string to be converted.\n\n### Mapper:decodeOutput(predictions)\n\nConverts predictions of the neural network into a sequence of tokens (characters) via a mapper.\n\n`predictions` is a tensor of sequence likelihood vectors given by the neural network.\n\n### Mapper:tokensToText(tokens)\n\nUsing the mapper converts the tokens into readable text.\n\n`tokens` A set of numeric tokens to convert into readable text.\n"
  },
  {
    "path": "doc/ModelEvaluator.md",
    "content": "# ModelEvaluator\n\nHandles calculation of word error rate using an LMDB dataset. For more information on the calculation, see [Evaluator](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Evaluator.md).\n\n### ModelEvaluator:__init(isGPU, datasetPath, mapper, testBatchSize, logsPath)\n\n'isGPU' Whether to use the GPU (CUDA) or CPU.\n\n`datasetPath` the path to the LMDB test dataset to use in evaluation.\n\n`mapper` Maps predicted numeric values to characters, see [Mapper](https://github.com/SeanNaren/CTCSpeechRecognition/doc/Mapper.md) for more details.\n\n`testBatchSize` The size of the batches to pass the network.\n\n`logsPath` File path to put the details of evaluations into.\n\n\n### ModelEvaluator:runEvaluation(model, verbose, epoch)\n\nCalculates the word error rate and character error rate averaged over the test iterations. Uses the same threading as the training process does to load batches from the dataset.\n\n`model` The Torch model to evaluate.\n\n`verbose` If set to true, will store details of WER calculations into the log files.\n\n`epoch` Determines the epoch number that is written in the log files for this calculation."
  },
  {
    "path": "doc/Network.md",
    "content": "# Network\n\nHandles interactions with the neural network for training and testing. Configured by network parameters given in\nconstructor.\n\n### Network:init(networkParams)\n\nConstructor of the Network class. Below defines each parameter that can be taken as input.\n\n```lua\nlocal networkParams = {\n    loadModel = false, -- Set to true if loading a model into the Network class rather than training.\n    saveModel = true, -- Set to true if saving the model after training.\n    modelName = 'DeepSpeechModel', -- The name of the lua class containing the network architecture\n    nGPU = 1, -- Number of GPUs, set -1 to use CPU\n    trainingSetLMDBPath = './prepare_an4/train/', -- online loading path from the LMDB dataset for training.\n    validationSetLMDBPath = './prepare_an4/test/', -- online loading path from the LMDB dataset for testing.\n    logsTrainPath = './logs/TrainingLoss/', -- Where training logs will be stored.\n    logsValidationPath = './logs/ValidationScores/', -- Where testing score logs will be stored.\n    modelTrainingPath = './models/', -- Where models will be stored on saving.\n    modelPath = 'CTCNetwork.t7',\n    dictionaryPath = './dictionary', -- Contains the alphabet/characters that we are to predict on.\n    batchSize = 20, -- The sizes of batches that we are passing into the network in training.\n    validationBatchSize = 1, -- Validation batch sizes (should be kept at 1, since we pass 1 sample at a time).\n    validationIterations = 20, -- Number of validation iterations (kept small, because we only want to run a few tests per epoch).\n    saveModelInTraining = false, -- saves model periodically through training\n    saveModelIterations = 50 -- If saveModelInTraining set to true, we save every 50 epochs.\n}\n```\n\n### Network:prepSpeechModel(modelName, opt)\n\nUsed to create the model via the defined modelName and options.\n\n### Network:testNetwork(epoch)\n\nTests the current stored model via the word error rate.\n\n`epoch` can be used to detail the epoch number in the logs when testing scores are stored.\n\n### Network:trainNetwork(epochs, sgd_params)\n\nTrains a network stored in the `Network` class. Uses multiple threads in an online loading fashion to load the data from the disk.\n\n`epochs` defines the number of iterations of training that will occur across the entire dataset (each epochs trains on the entire dataset).\n\n`sgd_params` defines the SGD parameters for the optim library such as below.\n\n```lua\nlocal sgdParams = {\n    learningRate = 5e-4,\n    learningRateDecay = 1e-9,\n    weightDecay = 0,\n    momentum = 0.9,\n    dampening = 0,\n    nesterov = true\n}\n```\n\n### Network:createLossGraph()\n\nAfter training, when called will use gnuplot (through wrapper in the optim library) to generate a graph of the loss and word error rate over epochs.\n\n### Network:saveNetwork(saveName)\n\nWill save the model currently stored in the network class to disk, at the pre-defined save location with the given `saveName`.\n\n### Network:loadNetwork(saveName, modelName)\n\nLoads the network from the save location, stored using the pre-defined save name.\n\n`saveName` The name as to which the network was saved as\n\n`modelName` The name of the class that stores the model or architecture."
  },
  {
    "path": "doc/SequenceError.md",
    "content": "# SequenceError\n\nCalculates word error rates and handles conversion of CTC predictions to numeric tokens.\n\n### SequenceError.sequenceErrorRate(target, prediction)\n\nCalculates the error rates based on the target and the predicted inputs.\n\n`target` and `prediction` are inputs of strings or tables.\n\n### SequenceError:calculateCER(targetTranscript, predictTranscript)\n\n`targetTranscript` and `predictTranscript` are two strings, returns the Character Error Rate.\n\n### SequenceError:calculateWER(targetTranscript, predictTranscript)\n\n`targetTranscript` and `predictTranscript` are two strings, returns the Word Error Rate."
  },
  {
    "path": "doc/UtilsMultiGPU.md",
    "content": "# UtilsMultiGPU\n\nHandles multi-gpu setups of the architecture.\n\n### makeDataParallel(model, nGPU)\n\nConverts the model into a multi-gpu set up if necessary using DataParallelTable.\n\n`model` The Torch network model to modify for configured GPUs.\n\n`nGPU` Number of GPUs.\n\n### saveDataParallel(modelPath, model)\n\nSaves the model to disk.\n\n`modelPath` Location to save the model.\n\n`model` The Torch network model to save.\n\n### loadDataParallel(modelPath, nGPU)\n\nLoads a model saved using the above methods.\n\n`modelPath` Location to load the model.\n\n`nGPU` Number of GPUs to load to."
  },
  {
    "path": "doc/index.md",
    "content": "# Technical Documentation\n\nBelow are a few classes that have been documented, explaining their purpose and functions available.\n\n## Classes\n\n  * [Network](Network.md)\n  * [DeepSpeechModel](DeepSpeechModel.md)\n  * [Mapper](Mapper.md)\n  * [Evaluator](Evaluator.md)\n  * [ModelEvaluator](ModelEvaluator.md)\n  * [Utils](Utils.md)\n  * [UtilsMultiGPU](UtilsMultiGPU.md)\n  * [Loader](Loader.md)\n"
  },
  {
    "path": "mkdocs.yml",
    "content": "site_name: CTCSpeechRecognition\ntheme : simplex\nrepo_url : https://github.com/SeanNaren/CTCSpeechRecognition\nuse_directory_urls : false\nmarkdown_extensions: [extra]\ndocs_dir : doc\npages:\n- [index.md, Home]\n- [Network.md, Network]\n- [DeepSpeechModel.md, DeepSpeechModel]\n- [Mapper.md, Mapper]\n- [SequenceError.md, SequenceError]\n- [ModelEvaluator.md, ModelEvaluator]\n- [UtilsMultiGPU.md, UtilsMultiGPU]\n- [Loader.md, Loader]"
  },
  {
    "path": "prepare_datasets/FormatAN4.lua",
    "content": "require 'torch'\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'an4', 'Path to the an4 root')\ncmd:option('-newPath', 'an4_dataset', 'Path to the new data path')\ncmd:option('-audioExtension', 'sph', 'The extension of the audio files (wav/mp3/sph/etc)')\ncmd:option('-move', false, 'Moves the files over rather than copies, used to save space')\n\nlocal opt = cmd:parse(arg)\n\nlocal an4TestPath = opt.rootPath .. '/etc/an4_test.'\nlocal an4TrainPath = opt.rootPath .. '/etc/an4_train.'\nlocal an4AudioPath = opt.rootPath .. '/wav'\n\n-- strips down the transcripts into pure text\nlocal function processText(line)\n    local text = line:gsub('<s>', ''):gsub('</s>', ''):gsub('^%s', ''):gsub('%(.*%)', ''):gsub('%s*$', '')\n    return text\nend\n\nlocal function createDataset(pathToAN4, an4AudioPath, newPath)\n    sys.execute(\"mkdir \" .. newPath)\n    local fileids = pathToAN4 .. 'fileids'\n    local transcripts = pathToAN4 .. 'transcription'\n    local filePaths = {}\n    for filePath in io.lines(fileids) do\n        table.insert(filePaths, filePath)\n    end\n    local counter = 1\n    for line in io.lines(transcripts) do\n        local text = processText(line)\n        local filePath = filePaths[counter]\n        -- new filename extracted from an4 file id\n        local fileName = sys.split(filePath, '/')[3] -- last part is the filename\n        -- create new text file with clean transcript\n        local textPath = newPath .. '/' .. fileName .. '.txt'\n        local file = io.open(textPath, \"w\")\n        file:write(text)\n        file:close()\n        -- move audio to correct place\n        local audioPath = an4AudioPath .. '/' .. filePath .. '.' .. opt.audioExtension\n        local newPath = newPath .. '/' .. fileName .. '.' .. opt.audioExtension\n        local command\n        if opt.move then command = \"mv \" else command = \"cp \" end\n        sys.execute(command .. audioPath .. ' ' .. newPath)\n        counter = counter + 1\n    end\nend\n\nsys.execute(\"mkdir \" .. opt.newPath)\ncreateDataset(an4TrainPath, an4AudioPath, opt.newPath .. '/train/')\ncreateDataset(an4TestPath, an4AudioPath, opt.newPath .. '/test/')\n"
  },
  {
    "path": "prepare_datasets/FormatLibriSpeech.lua",
    "content": "require 'torch'\nlocal threads = require 'threads'\n\nlocal cmd = torch.CmdLine()\ncmd:option('-rootPath', 'LibriSpeech', 'Path to the librispeech root')\ncmd:option('-newPath', 'libri_dataset', 'Path to the new data path')\ncmd:option('-audioExtension', 'flac', 'The extension of the audio files (wav/mp3/sph/etc)')\ncmd:option('-move', false, 'Moves the files over rather than copies, used to save space')\ncmd:option('-threads', 8, 'Number of threads to use')\n\nlocal opt = cmd:parse(arg)\nlocal extension = '.' .. opt.audioExtension\n\nlocal libriTestPath = opt.rootPath .. '/test/'\nlocal libriTrainPath = opt.rootPath .. '/train/'\nlocal threads = threads.Threads(opt.threads, function(idx) require 'torch' require 'sys' end)\n\n-- strips down the transcripts into pure text\nlocal function processText(line)\n    local text = line:gsub('[^a-zA-Z ]', '')\n    return text\nend\n\nlocal function createDataset(libriPath, newDirPath)\n    sys.execute(\"mkdir \" .. newDirPath)\n    local size = tonumber(sys.execute(\"find \" .. libriPath .. \" -type f -name '*'\" .. extension .. \" | wc -l \"))\n\n    local counter = 1\n\n    local function formatData(line, dir)\n        local text = processText(line)\n        local id = line:match(\"([^ ]*) \") -- first part of transcript, used for audio file path and ID\n        local audioFolders = sys.split(id, '-')\n\n        local textPath = newDirPath .. '/' .. id .. '.txt'\n        local file = io.open(textPath, \"w\")\n        file:write(text)\n        file:close()\n        -- move audio to correct place\n        local audioPath = dir .. '/' .. audioFolders[1] .. '/' .. audioFolders[2] .. '/' .. id .. extension\n        local newPath = newDirPath .. '/' .. id .. extension\n        local command\n        if opt.move then command = \"mv \" else command = \"cp \" end\n        sys.execute(command .. audioPath .. ' ' .. newPath)\n    end\n\n    local counter = 0\n\n    local p = io.popen('find \"' .. libriPath .. '\" -maxdepth 1 -mindepth 1 -type d')\n    for dir in p:lines() do\n        local transcripts = io.popen(\"find -L \" .. dir .. \" -type f -name '*.txt'\")\n        for transcript in transcripts:lines() do\n            for line in io.lines(transcript) do\n                threads:addjob(function()\n                    formatData(line, dir)\n                end,\n                    function()\n                        counter = counter + 1\n                        xlua.progress(counter, size)\n                    end)\n            end\n        end\n    end\nend\n\nsys.execute(\"mkdir \" .. opt.newPath)\ncreateDataset(libriTrainPath, opt.newPath .. '/train/')\ncreateDataset(libriTestPath, opt.newPath .. '/test/')\n"
  },
  {
    "path": "tests/test.lua",
    "content": "require 'nn'\n\nlocal test = torch.TestSuite()\nlocal mytester\nrequire '../SequenceError'\nrequire '../Mapper'\n\nlocal sequenceError = SequenceError()\n\nfunction test.evaluator()\n    -- Calculates WER, (nbOfInsertions + nbOfDeletions + nbOfSubstitutions) / nbOfWords\n    local target = \"test a sentence\"\n\n    local prediction = \"a sentence\"\n    local deletion = sequenceError:calculateWER(target, prediction)\n    local prediction = \"test a sentence inserted\"\n    local insertion = sequenceError:calculateWER(target, prediction)\n    local prediction = \"test substituted sentence\"\n    local substitution = sequenceError:calculateWER(target, prediction)\n    local oneMistakeWER = 1 / 3 -- One insertion/deletion/substitution / number of words\n    mytester:eq(deletion, oneMistakeWER, 'WER with deletion was incorrect')\n    mytester:eq(insertion, oneMistakeWER, 'WER with insertion was incorrect')\n    mytester:eq(substitution, oneMistakeWER, 'WER with substitution was incorrect')\n\n    local prediction = \"a\"\n    local deletion = sequenceError:calculateWER(target, prediction)\n    local prediction = \"a wrong\"\n    local deletionAndSubstitution = sequenceError:calculateWER(target, prediction)\n    local prediction = \"wrong a sentence inserted\"\n    local substitionAndInsertion = sequenceError:calculateWER(target, prediction)\n    local twoMistakeWER = 2 / 3 -- Two errors of insertion/deletion/substitution / number of words\n    mytester:eq(deletion, twoMistakeWER, 'masking of outputs was incorrect')\n    mytester:eq(deletionAndSubstitution, twoMistakeWER, 'WER with substitution and deletion was incorrect')\n    mytester:eq(substitionAndInsertion, twoMistakeWER, 'WER with substitution and insertion was incorrect')\nend\n\nfunction test.mapper()\n    local dir_path = 'test_dictionary'\n    local mapper = Mapper(dir_path)\n    local alphabet = {\n        '$', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',\n        's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '\\''\n    }\n    local expectedMapping = {}\n    for index, letter in ipairs(alphabet) do\n        expectedMapping[letter] = index - 1\n    end\n    mytester:eq(mapper.alphabet2token, expectedMapping)\nend\n\nfunction test.mapperDecode()\n    local dir_path = 'test_dictionary'\n    local mapper = Mapper(dir_path)\n    local predictions = torch.Tensor({ { 1, 2, 3 }, { 2, 3, 1 }, { 1, 2, 3 } })\n    local tokens = mapper:decodeOutput(predictions)\n    local text = mapper:tokensToText(tokens)\n    mytester:eq(tokens, { 2, 1, 2 })\n    mytester:eq(text, 'bab')\nend\n\nmytester = torch.Tester()\nmytester:add(test)\nmytester:run()\n"
  },
  {
    "path": "tests/test_dictionary",
    "content": "$\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz\n \n'"
  }
]