[
  {
    "path": ".gitattributes",
    "content": "# Auto detect text files and perform LF normalization\n* text=auto\n\n# Custom for Visual Studio\n*.cs     diff=csharp\n\n# Standard to msysgit\n*.doc\t diff=astextplain\n*.DOC\t diff=astextplain\n*.docx diff=astextplain\n*.DOCX diff=astextplain\n*.dot  diff=astextplain\n*.DOT  diff=astextplain\n*.pdf  diff=astextplain\n*.PDF\t diff=astextplain\n*.rtf\t diff=astextplain\n*.RTF\t diff=astextplain\n"
  },
  {
    "path": "DetectMalware_CNN.lua",
    "content": "require 'nn'\nrequire 'optim'\nrequire 'nngraph'\n\nrequire 'readMalwareData'\nrequire 'splitMalwareData'\nrequire 'buildNetwork'\nrequire 'trainModel'\n\nlocal cmd = torch.CmdLine()\ncmd:option('-seed',1,'seed the random number generator')\ncmd:option('-nEmbeddingDims',8,'number of dims in lookupTable for projecting instructions to network')\ncmd:option('-nConvFilters',64,'number of convolutional filters')\ncmd:option('-kernelLength',8,'seed the random number generator')\ncmd:option('-useHiddenLayer',true,'use hidden layer between the conv layers and classifier')\ncmd:option('-nHiddenNodes',16,'seed the random number generator')\ncmd:option('-weightClasses',false,'seed the random number generator')\ncmd:option('-nSamplingEpochs',10,'how often to sample the validation set - slow')\ncmd:option('-useDropout',false,'use dropout between the conv and hidden layers')\ncmd:option('-dropoutFrac',0.5,'dropout strength')\ncmd:option('-randomize',false,'randomly select the network parameters')\ncmd:option('-numDAShuffles',1,'number of function order shuffled versions of each program to keep')\ncmd:option('-useOneHot',false,'Represent programs using one-hot / otherwise use look-up-table')\ncmd:option('-learningRate',1e-3,'learning rate')\ncmd:option('-nEpochs',20,'training epochs')\ncmd:option('-nConvLayers',1,'number of extra convolutional layers')\ncmd:option('-nFCLayers',1,'number of extra convolutional layers')\ncmd:option('-batchSize',1,'size of batch used in training')\ncmd:option('-usemom',false,'use momentum during SGD optimisation')\ncmd:option('-useRMSProp',false,'use alternative optimizer rather than SGD')\ncmd:option('-useCUDA',false,'use CUDA optimisation')\ncmd:option('-gpuid',1,'which GPU to use')\ncmd:option('-usePreTrainedEmbedding',false,'initialise network with pre-trained embedding')\ncmd:option('-fixEmbedding',false,'prevent the embedding from being updated during learning')\n\ncmd:option('-programLen',8,'how many instructions to read')\n\ncmd:option('-debug',false,'enter debug mode')\n\ncmd:option('-dataAugProb',0.1,'probability of changing an instruction during data augmentation')\ncmd:option('-dataAugMethod',1,'1 - substitue the semantically most similar instruction, 2 - substitue random instruction')\n\ncmd:option('-trainingSetSize',2,'restrict the size of the training-set for evaluation purposes')\ncmd:option('-markFunctionEnds',false,'place a marker at the end of each method which may help classification work better')\n\ncmd:option('-saveModel',false,'save the model and data split')\ncmd:option('-saveFileName','detect_malware_cnn','filename to save the network')\n\ncmd:option('-decayLearningRate',false,'reduce learning rate by factor of 10 every so often')\ncmd:option('-weightDecay',0,'weight decay for L2 regularisation')\ncmd:option('-weightDecayFrac',0.1,'amount to reduce learning rate by, 0.1 or 0.5 are good values')\n\n-- try using dropout in various places of the network\ncmd:option('-useSpatialDropout',false,'drop instructions after the embedding layer')\ncmd:option('-useDropoutAfterEmbedding',false,'drop instructions after the embedding layer')\ncmd:option('-useDropoutAfterConv',false,'drop instructions after the embedding layer')\n\ncmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')\ncmd:option('-metaDataFile','./config/metaData.th7','file containing indicies of test/train/val split')\ncmd:option('-setupMode',false,'Only run in this mode once. Splits the data into the train/test sets. Saved into ./config/metaData.th7')\n\ncmd:option('-maxSequenceLength',1000000,'if program is longer than this length, crop sequence before passing to GPU')\n\ncmd:option('-dataAugTesting',false,'Use data augmentation during testing i.e average score over random samples from program')\n\nopt = cmd:parse(arg)\n\nif opt.useCUDA then\n\trequire 'cunn'\n\trequire 'cutorch'\nend\n\ntorch.setdefaulttensortype(\"torch.DoubleTensor\")\ntorch.manualSeed(opt.seed)\nif opt.useCUDA then \n\tcutorch.setDevice(opt.gpuid)\n\tcutorch.manualSeedAll(opt.seed)\nend\n\nif opt.dataAugTesting then\n\trequire 'testModel_dataAug'\nelse\n\trequire 'testModel'\nend\n\nprint(opt)\n\nfunction isnan(z)\n\treturn z ~= z\nend\n\n\nif opt.setupMode then\n\n\t-- READ-ME\n\t-- Given a new dataset we need to split into training / testing sets.\n\t-- We only run this chunk once to generate the new train / test split and save it to disk\n\t-- Later, when training the network, the training-set is randomly spit into train / validation for a given run\n\t-- This allows us to perform cross-validation on the training-set. After we have finished\n\t-- doing all development we can test a pre-trained network on the testing-set.\n\t------------------------------------------------------------------------------------------\n\t------------------------------------------------------------------------------------------\n\n\t-- read the data from the root dir\n\t-- decide which files should be included in the dataset\n\tprint('reading dataset')\n\tlocal datasetInfo = readMalwareData_setup(opt.dataDir)\n\n\tprint('splitting dataset into train/test sets')\n\tlocal trainPercentage = 0.9 -- use 90% for training and validation sets, and 10% for held-out testing-set\n\tlocal trainInds,testInds,posNegRatio = splitMalwareDataTrainTest(datasetInfo.label,trainPercentage,1 - trainPercentage)\n\tlocal metaData = {\n\t\ttrainInds = trainInds,\n\t\ttestInds = testInds,\n\t\tposNegRatio = posNegRatio,\n\t\ttrainPercentage = trainPercentage,\n\t\t--\n\t\tfilesList = datasetInfo.filesList,\n\t\tfamily = datasetInfo.family,\n\t\tlabel = datasetInfo.label,\n\t\tbenignFamily = datasetInfo.benignFamily,\n\t\tfamilyName = datasetInfo.familyName,\n\t}\n\tprint('saving dataset metadata to file ',opt.metaDataFile)\n\ttorch.save(opt.metaDataFile,metaData)\n\n\t-- ------------------------------------------------------------------------------------------\n\t-- ------------------------------------------------------------------------------------------\n\nelse\n\t-- train the network and save version with lowest validation error to disk\n\n\tprint(opt.metaDataFile)\n\tlocal metaData = torch.load(opt.metaDataFile)\n\n\tprint('reading data from disk')\n\tlocal allData = readMalwareData(opt.dataDir,metaData)\n\tprint('reading data from disk - complete')\n\tprint('program lens ',torch.min(allData.programLengths),torch.max(allData.programLengths),torch.mean(allData.programLengths))\n\n\t--take the saved split of train/test and further split the train-set into train/val\n\tprint('splitting data into train/val/test sets')\t\n\tlocal testPercentage = (1 - metaData.trainPercentage)\n\tlocal valPercentage = (1 - metaData.trainPercentage)\n\tlocal trainPercentage = 1 - (testPercentage + valPercentage)\n\n\tprint('t,v,t')\n\tprint(testPercentage,valPercentage,trainPercentage)\n\n\tlocal trainInds,valInds,testInds,posNegRatio = splitMalwareDataTrainValTest(allData.label,metaData,trainPercentage)\n\tlocal dataSplit = {\n\t\t\ttrainInds = trainInds,\n\t\t\tvalInds = valInds,\n\t\t\ttestInds = testInds,\n\t\t\tposNegRatio = posNegRatio,\n\t\t}\n\n\tprint('new network')\n\tlocal model,criterion = buildNetwork(metaData.posNegRatio)\n\n\tprint('starting training')\n\tlocal trainedModel = trainModel(model,criterion,allData,dataSplit.trainInds,dataSplit.valInds,dataSplit,metaData)\nend"
  },
  {
    "path": "buildNetwork.lua",
    "content": "function buildNetwork(posNegRatio)\n\n\tlocal nIndex = 256\n\tlocal nOutputSamples = opt.nConvFilters    -- number of conv-filters\n\tlocal kernelStride = 1      -- stride of kernel\n\tlocal nClasses = 2\n\tlocal nHidden = opt.nHiddenNodes\n\n\tlocal model = nn.Sequential()\n\n\t-- project from one-hot to low-dim embedding space\n\tif opt.constrainEmbeddingNorm then\t\n\t\tmodel:add(nn.LookupTable(nIndex,opt.nEmbeddingDims,0,1,2))\n\telse\n\t\tmodel:add(nn.LookupTable(nIndex,opt.nEmbeddingDims))\n\tend\n\n\t-- we can add this here to prevent the network from updating the projection layer\n\t-- maybe the projection does not matter much?\n\t-- model:add(nn.GradBlocker())\n\n\t-- 1st conv layer\n\t--model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))\n\tmodel:add(nn.Reshape(1,-1,opt.nEmbeddingDims,true))\n\n\tif opt.useSpatialDropout then\n\t\t-- should be batchx1xproglenxembeddingdim\n\t\tmodel:add(nn.Reshape(opt.programLen,opt.nEmbeddingDims,1,true))\n\t\tmodel:add(nn.SpatialDropout(opt.dropoutFrac))\n\t\tmodel:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))\n\tend\n\n\t--model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))\n\n\tif opt.useDropoutAfterEmbedding then\n\t\tmodel:add(nn.Dropout(opt.dropoutFrac))\n\tend\n\n\tmodel:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nEmbeddingDims,opt.kernelLength,kernelStride))\n\tmodel:add(nn.ReLU())\n\n\t-- if opt.nConvLayers > 1 then\n\t-- \tfor layernum = 1,(opt.nConvLayers-1) do\n\n\t-- \t\tmodel:add(nn.Reshape(opt.nConvFilters,-1,true))\t\n\t-- \t\tmodel:add(nn.Transpose({2,3}))\n\t-- \t\t--model:add(nn.TemporalMaxPooling(opt.kernelLength/2,opt.kernelLength/2))\n\t-- \t\tmodel:add(nn.TemporalMaxPooling(2,2))\n\t-- \t\tmodel:add(nn.Reshape(1,-1,opt.nConvFilters,true))\n\n\t-- \t\tmodel:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))\n\t-- \t\tmodel:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nConvFilters,opt.kernelLength,kernelStride))\n\n\t-- \t\tmodel:add(nn.ReLU())\t\t\t\n\n\t-- \tend\n\t-- end\n\n\n\tmodel:add(nn.Reshape(opt.nConvFilters,-1,true))\t\n\n\tif opt.useDropoutAfterConv then\n\t\tmodel:add(nn.Dropout(opt.dropoutFrac))\n\tend\n\n\tmodel:add(nn.Max(3)) -- produces a vector of fixed size\n\t\n\tif opt.useHiddenLayer then\n\t\tmodel:add(nn.Linear(nOutputSamples,nHidden))\n\t\tmodel:add(nn.ReLU())\n\t\tmodel:add(nn.Linear(nHidden,nClasses))\n\telse\n\t\tmodel:add(nn.Linear(nOutputSamples,nClasses))\n\tend\n\tmodel:add(nn.LogSoftMax())\n\t\n\tlocal criterion = 0\n\tif opt.weightClasses then\n\n\t\tlocal weights = torch.zeros(nClasses)\n\t\tif posNegRatio < 0.5 then\n\t\t\tweights[1] = 1 - posNegRatio\n\t\t\tweights[2] = posNegRatio\n\t\telse\n\t\t\tweights[2] = 1 - posNegRatio\n\t\t\tweights[1] = posNegRatio\n\t\tend\n\t\tcriterion = nn.ClassNLLCriterion(weights)\n\telse\n\t\tcriterion = nn.ClassNLLCriterion()\n\tend\n\n\tif opt.useCUDA then\n\t\tmodel:cuda()\n\t\tcriterion:cuda()\n\tend\n\n\tprint(model)\n\n\treturn model,criterion\nend"
  },
  {
    "path": "dataset/Benign/example.opseq",
    "content": "5b700e\n700e\n1f6e0c"
  },
  {
    "path": "dataset/Malware/example.opseq",
    "content": "5b700e\n700e\n1f6e0c"
  },
  {
    "path": "opcodeseq_creator/DalvikOpcodes.txt",
    "content": "nop 00\nmove 01\nmove/from16 02\nmove/16 03\nmove-wide 04\nmove-wide/from16 05\nmove-wide/16 06\nmove-object 07\nmove-object/from16 08\nmove-object/16 09\nmove-result 0a\nmove-result-wide 0b\nmove-result-object 0c\nmove-exception 0d\nreturn-void 0e\nreturn 0f\nreturn-wide 10\nreturn-object 11\nconst/4 12\nconst/16 13\nconst 14\nconst/high16 15\nconst-wide/16 16\nconst-wide/32 17\nconst-wide 18\nconst-wide/high16 19\nconst-string 1a\nconst-string/jumbo 1b\nconst-class 1c\nmonitor-enter 1d\nmonitor-exit 1e\ncheck-cast 1f\ninstance-of 20\narray-length 21\nnew-instance 22\nnew-array 23\nfilled-new-array 24\nfilled-new-array/range 25\nfill-array-data 26\nthrow 27\ngoto 28\ngoto/16 29\ngoto/32 2a\npacked-switch 2b\nsparse-switch 2c\ncmpl-float 2d\ncmpg-float 2e\ncmpl-double 2f\ncmpg-double 30\ncmp-long 31\nif-eq 32\nif-ne 33\nif-lt 34\nif-ge 35\nif-gt 36\nif-le 37\nif-eqz 38\nif-nez 39\nif-ltz 3a\nif-gez 3b\nif-gtz 3c\nif-lez 3d\naget 44\naget-wide 45\naget-object 46\naget-boolean 47\naget-byte 48\naget-char 49\naget-short 4a\naput 4b\naput-wide 4c\naput-object 4d\naput-boolean 4e\naput-byte 4f\naput-char 50\naput-short 51\niget 52\niget-wide 53\niget-object 54\niget-boolean 55\niget-byte 56\niget-char 57\niget-short 58\niput 59\niput-wide 5a\niput-object 5b\niput-boolean 5c\niput-byte 5d\niput-char 5e\niput-short 5f\nsget 60\nsget-wide 61\nsget-object 62\nsget-boolean 63\nsget-byte 64\nsget-char 65\nsget-short 66\nsput 67\nsput-wide 68\nsput-object 69\nsput-boolean 6a\nsput-byte 6b\nsput-char 6c\nsput-short 6d\ninvoke-virtual 6e\ninvoke-super 6f\ninvoke-direct 70\ninvoke-static 71\ninvoke-interface 72\ninvoke-virtual/range 74\ninvoke-super/range 75\ninvoke-direct/range 76\ninvoke-static/range 77\ninvoke-interface/range 78\nneg-int 7b\nnot-int 7c\nneg-long 7d\nnot-long 7e\nneg-float 7f\nneg-double 80\nint-to-long 81\nint-to-float 82\nint-to-double 83\nlong-to-int 84\nlong-to-float 85\nlong-to-double 86\nfloat-to-int 87\nfloat-to-long 88\nfloat-to-double 89\ndouble-to-int 8a\ndouble-to-long 8b\ndouble-to-float 8c\nint-to-byte 8d\nint-to-char 8e\nint-to-short 8f\nadd-int 90\nsub-int 91\nmul-int 92\ndiv-int 93\nrem-int 94\nand-int 95\nor-int 96\nxor-int 97\nshl-int 98\nshr-int 99\nushr-int 9a\nadd-long 9b\nsub-long 9c\nmul-long 9d\ndiv-long 9e\nrem-long 9f\nand-long a0\nor-long a1\nxor-long a2\nshl-long a3\nshr-long a4\nushr-long a5\nadd-float a6\nsub-float a7\nmul-float a8\ndiv-float a9\nrem-float aa\nadd-double ab\nsub-double ac\nmul-double ad\ndiv-double ae\nrem-double af\nadd-int/2addr b0\nsub-int/2addr b1\nmul-int/2addr b2\ndiv-int/2addr b3\nrem-int/2addr b4\nand-int/2addr b5\nor-int/2addr b6\nxor-int/2addr b7\nshl-int/2addr b8\nshr-int/2addr b9\nushr-int/2addr ba\nadd-long/2addr bb\nsub-long/2addr bc\nmul-long/2addr bd\ndiv-long/2addr be\nrem-long/2addr bf\nand-long/2addr c0\nor-long/2addr c1\nxor-long/2addr c2\nshl-long/2addr c3\nshr-long/2addr c4\nushr-long/2addr c5\nadd-float/2addr c6\nsub-float/2addr c7\nmul-float/2addr c8\ndiv-float/2addr c9\nrem-float/2addr ca\nadd-double/2addr cb\nsub-double/2addr cc\nmul-double/2addr cd\ndiv-double/2addr ce\nrem-double/2addr cf\nadd-int/lit16 d0\nrsub-int d1\nmul-int/lit16 d2\ndiv-int/lit16 d3\nrem-int/lit16 d4\nand-int/lit16 d5\nor-int/lit16 d6\nxor-int/lit16 d7\nadd-int/lit8 d8\nrsub-int/lit8 d9\nmul-int/lit8 da\ndiv-int/lit8 db\nrem-int/lit8 dc\nand-int/lit8 dd\nor-int/lit8 de\nxor-int/lit8 df\nshl-int/lit8 e0\nshr-int/lit8 e1\nushr-int/lit8 e2\n"
  },
  {
    "path": "opcodeseq_creator/README.txt",
    "content": "\nThe zip file contains:\n\n\t1- A csv file containing Davlik opcodes\n\n\t2- Sample directory structure containing\n\t\t-apk folder with one sample apk\n\t\t-tmp folder to hold the decoded apps\n\t\t-opseq folder to store the opcode sequece files\n\n\t5- a python file run_opcode_seq_creation.py which takes the following arguments:\n\t\t\n\t\tPython script arguments:\n\n\t\t\t<apk file directory>   \n\t\t\t1. Pathname to the directory containing apk file \n\n\t\t\t<temp directory>\n\t\t\t2. Pathname of a temporary folder to keep the decoded files during the analysis \n\n\t\t\t<opseq directory>\n\t\t\t3. Pathname to an arbitrary directory to store the opcode sequence files\n\n\t\t\t<include support libraries>\n\t\t\t4. (optional) \"incl\" (without quotes) to include android support library files\n           \t   \t   Note: default behavior is NOT to include those libraries\n\nSteps to run the script:\n\t\n\n\t1) Apktool installation:\n\n\t \t-Make sure you have java install by running \"java --version\"\n\t\t you can install jre by running \"apt-get install default-jre\"\n\n \t \t-Follow the installation below to install apktool on Linux\n  \t\t https://ibotpeaches.github.io/Apktool/install/\n  \t\t (folowing the instructions will place apktool files in /usr/local/bin)\n\t\t Note: Make sure that they are executable\n\n\n\t2) Extract the zip file to a folder (extracted_folder) and run the following command:\n\n \t\textracted_folder$ ./run_opcode_seq_creation.py ./apk ./tmp ./opseq incl \n\n\t\n\n"
  },
  {
    "path": "opcodeseq_creator/run_opcode_seq_creation.py",
    "content": "#!/usr/bin/env python\nimport sys\nimport os\nimport shutil\nimport datetime\nimport logging\n\nsys.path.insert(1, os.path.join(sys.path[0], '../..'))\n\ndef main():\n\n    if len(sys.argv) < 4:\n\n        print \"Usage\", sys.argv[0], \"<apk file directory> <temp directory> <opseq directory> <include support libraries>\"\n        return\n\n    # Reads the location of apk files that need decoding\n    apk_file_directory = sys.argv[1]\n    print \"Reading apks from\", apk_file_directory\n\n    # Temporary folder to store the decoded app\n    tmp_file_directory = sys.argv[2]\n    print \"Decoding folder\", tmp_file_directory\n\n    # Reads the location that we want to store our opseq files in\n    opseq_file_directory = sys.argv[3]\n    print \"opseq folder\", opseq_file_directory\n\n    # Default is not to include smali files in android support libraries unless 4th parameter is provided\n    include_libs = False\n    if len(sys.argv) == 5:\n        include_libs = ((sys.argv[4]) == \"incl\")\n        print \"Include Android support library smali files\", include_libs\n\n    print \"Keep Android support libaray files: \"+ str(include_libs)\n\n    # Created a log file in the temp directory\n    logging.basicConfig(filename=tmp_file_directory+'/opseq.log', level=logging.DEBUG)\n\n    apks = []\n\n    for name in os.listdir(apk_file_directory):\n        if os.path.isfile(os.path.join(apk_file_directory, name)):\n            apks.append(name)\n\n    logging.info('Total apks to be decoded {0}'.format(len(apks)))\n    print \"Total apks to be decoded\",len(apks)\n\n    num_local = 0\n    before=datetime.datetime.now()\n    logging.info('Starting at: {0}'.format(before))\n    print \"Starting at: {0}\",before\n\n    # Looping through all apks\n    for apk_hash in apks:\n        apk_file_location = os.path.join(apk_file_directory, apk_hash)\n        num_local += 1\n        logging.info('Decoding apk: {0} apk #: {1}'.format(apk_file_location,num_local))\n        print \"apk #: \", num_local\n        print \"apk location: \", apk_file_location\n\n        decoded_location = None\n        # Decoding apk into the tmp_file_directory\n        decoded_location = decode_application(apk_file_location,tmp_file_directory,apk_hash,include_libs)\n\n        if (not os.path.exists(decoded_location) or not os.listdir(decoded_location)):\n            print \"smali directory does not exist continue....\"\n            logging.error('NOT decoded directory: {0}'.format(apk_file_location))\n            print \"NOT decoded directory:\", apk_file_location\n            continue\n\n        result =create_opcode_seq(decoded_location,opseq_file_directory,apk_hash)\n\n        if result:\n            print \"opseq file for apk #\",num_local,\" is created\"\n            logging.info('opseq file for apk # {0} is created'.format(num_local))\n        else:\n            logging.error('opseq file creation was not successful')\n            print \"opseq file creation was not successful\"\n\n        if os.path.exists(decoded_location):\n            shutil.rmtree(decoded_location)\n\n\n    after=datetime.datetime.now()\n    print \"Finished by: {0} \",after\n    logging.info('Total time taken:  {0}'.format(after-before))\n    print \"Total time taken:\", after-before\n\ndef create_opcode_seq(decoded_dir,opseq_file_directory,apk_hash):\n    # Returns true if creating opcode sequence file was successful,\n    # searches all files in smali folder,\n    # writes the coresponding opcode sequence to a .opseq file\n    # and depending on the include_lib value,\n    # it includes or excludes the support library files\n\n    dalvik_opcodes = {}\n    # Reading Davlik opcodes into a dictionary\n    with open(\"DalvikOpcodes.txt\") as fop:\n        for linee in fop:\n            (key, val) = linee.split()\n            dalvik_opcodes[key] = val\n    try:\n        smali_dir = os.path.join(decoded_dir, \"smali\")\n        opseq_fname=os.path.join(opseq_file_directory,apk_hash+\".opseq\")\n\n        with open(opseq_fname, \"a\") as opseq_file:\n            for root, dirs, fnames in os.walk(smali_dir):\n                for fname in fnames:\n                    full_path = os.path.join(root, fname)\n                    opseq_file.write(get_opcode_seq(full_path, dalvik_opcodes))\n        opseq_file.close()\n\n        return True\n    except Exception as e:\n        print \"Exception occured during opseq creation of apk \" ,apk_hash\n        logging.error('Exception occured during opseq creation {0}'.format(str(e)))\n        return False\n\ndef get_opcode_seq(smali_fname, dalvik_opcodes):\n    # Returns opcode sequence created from smali file 'smali_fname'.\n\n    opcode_seq=''\n\n    with open(smali_fname, mode=\"r\") as bigfile:\n        reader = bigfile.read()\n        for i, part in enumerate(reader.split(\".method\")):\n            add_newline = False\n            if i!=0:\n                method_part=part.split(\".end method\")[0]\n                method_body = method_part.strip().split('\\n')\n                for line in method_body:\n                    if not line.strip().startswith('.') and not line.strip().startswith('#') and line.strip():\n                        method_line = line.strip().split()\n                        if method_line[0] in dalvik_opcodes:\n                            add_newline = True\n                            opcode_seq += dalvik_opcodes[method_line[0]]\n                if  add_newline:\n                    opcode_seq += '\\n'\n    return opcode_seq\n\ndef decode_application (apk_file_location,tmp_file_directory,hash,include_libs):\n    # Decodes the apk at apk_file_location and\n    # stores the decoded folders in tmp_file_directory\n\n    out_file_location = os.path.join(tmp_file_directory, hash+ \".smali\")\n    try:\n        apktool_decode_apk( apk_file_location, out_file_location,include_libs )\n    except ApkToolException:\n        print \"ApktoolException on decoding\"\n        logging.error(\"ApktoolException on decoding apk  {0} \".format(apk_file_location))\n        pass\n    return out_file_location\n\ndef apktool_decode_apk(apk_file, out_file,include_libs):\n    # Runs the apktool on a given apk\n\n    apktooldir=\"/usr/local/bin\"\n\n    apktoolcmd = \"{0}/apktool d -f {1} -o {2}\".format(apktooldir, apk_file, out_file)\n    res = os.system(apktoolcmd)\n    if res != 0: raise ApkToolException(apktoolcmd)\n\n    # Checks if we should keep the smali files belonging to the android support libraries\n    if not include_libs:\n        # Don't keep the smali/android folder\n        android_folder = os.path.join(out_file, \"smali/android\")\n        if os.path.exists(android_folder):\n\t\t    rm_cmd = \"rm -r %s\" %(android_folder)\n \t\t    os.system(rm_cmd)\n\n# Exception class to signify an Apktool Exception\nclass ApkToolException(Exception):\n    def __init__(self, command):\n        self.command = command\n\n    def __str__(self):\n        return repr(self.command)\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "readMalwareData.lua",
    "content": "\n-- read the malware data\n--\t in setup mode \n--\t- read all the files\n--  - decide if it should be in dataset\n--  - save a list of all files\n--  - \t\t\t\n\n-- read the whole program into a tensor\nfunction readfileFunc_tensor(filename)\n\n\tlocal contents = {}\n\n\tlocal f = torch.DiskFile(filename)\n\tf.quiet(f)\n\tlocal c = 'a'\n\tlocal count = 0\n\tlocal func = {}\n\n\tfor i = 1,opt.kernelLength do\n\t\ttable.insert(func,1)\n\t\tcount = count + 1\n\tend\n\n\tlocal nFuncs = 0\n\twhile c ~= '' do --and count <= opt.programLen do -- potential bug...\n\t\tc = f.readString(f,'*l')\n\t\tlocal len = #c\n\t\tif len > 0 then\n\n\t\t\tfor k = 1,len,2 do\n\t\t\t\tlocal num = string.sub(c,k,k+1)\n\t\t\t\tlocal n = tonumber(num,16)\n\t\t\t\ttable.insert(func,n + 2) -- plus 2 so that our lowest symbol is '2' i.e. no_op is '2'\n\t\t\t\tcount = count + 1\n\t\t\tend\n\t\t\tnFuncs = nFuncs + 1\n\n\t\t\tfor i = 1,opt.kernelLength do\n\t\t\t\ttable.insert(func,1)\n\t\t\t\tcount = count + 1\n\t\t\tend\n\n\t\t\tif opt.markFunctionEnds then\n\t\t\t\ttable.insert(func,255) -- mark the end of each function\n\t\t\tend\n\t\tend\n\tend\n\treturn torch.ByteTensor(func),nFuncs,count\nend\n\n-- get an upper bound on the number of malware files\n-- we will discard some files that are too short etc\nfunction upperBoundNumberOfFiles(rootDir)\n\tlocal numberOfFilesBound = 0\n\tlocal malwareDirs = paths.dir(rootDir)\t\n\tfor i = 1,#malwareDirs do\n\t\tlocal dir = malwareDirs[i]\n\t\tif dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then\n\t\t\tlocal malwarefiles = paths.dir(paths.concat(rootDir,dir))\t\n\t\t\t-- number of files minus '.' and '..'\n\t\t\tnumberOfFilesBound = numberOfFilesBound + #malwarefiles - 2\n\t\tend\n\tend\n\tprint('upper bound number of programs ',numberOfFilesBound)\n\treturn numberOfFilesBound\nend\n\n-- this function gets called once when processing a new dataset\n-- we read all the programs and decide which ones should be included\n-- we just use an arbitrary rule that excludes very short programs\n-- the list of included programs is returned and saved for later use\nfunction readMalwareData_setup(rootDir)\n\n\t-- read all the directories\n\t-- check each file to see if it meets some criterion\n\t-- save list of filenames\n\t-- split into train / test sets\n\n\tlocal datasetInfo = {\n\t\tfilesList = {},\n\t\tfamily = {},\n\t\tfamilyName = {},\n\t\tlabel = {},\n\t\tbenignFamily = -1,\n\t}\n\n\tlocal programCount = 0\n\tlocal familyNumber = 1\n\n\tlocal malwareDirs = paths.dir(rootDir)\n\n\tfor i = 1,#malwareDirs do\n\t\tlocal dir = malwareDirs[i]\n\t\tif dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then\n\t\t\tlocal malwarefiles = paths.dir(paths.concat(rootDir,dir))\t\t\t\n\t\t\tfor f = 1,#malwarefiles do\n\t\t\t\tlocal file = malwarefiles[f]\n\t\t\t\tif file ~= '.' and file ~= '..' then\n\t\t\t\t\tlocal contents,nFuncs = readfileFunc_tensor(paths.concat(rootDir,dir,malwarefiles[f]))\t\t\t\t\t\n\t\t\t\t\tif nFuncs >= 8 then -- a bit arbitrary... basically we want to ignore very short files\n\n\t\t\t\t\t\tprogramCount = programCount + 1\n\t\t\t\t\t\tif programCount % 100 == 0 then\n\t\t\t\t\t\t\tprint('programs read ',programCount,collectgarbage(\"count\"))\n\t\t\t\t\t\t\tcollectgarbage()\n\t\t\t\t\t\tend\n\t\t\t\t\t\t\n\t\t\t\t\t\t-- local includeFile = dir .. '/' .. malwarefiles[f]\t\t\t\t\t\t\n\t\t\t\t\t\ttable.insert(datasetInfo.filesList,malwarefiles[f])\n\t\t\t\t\t\ttable.insert(datasetInfo.family,familyNumber)\n\n\t\t\t\t\t\tif dir == 'Benign' then\n\t\t\t\t\t\t\tdatasetInfo.benignFamily = familyNumber\n\t\t\t\t\t\t\ttable.insert(datasetInfo.label,1)\n\t\t\t\t\t\telse                                \n\t\t\t\t\t\t\ttable.insert(datasetInfo.label,2)\n\t\t\t\t\t\tend\n\n\t\t\t\t\tend\n\t\t\t\tend\n\t\t\tend\n\t\t\tfamilyNumber = familyNumber + 1\n\t\t\ttable.insert(datasetInfo.familyName,dir)\n\t\tend\n\tend\n\n\tdatasetInfo.family = torch.Tensor(datasetInfo.family)\n\tdatasetInfo.label = torch.Tensor(datasetInfo.label)\n\n\treturn datasetInfo\nend\n\n-- reads the malware data into a tensor\n-- We read all the opcodes into a single block of memory\n-- this is because each program can be a different length\n-- so storing in a 2D array will waste lots space\n-- We also can't use a Lua list as they are limited to 2GB\n--\n-- allData.program          - tensor (i.e. 1D array of bytes) containing all opcodes\n-- allData.programStartPtrs - pointers to start of each program in allData.program\n-- allData.programLengths   - the length of each opcode sequence\n--\n-- For example, to access program 3 do\n--\n-- local ptr = allData.programStartPrts[3]\n-- local len = allData.programLengths[3]\n-- local prog = allData.program[{{ptr,ptr + len - 1}}]\n--\nfunction readMalwareData(rootDir,metaData)\n\n\tprint('reading files with version 2')\n\n\tlocal malwareDirs = paths.dir(rootDir)\t\n\tlocal upperBoundNumFiles = upperBoundNumberOfFiles(rootDir)\n\n\tlocal meanProgramLen = 50000\n\n\tlocal allData = {\n\t\tprogram = torch.ones(upperBoundNumFiles * meanProgramLen):byte(),\n\t\tprogramStartPtrs = {},\n\t\tprogramLengths = {},\n\t}\n\n\tlocal programLen = {}\n\n\tlocal progPtr = 1\n\tlocal programCount = 0\n\n\tfor i = 1,#metaData.filesList do\n\n\t\tlocal file = metaData.filesList[i]\n\t\tlocal familyDir = metaData.familyName[metaData.family[i]]\n\t\tlocal fullFile = paths.concat(rootDir,familyDir,file)\n\n\t\tif paths.filep(fullFile) then\t\t\t\n\n\t\t\tlocal contents = readfileFunc_tensor(fullFile)\n\n\t\t\tprogramCount = programCount + 1\n\t\t\tif programCount % 100 == 0 then\n\t\t\t\tprint('programs read ',programCount,collectgarbage(\"count\"))\n\t\t\t\tcollectgarbage()\n\t\t\tend\n\n\t\t\tlocal programLength = contents:size(1)\n\n\t\t\t-- if needed - increase the size of the storage\n\t\t\tif (progPtr + programLength - 1) > allData.program:size(1) then\n\t\t\t\tlocal currSize = allData.program:size(1)\n\t\t\t\tallData.program = allData.program:resize(currSize * 1.05)\n\t\t\tend\n\n\t\t\ttable.insert(allData.programStartPtrs,progPtr)\n\t\t\ttable.insert(allData.programLengths,programLength)\n\n\t\t\t-- insert the program into the memory\n\t\t\tallData.program[{{progPtr,progPtr + programLength - 1}}] = contents\n\t\t\tprogPtr = progPtr + programLength\n\t\telse\n\t\t\t-- we should stop if this happens!\n\t\t\terror('ERROR : Missing file in dataset : ' .. fullFile)\n\t\tend\n\tend\n\n\tallData.program = allData.program:resize(progPtr) -- discard redundant rows\n\tallData.programStartPtrs = torch.Tensor(allData.programStartPtrs)\n\tallData.programLengths = torch.Tensor(allData.programLengths)\n\tallData.label = metaData.label\n\n\treturn allData,programLen\nend"
  },
  {
    "path": "readme.md",
    "content": "# Deep Android Malware Detection\n\nThis repository contains the code for the paper \"Deep Android Malware Detection\" ([pdf download](https://pure.qub.ac.uk/portal/files/122380314/sig_camera_ready.pdf)) | ([citation](http://dl.acm.org/citation.cfm?id=3029823))\n\nWe use a convolutional neural network (CNN) for android malware classification. Malware classification is performed based on static analysis of the raw opcode sequence from a disassembled android apk. Features indicative of malware are automatically learned from the raw opcode sequence thus removing the need for hand-engineered malware features. The network runs on GPU, allowing a very large number of files to be quickly scanned.\n\n<p><img src='malware_network_diagram.png'></p>\n\nIf you use this code please cite the following paper:\n\n```\n@inproceedings{mclaughlin2017codaspy,\ntitle = \"Deep Android Malware Detection\",\nauthor = \"Niall McLaughlin and {Martinez del Rincon}, Jesus and BooJoong Kang and Suleiman Yerima and Paul Miller and Sakir Sezer and Yeganeh Safaeisemnani and Erik Trickel and Ziming Zhao and Adam Doupé and {Joon Ahn}, Gail\",\nyear = \"2016\",\nmonth = \"12\",\nbooktitle = \"Proceeding of the ACM Conference on Data and Applications Security and Privacy (CODASPY) 2017\",\npublisher = \"Association for Computing Machinery (ACM)\",\n}\n```\n\n## How to run the code\n\nGiven an existing dataset directory (see below for details), the run.sh file will do the following:\n\n1. Partition the dataset into training-set and held-out test-set\n2. Train a neural network\n3. Test the trained network on the test-set\n\n## Prerequisites\n\n### Dataset structure\n\nAn example dataset with the required directory structure is provided in ./dataset \n\nThe neural network requires opcode sequence files in the correct format, and a dataset directory with sub-directories containing malware and benign opcode sequence files.\n\nAn example dataset directory is provided in ./dataset. The dataset directory must have the following structure:\n\n1. There must be a directory called 'Benign', and contains non-malware opcode sequences files\n2. The other directory can have any name ,and contains malware opcode sequence files\n\n### Opcode Sequence files\n\nOpcode sequence files can be created from android APK files using the opcode sequence creation tool. This tool is located in ./opcodeseq_creator Please see the readme file in this directory for more information.\n\n### Setup\n\nThe neural network code is implemented using Torch. It is recommended to use a GPU to achieve acceleration of testing and training. For details on installing Torch please see http://torch.ch\n\nThe opcode sequence creator tool requires APKTool https://ibotpeaches.github.io/Apktool/\n"
  },
  {
    "path": "results/exampleOutput.txt",
    "content": "{\n  useOneHot : false\n  nConvLayers : 1\n  usemom : false\n  dataAugProb : 0.1\n  batchSize : 1\n  nSamplingEpochs : 5\n  nFCLayers : 1\n  nEmbeddingDims : 8\n  kernelLength : 8\n  useDropoutAfterEmbedding : false\n  numDAShuffles : 1\n  metaDataFile : \"./config/metaData_small_test.th7\"\n  useSpatialDropout : false\n  useHiddenLayer : true\n  weightDecay : 0\n  nConvFilters : 64\n  dropoutFrac : 0.5\n  useRMSProp : false\n  programLen : 8192\n  gpuid : 1\n  nHiddenNodes : 16\n  dataAugTesting : false\n  dataDir : \"/home/nmclaughlin02/Documents/cyberdata/malware/\"\n  seed : 1\n  maxSequenceLength : 8192\n  markFunctionEnds : false\n  debug : false\n  useDropoutAfterConv : false\n  useDropout : false\n  weightClasses : false\n  saveFileName : \"model_tmp\"\n  fixEmbedding : false\n  trainingSetSize : 2\n  randomize : false\n  weightDecayFrac : 0.1\n  useCUDA : true\n  usePreTrainedEmbedding : false\n  nEpochs : 75\n  decayLearningRate : false\n  setupMode : true\n  dataAugMethod : 1\n  saveModel : true\n  learningRate : 0.001\n}\nreading dataset\t\nprograms read \t100\t5064.8681640625\t\nprograms read \t200\t5527.64453125\t\nprograms read \t300\t6014.9560546875\t\nprograms read \t400\t3911.5263671875\t\nprograms read \t500\t7196.4423828125\t\nprograms read \t600\t8327.2734375\t\nprograms read \t700\t10306.740234375\t\nprograms read \t800\t6509.2666015625\t\nprograms read \t900\t7206.0546875\t\nprograms read \t1000\t6228.0478515625\t\nprograms read \t1100\t6535.55078125\t\nprograms read \t1200\t6618.107421875\t\nprograms read \t1300\t4311.482421875\t\nprograms read \t1400\t8571.1533203125\t\nprograms read \t1500\t8814.9814453125\t\nprograms read \t1600\t6065.205078125\t\nprograms read \t1700\t5644.7822265625\t\nprograms read \t1800\t4623.0302734375\t\nprograms read \t1900\t6804.72265625\t\nprograms read \t2000\t4155.318359375\t\nprograms read \t2100\t3895.193359375\t\nsplitting dataset into train/test sets\t\n846\t1259\t\nsplitting dataset\t\nnPosTrain\t761\tnNegTrain\t1133\tpos/neg \t0.40179514255544\t\nnPosTest\t85\tnNegTest\t126\tpos/neg \t0.40284360189573\t\nsaving dataset metadata to file \t./config/metaData_small_test.th7\t\n{\n  useOneHot : false\n  nConvLayers : 1\n  usemom : false\n  dataAugProb : 0.1\n  batchSize : 1\n  nSamplingEpochs : 5\n  nFCLayers : 1\n  nEmbeddingDims : 8\n  kernelLength : 8\n  useDropoutAfterEmbedding : false\n  numDAShuffles : 1\n  metaDataFile : \"./config/metaData_small_test.th7\"\n  useSpatialDropout : false\n  useHiddenLayer : true\n  weightDecay : 0\n  nConvFilters : 64\n  dropoutFrac : 0.5\n  useRMSProp : false\n  programLen : 8192\n  gpuid : 1\n  nHiddenNodes : 16\n  dataAugTesting : false\n  dataDir : \"/home/nmclaughlin02/Documents/cyberdata/malware/\"\n  seed : 1\n  maxSequenceLength : 8192\n  markFunctionEnds : false\n  debug : false\n  useDropoutAfterConv : false\n  useDropout : false\n  weightClasses : false\n  saveFileName : \"model_tmp\"\n  fixEmbedding : false\n  trainingSetSize : 2\n  randomize : false\n  weightDecayFrac : 0.1\n  useCUDA : true\n  usePreTrainedEmbedding : false\n  nEpochs : 75\n  decayLearningRate : false\n  setupMode : false\n  dataAugMethod : 1\n  saveModel : true\n  learningRate : 0.001\n}\n./config/metaData_small_test.th7\t\nreading data from disk\t\nreading files with version 2\t\nupper bound number of programs \t2125\t\nprograms read \t100\t5121.4873046875\t\nprograms read \t200\t17134.224609375\t\nprograms read \t300\t8701.2607421875\t\nprograms read \t400\t7367.3076171875\t\nprograms read \t500\t7284.8056640625\t\nprograms read \t600\t8411.69921875\t\nprograms read \t700\t10391.264648438\t\nprograms read \t800\t6593.51953125\t\nprograms read \t900\t7580.0087890625\t\nprograms read \t1000\t6310.1181640625\t\nprograms read \t1100\t6471.4033203125\t\nprograms read \t1200\t5646.609375\t\nprograms read \t1300\t6056.0703125\t\nprograms read \t1400\t5953.17578125\t\nprograms read \t1500\t5674.7333984375\t\nprograms read \t1600\t6110.0087890625\t\nprograms read \t1700\t5555.7314453125\t\nprograms read \t1800\t6911.7939453125\t\nprograms read \t1900\t6337.0595703125\t\nprograms read \t2000\t8575.1025390625\t\nprograms read \t2100\t3910.28515625\t\nreading data from disk - complete\t\nprogram lens \t88\t1083463\t66743.73064133\t\nsplitting data into train/val/test sets\t\nt,v,t\t\n0.1\t0.1\t0.8\t\nnPrograms \t2105\t\n846\t1259\t\n761\t1133\t\nsplitting dataset\t\nnPosTrain\t676\tnNegTrain\t1007\t\nnPosVal\t85\tnNegVal\t126\t\nnPosTest\t85\tnNegTest\t126\t\ntrain/val/test check\t1\t1\t2105\t2105\t\nnew network\t\nnn.Sequential {\n  [input -> (1) -> (2) -> (3) -> (4) -> (5) -> (6) -> (7) -> (8) -> (9) -> (10) -> output]\n  (1): nn.LookupTable\n  (2): nn.Reshape(1x-1x8)\n  (3): nn.SpatialConvolutionMM(1 -> 64, 8x8)\n  (4): nn.ReLU\n  (5): nn.Reshape(64x-1)\n  (6): nn.Max\n  (7): nn.Linear(64 -> 16)\n  (8): nn.ReLU\n  (9): nn.Linear(16 -> 2)\n  (10): nn.LogSoftMax\n}\nstarting training\t\nNumber of Model Parameters \t7282\t\nUsing CUDA\t\nNumber of training examples \t1683\t\nNumber of validation examples \t211\t\nallocating batch memory\t\nmemory allocated\t\nCUDA memory usage\t\nfree \t3702857728\ttotal \t4294246400\tratio \t0.86228347958794\t\ntraining time\t 30.722\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n5\tval   \t0.40789575868608\t0.38057567440503\t0.87677725118483\t0.83908045977011\t0.85882352941176\t0.84883720930233\t\ntesting time - val  \t  0.157\t nValPrograms\t211\t\n  73   14\n  12  112\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n5\ttrain \t0.40789575868608\t0.33726407361753\t0.89304812834225\t0.88036809815951\t0.8491124260355\t0.8644578313253\t\ntesting time - train\t  1.179\t nTrainingPrograms\t1683\t\n 574   78\n 102  929\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 32.220\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n10\tval   \t0.16407678506945\t0.17450008092898\t0.9478672985782\t0.92045454545455\t0.95294117647059\t0.9364161849711\t\ntesting time - val  \t  0.149\t nValPrograms\t211\t\n  81    7\n   4  119\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n10\ttrain \t0.16407678506945\t0.12949798309725\t0.96375519904932\t0.93741109530583\t0.97485207100592\t0.95576504713561\t\ntesting time - train\t  1.157\t nTrainingPrograms\t1683\t\n 659   44\n  17  963\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 32.102\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n15\tval   \t0.084003127938711\t0.13772791624069\t0.9478672985782\t0.94047619047619\t0.92941176470588\t0.93491124260355\t\ntesting time - val  \t  0.149\t nValPrograms\t211\t\n  79    5\n   6  121\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n15\ttrain \t0.084003127938711\t0.059644215920109\t0.98871063576946\t0.98811292719168\t0.98372781065089\t0.98591549295775\t\ntesting time - train\t  1.178\t nTrainingPrograms\t1683\t\n 665    8\n  11  999\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.980\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n20\tval   \t0.045003364997043\t0.12233256954717\t0.9478672985782\t0.92045454545455\t0.95294117647059\t0.9364161849711\t\ntesting time - val  \t  0.153\t nValPrograms\t211\t\n  81    7\n   4  119\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n20\ttrain \t0.045003364997043\t0.031773728943268\t0.99524658348188\t0.99408284023669\t0.99408284023669\t0.99408284023669\t\ntesting time - train\t  1.180\t nTrainingPrograms\t1683\t\n  672     4\n    4  1003\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 32.320\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n25\tval   \t0.026457868370355\t0.11776774217732\t0.94312796208531\t0.92941176470588\t0.92941176470588\t0.92941176470588\t\ntesting time - val  \t  0.152\t nValPrograms\t211\t\n  79    6\n   6  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n25\ttrain \t0.026457868370355\t0.015975192693891\t0.99762329174094\t0.99851632047478\t0.99556213017751\t0.99703703703704\t\ntesting time - train\t  1.181\t nTrainingPrograms\t1683\t\n  673     1\n    3  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.997\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n30\tval   \t0.019170329284611\t0.11645289704698\t0.94312796208531\t0.92941176470588\t0.92941176470588\t0.92941176470588\t\ntesting time - val  \t  0.154\t nValPrograms\t211\t\n  79    6\n   6  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n30\ttrain \t0.019170329284611\t0.012826394978125\t0.9982174688057\t0.99704579025111\t0.99852071005917\t0.99778270509978\t\ntesting time - train\t  1.186\t nTrainingPrograms\t1683\t\n  675     2\n    1  1005\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 32.005\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n35\tval   \t0.021271346299619\t0.12037801799051\t0.9478672985782\t0.92045454545455\t0.95294117647059\t0.9364161849711\t\ntesting time - val  \t  0.156\t nValPrograms\t211\t\n  81    7\n   4  119\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n35\ttrain \t0.021271346299619\t0.014115614049575\t0.99702911467617\t0.99410898379971\t0.99852071005917\t0.99630996309963\t\ntesting time - train\t  1.183\t nTrainingPrograms\t1683\t\n  675     4\n    1  1003\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.991\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n40\tval   \t0.017399860965448\t0.12343576564608\t0.93364928909953\t0.92771084337349\t0.90588235294118\t0.91666666666667\t\ntesting time - val  \t  0.154\t nValPrograms\t211\t\n  77    6\n   8  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n40\ttrain \t0.017399860965448\t0.0081807981271228\t0.9982174688057\t0.99851851851852\t0.99704142011834\t0.99777942264989\t\ntesting time - train\t  1.189\t nTrainingPrograms\t1683\t\n  674     1\n    2  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.986\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n45\tval   \t0.012330326521177\t0.13431220088525\t0.94312796208531\t0.92941176470588\t0.92941176470588\t0.92941176470588\t\ntesting time - val  \t  0.152\t nValPrograms\t211\t\n  79    6\n   6  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n45\ttrain \t0.012330326521177\t0.0074799091279046\t0.9982174688057\t0.99851851851852\t0.99704142011834\t0.99777942264989\t\ntesting time - train\t  1.184\t nTrainingPrograms\t1683\t\n  674     1\n    2  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.966\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n50\tval   \t0.015055712885752\t0.13219990125765\t0.9478672985782\t0.93023255813953\t0.94117647058824\t0.93567251461988\t\ntesting time - val  \t  0.153\t nValPrograms\t211\t\n  80    6\n   5  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n50\ttrain \t0.015055712885752\t0.0077246054254397\t0.99762329174094\t0.99704142011834\t0.99704142011834\t0.99704142011834\t\ntesting time - train\t  1.184\t nTrainingPrograms\t1683\t\n  674     2\n    2  1005\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.966\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n55\tval   \t0.012149294217428\t0.12793228326816\t0.93364928909953\t0.92771084337349\t0.90588235294118\t0.91666666666667\t\ntesting time - val  \t  0.154\t nValPrograms\t211\t\n  77    6\n   8  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n55\ttrain \t0.012149294217428\t0.006638735727547\t0.99881164587047\t0.99852071005917\t0.99852071005917\t0.99852071005917\t\ntesting time - train\t  1.184\t nTrainingPrograms\t1683\t\n  675     1\n    1  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.977\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n60\tval   \t0.012296135696144\t0.13272679530049\t0.9478672985782\t0.93023255813953\t0.94117647058824\t0.93567251461988\t\ntesting time - val  \t  0.153\t nValPrograms\t211\t\n  80    6\n   5  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n60\ttrain \t0.012296135696144\t0.0069904121273129\t0.99881164587047\t0.99852071005917\t0.99852071005917\t0.99852071005917\t\ntesting time - train\t  1.187\t nTrainingPrograms\t1683\t\n  675     1\n    1  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.982\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n65\tval   \t0.013662806098403\t0.13083266985925\t0.93838862559242\t0.92857142857143\t0.91764705882353\t0.92307692307692\t\ntesting time - val  \t  0.154\t nValPrograms\t211\t\n  78    6\n   7  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n65\ttrain \t0.013662806098403\t0.0061591699303294\t0.99881164587047\t0.99852071005917\t0.99852071005917\t0.99852071005917\t\ntesting time - train\t  1.187\t nTrainingPrograms\t1683\t\n  675     1\n    1  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 31.974\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n70\tval   \t0.014961927119848\t0.14605692608097\t0.95734597156398\t0.93181818181818\t0.96470588235294\t0.94797687861272\t\ntesting time - val  \t  0.153\t nValPrograms\t211\t\n  82    6\n   3  120\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n70\ttrain \t0.014961927119848\t0.0064191930150957\t0.99881164587047\t0.99852071005917\t0.99852071005917\t0.99852071005917\t\ntesting time - train\t  1.152\t nTrainingPrograms\t1683\t\n  675     1\n    1  1006\n[torch.DoubleTensor of size 2x2]\n\n--\t\ntraining time\t 32.109\t nPrograms in training \t1683\t\nnValPrograms\t211\tnTrainingPrograms\t1683\t\ntesting corrected verison 2\t\nTest Stats : nMalware \t126\t nBenign \t85\t positiveLabel \t1\t\n75\tval   \t0.011070825411887\t0.14359631668335\t0.95260663507109\t0.92134831460674\t0.96470588235294\t0.94252873563218\t\ntesting time - val  \t  0.152\t nValPrograms\t211\t\n  82    7\n   3  119\n[torch.DoubleTensor of size 2x2]\n\ntesting corrected verison 2\t\nTest Stats : nMalware \t1007\t nBenign \t676\t positiveLabel \t1\t\n75\ttrain \t0.011070825411887\t0.0070228511495245\t0.9982174688057\t0.99704579025111\t0.99852071005917\t0.99778270509978\t\ntesting time - train\t  1.183\t nTrainingPrograms\t1683\t\n  675     2\n    1  1005\n[torch.DoubleTensor of size 2x2]\n\n--\t\nBest Result \t0.014961927119848\t0.14605692608097\t0.95734597156398\t0.93181818181818\t0.96470588235294\t0.94797687861272\t\n"
  },
  {
    "path": "run.sh",
    "content": "#\n#  First we must run the program with the -setupMode flag\n#  The program should be run with this flag ONLY ONCE for each dataset\n#  This reads the dataset, splits it into training and testing-sets\n#  and saves the dataset metadata to a file\n#\nth  DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64   -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192 -setupMode\n#\n#\n#  Below is the code to train a network\n#  This uses the metadata file above so that we can reproduce our results\n#\nth  DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64   -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192\n#\n#\n#  Below is the code to test a pre-trained network\n#  This should only be run ONCE after setting hyper-parameters using the validation-set\n#\nth testWithPreTrainedNetwork.lua -useCUDA -dataDir ./dataset -modelPath ./trainedNets/model_tmp.th7\n"
  },
  {
    "path": "splitMalwareData.lua",
    "content": "-- run this program once given a new dataset\n-- saves the test / train split to disk\n-- later sub-divide the train-set into train / validation sets\n-- return indicies for the training and testing sets\n-- we will later sub-divide the training-set into train & val sets\nfunction splitMalwareDataTrainTest(labels,pTrain,pTest)\n\n\tlocal pos = {}\n\tlocal neg = {}\n\tlocal nPrograms = labels:size(1)--allData.program:size(1)\n\n\t-- record the incidies of all the pos/neg i.e. malware/benign examples\n\tfor i = 1,nPrograms do\n\t\tif labels[i] == 1 then\n\t\t\ttable.insert(pos,i)\n\t\telse\n\t\t\ttable.insert(neg,i)\n\t\tend\n\tend\n\n\tprint(#pos,#neg)\n\n\t-- record all the positive and negative indicies\n\t-- shuffle the data\n\t-- take the first X% of pos and first x% of pos for training\n\n\tlocal trainInds = {}\n\tlocal testInds = {}\n\n\tlocal indsPos = torch.randperm(#pos)\n\tlocal indsNeg = torch.randperm(#neg)\n\n\tlocal nPosTrain = torch.floor(#pos * pTrain)\n\tlocal nNegTrain = torch.floor(#neg * pTrain)\n\n\tlocal nPosTest = #pos - nPosTrain\n\tlocal nNegTest = #neg - nNegTrain\n\n\tprint('splitting dataset')\n\tprint('nPosTrain',nPosTrain,'nNegTrain',nNegTrain,'pos/neg ',nPosTrain / (nPosTrain+nNegTrain))\n\tprint('nPosTest',nPosTest,'nNegTest',nNegTest,'pos/neg ',nPosTest / (nPosTest+nNegTest))\n\n\tfor i = 1,nPosTrain do\n\t\ttable.insert(trainInds,pos[indsPos[i]])\n\tend\t\n\tfor i = 1,nNegTrain do\n\t\ttable.insert(trainInds,neg[indsNeg[i]])\n\tend\n\n\tfor i = 1,nPosTest do\n\t\ttable.insert(testInds,pos[indsPos[nPosTrain + i]])\n\tend\t\n\tfor i = 1,nNegTest do\n\t\ttable.insert(testInds,neg[indsNeg[nNegTrain + i]])\n\tend\n\n\t-- ratio used to weight the classes during training. Deals with\n\t-- the unbalanced number of examples for each class\n\tlocal posNegRatio = nPosTrain / (nPosTrain + nNegTrain)\n\n\treturn trainInds,testInds,posNegRatio\nend\n\n-- return indicies for the train,val and testing sets\nfunction splitMalwareDataTrainValTest(labels,metaData)\n\n\tlocal pTrain = 0.8\n\tlocal pVal = 0.1\n\tlocal pTest = 0.1\n\n\tlocal testInds = metaData.testInds\n\n\tlocal pos = {}\n\tlocal neg = {}\n\tlocal nPrograms = labels:size(1)--allData.program:size(1)\t\n\tprint('nPrograms ',nPrograms)\n\n\t-- record the incidies of all the pos/neg i.e. malware/benign examples\n\tfor i = 1,nPrograms do\n\t\tif labels[i] == 1 then\n\t\t\ttable.insert(pos,i)\n\t\telse\n\t\t\ttable.insert(neg,i)\n\t\tend\n\tend\n\n\tlocal posTrainVal = {}\n\tlocal negTrainVal = {}\n\t-- record the incidies of all the pos/neg i.e. malware/benign examples in the training-set\n\tfor i = 1,#metaData.trainInds do\n\t\tif labels[metaData.trainInds[i]] == 1 then\n\t\t\ttable.insert(posTrainVal,metaData.trainInds[i])\n\t\telse\n\t\t\ttable.insert(negTrainVal,metaData.trainInds[i])\n\t\tend\n\tend\n\n\tprint(#pos,#neg)\n\tprint(#posTrainVal,#negTrainVal)\n\n\t-- record all the positive and negative indicies\n\t-- shuffle the data\n\t-- take the first X% of pos and first x% of pos for training\n\n\tlocal trainInds = {}\n\tlocal valInds = {}\n\n\tlocal indsPos = torch.randperm(#posTrainVal)\n\tlocal indsNeg = torch.randperm(#negTrainVal)\n\n\tlocal nPosTrain = torch.floor(#pos * pTrain)\n\tlocal nNegTrain = torch.floor(#neg * pTrain)\n\tlocal nPosVal = #posTrainVal - nPosTrain\n\tlocal nNegVal = #negTrainVal - nNegTrain\n\tlocal nPosTest = #pos - (nPosTrain + nPosVal)\n\tlocal nNegTest = #neg - (nNegTrain + nNegVal)\n\n\tprint('splitting dataset')\n\tprint('nPosTrain',nPosTrain,'nNegTrain',nNegTrain)\n\tprint('nPosVal',nPosVal,'nNegVal',nNegVal)\n\tprint('nPosTest',nPosTest,'nNegTest',nNegTest)\n\n\tfor i = 1,nPosTrain do\n\t\ttable.insert(trainInds,posTrainVal[indsPos[i]])\n\tend\t\n\tfor i = 1,nNegTrain do\n\t\ttable.insert(trainInds,negTrainVal[indsNeg[i]])\n\tend\n\tfor i = 1,nPosVal do\n\t\ttable.insert(valInds,posTrainVal[indsPos[nPosTrain + i]])\n\tend\t\n\tfor i = 1,nNegVal do\n\t\ttable.insert(valInds,negTrainVal[indsNeg[nNegTrain + i]])\n\tend\n\t-- for i = 1,nPosTest do\n\t-- \ttable.insert(testInds,pos[indsPos[nPosTrain + nPosVal + i]])\n\t-- end\t\n\t-- for i = 1,nNegTest do\n\t-- \ttable.insert(testInds,neg[indsNeg[nNegTrain + nNegVal + i]])\n\t-- end\n\n\t-- ratio used to weight the classes during training. Deals with\n\t-- the unbalanced number of examples for each class\n\tlocal posNegRatio = nPosTrain / (nPosTrain + nNegTrain)\n\n\t-- check there is no overlap between train / val / test sets\n\tlocal sanity = torch.zeros(nPrograms)\n\tfor i = 1,#trainInds do\n\t\tsanity[trainInds[i]] = sanity[trainInds[i]] + 1\n\tend\n\tfor i = 1,#testInds do\n\t\tsanity[testInds[i]] = sanity[testInds[i]] + 1\n\tend\n\tfor i = 1,#valInds do\n\t\tsanity[valInds[i]] = sanity[valInds[i]] + 1\n\tend\n\n\tprint('train/val/test check',torch.min(sanity),torch.max(sanity),torch.sum(sanity),nPrograms)\n\tif not (torch.min(sanity) == 1) or not (torch.max(sanity) == 1) or not (torch.sum(sanity) == nPrograms) then\n\t\t-- stop if this happens\n\t\terror('overlap between training / validation and testing sets')\n\tend\n\n\treturn trainInds,valInds,testInds,posNegRatio\nend"
  },
  {
    "path": "testModel.lua",
    "content": "function testModel(allData,model,valInds,epochError)\n\n\tprint('testing corrected verison 2')\n\n\tlocal timerTest = torch.Timer()\n\n\tlocal dtype = 'torch.DoubleTensor'\n\tif opt.useCUDA then\n\t\tdtype = 'torch.CudaTensor'\n\tend\n\n\tlocal criterion = nn.ClassNLLCriterion():type(dtype)\n\n\tmodel:evaluate()\n\t\n\t-- push the validation data through the network\n\tlocal nValPrograms = #valInds\n\tlocal valError = 0\n\tlocal correct = 0\n\tlocal confmat = torch.zeros(2,2)\n\tlocal lens = torch.zeros(nValPrograms)\n\n\t-- We need to make sure the rare-class is regarded as positive\n\t-- This means the f-score etc will be corectly calculated\n\t-- When reading the data benign is labelled as 1 and malware as 2\n\tlocal nBenign = 0\n\tlocal nMalware = 0\n\tfor k = 1,nValPrograms do\n\t\tif allData.label[valInds[k]] == 1 then\n\t\t\tnBenign = nBenign + 1\n\t\telse\n\t\t\tnMalware = nMalware + 1\n\t\tend\n\tend\n\tlocal positiveLabel = 1\n\tif nMalware < nBenign then\n\t\tpositiveLabel = 2\n\tend\n\n\tprint('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)\n\n\t--local valBatch = torch.zeros(1,opt.programLen):type(dtype)\n\tlocal valLabel = torch.zeros(1):type(dtype)\n\n\tfor k = 1,nValPrograms do\n\t\tvalLabel[{1}] = allData.label[valInds[k]]\n\t\t--valBatch[{{1},{}}] = allData.program[valInds[k]]\n\n\t\tlocal currProgramPtr = allData.programStartPtrs[valInds[k]]\n\t\tlocal currProgramLen = allData.programLengths[valInds[k]]\n\n\t\tif currProgramLen > opt.maxSequenceLength then\n\t\t\tcurrProgramLen = opt.maxSequenceLength\n\t\tend\t\t\t\n\n\t\tlocal valBatch = torch.zeros(1,currProgramLen):type(dtype)\n\t\tvalBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]\n\n\t\tlocal netOutput = model:forward(valBatch)\n\n\t\tvalError = valError + criterion:forward(netOutput,valLabel)\t \t\t\n\t\tlocal netOutputProb = nn.Exp():forward(netOutput:double())\n\n\t\tlocal v,i = torch.max(netOutputProb,2)\n\t\tlocal pred = i[{1,1}]\n\t\tlocal gt = allData.label[valInds[k]]\n\t\tif pred == gt then\n\t\t\tcorrect = correct + 1;\t\t\t\t\t\t\n\t\tend\n\t\tconfmat[pred][gt] = confmat[pred][gt] + 1\n\tend\n\tvalError = valError / nValPrograms\n\n\tlocal tp = 0\n\tlocal fp = 0\n\tlocal fn = 0\n\n\tif positiveLabel == 1 then\n\t\ttp = confmat[1][1]\n\t\tfp = confmat[1][2]\n\t\tfn = confmat[2][1]\n\telse\n\t\ttp = confmat[2][2]\n\t\tfp = confmat[2][1]\n\t\tfn = confmat[1][2]\n\tend\n\n\tlocal testResult = {\n\t\t-- tp = tp,\n\t\t-- fp = fp,\n\t\t-- fn = fn,\n\t\tprec = tp / (tp + fp),\n\t\trecall = tp / (tp + fn),\n\t\tfscore = (2 * tp) / ((2 * tp) + fp + fn),\n\t\taccuracy = correct/nValPrograms,\n\t\ttestError = valError,\t\t\n\t}\n\n\tlocal time = timerTest:time().real\t\n\n\tmodel:training()\n\n\t-- clean up\n\tvalLabel = nil\n\tcollectgarbage()\n\n\treturn testResult,confmat,time\nend"
  },
  {
    "path": "testModel_dataAug.lua",
    "content": "function testModel(allData,model,valInds,epochError)\n\n\tprint('testing corrected verison 3')\n\n\tlocal timerTest = torch.Timer()\n\n\tlocal dtype = 'torch.DoubleTensor'\n\tif opt.useCUDA then\n\t\tdtype = 'torch.CudaTensor'\n\tend\n\n\tlocal criterion = nn.ClassNLLCriterion():type(dtype)\n\n\tmodel:evaluate()\n\t\n\t-- push the validation data through the network\n\tlocal nValPrograms = #valInds\n\tlocal valError = 0\n\tlocal correct = 0\n\tlocal confmat = torch.zeros(2,2)\n\tlocal lens = torch.zeros(nValPrograms)\n\n\t-- We need to make sure the rare-class is regarded as positive\n\t-- This means the f-score etc will be corectly calculated\n\t-- When reading the data benign is labelled as 1 and malware as 2\n\tlocal nBenign = 0\n\tlocal nMalware = 0\n\tfor k = 1,nValPrograms do\n\t\tif allData.label[valInds[k]] == 1 then\n\t\t\tnBenign = nBenign + 1\n\t\telse\n\t\t\tnMalware = nMalware + 1\n\t\tend\n\tend\n\tlocal positiveLabel = 1\n\tif nMalware < nBenign then\n\t\tpositiveLabel = 2\n\tend\n\n\tprint('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)\n\n\t--local valBatch = torch.zeros(1,opt.programLen):type(dtype)\n\tlocal valLabel = torch.zeros(1):type(dtype)\n\n\tfor k = 1,nValPrograms do\n\t\tvalLabel[{1}] = allData.label[valInds[k]]\n\t\t--valBatch[{{1},{}}] = allData.program[valInds[k]]\n\n\t\tlocal currProgramPtr = allData.programStartPtrs[valInds[k]]\n\t\tlocal currProgramLen = allData.programLengths[valInds[k]]\n\n\t\tlocal netOutputProb = torch.zeros(1,2)\n\t\tlocal nDataAug = 10\n\t\tfor j = 1,nDataAug do\n\n\t\t\tlocal valBatch\n\t\t\tif currProgramLen > opt.maxSequenceLength then\n\t\t\t\tvalBatch = torch.zeros(1,opt.maxSequenceLength):type(dtype)\n\t\t\t\tlocal rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))\n\t\t\t\tvalBatch[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]\n\t\t\telse\n\t\t\t\tvalBatch = torch.zeros(1,currProgramLen):type(dtype)\n\t\t\t\tvalBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]\n\t\t\tend\n\n\t\t\t-- if currProgramLen > opt.maxSequenceLength then\n\t\t\t-- \tcurrProgramLen = opt.maxSequenceLength\n\t\t\t-- end\t\t\t\n\t\t\t-- local valBatch = torch.zeros(1,currProgramLen):type(dtype)\n\t\t\t-- valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]\n\n\t\t\tlocal netOutput = model:forward(valBatch)\n\t\t\tvalError = valError + criterion:forward(netOutput,valLabel)\t \t\t\n\t\t\tnetOutputProb = netOutputProb + nn.Exp():forward(netOutput:double())\n\t\tend\n\n\t\tlocal v,i = torch.max(netOutputProb,2)\n\t\tlocal pred = i[{1,1}]\n\t\tlocal gt = allData.label[valInds[k]]\n\t\tif pred == gt then\n\t\t\tcorrect = correct + 1;\t\t\t\t\t\t\n\t\tend\n\t\tconfmat[pred][gt] = confmat[pred][gt] + 1\n\tend\n\tvalError = valError / nValPrograms\n\n\tlocal tp = 0\n\tlocal fp = 0\n\tlocal fn = 0\n\n\tif positiveLabel == 1 then\n\t\ttp = confmat[1][1]\n\t\tfp = confmat[1][2]\n\t\tfn = confmat[2][1]\n\telse\n\t\ttp = confmat[2][2]\n\t\tfp = confmat[2][1]\n\t\tfn = confmat[1][2]\n\tend\n\n\tlocal testResult = {\n\t\t-- tp = tp,\n\t\t-- fp = fp,\n\t\t-- fn = fn,\n\t\tprec = tp / (tp + fp),\n\t\trecall = tp / (tp + fn),\n\t\tfscore = (2 * tp) / ((2 * tp) + fp + fn),\n\t\taccuracy = correct/nValPrograms,\n\t\ttestError = valError,\t\t\n\t}\n\n\tlocal time = timerTest:time().real\t\n\n\tmodel:training()\n\n\t-- clean up\n\tvalBatch = nil\n\tvalLabel = nil\n\tcollectgarbage()\n\n\treturn testResult,confmat,time\nend"
  },
  {
    "path": "testWithPreTrainedNetwork.lua",
    "content": "-- Example of how to test using a pre-trained network\n-- Expects a directory containing two or more directories\n-- One directory contains all the malware\n-- The other directory contains all the benign software\n\n-- given a model that has already been trained\n-- and a directory containing programs - classify into malware / benign\n\nrequire 'nn'\nrequire 'optim'\nrequire 'nngraph'\nrequire 'cunn'\nrequire 'cutorch'\n\nrequire 'readMalwareData'\nrequire 'testModel'\n\ncmd = torch.CmdLine()\ncmd:option('-useCUDA',false,'use CUDA optimisation')\ncmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')\ncmd:option('-modelPath','./trainedNets/model.th7','path to model to use for testing')\nopt = cmd:parse(arg)\n\nprint('loading model from disk')\nsavedModel = torch.load(opt.modelPath)\nprint('loaded model')\nprint(savedModel.trainedModel)\n\n-- we need these values to correctly prepare the files when reading from disk\nopt.programLen = savedModel.opt.programLen\nopt.kernelLength = savedModel.opt.kernelLength\nopt.maxSequenceLength = savedModel.opt.maxSequenceLength\n\nprint('reading data from disk')\nallData = readMalwareData(opt.dataDir,savedModel.metaData)\n\nif opt.useCUDA then\n\tsavedModel.trainedModel:cuda()\nend\nsavedModel.trainedModel:evaluate()\n\nprint('starting test')\ntestResult,confmat,time = testModel(allData,savedModel.trainedModel,savedModel.metaData.testInds,0)\n\nprint('Results')\nprint('f-score   ',testResult.fscore)\nprint('precision ',testResult.prec)\nprint('recall    ',testResult.recall)\nprint('accuracy  ',testResult.accuracy)\nprint('--')\nprint('Confusion Matrix')\nprint(confmat)\nprint('--')\nprint('time to complete test (s) :',time)\n"
  },
  {
    "path": "trainModel.lua",
    "content": "-- use the GPU to process the whole batch in parallel\nfunction trainModel(model,criterion,allData,trainInds,valInds,dataSplit,metaData)\n\n\tlocal parameters,gradParameters = model:getParameters()\n\tprint('Number of Model Parameters ',parameters:size(1))\n\n\tlocal dtype = 'torch.DoubleTensor'\n\tif opt.useCUDA then\n\t\tprint('Using CUDA')\n\t\tdtype = 'torch.CudaTensor'\n\telse\n\t\tprint('Running on CPU - CUDA disabled')\n\tend\n\n\tlocal config = {\n\t\tlearningRate = opt.learningRate,\n\t\tweightDecay = opt.weightDecay,\n\t}\n\n\tlocal bestfscore = 0\n\tlocal bestResult = torch.zeros(6)\n\n\tlocal timer = torch.Timer()\n\tlocal nPrograms = #trainInds\n\tprint('Number of training examples ',#trainInds)\n\tprint('Number of validation examples ',#valInds)\n\n\t-- pre-allocate memory for the batch\n\tprint('allocating batch memory')\n\t--local batchProg = torch.zeros(opt.batchSize,opt.programLen):type(dtype)\n\tlocal batchLabel = torch.zeros(opt.batchSize):type(dtype)\n\tprint('memory allocated')\n\t--print(#batchProg)\n\tif opt.useCUDA then\n\t\tlocal freeMemory, totalMemory = cutorch.getMemoryUsage(opt.gpuid)\n\t\tprint('CUDA memory usage')\n\t\tprint('free ',freeMemory,'total ',totalMemory,'ratio ',freeMemory/totalMemory)\n\tend\n\n\tlocal gradMultiplier = torch.zeros(2):type(dtype)\n\tif dataSplit.posNegRatio < 0.5 then\n\t\tgradMultiplier[1] = 1 - dataSplit.posNegRatio\n\t\tgradMultiplier[2] = dataSplit.posNegRatio\n\telse\n\t\tgradMultiplier[1] = dataSplit.posNegRatio\n\t\tgradMultiplier[2] = 1 - dataSplit.posNegRatio\n\tend\n\n\tfor e = 1,opt.nEpochs do\n\n\t\t--batchProg:mul(0)\n\t\tbatchLabel:mul(0)\n\n\t\tlocal nBatches = 0\n\t\tlocal nSamples = 0\n\t\tlocal epochError = 0\n\t\tlocal order = torch.randperm(nPrograms)\n\n\t\tfor i = 1,(nPrograms - (nPrograms%opt.batchSize)),opt.batchSize do\n\n\t\t\tnSamples = nSamples + opt.batchSize\n\t\t\tnBatches = nBatches + 1\n\n\t\t\t-- build the batch here\n\t\t\tfor k = 0,(opt.batchSize-1) do\n\t\t\t\t--batchProg[{{k+1},{}}] = allData.program[trainInds[order[i + k]]]\n\t\t\t\tbatchLabel[{k+1}] = allData.label[trainInds[order[i + k]]]\t\t\t\t\n\t\t\tend\n\n\t\t\tlocal currProgramPtr = allData.programStartPtrs[trainInds[order[i]]]\n\t\t\tlocal currProgramLen = allData.programLengths[trainInds[order[i]]]\n\n\t\t\tlocal batchProg\n\t\t\tif currProgramLen > opt.maxSequenceLength then\n\t\t\t\tbatchProg = torch.zeros(1,opt.maxSequenceLength):type(dtype)\t\n\t\t\t\tlocal rndPtr = 0\n\t\t\t\tif opt.dataAugTesting then\t\t\t\n\t\t\t\t\trndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))\n\t\t\t\tend\n\t\t\t\tbatchProg[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]\n\t\t\telse\n\t\t\t\tbatchProg = torch.zeros(1,currProgramLen):type(dtype)\n\t\t\t\tbatchProg[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]\n\t\t\tend\n\n\t\t\t--print(#batchProg)\n\t\t\t--print(currProgramPtr,currProgramLen)\n\n\t\t\tlocal feval = function(x)\n\n\t\t\t\tlocal batchError = 0\n\t\t\t\tif x ~= parameters then\n\t\t\t\t\tparameters:copy(x)\n\t\t\t\tend\n\t\t\t\tgradParameters:zero()\n\t\t\t\t\n\t\t\t\tlocal output = model:forward(batchProg)\n\t\t\t\tlocal netError = criterion:forward(output,batchLabel)\n\t\t\t\tbatchError = batchError + netError\n\t\t\t\tepochError = epochError + netError\n\t\t\t\tlocal gradCriterion = criterion:backward(output,batchLabel)\n\n\t\t\t\tif opt.weightClasses then\t\t\t\t\t\n\t\t\t\t\t-- seems to be a bug in Torch with ClassNLLCriterion as it should \n\t\t\t\t\t-- do this automatically ... \n\t\t\t\t\t-- manually weight the classes to deal with imbalanced pos / neg samples\n\t\t\t\t\tgradCriterion = gradCriterion:cmul(gradMultiplier)\n\t\t\t\tend\n\n\t\t\t\tmodel:backward(batchProg,gradCriterion)\t\n\n\t\t\t\treturn batchError,gradParameters\t\t\t\t\t\n\t\t\tend\n\t\t\tif opt.useRMSProp then\n\t\t\t\toptim.rmsprop(feval, parameters, config)\n         \telse\n         \t\toptim.sgd(feval, parameters, config)\n         \tend\n\n\t\t\tif isnan(epochError) then\n\t\t\t\tprint('training fail - Nan')\n\t\t\t\treturn 0\n\t\t\tend\t\t\n\t\t\tif epochError > 1e9 then\n\t\t\t\tprint('training fail - gradient exploded')\n\t\t\t\treturn 0\n\t\t\tend\n\t\tend\t\t\n\n\t\tif (e == 50 or e == 75) and opt.decayLearningRate then\n\t\t\tconfig.learningRate = config.learningRate * opt.weightDecayFrac\n\t\tend\n\n\t\t-- check the cross validation error\n\t    if e % opt.nSamplingEpochs == 0 or e == opt.nEpochs then  \n\n\t\t\tlocal time = timer:time().real\n\t\t\tprint('training time',string.format(\"%7.3f\",time),' nPrograms in training ',nSamples)\n\t\t\ttimer:reset()\n\n\t\t\tlocal nValPrograms = #valInds\n\t\t\tlocal nTrainPrograms = #trainInds\n\n\t\t\tprint('nValPrograms',nValPrograms,'nTrainingPrograms',nTrainPrograms)\n\n\t \t\tlocal valResult,valConfMat,valTime = testModel(allData,model,valInds,bestfscore)\n\n \t\t\tif valResult.fscore > bestfscore then\n\t\t\t\tbestfscore = valResult.fscore\n\t\t\t\tbestResult[1] = valResult.accuracy\n\t\t\t\tbestResult[2] = valResult.prec\n\t\t\t\tbestResult[3] = valResult.recall\n\t\t\t\tbestResult[4] = valResult.fscore\n\t\t\t\tbestResult[5] = epochError/nBatches\n\t\t\t\tbestResult[6] = valResult.testError\n\n\t\t\t\t-- save the best model so far and the data split etc\n\t\t\t\tif opt.saveModel then\n\t\t\t\t\tlocal experimentData = {\n\t\t\t\t\t\topt = opt,\n\t\t\t\t\t\ttrainedModel = model:double(),\n\t\t\t\t\t\tdataSplit = dataSplit,\n\t\t\t\t\t\tmetaData = metaData,\t\t\t\t\t\t\n\t\t\t\t\t}\n\t\t\t\t\ttorch.save('./trainedNets/' .. opt.saveFileName .. '.th7',experimentData)\n\t\t\t\t\tmodel:type(dtype)\n\t\t\t\t\tparameters, gradParameters = model:getParameters()\n    \t\t\t\tcollectgarbage()\n\t\t\t\tend\n\t\t\tend\n\n\t\t\tprint(e,'val   ',epochError/nBatches,valResult.testError,valResult.accuracy,valResult.prec,valResult.recall,valResult.fscore)\n\t\t\tprint('testing time - val  ',string.format(\"%7.3f\",valTime),' nValPrograms',nValPrograms)\n\t\t\tprint(valConfMat)\n\n\t \t\tlocal testResult,testConfMat,testTime = testModel(allData,model,trainInds,1)\n\t \t\tprint(e,'train ',epochError/nBatches,testResult.testError,testResult.accuracy,testResult.prec,testResult.recall,testResult.fscore)\n\t \t\tprint('testing time - train',string.format(\"%7.3f\",testTime),' nTrainingPrograms',nTrainPrograms)\n\t \t\tprint(testConfMat)\n\t \t\tprint('--')\n\n\t \t\tepochError = 0\n\t\t\tnSamples = 0\n\t\t\tnBatches = 0\n\t\t\tcollectgarbage()\n\t \tend\n\tend\n\tprint('Best Result ',bestResult[5],bestResult[6],bestResult[1],bestResult[2],bestResult[3],bestResult[4])\n\treturn model\nend"
  }
]