Repository: niallmcl/Deep-Android-Malware-Detection Branch: master Commit: 7b9ed3def0ea Files: 17 Total size: 64.4 KB Directory structure: gitextract_s38v2mnj/ ├── .gitattributes ├── DetectMalware_CNN.lua ├── buildNetwork.lua ├── dataset/ │ ├── Benign/ │ │ └── example.opseq │ └── Malware/ │ └── example.opseq ├── opcodeseq_creator/ │ ├── DalvikOpcodes.txt │ ├── README.txt │ └── run_opcode_seq_creation.py ├── readMalwareData.lua ├── readme.md ├── results/ │ └── exampleOutput.txt ├── run.sh ├── splitMalwareData.lua ├── testModel.lua ├── testModel_dataAug.lua ├── testWithPreTrainedNetwork.lua └── trainModel.lua ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ # Auto detect text files and perform LF normalization * text=auto # Custom for Visual Studio *.cs diff=csharp # Standard to msysgit *.doc diff=astextplain *.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain ================================================ FILE: DetectMalware_CNN.lua ================================================ require 'nn' require 'optim' require 'nngraph' require 'readMalwareData' require 'splitMalwareData' require 'buildNetwork' require 'trainModel' local cmd = torch.CmdLine() cmd:option('-seed',1,'seed the random number generator') cmd:option('-nEmbeddingDims',8,'number of dims in lookupTable for projecting instructions to network') cmd:option('-nConvFilters',64,'number of convolutional filters') cmd:option('-kernelLength',8,'seed the random number generator') cmd:option('-useHiddenLayer',true,'use hidden layer between the conv layers and classifier') cmd:option('-nHiddenNodes',16,'seed the random number generator') cmd:option('-weightClasses',false,'seed the random number generator') cmd:option('-nSamplingEpochs',10,'how often to sample the validation set - slow') cmd:option('-useDropout',false,'use dropout between the conv and hidden layers') cmd:option('-dropoutFrac',0.5,'dropout strength') cmd:option('-randomize',false,'randomly select the network parameters') cmd:option('-numDAShuffles',1,'number of function order shuffled versions of each program to keep') cmd:option('-useOneHot',false,'Represent programs using one-hot / otherwise use look-up-table') cmd:option('-learningRate',1e-3,'learning rate') cmd:option('-nEpochs',20,'training epochs') cmd:option('-nConvLayers',1,'number of extra convolutional layers') cmd:option('-nFCLayers',1,'number of extra convolutional layers') cmd:option('-batchSize',1,'size of batch used in training') cmd:option('-usemom',false,'use momentum during SGD optimisation') cmd:option('-useRMSProp',false,'use alternative optimizer rather than SGD') cmd:option('-useCUDA',false,'use CUDA optimisation') cmd:option('-gpuid',1,'which GPU to use') cmd:option('-usePreTrainedEmbedding',false,'initialise network with pre-trained embedding') cmd:option('-fixEmbedding',false,'prevent the embedding from being updated during learning') cmd:option('-programLen',8,'how many instructions to read') cmd:option('-debug',false,'enter debug mode') cmd:option('-dataAugProb',0.1,'probability of changing an instruction during data augmentation') cmd:option('-dataAugMethod',1,'1 - substitue the semantically most similar instruction, 2 - substitue random instruction') cmd:option('-trainingSetSize',2,'restrict the size of the training-set for evaluation purposes') cmd:option('-markFunctionEnds',false,'place a marker at the end of each method which may help classification work better') cmd:option('-saveModel',false,'save the model and data split') cmd:option('-saveFileName','detect_malware_cnn','filename to save the network') cmd:option('-decayLearningRate',false,'reduce learning rate by factor of 10 every so often') cmd:option('-weightDecay',0,'weight decay for L2 regularisation') cmd:option('-weightDecayFrac',0.1,'amount to reduce learning rate by, 0.1 or 0.5 are good values') -- try using dropout in various places of the network cmd:option('-useSpatialDropout',false,'drop instructions after the embedding layer') cmd:option('-useDropoutAfterEmbedding',false,'drop instructions after the embedding layer') cmd:option('-useDropoutAfterConv',false,'drop instructions after the embedding layer') cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify') cmd:option('-metaDataFile','./config/metaData.th7','file containing indicies of test/train/val split') cmd:option('-setupMode',false,'Only run in this mode once. Splits the data into the train/test sets. Saved into ./config/metaData.th7') cmd:option('-maxSequenceLength',1000000,'if program is longer than this length, crop sequence before passing to GPU') cmd:option('-dataAugTesting',false,'Use data augmentation during testing i.e average score over random samples from program') opt = cmd:parse(arg) if opt.useCUDA then require 'cunn' require 'cutorch' end torch.setdefaulttensortype("torch.DoubleTensor") torch.manualSeed(opt.seed) if opt.useCUDA then cutorch.setDevice(opt.gpuid) cutorch.manualSeedAll(opt.seed) end if opt.dataAugTesting then require 'testModel_dataAug' else require 'testModel' end print(opt) function isnan(z) return z ~= z end if opt.setupMode then -- READ-ME -- Given a new dataset we need to split into training / testing sets. -- We only run this chunk once to generate the new train / test split and save it to disk -- Later, when training the network, the training-set is randomly spit into train / validation for a given run -- This allows us to perform cross-validation on the training-set. After we have finished -- doing all development we can test a pre-trained network on the testing-set. ------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------ -- read the data from the root dir -- decide which files should be included in the dataset print('reading dataset') local datasetInfo = readMalwareData_setup(opt.dataDir) print('splitting dataset into train/test sets') local trainPercentage = 0.9 -- use 90% for training and validation sets, and 10% for held-out testing-set local trainInds,testInds,posNegRatio = splitMalwareDataTrainTest(datasetInfo.label,trainPercentage,1 - trainPercentage) local metaData = { trainInds = trainInds, testInds = testInds, posNegRatio = posNegRatio, trainPercentage = trainPercentage, -- filesList = datasetInfo.filesList, family = datasetInfo.family, label = datasetInfo.label, benignFamily = datasetInfo.benignFamily, familyName = datasetInfo.familyName, } print('saving dataset metadata to file ',opt.metaDataFile) torch.save(opt.metaDataFile,metaData) -- ------------------------------------------------------------------------------------------ -- ------------------------------------------------------------------------------------------ else -- train the network and save version with lowest validation error to disk print(opt.metaDataFile) local metaData = torch.load(opt.metaDataFile) print('reading data from disk') local allData = readMalwareData(opt.dataDir,metaData) print('reading data from disk - complete') print('program lens ',torch.min(allData.programLengths),torch.max(allData.programLengths),torch.mean(allData.programLengths)) --take the saved split of train/test and further split the train-set into train/val print('splitting data into train/val/test sets') local testPercentage = (1 - metaData.trainPercentage) local valPercentage = (1 - metaData.trainPercentage) local trainPercentage = 1 - (testPercentage + valPercentage) print('t,v,t') print(testPercentage,valPercentage,trainPercentage) local trainInds,valInds,testInds,posNegRatio = splitMalwareDataTrainValTest(allData.label,metaData,trainPercentage) local dataSplit = { trainInds = trainInds, valInds = valInds, testInds = testInds, posNegRatio = posNegRatio, } print('new network') local model,criterion = buildNetwork(metaData.posNegRatio) print('starting training') local trainedModel = trainModel(model,criterion,allData,dataSplit.trainInds,dataSplit.valInds,dataSplit,metaData) end ================================================ FILE: buildNetwork.lua ================================================ function buildNetwork(posNegRatio) local nIndex = 256 local nOutputSamples = opt.nConvFilters -- number of conv-filters local kernelStride = 1 -- stride of kernel local nClasses = 2 local nHidden = opt.nHiddenNodes local model = nn.Sequential() -- project from one-hot to low-dim embedding space if opt.constrainEmbeddingNorm then model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims,0,1,2)) else model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims)) end -- we can add this here to prevent the network from updating the projection layer -- maybe the projection does not matter much? -- model:add(nn.GradBlocker()) -- 1st conv layer --model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true)) model:add(nn.Reshape(1,-1,opt.nEmbeddingDims,true)) if opt.useSpatialDropout then -- should be batchx1xproglenxembeddingdim model:add(nn.Reshape(opt.programLen,opt.nEmbeddingDims,1,true)) model:add(nn.SpatialDropout(opt.dropoutFrac)) model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true)) end --model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength)) if opt.useDropoutAfterEmbedding then model:add(nn.Dropout(opt.dropoutFrac)) end model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nEmbeddingDims,opt.kernelLength,kernelStride)) model:add(nn.ReLU()) -- if opt.nConvLayers > 1 then -- for layernum = 1,(opt.nConvLayers-1) do -- model:add(nn.Reshape(opt.nConvFilters,-1,true)) -- model:add(nn.Transpose({2,3})) -- --model:add(nn.TemporalMaxPooling(opt.kernelLength/2,opt.kernelLength/2)) -- model:add(nn.TemporalMaxPooling(2,2)) -- model:add(nn.Reshape(1,-1,opt.nConvFilters,true)) -- model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength)) -- model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nConvFilters,opt.kernelLength,kernelStride)) -- model:add(nn.ReLU()) -- end -- end model:add(nn.Reshape(opt.nConvFilters,-1,true)) if opt.useDropoutAfterConv then model:add(nn.Dropout(opt.dropoutFrac)) end model:add(nn.Max(3)) -- produces a vector of fixed size if opt.useHiddenLayer then model:add(nn.Linear(nOutputSamples,nHidden)) model:add(nn.ReLU()) model:add(nn.Linear(nHidden,nClasses)) else model:add(nn.Linear(nOutputSamples,nClasses)) end model:add(nn.LogSoftMax()) local criterion = 0 if opt.weightClasses then local weights = torch.zeros(nClasses) if posNegRatio < 0.5 then weights[1] = 1 - posNegRatio weights[2] = posNegRatio else weights[2] = 1 - posNegRatio weights[1] = posNegRatio end criterion = nn.ClassNLLCriterion(weights) else criterion = nn.ClassNLLCriterion() end if opt.useCUDA then model:cuda() criterion:cuda() end print(model) return model,criterion end ================================================ FILE: dataset/Benign/example.opseq ================================================ 5b700e 700e 1f6e0c ================================================ FILE: dataset/Malware/example.opseq ================================================ 5b700e 700e 1f6e0c ================================================ FILE: opcodeseq_creator/DalvikOpcodes.txt ================================================ nop 00 move 01 move/from16 02 move/16 03 move-wide 04 move-wide/from16 05 move-wide/16 06 move-object 07 move-object/from16 08 move-object/16 09 move-result 0a move-result-wide 0b move-result-object 0c move-exception 0d return-void 0e return 0f return-wide 10 return-object 11 const/4 12 const/16 13 const 14 const/high16 15 const-wide/16 16 const-wide/32 17 const-wide 18 const-wide/high16 19 const-string 1a const-string/jumbo 1b const-class 1c monitor-enter 1d monitor-exit 1e check-cast 1f instance-of 20 array-length 21 new-instance 22 new-array 23 filled-new-array 24 filled-new-array/range 25 fill-array-data 26 throw 27 goto 28 goto/16 29 goto/32 2a packed-switch 2b sparse-switch 2c cmpl-float 2d cmpg-float 2e cmpl-double 2f cmpg-double 30 cmp-long 31 if-eq 32 if-ne 33 if-lt 34 if-ge 35 if-gt 36 if-le 37 if-eqz 38 if-nez 39 if-ltz 3a if-gez 3b if-gtz 3c if-lez 3d aget 44 aget-wide 45 aget-object 46 aget-boolean 47 aget-byte 48 aget-char 49 aget-short 4a aput 4b aput-wide 4c aput-object 4d aput-boolean 4e aput-byte 4f aput-char 50 aput-short 51 iget 52 iget-wide 53 iget-object 54 iget-boolean 55 iget-byte 56 iget-char 57 iget-short 58 iput 59 iput-wide 5a iput-object 5b iput-boolean 5c iput-byte 5d iput-char 5e iput-short 5f sget 60 sget-wide 61 sget-object 62 sget-boolean 63 sget-byte 64 sget-char 65 sget-short 66 sput 67 sput-wide 68 sput-object 69 sput-boolean 6a sput-byte 6b sput-char 6c sput-short 6d invoke-virtual 6e invoke-super 6f invoke-direct 70 invoke-static 71 invoke-interface 72 invoke-virtual/range 74 invoke-super/range 75 invoke-direct/range 76 invoke-static/range 77 invoke-interface/range 78 neg-int 7b not-int 7c neg-long 7d not-long 7e neg-float 7f neg-double 80 int-to-long 81 int-to-float 82 int-to-double 83 long-to-int 84 long-to-float 85 long-to-double 86 float-to-int 87 float-to-long 88 float-to-double 89 double-to-int 8a double-to-long 8b double-to-float 8c int-to-byte 8d int-to-char 8e int-to-short 8f add-int 90 sub-int 91 mul-int 92 div-int 93 rem-int 94 and-int 95 or-int 96 xor-int 97 shl-int 98 shr-int 99 ushr-int 9a add-long 9b sub-long 9c mul-long 9d div-long 9e rem-long 9f and-long a0 or-long a1 xor-long a2 shl-long a3 shr-long a4 ushr-long a5 add-float a6 sub-float a7 mul-float a8 div-float a9 rem-float aa add-double ab sub-double ac mul-double ad div-double ae rem-double af add-int/2addr b0 sub-int/2addr b1 mul-int/2addr b2 div-int/2addr b3 rem-int/2addr b4 and-int/2addr b5 or-int/2addr b6 xor-int/2addr b7 shl-int/2addr b8 shr-int/2addr b9 ushr-int/2addr ba add-long/2addr bb sub-long/2addr bc mul-long/2addr bd div-long/2addr be rem-long/2addr bf and-long/2addr c0 or-long/2addr c1 xor-long/2addr c2 shl-long/2addr c3 shr-long/2addr c4 ushr-long/2addr c5 add-float/2addr c6 sub-float/2addr c7 mul-float/2addr c8 div-float/2addr c9 rem-float/2addr ca add-double/2addr cb sub-double/2addr cc mul-double/2addr cd div-double/2addr ce rem-double/2addr cf add-int/lit16 d0 rsub-int d1 mul-int/lit16 d2 div-int/lit16 d3 rem-int/lit16 d4 and-int/lit16 d5 or-int/lit16 d6 xor-int/lit16 d7 add-int/lit8 d8 rsub-int/lit8 d9 mul-int/lit8 da div-int/lit8 db rem-int/lit8 dc and-int/lit8 dd or-int/lit8 de xor-int/lit8 df shl-int/lit8 e0 shr-int/lit8 e1 ushr-int/lit8 e2 ================================================ FILE: opcodeseq_creator/README.txt ================================================ The zip file contains: 1- A csv file containing Davlik opcodes 2- Sample directory structure containing -apk folder with one sample apk -tmp folder to hold the decoded apps -opseq folder to store the opcode sequece files 5- a python file run_opcode_seq_creation.py which takes the following arguments: Python script arguments: 1. Pathname to the directory containing apk file 2. Pathname of a temporary folder to keep the decoded files during the analysis 3. Pathname to an arbitrary directory to store the opcode sequence files 4. (optional) "incl" (without quotes) to include android support library files Note: default behavior is NOT to include those libraries Steps to run the script: 1) Apktool installation: -Make sure you have java install by running "java --version" you can install jre by running "apt-get install default-jre" -Follow the installation below to install apktool on Linux https://ibotpeaches.github.io/Apktool/install/ (folowing the instructions will place apktool files in /usr/local/bin) Note: Make sure that they are executable 2) Extract the zip file to a folder (extracted_folder) and run the following command: extracted_folder$ ./run_opcode_seq_creation.py ./apk ./tmp ./opseq incl ================================================ FILE: opcodeseq_creator/run_opcode_seq_creation.py ================================================ #!/usr/bin/env python import sys import os import shutil import datetime import logging sys.path.insert(1, os.path.join(sys.path[0], '../..')) def main(): if len(sys.argv) < 4: print "Usage", sys.argv[0], " " return # Reads the location of apk files that need decoding apk_file_directory = sys.argv[1] print "Reading apks from", apk_file_directory # Temporary folder to store the decoded app tmp_file_directory = sys.argv[2] print "Decoding folder", tmp_file_directory # Reads the location that we want to store our opseq files in opseq_file_directory = sys.argv[3] print "opseq folder", opseq_file_directory # Default is not to include smali files in android support libraries unless 4th parameter is provided include_libs = False if len(sys.argv) == 5: include_libs = ((sys.argv[4]) == "incl") print "Include Android support library smali files", include_libs print "Keep Android support libaray files: "+ str(include_libs) # Created a log file in the temp directory logging.basicConfig(filename=tmp_file_directory+'/opseq.log', level=logging.DEBUG) apks = [] for name in os.listdir(apk_file_directory): if os.path.isfile(os.path.join(apk_file_directory, name)): apks.append(name) logging.info('Total apks to be decoded {0}'.format(len(apks))) print "Total apks to be decoded",len(apks) num_local = 0 before=datetime.datetime.now() logging.info('Starting at: {0}'.format(before)) print "Starting at: {0}",before # Looping through all apks for apk_hash in apks: apk_file_location = os.path.join(apk_file_directory, apk_hash) num_local += 1 logging.info('Decoding apk: {0} apk #: {1}'.format(apk_file_location,num_local)) print "apk #: ", num_local print "apk location: ", apk_file_location decoded_location = None # Decoding apk into the tmp_file_directory decoded_location = decode_application(apk_file_location,tmp_file_directory,apk_hash,include_libs) if (not os.path.exists(decoded_location) or not os.listdir(decoded_location)): print "smali directory does not exist continue...." logging.error('NOT decoded directory: {0}'.format(apk_file_location)) print "NOT decoded directory:", apk_file_location continue result =create_opcode_seq(decoded_location,opseq_file_directory,apk_hash) if result: print "opseq file for apk #",num_local," is created" logging.info('opseq file for apk # {0} is created'.format(num_local)) else: logging.error('opseq file creation was not successful') print "opseq file creation was not successful" if os.path.exists(decoded_location): shutil.rmtree(decoded_location) after=datetime.datetime.now() print "Finished by: {0} ",after logging.info('Total time taken: {0}'.format(after-before)) print "Total time taken:", after-before def create_opcode_seq(decoded_dir,opseq_file_directory,apk_hash): # Returns true if creating opcode sequence file was successful, # searches all files in smali folder, # writes the coresponding opcode sequence to a .opseq file # and depending on the include_lib value, # it includes or excludes the support library files dalvik_opcodes = {} # Reading Davlik opcodes into a dictionary with open("DalvikOpcodes.txt") as fop: for linee in fop: (key, val) = linee.split() dalvik_opcodes[key] = val try: smali_dir = os.path.join(decoded_dir, "smali") opseq_fname=os.path.join(opseq_file_directory,apk_hash+".opseq") with open(opseq_fname, "a") as opseq_file: for root, dirs, fnames in os.walk(smali_dir): for fname in fnames: full_path = os.path.join(root, fname) opseq_file.write(get_opcode_seq(full_path, dalvik_opcodes)) opseq_file.close() return True except Exception as e: print "Exception occured during opseq creation of apk " ,apk_hash logging.error('Exception occured during opseq creation {0}'.format(str(e))) return False def get_opcode_seq(smali_fname, dalvik_opcodes): # Returns opcode sequence created from smali file 'smali_fname'. opcode_seq='' with open(smali_fname, mode="r") as bigfile: reader = bigfile.read() for i, part in enumerate(reader.split(".method")): add_newline = False if i!=0: method_part=part.split(".end method")[0] method_body = method_part.strip().split('\n') for line in method_body: if not line.strip().startswith('.') and not line.strip().startswith('#') and line.strip(): method_line = line.strip().split() if method_line[0] in dalvik_opcodes: add_newline = True opcode_seq += dalvik_opcodes[method_line[0]] if add_newline: opcode_seq += '\n' return opcode_seq def decode_application (apk_file_location,tmp_file_directory,hash,include_libs): # Decodes the apk at apk_file_location and # stores the decoded folders in tmp_file_directory out_file_location = os.path.join(tmp_file_directory, hash+ ".smali") try: apktool_decode_apk( apk_file_location, out_file_location,include_libs ) except ApkToolException: print "ApktoolException on decoding" logging.error("ApktoolException on decoding apk {0} ".format(apk_file_location)) pass return out_file_location def apktool_decode_apk(apk_file, out_file,include_libs): # Runs the apktool on a given apk apktooldir="/usr/local/bin" apktoolcmd = "{0}/apktool d -f {1} -o {2}".format(apktooldir, apk_file, out_file) res = os.system(apktoolcmd) if res != 0: raise ApkToolException(apktoolcmd) # Checks if we should keep the smali files belonging to the android support libraries if not include_libs: # Don't keep the smali/android folder android_folder = os.path.join(out_file, "smali/android") if os.path.exists(android_folder): rm_cmd = "rm -r %s" %(android_folder) os.system(rm_cmd) # Exception class to signify an Apktool Exception class ApkToolException(Exception): def __init__(self, command): self.command = command def __str__(self): return repr(self.command) if __name__ == '__main__': main() ================================================ FILE: readMalwareData.lua ================================================ -- read the malware data -- in setup mode -- - read all the files -- - decide if it should be in dataset -- - save a list of all files -- - -- read the whole program into a tensor function readfileFunc_tensor(filename) local contents = {} local f = torch.DiskFile(filename) f.quiet(f) local c = 'a' local count = 0 local func = {} for i = 1,opt.kernelLength do table.insert(func,1) count = count + 1 end local nFuncs = 0 while c ~= '' do --and count <= opt.programLen do -- potential bug... c = f.readString(f,'*l') local len = #c if len > 0 then for k = 1,len,2 do local num = string.sub(c,k,k+1) local n = tonumber(num,16) table.insert(func,n + 2) -- plus 2 so that our lowest symbol is '2' i.e. no_op is '2' count = count + 1 end nFuncs = nFuncs + 1 for i = 1,opt.kernelLength do table.insert(func,1) count = count + 1 end if opt.markFunctionEnds then table.insert(func,255) -- mark the end of each function end end end return torch.ByteTensor(func),nFuncs,count end -- get an upper bound on the number of malware files -- we will discard some files that are too short etc function upperBoundNumberOfFiles(rootDir) local numberOfFilesBound = 0 local malwareDirs = paths.dir(rootDir) for i = 1,#malwareDirs do local dir = malwareDirs[i] if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then local malwarefiles = paths.dir(paths.concat(rootDir,dir)) -- number of files minus '.' and '..' numberOfFilesBound = numberOfFilesBound + #malwarefiles - 2 end end print('upper bound number of programs ',numberOfFilesBound) return numberOfFilesBound end -- this function gets called once when processing a new dataset -- we read all the programs and decide which ones should be included -- we just use an arbitrary rule that excludes very short programs -- the list of included programs is returned and saved for later use function readMalwareData_setup(rootDir) -- read all the directories -- check each file to see if it meets some criterion -- save list of filenames -- split into train / test sets local datasetInfo = { filesList = {}, family = {}, familyName = {}, label = {}, benignFamily = -1, } local programCount = 0 local familyNumber = 1 local malwareDirs = paths.dir(rootDir) for i = 1,#malwareDirs do local dir = malwareDirs[i] if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then local malwarefiles = paths.dir(paths.concat(rootDir,dir)) for f = 1,#malwarefiles do local file = malwarefiles[f] if file ~= '.' and file ~= '..' then local contents,nFuncs = readfileFunc_tensor(paths.concat(rootDir,dir,malwarefiles[f])) if nFuncs >= 8 then -- a bit arbitrary... basically we want to ignore very short files programCount = programCount + 1 if programCount % 100 == 0 then print('programs read ',programCount,collectgarbage("count")) collectgarbage() end -- local includeFile = dir .. '/' .. malwarefiles[f] table.insert(datasetInfo.filesList,malwarefiles[f]) table.insert(datasetInfo.family,familyNumber) if dir == 'Benign' then datasetInfo.benignFamily = familyNumber table.insert(datasetInfo.label,1) else table.insert(datasetInfo.label,2) end end end end familyNumber = familyNumber + 1 table.insert(datasetInfo.familyName,dir) end end datasetInfo.family = torch.Tensor(datasetInfo.family) datasetInfo.label = torch.Tensor(datasetInfo.label) return datasetInfo end -- reads the malware data into a tensor -- We read all the opcodes into a single block of memory -- this is because each program can be a different length -- so storing in a 2D array will waste lots space -- We also can't use a Lua list as they are limited to 2GB -- -- allData.program - tensor (i.e. 1D array of bytes) containing all opcodes -- allData.programStartPtrs - pointers to start of each program in allData.program -- allData.programLengths - the length of each opcode sequence -- -- For example, to access program 3 do -- -- local ptr = allData.programStartPrts[3] -- local len = allData.programLengths[3] -- local prog = allData.program[{{ptr,ptr + len - 1}}] -- function readMalwareData(rootDir,metaData) print('reading files with version 2') local malwareDirs = paths.dir(rootDir) local upperBoundNumFiles = upperBoundNumberOfFiles(rootDir) local meanProgramLen = 50000 local allData = { program = torch.ones(upperBoundNumFiles * meanProgramLen):byte(), programStartPtrs = {}, programLengths = {}, } local programLen = {} local progPtr = 1 local programCount = 0 for i = 1,#metaData.filesList do local file = metaData.filesList[i] local familyDir = metaData.familyName[metaData.family[i]] local fullFile = paths.concat(rootDir,familyDir,file) if paths.filep(fullFile) then local contents = readfileFunc_tensor(fullFile) programCount = programCount + 1 if programCount % 100 == 0 then print('programs read ',programCount,collectgarbage("count")) collectgarbage() end local programLength = contents:size(1) -- if needed - increase the size of the storage if (progPtr + programLength - 1) > allData.program:size(1) then local currSize = allData.program:size(1) allData.program = allData.program:resize(currSize * 1.05) end table.insert(allData.programStartPtrs,progPtr) table.insert(allData.programLengths,programLength) -- insert the program into the memory allData.program[{{progPtr,progPtr + programLength - 1}}] = contents progPtr = progPtr + programLength else -- we should stop if this happens! error('ERROR : Missing file in dataset : ' .. fullFile) end end allData.program = allData.program:resize(progPtr) -- discard redundant rows allData.programStartPtrs = torch.Tensor(allData.programStartPtrs) allData.programLengths = torch.Tensor(allData.programLengths) allData.label = metaData.label return allData,programLen end ================================================ FILE: readme.md ================================================ # Deep Android Malware Detection This repository contains the code for the paper "Deep Android Malware Detection" ([pdf download](https://pure.qub.ac.uk/portal/files/122380314/sig_camera_ready.pdf)) | ([citation](http://dl.acm.org/citation.cfm?id=3029823)) We use a convolutional neural network (CNN) for android malware classification. Malware classification is performed based on static analysis of the raw opcode sequence from a disassembled android apk. Features indicative of malware are automatically learned from the raw opcode sequence thus removing the need for hand-engineered malware features. The network runs on GPU, allowing a very large number of files to be quickly scanned.

If you use this code please cite the following paper: ``` @inproceedings{mclaughlin2017codaspy, title = "Deep Android Malware Detection", author = "Niall McLaughlin and {Martinez del Rincon}, Jesus and BooJoong Kang and Suleiman Yerima and Paul Miller and Sakir Sezer and Yeganeh Safaeisemnani and Erik Trickel and Ziming Zhao and Adam Doupé and {Joon Ahn}, Gail", year = "2016", month = "12", booktitle = "Proceeding of the ACM Conference on Data and Applications Security and Privacy (CODASPY) 2017", publisher = "Association for Computing Machinery (ACM)", } ``` ## How to run the code Given an existing dataset directory (see below for details), the run.sh file will do the following: 1. Partition the dataset into training-set and held-out test-set 2. Train a neural network 3. Test the trained network on the test-set ## Prerequisites ### Dataset structure An example dataset with the required directory structure is provided in ./dataset The neural network requires opcode sequence files in the correct format, and a dataset directory with sub-directories containing malware and benign opcode sequence files. An example dataset directory is provided in ./dataset. The dataset directory must have the following structure: 1. There must be a directory called 'Benign', and contains non-malware opcode sequences files 2. The other directory can have any name ,and contains malware opcode sequence files ### Opcode Sequence files Opcode sequence files can be created from android APK files using the opcode sequence creation tool. This tool is located in ./opcodeseq_creator Please see the readme file in this directory for more information. ### Setup The neural network code is implemented using Torch. It is recommended to use a GPU to achieve acceleration of testing and training. For details on installing Torch please see http://torch.ch The opcode sequence creator tool requires APKTool https://ibotpeaches.github.io/Apktool/ ================================================ FILE: results/exampleOutput.txt ================================================ { useOneHot : false nConvLayers : 1 usemom : false dataAugProb : 0.1 batchSize : 1 nSamplingEpochs : 5 nFCLayers : 1 nEmbeddingDims : 8 kernelLength : 8 useDropoutAfterEmbedding : false numDAShuffles : 1 metaDataFile : "./config/metaData_small_test.th7" useSpatialDropout : false useHiddenLayer : true weightDecay : 0 nConvFilters : 64 dropoutFrac : 0.5 useRMSProp : false programLen : 8192 gpuid : 1 nHiddenNodes : 16 dataAugTesting : false dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/" seed : 1 maxSequenceLength : 8192 markFunctionEnds : false debug : false useDropoutAfterConv : false useDropout : false weightClasses : false saveFileName : "model_tmp" fixEmbedding : false trainingSetSize : 2 randomize : false weightDecayFrac : 0.1 useCUDA : true usePreTrainedEmbedding : false nEpochs : 75 decayLearningRate : false setupMode : true dataAugMethod : 1 saveModel : true learningRate : 0.001 } reading dataset programs read 100 5064.8681640625 programs read 200 5527.64453125 programs read 300 6014.9560546875 programs read 400 3911.5263671875 programs read 500 7196.4423828125 programs read 600 8327.2734375 programs read 700 10306.740234375 programs read 800 6509.2666015625 programs read 900 7206.0546875 programs read 1000 6228.0478515625 programs read 1100 6535.55078125 programs read 1200 6618.107421875 programs read 1300 4311.482421875 programs read 1400 8571.1533203125 programs read 1500 8814.9814453125 programs read 1600 6065.205078125 programs read 1700 5644.7822265625 programs read 1800 4623.0302734375 programs read 1900 6804.72265625 programs read 2000 4155.318359375 programs read 2100 3895.193359375 splitting dataset into train/test sets 846 1259 splitting dataset nPosTrain 761 nNegTrain 1133 pos/neg 0.40179514255544 nPosTest 85 nNegTest 126 pos/neg 0.40284360189573 saving dataset metadata to file ./config/metaData_small_test.th7 { useOneHot : false nConvLayers : 1 usemom : false dataAugProb : 0.1 batchSize : 1 nSamplingEpochs : 5 nFCLayers : 1 nEmbeddingDims : 8 kernelLength : 8 useDropoutAfterEmbedding : false numDAShuffles : 1 metaDataFile : "./config/metaData_small_test.th7" useSpatialDropout : false useHiddenLayer : true weightDecay : 0 nConvFilters : 64 dropoutFrac : 0.5 useRMSProp : false programLen : 8192 gpuid : 1 nHiddenNodes : 16 dataAugTesting : false dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/" seed : 1 maxSequenceLength : 8192 markFunctionEnds : false debug : false useDropoutAfterConv : false useDropout : false weightClasses : false saveFileName : "model_tmp" fixEmbedding : false trainingSetSize : 2 randomize : false weightDecayFrac : 0.1 useCUDA : true usePreTrainedEmbedding : false nEpochs : 75 decayLearningRate : false setupMode : false dataAugMethod : 1 saveModel : true learningRate : 0.001 } ./config/metaData_small_test.th7 reading data from disk reading files with version 2 upper bound number of programs 2125 programs read 100 5121.4873046875 programs read 200 17134.224609375 programs read 300 8701.2607421875 programs read 400 7367.3076171875 programs read 500 7284.8056640625 programs read 600 8411.69921875 programs read 700 10391.264648438 programs read 800 6593.51953125 programs read 900 7580.0087890625 programs read 1000 6310.1181640625 programs read 1100 6471.4033203125 programs read 1200 5646.609375 programs read 1300 6056.0703125 programs read 1400 5953.17578125 programs read 1500 5674.7333984375 programs read 1600 6110.0087890625 programs read 1700 5555.7314453125 programs read 1800 6911.7939453125 programs read 1900 6337.0595703125 programs read 2000 8575.1025390625 programs read 2100 3910.28515625 reading data from disk - complete program lens 88 1083463 66743.73064133 splitting data into train/val/test sets t,v,t 0.1 0.1 0.8 nPrograms 2105 846 1259 761 1133 splitting dataset nPosTrain 676 nNegTrain 1007 nPosVal 85 nNegVal 126 nPosTest 85 nNegTest 126 train/val/test check 1 1 2105 2105 new network nn.Sequential { [input -> (1) -> (2) -> (3) -> (4) -> (5) -> (6) -> (7) -> (8) -> (9) -> (10) -> output] (1): nn.LookupTable (2): nn.Reshape(1x-1x8) (3): nn.SpatialConvolutionMM(1 -> 64, 8x8) (4): nn.ReLU (5): nn.Reshape(64x-1) (6): nn.Max (7): nn.Linear(64 -> 16) (8): nn.ReLU (9): nn.Linear(16 -> 2) (10): nn.LogSoftMax } starting training Number of Model Parameters 7282 Using CUDA Number of training examples 1683 Number of validation examples 211 allocating batch memory memory allocated CUDA memory usage free 3702857728 total 4294246400 ratio 0.86228347958794 training time 30.722 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 5 val 0.40789575868608 0.38057567440503 0.87677725118483 0.83908045977011 0.85882352941176 0.84883720930233 testing time - val 0.157 nValPrograms 211 73 14 12 112 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 5 train 0.40789575868608 0.33726407361753 0.89304812834225 0.88036809815951 0.8491124260355 0.8644578313253 testing time - train 1.179 nTrainingPrograms 1683 574 78 102 929 [torch.DoubleTensor of size 2x2] -- training time 32.220 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 10 val 0.16407678506945 0.17450008092898 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711 testing time - val 0.149 nValPrograms 211 81 7 4 119 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 10 train 0.16407678506945 0.12949798309725 0.96375519904932 0.93741109530583 0.97485207100592 0.95576504713561 testing time - train 1.157 nTrainingPrograms 1683 659 44 17 963 [torch.DoubleTensor of size 2x2] -- training time 32.102 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 15 val 0.084003127938711 0.13772791624069 0.9478672985782 0.94047619047619 0.92941176470588 0.93491124260355 testing time - val 0.149 nValPrograms 211 79 5 6 121 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 15 train 0.084003127938711 0.059644215920109 0.98871063576946 0.98811292719168 0.98372781065089 0.98591549295775 testing time - train 1.178 nTrainingPrograms 1683 665 8 11 999 [torch.DoubleTensor of size 2x2] -- training time 31.980 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 20 val 0.045003364997043 0.12233256954717 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711 testing time - val 0.153 nValPrograms 211 81 7 4 119 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 20 train 0.045003364997043 0.031773728943268 0.99524658348188 0.99408284023669 0.99408284023669 0.99408284023669 testing time - train 1.180 nTrainingPrograms 1683 672 4 4 1003 [torch.DoubleTensor of size 2x2] -- training time 32.320 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 25 val 0.026457868370355 0.11776774217732 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588 testing time - val 0.152 nValPrograms 211 79 6 6 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 25 train 0.026457868370355 0.015975192693891 0.99762329174094 0.99851632047478 0.99556213017751 0.99703703703704 testing time - train 1.181 nTrainingPrograms 1683 673 1 3 1006 [torch.DoubleTensor of size 2x2] -- training time 31.997 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 30 val 0.019170329284611 0.11645289704698 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588 testing time - val 0.154 nValPrograms 211 79 6 6 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 30 train 0.019170329284611 0.012826394978125 0.9982174688057 0.99704579025111 0.99852071005917 0.99778270509978 testing time - train 1.186 nTrainingPrograms 1683 675 2 1 1005 [torch.DoubleTensor of size 2x2] -- training time 32.005 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 35 val 0.021271346299619 0.12037801799051 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711 testing time - val 0.156 nValPrograms 211 81 7 4 119 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 35 train 0.021271346299619 0.014115614049575 0.99702911467617 0.99410898379971 0.99852071005917 0.99630996309963 testing time - train 1.183 nTrainingPrograms 1683 675 4 1 1003 [torch.DoubleTensor of size 2x2] -- training time 31.991 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 40 val 0.017399860965448 0.12343576564608 0.93364928909953 0.92771084337349 0.90588235294118 0.91666666666667 testing time - val 0.154 nValPrograms 211 77 6 8 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 40 train 0.017399860965448 0.0081807981271228 0.9982174688057 0.99851851851852 0.99704142011834 0.99777942264989 testing time - train 1.189 nTrainingPrograms 1683 674 1 2 1006 [torch.DoubleTensor of size 2x2] -- training time 31.986 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 45 val 0.012330326521177 0.13431220088525 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588 testing time - val 0.152 nValPrograms 211 79 6 6 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 45 train 0.012330326521177 0.0074799091279046 0.9982174688057 0.99851851851852 0.99704142011834 0.99777942264989 testing time - train 1.184 nTrainingPrograms 1683 674 1 2 1006 [torch.DoubleTensor of size 2x2] -- training time 31.966 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 50 val 0.015055712885752 0.13219990125765 0.9478672985782 0.93023255813953 0.94117647058824 0.93567251461988 testing time - val 0.153 nValPrograms 211 80 6 5 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 50 train 0.015055712885752 0.0077246054254397 0.99762329174094 0.99704142011834 0.99704142011834 0.99704142011834 testing time - train 1.184 nTrainingPrograms 1683 674 2 2 1005 [torch.DoubleTensor of size 2x2] -- training time 31.966 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 55 val 0.012149294217428 0.12793228326816 0.93364928909953 0.92771084337349 0.90588235294118 0.91666666666667 testing time - val 0.154 nValPrograms 211 77 6 8 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 55 train 0.012149294217428 0.006638735727547 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917 testing time - train 1.184 nTrainingPrograms 1683 675 1 1 1006 [torch.DoubleTensor of size 2x2] -- training time 31.977 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 60 val 0.012296135696144 0.13272679530049 0.9478672985782 0.93023255813953 0.94117647058824 0.93567251461988 testing time - val 0.153 nValPrograms 211 80 6 5 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 60 train 0.012296135696144 0.0069904121273129 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917 testing time - train 1.187 nTrainingPrograms 1683 675 1 1 1006 [torch.DoubleTensor of size 2x2] -- training time 31.982 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 65 val 0.013662806098403 0.13083266985925 0.93838862559242 0.92857142857143 0.91764705882353 0.92307692307692 testing time - val 0.154 nValPrograms 211 78 6 7 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 65 train 0.013662806098403 0.0061591699303294 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917 testing time - train 1.187 nTrainingPrograms 1683 675 1 1 1006 [torch.DoubleTensor of size 2x2] -- training time 31.974 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 70 val 0.014961927119848 0.14605692608097 0.95734597156398 0.93181818181818 0.96470588235294 0.94797687861272 testing time - val 0.153 nValPrograms 211 82 6 3 120 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 70 train 0.014961927119848 0.0064191930150957 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917 testing time - train 1.152 nTrainingPrograms 1683 675 1 1 1006 [torch.DoubleTensor of size 2x2] -- training time 32.109 nPrograms in training 1683 nValPrograms 211 nTrainingPrograms 1683 testing corrected verison 2 Test Stats : nMalware 126 nBenign 85 positiveLabel 1 75 val 0.011070825411887 0.14359631668335 0.95260663507109 0.92134831460674 0.96470588235294 0.94252873563218 testing time - val 0.152 nValPrograms 211 82 7 3 119 [torch.DoubleTensor of size 2x2] testing corrected verison 2 Test Stats : nMalware 1007 nBenign 676 positiveLabel 1 75 train 0.011070825411887 0.0070228511495245 0.9982174688057 0.99704579025111 0.99852071005917 0.99778270509978 testing time - train 1.183 nTrainingPrograms 1683 675 2 1 1005 [torch.DoubleTensor of size 2x2] -- Best Result 0.014961927119848 0.14605692608097 0.95734597156398 0.93181818181818 0.96470588235294 0.94797687861272 ================================================ FILE: run.sh ================================================ # # First we must run the program with the -setupMode flag # The program should be run with this flag ONLY ONCE for each dataset # This reads the dataset, splits it into training and testing-sets # and saves the dataset metadata to a file # th DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64 -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192 -setupMode # # # Below is the code to train a network # This uses the metadata file above so that we can reproduce our results # th DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64 -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192 # # # Below is the code to test a pre-trained network # This should only be run ONCE after setting hyper-parameters using the validation-set # th testWithPreTrainedNetwork.lua -useCUDA -dataDir ./dataset -modelPath ./trainedNets/model_tmp.th7 ================================================ FILE: splitMalwareData.lua ================================================ -- run this program once given a new dataset -- saves the test / train split to disk -- later sub-divide the train-set into train / validation sets -- return indicies for the training and testing sets -- we will later sub-divide the training-set into train & val sets function splitMalwareDataTrainTest(labels,pTrain,pTest) local pos = {} local neg = {} local nPrograms = labels:size(1)--allData.program:size(1) -- record the incidies of all the pos/neg i.e. malware/benign examples for i = 1,nPrograms do if labels[i] == 1 then table.insert(pos,i) else table.insert(neg,i) end end print(#pos,#neg) -- record all the positive and negative indicies -- shuffle the data -- take the first X% of pos and first x% of pos for training local trainInds = {} local testInds = {} local indsPos = torch.randperm(#pos) local indsNeg = torch.randperm(#neg) local nPosTrain = torch.floor(#pos * pTrain) local nNegTrain = torch.floor(#neg * pTrain) local nPosTest = #pos - nPosTrain local nNegTest = #neg - nNegTrain print('splitting dataset') print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain,'pos/neg ',nPosTrain / (nPosTrain+nNegTrain)) print('nPosTest',nPosTest,'nNegTest',nNegTest,'pos/neg ',nPosTest / (nPosTest+nNegTest)) for i = 1,nPosTrain do table.insert(trainInds,pos[indsPos[i]]) end for i = 1,nNegTrain do table.insert(trainInds,neg[indsNeg[i]]) end for i = 1,nPosTest do table.insert(testInds,pos[indsPos[nPosTrain + i]]) end for i = 1,nNegTest do table.insert(testInds,neg[indsNeg[nNegTrain + i]]) end -- ratio used to weight the classes during training. Deals with -- the unbalanced number of examples for each class local posNegRatio = nPosTrain / (nPosTrain + nNegTrain) return trainInds,testInds,posNegRatio end -- return indicies for the train,val and testing sets function splitMalwareDataTrainValTest(labels,metaData) local pTrain = 0.8 local pVal = 0.1 local pTest = 0.1 local testInds = metaData.testInds local pos = {} local neg = {} local nPrograms = labels:size(1)--allData.program:size(1) print('nPrograms ',nPrograms) -- record the incidies of all the pos/neg i.e. malware/benign examples for i = 1,nPrograms do if labels[i] == 1 then table.insert(pos,i) else table.insert(neg,i) end end local posTrainVal = {} local negTrainVal = {} -- record the incidies of all the pos/neg i.e. malware/benign examples in the training-set for i = 1,#metaData.trainInds do if labels[metaData.trainInds[i]] == 1 then table.insert(posTrainVal,metaData.trainInds[i]) else table.insert(negTrainVal,metaData.trainInds[i]) end end print(#pos,#neg) print(#posTrainVal,#negTrainVal) -- record all the positive and negative indicies -- shuffle the data -- take the first X% of pos and first x% of pos for training local trainInds = {} local valInds = {} local indsPos = torch.randperm(#posTrainVal) local indsNeg = torch.randperm(#negTrainVal) local nPosTrain = torch.floor(#pos * pTrain) local nNegTrain = torch.floor(#neg * pTrain) local nPosVal = #posTrainVal - nPosTrain local nNegVal = #negTrainVal - nNegTrain local nPosTest = #pos - (nPosTrain + nPosVal) local nNegTest = #neg - (nNegTrain + nNegVal) print('splitting dataset') print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain) print('nPosVal',nPosVal,'nNegVal',nNegVal) print('nPosTest',nPosTest,'nNegTest',nNegTest) for i = 1,nPosTrain do table.insert(trainInds,posTrainVal[indsPos[i]]) end for i = 1,nNegTrain do table.insert(trainInds,negTrainVal[indsNeg[i]]) end for i = 1,nPosVal do table.insert(valInds,posTrainVal[indsPos[nPosTrain + i]]) end for i = 1,nNegVal do table.insert(valInds,negTrainVal[indsNeg[nNegTrain + i]]) end -- for i = 1,nPosTest do -- table.insert(testInds,pos[indsPos[nPosTrain + nPosVal + i]]) -- end -- for i = 1,nNegTest do -- table.insert(testInds,neg[indsNeg[nNegTrain + nNegVal + i]]) -- end -- ratio used to weight the classes during training. Deals with -- the unbalanced number of examples for each class local posNegRatio = nPosTrain / (nPosTrain + nNegTrain) -- check there is no overlap between train / val / test sets local sanity = torch.zeros(nPrograms) for i = 1,#trainInds do sanity[trainInds[i]] = sanity[trainInds[i]] + 1 end for i = 1,#testInds do sanity[testInds[i]] = sanity[testInds[i]] + 1 end for i = 1,#valInds do sanity[valInds[i]] = sanity[valInds[i]] + 1 end print('train/val/test check',torch.min(sanity),torch.max(sanity),torch.sum(sanity),nPrograms) if not (torch.min(sanity) == 1) or not (torch.max(sanity) == 1) or not (torch.sum(sanity) == nPrograms) then -- stop if this happens error('overlap between training / validation and testing sets') end return trainInds,valInds,testInds,posNegRatio end ================================================ FILE: testModel.lua ================================================ function testModel(allData,model,valInds,epochError) print('testing corrected verison 2') local timerTest = torch.Timer() local dtype = 'torch.DoubleTensor' if opt.useCUDA then dtype = 'torch.CudaTensor' end local criterion = nn.ClassNLLCriterion():type(dtype) model:evaluate() -- push the validation data through the network local nValPrograms = #valInds local valError = 0 local correct = 0 local confmat = torch.zeros(2,2) local lens = torch.zeros(nValPrograms) -- We need to make sure the rare-class is regarded as positive -- This means the f-score etc will be corectly calculated -- When reading the data benign is labelled as 1 and malware as 2 local nBenign = 0 local nMalware = 0 for k = 1,nValPrograms do if allData.label[valInds[k]] == 1 then nBenign = nBenign + 1 else nMalware = nMalware + 1 end end local positiveLabel = 1 if nMalware < nBenign then positiveLabel = 2 end print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel) --local valBatch = torch.zeros(1,opt.programLen):type(dtype) local valLabel = torch.zeros(1):type(dtype) for k = 1,nValPrograms do valLabel[{1}] = allData.label[valInds[k]] --valBatch[{{1},{}}] = allData.program[valInds[k]] local currProgramPtr = allData.programStartPtrs[valInds[k]] local currProgramLen = allData.programLengths[valInds[k]] if currProgramLen > opt.maxSequenceLength then currProgramLen = opt.maxSequenceLength end local valBatch = torch.zeros(1,currProgramLen):type(dtype) valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}] local netOutput = model:forward(valBatch) valError = valError + criterion:forward(netOutput,valLabel) local netOutputProb = nn.Exp():forward(netOutput:double()) local v,i = torch.max(netOutputProb,2) local pred = i[{1,1}] local gt = allData.label[valInds[k]] if pred == gt then correct = correct + 1; end confmat[pred][gt] = confmat[pred][gt] + 1 end valError = valError / nValPrograms local tp = 0 local fp = 0 local fn = 0 if positiveLabel == 1 then tp = confmat[1][1] fp = confmat[1][2] fn = confmat[2][1] else tp = confmat[2][2] fp = confmat[2][1] fn = confmat[1][2] end local testResult = { -- tp = tp, -- fp = fp, -- fn = fn, prec = tp / (tp + fp), recall = tp / (tp + fn), fscore = (2 * tp) / ((2 * tp) + fp + fn), accuracy = correct/nValPrograms, testError = valError, } local time = timerTest:time().real model:training() -- clean up valLabel = nil collectgarbage() return testResult,confmat,time end ================================================ FILE: testModel_dataAug.lua ================================================ function testModel(allData,model,valInds,epochError) print('testing corrected verison 3') local timerTest = torch.Timer() local dtype = 'torch.DoubleTensor' if opt.useCUDA then dtype = 'torch.CudaTensor' end local criterion = nn.ClassNLLCriterion():type(dtype) model:evaluate() -- push the validation data through the network local nValPrograms = #valInds local valError = 0 local correct = 0 local confmat = torch.zeros(2,2) local lens = torch.zeros(nValPrograms) -- We need to make sure the rare-class is regarded as positive -- This means the f-score etc will be corectly calculated -- When reading the data benign is labelled as 1 and malware as 2 local nBenign = 0 local nMalware = 0 for k = 1,nValPrograms do if allData.label[valInds[k]] == 1 then nBenign = nBenign + 1 else nMalware = nMalware + 1 end end local positiveLabel = 1 if nMalware < nBenign then positiveLabel = 2 end print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel) --local valBatch = torch.zeros(1,opt.programLen):type(dtype) local valLabel = torch.zeros(1):type(dtype) for k = 1,nValPrograms do valLabel[{1}] = allData.label[valInds[k]] --valBatch[{{1},{}}] = allData.program[valInds[k]] local currProgramPtr = allData.programStartPtrs[valInds[k]] local currProgramLen = allData.programLengths[valInds[k]] local netOutputProb = torch.zeros(1,2) local nDataAug = 10 for j = 1,nDataAug do local valBatch if currProgramLen > opt.maxSequenceLength then valBatch = torch.zeros(1,opt.maxSequenceLength):type(dtype) local rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1)) valBatch[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}] else valBatch = torch.zeros(1,currProgramLen):type(dtype) valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}] end -- if currProgramLen > opt.maxSequenceLength then -- currProgramLen = opt.maxSequenceLength -- end -- local valBatch = torch.zeros(1,currProgramLen):type(dtype) -- valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}] local netOutput = model:forward(valBatch) valError = valError + criterion:forward(netOutput,valLabel) netOutputProb = netOutputProb + nn.Exp():forward(netOutput:double()) end local v,i = torch.max(netOutputProb,2) local pred = i[{1,1}] local gt = allData.label[valInds[k]] if pred == gt then correct = correct + 1; end confmat[pred][gt] = confmat[pred][gt] + 1 end valError = valError / nValPrograms local tp = 0 local fp = 0 local fn = 0 if positiveLabel == 1 then tp = confmat[1][1] fp = confmat[1][2] fn = confmat[2][1] else tp = confmat[2][2] fp = confmat[2][1] fn = confmat[1][2] end local testResult = { -- tp = tp, -- fp = fp, -- fn = fn, prec = tp / (tp + fp), recall = tp / (tp + fn), fscore = (2 * tp) / ((2 * tp) + fp + fn), accuracy = correct/nValPrograms, testError = valError, } local time = timerTest:time().real model:training() -- clean up valBatch = nil valLabel = nil collectgarbage() return testResult,confmat,time end ================================================ FILE: testWithPreTrainedNetwork.lua ================================================ -- Example of how to test using a pre-trained network -- Expects a directory containing two or more directories -- One directory contains all the malware -- The other directory contains all the benign software -- given a model that has already been trained -- and a directory containing programs - classify into malware / benign require 'nn' require 'optim' require 'nngraph' require 'cunn' require 'cutorch' require 'readMalwareData' require 'testModel' cmd = torch.CmdLine() cmd:option('-useCUDA',false,'use CUDA optimisation') cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify') cmd:option('-modelPath','./trainedNets/model.th7','path to model to use for testing') opt = cmd:parse(arg) print('loading model from disk') savedModel = torch.load(opt.modelPath) print('loaded model') print(savedModel.trainedModel) -- we need these values to correctly prepare the files when reading from disk opt.programLen = savedModel.opt.programLen opt.kernelLength = savedModel.opt.kernelLength opt.maxSequenceLength = savedModel.opt.maxSequenceLength print('reading data from disk') allData = readMalwareData(opt.dataDir,savedModel.metaData) if opt.useCUDA then savedModel.trainedModel:cuda() end savedModel.trainedModel:evaluate() print('starting test') testResult,confmat,time = testModel(allData,savedModel.trainedModel,savedModel.metaData.testInds,0) print('Results') print('f-score ',testResult.fscore) print('precision ',testResult.prec) print('recall ',testResult.recall) print('accuracy ',testResult.accuracy) print('--') print('Confusion Matrix') print(confmat) print('--') print('time to complete test (s) :',time) ================================================ FILE: trainModel.lua ================================================ -- use the GPU to process the whole batch in parallel function trainModel(model,criterion,allData,trainInds,valInds,dataSplit,metaData) local parameters,gradParameters = model:getParameters() print('Number of Model Parameters ',parameters:size(1)) local dtype = 'torch.DoubleTensor' if opt.useCUDA then print('Using CUDA') dtype = 'torch.CudaTensor' else print('Running on CPU - CUDA disabled') end local config = { learningRate = opt.learningRate, weightDecay = opt.weightDecay, } local bestfscore = 0 local bestResult = torch.zeros(6) local timer = torch.Timer() local nPrograms = #trainInds print('Number of training examples ',#trainInds) print('Number of validation examples ',#valInds) -- pre-allocate memory for the batch print('allocating batch memory') --local batchProg = torch.zeros(opt.batchSize,opt.programLen):type(dtype) local batchLabel = torch.zeros(opt.batchSize):type(dtype) print('memory allocated') --print(#batchProg) if opt.useCUDA then local freeMemory, totalMemory = cutorch.getMemoryUsage(opt.gpuid) print('CUDA memory usage') print('free ',freeMemory,'total ',totalMemory,'ratio ',freeMemory/totalMemory) end local gradMultiplier = torch.zeros(2):type(dtype) if dataSplit.posNegRatio < 0.5 then gradMultiplier[1] = 1 - dataSplit.posNegRatio gradMultiplier[2] = dataSplit.posNegRatio else gradMultiplier[1] = dataSplit.posNegRatio gradMultiplier[2] = 1 - dataSplit.posNegRatio end for e = 1,opt.nEpochs do --batchProg:mul(0) batchLabel:mul(0) local nBatches = 0 local nSamples = 0 local epochError = 0 local order = torch.randperm(nPrograms) for i = 1,(nPrograms - (nPrograms%opt.batchSize)),opt.batchSize do nSamples = nSamples + opt.batchSize nBatches = nBatches + 1 -- build the batch here for k = 0,(opt.batchSize-1) do --batchProg[{{k+1},{}}] = allData.program[trainInds[order[i + k]]] batchLabel[{k+1}] = allData.label[trainInds[order[i + k]]] end local currProgramPtr = allData.programStartPtrs[trainInds[order[i]]] local currProgramLen = allData.programLengths[trainInds[order[i]]] local batchProg if currProgramLen > opt.maxSequenceLength then batchProg = torch.zeros(1,opt.maxSequenceLength):type(dtype) local rndPtr = 0 if opt.dataAugTesting then rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1)) end batchProg[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}] else batchProg = torch.zeros(1,currProgramLen):type(dtype) batchProg[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}] end --print(#batchProg) --print(currProgramPtr,currProgramLen) local feval = function(x) local batchError = 0 if x ~= parameters then parameters:copy(x) end gradParameters:zero() local output = model:forward(batchProg) local netError = criterion:forward(output,batchLabel) batchError = batchError + netError epochError = epochError + netError local gradCriterion = criterion:backward(output,batchLabel) if opt.weightClasses then -- seems to be a bug in Torch with ClassNLLCriterion as it should -- do this automatically ... -- manually weight the classes to deal with imbalanced pos / neg samples gradCriterion = gradCriterion:cmul(gradMultiplier) end model:backward(batchProg,gradCriterion) return batchError,gradParameters end if opt.useRMSProp then optim.rmsprop(feval, parameters, config) else optim.sgd(feval, parameters, config) end if isnan(epochError) then print('training fail - Nan') return 0 end if epochError > 1e9 then print('training fail - gradient exploded') return 0 end end if (e == 50 or e == 75) and opt.decayLearningRate then config.learningRate = config.learningRate * opt.weightDecayFrac end -- check the cross validation error if e % opt.nSamplingEpochs == 0 or e == opt.nEpochs then local time = timer:time().real print('training time',string.format("%7.3f",time),' nPrograms in training ',nSamples) timer:reset() local nValPrograms = #valInds local nTrainPrograms = #trainInds print('nValPrograms',nValPrograms,'nTrainingPrograms',nTrainPrograms) local valResult,valConfMat,valTime = testModel(allData,model,valInds,bestfscore) if valResult.fscore > bestfscore then bestfscore = valResult.fscore bestResult[1] = valResult.accuracy bestResult[2] = valResult.prec bestResult[3] = valResult.recall bestResult[4] = valResult.fscore bestResult[5] = epochError/nBatches bestResult[6] = valResult.testError -- save the best model so far and the data split etc if opt.saveModel then local experimentData = { opt = opt, trainedModel = model:double(), dataSplit = dataSplit, metaData = metaData, } torch.save('./trainedNets/' .. opt.saveFileName .. '.th7',experimentData) model:type(dtype) parameters, gradParameters = model:getParameters() collectgarbage() end end print(e,'val ',epochError/nBatches,valResult.testError,valResult.accuracy,valResult.prec,valResult.recall,valResult.fscore) print('testing time - val ',string.format("%7.3f",valTime),' nValPrograms',nValPrograms) print(valConfMat) local testResult,testConfMat,testTime = testModel(allData,model,trainInds,1) print(e,'train ',epochError/nBatches,testResult.testError,testResult.accuracy,testResult.prec,testResult.recall,testResult.fscore) print('testing time - train',string.format("%7.3f",testTime),' nTrainingPrograms',nTrainPrograms) print(testConfMat) print('--') epochError = 0 nSamples = 0 nBatches = 0 collectgarbage() end end print('Best Result ',bestResult[5],bestResult[6],bestResult[1],bestResult[2],bestResult[3],bestResult[4]) return model end