Repository: niallmcl/Deep-Android-Malware-Detection
Branch: master
Commit: 7b9ed3def0ea
Files: 17
Total size: 64.4 KB

Directory structure:
gitextract_s38v2mnj/

├── .gitattributes
├── DetectMalware_CNN.lua
├── buildNetwork.lua
├── dataset/
│   ├── Benign/
│   │   └── example.opseq
│   └── Malware/
│       └── example.opseq
├── opcodeseq_creator/
│   ├── DalvikOpcodes.txt
│   ├── README.txt
│   └── run_opcode_seq_creation.py
├── readMalwareData.lua
├── readme.md
├── results/
│   └── exampleOutput.txt
├── run.sh
├── splitMalwareData.lua
├── testModel.lua
├── testModel_dataAug.lua
├── testWithPreTrainedNetwork.lua
└── trainModel.lua

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto

# Custom for Visual Studio
*.cs     diff=csharp

# Standard to msysgit
*.doc	 diff=astextplain
*.DOC	 diff=astextplain
*.docx diff=astextplain
*.DOCX diff=astextplain
*.dot  diff=astextplain
*.DOT  diff=astextplain
*.pdf  diff=astextplain
*.PDF	 diff=astextplain
*.rtf	 diff=astextplain
*.RTF	 diff=astextplain


================================================
FILE: DetectMalware_CNN.lua
================================================
require 'nn'
require 'optim'
require 'nngraph'

require 'readMalwareData'
require 'splitMalwareData'
require 'buildNetwork'
require 'trainModel'

local cmd = torch.CmdLine()
cmd:option('-seed',1,'seed the random number generator')
cmd:option('-nEmbeddingDims',8,'number of dims in lookupTable for projecting instructions to network')
cmd:option('-nConvFilters',64,'number of convolutional filters')
cmd:option('-kernelLength',8,'seed the random number generator')
cmd:option('-useHiddenLayer',true,'use hidden layer between the conv layers and classifier')
cmd:option('-nHiddenNodes',16,'seed the random number generator')
cmd:option('-weightClasses',false,'seed the random number generator')
cmd:option('-nSamplingEpochs',10,'how often to sample the validation set - slow')
cmd:option('-useDropout',false,'use dropout between the conv and hidden layers')
cmd:option('-dropoutFrac',0.5,'dropout strength')
cmd:option('-randomize',false,'randomly select the network parameters')
cmd:option('-numDAShuffles',1,'number of function order shuffled versions of each program to keep')
cmd:option('-useOneHot',false,'Represent programs using one-hot / otherwise use look-up-table')
cmd:option('-learningRate',1e-3,'learning rate')
cmd:option('-nEpochs',20,'training epochs')
cmd:option('-nConvLayers',1,'number of extra convolutional layers')
cmd:option('-nFCLayers',1,'number of extra convolutional layers')
cmd:option('-batchSize',1,'size of batch used in training')
cmd:option('-usemom',false,'use momentum during SGD optimisation')
cmd:option('-useRMSProp',false,'use alternative optimizer rather than SGD')
cmd:option('-useCUDA',false,'use CUDA optimisation')
cmd:option('-gpuid',1,'which GPU to use')
cmd:option('-usePreTrainedEmbedding',false,'initialise network with pre-trained embedding')
cmd:option('-fixEmbedding',false,'prevent the embedding from being updated during learning')

cmd:option('-programLen',8,'how many instructions to read')

cmd:option('-debug',false,'enter debug mode')

cmd:option('-dataAugProb',0.1,'probability of changing an instruction during data augmentation')
cmd:option('-dataAugMethod',1,'1 - substitue the semantically most similar instruction, 2 - substitue random instruction')

cmd:option('-trainingSetSize',2,'restrict the size of the training-set for evaluation purposes')
cmd:option('-markFunctionEnds',false,'place a marker at the end of each method which may help classification work better')

cmd:option('-saveModel',false,'save the model and data split')
cmd:option('-saveFileName','detect_malware_cnn','filename to save the network')

cmd:option('-decayLearningRate',false,'reduce learning rate by factor of 10 every so often')
cmd:option('-weightDecay',0,'weight decay for L2 regularisation')
cmd:option('-weightDecayFrac',0.1,'amount to reduce learning rate by, 0.1 or 0.5 are good values')

-- try using dropout in various places of the network
cmd:option('-useSpatialDropout',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterEmbedding',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterConv',false,'drop instructions after the embedding layer')

cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')
cmd:option('-metaDataFile','./config/metaData.th7','file containing indicies of test/train/val split')
cmd:option('-setupMode',false,'Only run in this mode once. Splits the data into the train/test sets. Saved into ./config/metaData.th7')

cmd:option('-maxSequenceLength',1000000,'if program is longer than this length, crop sequence before passing to GPU')

cmd:option('-dataAugTesting',false,'Use data augmentation during testing i.e average score over random samples from program')

opt = cmd:parse(arg)

if opt.useCUDA then
	require 'cunn'
	require 'cutorch'
end

torch.setdefaulttensortype("torch.DoubleTensor")
torch.manualSeed(opt.seed)
if opt.useCUDA then 
	cutorch.setDevice(opt.gpuid)
	cutorch.manualSeedAll(opt.seed)
end

if opt.dataAugTesting then
	require 'testModel_dataAug'
else
	require 'testModel'
end

print(opt)

function isnan(z)
	return z ~= z
end


if opt.setupMode then

	-- READ-ME
	-- Given a new dataset we need to split into training / testing sets.
	-- We only run this chunk once to generate the new train / test split and save it to disk
	-- Later, when training the network, the training-set is randomly spit into train / validation for a given run
	-- This allows us to perform cross-validation on the training-set. After we have finished
	-- doing all development we can test a pre-trained network on the testing-set.
	------------------------------------------------------------------------------------------
	------------------------------------------------------------------------------------------

	-- read the data from the root dir
	-- decide which files should be included in the dataset
	print('reading dataset')
	local datasetInfo = readMalwareData_setup(opt.dataDir)

	print('splitting dataset into train/test sets')
	local trainPercentage = 0.9 -- use 90% for training and validation sets, and 10% for held-out testing-set
	local trainInds,testInds,posNegRatio = splitMalwareDataTrainTest(datasetInfo.label,trainPercentage,1 - trainPercentage)
	local metaData = {
		trainInds = trainInds,
		testInds = testInds,
		posNegRatio = posNegRatio,
		trainPercentage = trainPercentage,
		--
		filesList = datasetInfo.filesList,
		family = datasetInfo.family,
		label = datasetInfo.label,
		benignFamily = datasetInfo.benignFamily,
		familyName = datasetInfo.familyName,
	}
	print('saving dataset metadata to file ',opt.metaDataFile)
	torch.save(opt.metaDataFile,metaData)

	-- ------------------------------------------------------------------------------------------
	-- ------------------------------------------------------------------------------------------

else
	-- train the network and save version with lowest validation error to disk

	print(opt.metaDataFile)
	local metaData = torch.load(opt.metaDataFile)

	print('reading data from disk')
	local allData = readMalwareData(opt.dataDir,metaData)
	print('reading data from disk - complete')
	print('program lens ',torch.min(allData.programLengths),torch.max(allData.programLengths),torch.mean(allData.programLengths))

	--take the saved split of train/test and further split the train-set into train/val
	print('splitting data into train/val/test sets')	
	local testPercentage = (1 - metaData.trainPercentage)
	local valPercentage = (1 - metaData.trainPercentage)
	local trainPercentage = 1 - (testPercentage + valPercentage)

	print('t,v,t')
	print(testPercentage,valPercentage,trainPercentage)

	local trainInds,valInds,testInds,posNegRatio = splitMalwareDataTrainValTest(allData.label,metaData,trainPercentage)
	local dataSplit = {
			trainInds = trainInds,
			valInds = valInds,
			testInds = testInds,
			posNegRatio = posNegRatio,
		}

	print('new network')
	local model,criterion = buildNetwork(metaData.posNegRatio)

	print('starting training')
	local trainedModel = trainModel(model,criterion,allData,dataSplit.trainInds,dataSplit.valInds,dataSplit,metaData)
end

================================================
FILE: buildNetwork.lua
================================================
function buildNetwork(posNegRatio)

	local nIndex = 256
	local nOutputSamples = opt.nConvFilters    -- number of conv-filters
	local kernelStride = 1      -- stride of kernel
	local nClasses = 2
	local nHidden = opt.nHiddenNodes

	local model = nn.Sequential()

	-- project from one-hot to low-dim embedding space
	if opt.constrainEmbeddingNorm then	
		model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims,0,1,2))
	else
		model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims))
	end

	-- we can add this here to prevent the network from updating the projection layer
	-- maybe the projection does not matter much?
	-- model:add(nn.GradBlocker())

	-- 1st conv layer
	--model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))
	model:add(nn.Reshape(1,-1,opt.nEmbeddingDims,true))

	if opt.useSpatialDropout then
		-- should be batchx1xproglenxembeddingdim
		model:add(nn.Reshape(opt.programLen,opt.nEmbeddingDims,1,true))
		model:add(nn.SpatialDropout(opt.dropoutFrac))
		model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))
	end

	--model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))

	if opt.useDropoutAfterEmbedding then
		model:add(nn.Dropout(opt.dropoutFrac))
	end

	model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nEmbeddingDims,opt.kernelLength,kernelStride))
	model:add(nn.ReLU())

	-- if opt.nConvLayers > 1 then
	-- 	for layernum = 1,(opt.nConvLayers-1) do

	-- 		model:add(nn.Reshape(opt.nConvFilters,-1,true))	
	-- 		model:add(nn.Transpose({2,3}))
	-- 		--model:add(nn.TemporalMaxPooling(opt.kernelLength/2,opt.kernelLength/2))
	-- 		model:add(nn.TemporalMaxPooling(2,2))
	-- 		model:add(nn.Reshape(1,-1,opt.nConvFilters,true))

	-- 		model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))
	-- 		model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nConvFilters,opt.kernelLength,kernelStride))

	-- 		model:add(nn.ReLU())			

	-- 	end
	-- end


	model:add(nn.Reshape(opt.nConvFilters,-1,true))	

	if opt.useDropoutAfterConv then
		model:add(nn.Dropout(opt.dropoutFrac))
	end

	model:add(nn.Max(3)) -- produces a vector of fixed size
	
	if opt.useHiddenLayer then
		model:add(nn.Linear(nOutputSamples,nHidden))
		model:add(nn.ReLU())
		model:add(nn.Linear(nHidden,nClasses))
	else
		model:add(nn.Linear(nOutputSamples,nClasses))
	end
	model:add(nn.LogSoftMax())
	
	local criterion = 0
	if opt.weightClasses then

		local weights = torch.zeros(nClasses)
		if posNegRatio < 0.5 then
			weights[1] = 1 - posNegRatio
			weights[2] = posNegRatio
		else
			weights[2] = 1 - posNegRatio
			weights[1] = posNegRatio
		end
		criterion = nn.ClassNLLCriterion(weights)
	else
		criterion = nn.ClassNLLCriterion()
	end

	if opt.useCUDA then
		model:cuda()
		criterion:cuda()
	end

	print(model)

	return model,criterion
end

================================================
FILE: dataset/Benign/example.opseq
================================================
5b700e
700e
1f6e0c

================================================
FILE: dataset/Malware/example.opseq
================================================
5b700e
700e
1f6e0c

================================================
FILE: opcodeseq_creator/DalvikOpcodes.txt
================================================
nop 00
move 01
move/from16 02
move/16 03
move-wide 04
move-wide/from16 05
move-wide/16 06
move-object 07
move-object/from16 08
move-object/16 09
move-result 0a
move-result-wide 0b
move-result-object 0c
move-exception 0d
return-void 0e
return 0f
return-wide 10
return-object 11
const/4 12
const/16 13
const 14
const/high16 15
const-wide/16 16
const-wide/32 17
const-wide 18
const-wide/high16 19
const-string 1a
const-string/jumbo 1b
const-class 1c
monitor-enter 1d
monitor-exit 1e
check-cast 1f
instance-of 20
array-length 21
new-instance 22
new-array 23
filled-new-array 24
filled-new-array/range 25
fill-array-data 26
throw 27
goto 28
goto/16 29
goto/32 2a
packed-switch 2b
sparse-switch 2c
cmpl-float 2d
cmpg-float 2e
cmpl-double 2f
cmpg-double 30
cmp-long 31
if-eq 32
if-ne 33
if-lt 34
if-ge 35
if-gt 36
if-le 37
if-eqz 38
if-nez 39
if-ltz 3a
if-gez 3b
if-gtz 3c
if-lez 3d
aget 44
aget-wide 45
aget-object 46
aget-boolean 47
aget-byte 48
aget-char 49
aget-short 4a
aput 4b
aput-wide 4c
aput-object 4d
aput-boolean 4e
aput-byte 4f
aput-char 50
aput-short 51
iget 52
iget-wide 53
iget-object 54
iget-boolean 55
iget-byte 56
iget-char 57
iget-short 58
iput 59
iput-wide 5a
iput-object 5b
iput-boolean 5c
iput-byte 5d
iput-char 5e
iput-short 5f
sget 60
sget-wide 61
sget-object 62
sget-boolean 63
sget-byte 64
sget-char 65
sget-short 66
sput 67
sput-wide 68
sput-object 69
sput-boolean 6a
sput-byte 6b
sput-char 6c
sput-short 6d
invoke-virtual 6e
invoke-super 6f
invoke-direct 70
invoke-static 71
invoke-interface 72
invoke-virtual/range 74
invoke-super/range 75
invoke-direct/range 76
invoke-static/range 77
invoke-interface/range 78
neg-int 7b
not-int 7c
neg-long 7d
not-long 7e
neg-float 7f
neg-double 80
int-to-long 81
int-to-float 82
int-to-double 83
long-to-int 84
long-to-float 85
long-to-double 86
float-to-int 87
float-to-long 88
float-to-double 89
double-to-int 8a
double-to-long 8b
double-to-float 8c
int-to-byte 8d
int-to-char 8e
int-to-short 8f
add-int 90
sub-int 91
mul-int 92
div-int 93
rem-int 94
and-int 95
or-int 96
xor-int 97
shl-int 98
shr-int 99
ushr-int 9a
add-long 9b
sub-long 9c
mul-long 9d
div-long 9e
rem-long 9f
and-long a0
or-long a1
xor-long a2
shl-long a3
shr-long a4
ushr-long a5
add-float a6
sub-float a7
mul-float a8
div-float a9
rem-float aa
add-double ab
sub-double ac
mul-double ad
div-double ae
rem-double af
add-int/2addr b0
sub-int/2addr b1
mul-int/2addr b2
div-int/2addr b3
rem-int/2addr b4
and-int/2addr b5
or-int/2addr b6
xor-int/2addr b7
shl-int/2addr b8
shr-int/2addr b9
ushr-int/2addr ba
add-long/2addr bb
sub-long/2addr bc
mul-long/2addr bd
div-long/2addr be
rem-long/2addr bf
and-long/2addr c0
or-long/2addr c1
xor-long/2addr c2
shl-long/2addr c3
shr-long/2addr c4
ushr-long/2addr c5
add-float/2addr c6
sub-float/2addr c7
mul-float/2addr c8
div-float/2addr c9
rem-float/2addr ca
add-double/2addr cb
sub-double/2addr cc
mul-double/2addr cd
div-double/2addr ce
rem-double/2addr cf
add-int/lit16 d0
rsub-int d1
mul-int/lit16 d2
div-int/lit16 d3
rem-int/lit16 d4
and-int/lit16 d5
or-int/lit16 d6
xor-int/lit16 d7
add-int/lit8 d8
rsub-int/lit8 d9
mul-int/lit8 da
div-int/lit8 db
rem-int/lit8 dc
and-int/lit8 dd
or-int/lit8 de
xor-int/lit8 df
shl-int/lit8 e0
shr-int/lit8 e1
ushr-int/lit8 e2


================================================
FILE: opcodeseq_creator/README.txt
================================================

The zip file contains:

	1- A csv file containing Davlik opcodes

	2- Sample directory structure containing
		-apk folder with one sample apk
		-tmp folder to hold the decoded apps
		-opseq folder to store the opcode sequece files

	5- a python file run_opcode_seq_creation.py which takes the following arguments:
		
		Python script arguments:

			<apk file directory>   
			1. Pathname to the directory containing apk file 

			<temp directory>
			2. Pathname of a temporary folder to keep the decoded files during the analysis 

			<opseq directory>
			3. Pathname to an arbitrary directory to store the opcode sequence files

			<include support libraries>
			4. (optional) "incl" (without quotes) to include android support library files
           	   	   Note: default behavior is NOT to include those libraries

Steps to run the script:
	

	1) Apktool installation:

	 	-Make sure you have java install by running "java --version"
		 you can install jre by running "apt-get install default-jre"

 	 	-Follow the installation below to install apktool on Linux
  		 https://ibotpeaches.github.io/Apktool/install/
  		 (folowing the instructions will place apktool files in /usr/local/bin)
		 Note: Make sure that they are executable


	2) Extract the zip file to a folder (extracted_folder) and run the following command:

 		extracted_folder$ ./run_opcode_seq_creation.py ./apk ./tmp ./opseq incl 

	
================================================
FILE: opcodeseq_creator/run_opcode_seq_creation.py
================================================
#!/usr/bin/env python
import sys
import os
import shutil
import datetime
import logging

sys.path.insert(1, os.path.join(sys.path[0], '../..'))

def main():

    if len(sys.argv) < 4:

        print "Usage", sys.argv[0], "<apk file directory> <temp directory> <opseq directory> <include support libraries>"
        return

    # Reads the location of apk files that need decoding
    apk_file_directory = sys.argv[1]
    print "Reading apks from", apk_file_directory

    # Temporary folder to store the decoded app
    tmp_file_directory = sys.argv[2]
    print "Decoding folder", tmp_file_directory

    # Reads the location that we want to store our opseq files in
    opseq_file_directory = sys.argv[3]
    print "opseq folder", opseq_file_directory

    # Default is not to include smali files in android support libraries unless 4th parameter is provided
    include_libs = False
    if len(sys.argv) == 5:
        include_libs = ((sys.argv[4]) == "incl")
        print "Include Android support library smali files", include_libs

    print "Keep Android support libaray files: "+ str(include_libs)

    # Created a log file in the temp directory
    logging.basicConfig(filename=tmp_file_directory+'/opseq.log', level=logging.DEBUG)

    apks = []

    for name in os.listdir(apk_file_directory):
        if os.path.isfile(os.path.join(apk_file_directory, name)):
            apks.append(name)

    logging.info('Total apks to be decoded {0}'.format(len(apks)))
    print "Total apks to be decoded",len(apks)

    num_local = 0
    before=datetime.datetime.now()
    logging.info('Starting at: {0}'.format(before))
    print "Starting at: {0}",before

    # Looping through all apks
    for apk_hash in apks:
        apk_file_location = os.path.join(apk_file_directory, apk_hash)
        num_local += 1
        logging.info('Decoding apk: {0} apk #: {1}'.format(apk_file_location,num_local))
        print "apk #: ", num_local
        print "apk location: ", apk_file_location

        decoded_location = None
        # Decoding apk into the tmp_file_directory
        decoded_location = decode_application(apk_file_location,tmp_file_directory,apk_hash,include_libs)

        if (not os.path.exists(decoded_location) or not os.listdir(decoded_location)):
            print "smali directory does not exist continue...."
            logging.error('NOT decoded directory: {0}'.format(apk_file_location))
            print "NOT decoded directory:", apk_file_location
            continue

        result =create_opcode_seq(decoded_location,opseq_file_directory,apk_hash)

        if result:
            print "opseq file for apk #",num_local," is created"
            logging.info('opseq file for apk # {0} is created'.format(num_local))
        else:
            logging.error('opseq file creation was not successful')
            print "opseq file creation was not successful"

        if os.path.exists(decoded_location):
            shutil.rmtree(decoded_location)


    after=datetime.datetime.now()
    print "Finished by: {0} ",after
    logging.info('Total time taken:  {0}'.format(after-before))
    print "Total time taken:", after-before

def create_opcode_seq(decoded_dir,opseq_file_directory,apk_hash):
    # Returns true if creating opcode sequence file was successful,
    # searches all files in smali folder,
    # writes the coresponding opcode sequence to a .opseq file
    # and depending on the include_lib value,
    # it includes or excludes the support library files

    dalvik_opcodes = {}
    # Reading Davlik opcodes into a dictionary
    with open("DalvikOpcodes.txt") as fop:
        for linee in fop:
            (key, val) = linee.split()
            dalvik_opcodes[key] = val
    try:
        smali_dir = os.path.join(decoded_dir, "smali")
        opseq_fname=os.path.join(opseq_file_directory,apk_hash+".opseq")

        with open(opseq_fname, "a") as opseq_file:
            for root, dirs, fnames in os.walk(smali_dir):
                for fname in fnames:
                    full_path = os.path.join(root, fname)
                    opseq_file.write(get_opcode_seq(full_path, dalvik_opcodes))
        opseq_file.close()

        return True
    except Exception as e:
        print "Exception occured during opseq creation of apk " ,apk_hash
        logging.error('Exception occured during opseq creation {0}'.format(str(e)))
        return False

def get_opcode_seq(smali_fname, dalvik_opcodes):
    # Returns opcode sequence created from smali file 'smali_fname'.

    opcode_seq=''

    with open(smali_fname, mode="r") as bigfile:
        reader = bigfile.read()
        for i, part in enumerate(reader.split(".method")):
            add_newline = False
            if i!=0:
                method_part=part.split(".end method")[0]
                method_body = method_part.strip().split('\n')
                for line in method_body:
                    if not line.strip().startswith('.') and not line.strip().startswith('#') and line.strip():
                        method_line = line.strip().split()
                        if method_line[0] in dalvik_opcodes:
                            add_newline = True
                            opcode_seq += dalvik_opcodes[method_line[0]]
                if  add_newline:
                    opcode_seq += '\n'
    return opcode_seq

def decode_application (apk_file_location,tmp_file_directory,hash,include_libs):
    # Decodes the apk at apk_file_location and
    # stores the decoded folders in tmp_file_directory

    out_file_location = os.path.join(tmp_file_directory, hash+ ".smali")
    try:
        apktool_decode_apk( apk_file_location, out_file_location,include_libs )
    except ApkToolException:
        print "ApktoolException on decoding"
        logging.error("ApktoolException on decoding apk  {0} ".format(apk_file_location))
        pass
    return out_file_location

def apktool_decode_apk(apk_file, out_file,include_libs):
    # Runs the apktool on a given apk

    apktooldir="/usr/local/bin"

    apktoolcmd = "{0}/apktool d -f {1} -o {2}".format(apktooldir, apk_file, out_file)
    res = os.system(apktoolcmd)
    if res != 0: raise ApkToolException(apktoolcmd)

    # Checks if we should keep the smali files belonging to the android support libraries
    if not include_libs:
        # Don't keep the smali/android folder
        android_folder = os.path.join(out_file, "smali/android")
        if os.path.exists(android_folder):
		    rm_cmd = "rm -r %s" %(android_folder)
 		    os.system(rm_cmd)

# Exception class to signify an Apktool Exception
class ApkToolException(Exception):
    def __init__(self, command):
        self.command = command

    def __str__(self):
        return repr(self.command)

if __name__ == '__main__':
    main()


================================================
FILE: readMalwareData.lua
================================================

-- read the malware data
--	 in setup mode 
--	- read all the files
--  - decide if it should be in dataset
--  - save a list of all files
--  - 			

-- read the whole program into a tensor
function readfileFunc_tensor(filename)

	local contents = {}

	local f = torch.DiskFile(filename)
	f.quiet(f)
	local c = 'a'
	local count = 0
	local func = {}

	for i = 1,opt.kernelLength do
		table.insert(func,1)
		count = count + 1
	end

	local nFuncs = 0
	while c ~= '' do --and count <= opt.programLen do -- potential bug...
		c = f.readString(f,'*l')
		local len = #c
		if len > 0 then

			for k = 1,len,2 do
				local num = string.sub(c,k,k+1)
				local n = tonumber(num,16)
				table.insert(func,n + 2) -- plus 2 so that our lowest symbol is '2' i.e. no_op is '2'
				count = count + 1
			end
			nFuncs = nFuncs + 1

			for i = 1,opt.kernelLength do
				table.insert(func,1)
				count = count + 1
			end

			if opt.markFunctionEnds then
				table.insert(func,255) -- mark the end of each function
			end
		end
	end
	return torch.ByteTensor(func),nFuncs,count
end

-- get an upper bound on the number of malware files
-- we will discard some files that are too short etc
function upperBoundNumberOfFiles(rootDir)
	local numberOfFilesBound = 0
	local malwareDirs = paths.dir(rootDir)	
	for i = 1,#malwareDirs do
		local dir = malwareDirs[i]
		if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then
			local malwarefiles = paths.dir(paths.concat(rootDir,dir))	
			-- number of files minus '.' and '..'
			numberOfFilesBound = numberOfFilesBound + #malwarefiles - 2
		end
	end
	print('upper bound number of programs ',numberOfFilesBound)
	return numberOfFilesBound
end

-- this function gets called once when processing a new dataset
-- we read all the programs and decide which ones should be included
-- we just use an arbitrary rule that excludes very short programs
-- the list of included programs is returned and saved for later use
function readMalwareData_setup(rootDir)

	-- read all the directories
	-- check each file to see if it meets some criterion
	-- save list of filenames
	-- split into train / test sets

	local datasetInfo = {
		filesList = {},
		family = {},
		familyName = {},
		label = {},
		benignFamily = -1,
	}

	local programCount = 0
	local familyNumber = 1

	local malwareDirs = paths.dir(rootDir)

	for i = 1,#malwareDirs do
		local dir = malwareDirs[i]
		if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then
			local malwarefiles = paths.dir(paths.concat(rootDir,dir))			
			for f = 1,#malwarefiles do
				local file = malwarefiles[f]
				if file ~= '.' and file ~= '..' then
					local contents,nFuncs = readfileFunc_tensor(paths.concat(rootDir,dir,malwarefiles[f]))					
					if nFuncs >= 8 then -- a bit arbitrary... basically we want to ignore very short files

						programCount = programCount + 1
						if programCount % 100 == 0 then
							print('programs read ',programCount,collectgarbage("count"))
							collectgarbage()
						end
						
						-- local includeFile = dir .. '/' .. malwarefiles[f]						
						table.insert(datasetInfo.filesList,malwarefiles[f])
						table.insert(datasetInfo.family,familyNumber)

						if dir == 'Benign' then
							datasetInfo.benignFamily = familyNumber
							table.insert(datasetInfo.label,1)
						else                                
							table.insert(datasetInfo.label,2)
						end

					end
				end
			end
			familyNumber = familyNumber + 1
			table.insert(datasetInfo.familyName,dir)
		end
	end

	datasetInfo.family = torch.Tensor(datasetInfo.family)
	datasetInfo.label = torch.Tensor(datasetInfo.label)

	return datasetInfo
end

-- reads the malware data into a tensor
-- We read all the opcodes into a single block of memory
-- this is because each program can be a different length
-- so storing in a 2D array will waste lots space
-- We also can't use a Lua list as they are limited to 2GB
--
-- allData.program          - tensor (i.e. 1D array of bytes) containing all opcodes
-- allData.programStartPtrs - pointers to start of each program in allData.program
-- allData.programLengths   - the length of each opcode sequence
--
-- For example, to access program 3 do
--
-- local ptr = allData.programStartPrts[3]
-- local len = allData.programLengths[3]
-- local prog = allData.program[{{ptr,ptr + len - 1}}]
--
function readMalwareData(rootDir,metaData)

	print('reading files with version 2')

	local malwareDirs = paths.dir(rootDir)	
	local upperBoundNumFiles = upperBoundNumberOfFiles(rootDir)

	local meanProgramLen = 50000

	local allData = {
		program = torch.ones(upperBoundNumFiles * meanProgramLen):byte(),
		programStartPtrs = {},
		programLengths = {},
	}

	local programLen = {}

	local progPtr = 1
	local programCount = 0

	for i = 1,#metaData.filesList do

		local file = metaData.filesList[i]
		local familyDir = metaData.familyName[metaData.family[i]]
		local fullFile = paths.concat(rootDir,familyDir,file)

		if paths.filep(fullFile) then			

			local contents = readfileFunc_tensor(fullFile)

			programCount = programCount + 1
			if programCount % 100 == 0 then
				print('programs read ',programCount,collectgarbage("count"))
				collectgarbage()
			end

			local programLength = contents:size(1)

			-- if needed - increase the size of the storage
			if (progPtr + programLength - 1) > allData.program:size(1) then
				local currSize = allData.program:size(1)
				allData.program = allData.program:resize(currSize * 1.05)
			end

			table.insert(allData.programStartPtrs,progPtr)
			table.insert(allData.programLengths,programLength)

			-- insert the program into the memory
			allData.program[{{progPtr,progPtr + programLength - 1}}] = contents
			progPtr = progPtr + programLength
		else
			-- we should stop if this happens!
			error('ERROR : Missing file in dataset : ' .. fullFile)
		end
	end

	allData.program = allData.program:resize(progPtr) -- discard redundant rows
	allData.programStartPtrs = torch.Tensor(allData.programStartPtrs)
	allData.programLengths = torch.Tensor(allData.programLengths)
	allData.label = metaData.label

	return allData,programLen
end

================================================
FILE: readme.md
================================================
# Deep Android Malware Detection

This repository contains the code for the paper "Deep Android Malware Detection" ([pdf download](https://pure.qub.ac.uk/portal/files/122380314/sig_camera_ready.pdf)) | ([citation](http://dl.acm.org/citation.cfm?id=3029823))

We use a convolutional neural network (CNN) for android malware classification. Malware classification is performed based on static analysis of the raw opcode sequence from a disassembled android apk. Features indicative of malware are automatically learned from the raw opcode sequence thus removing the need for hand-engineered malware features. The network runs on GPU, allowing a very large number of files to be quickly scanned.

<p><img src='malware_network_diagram.png'></p>

If you use this code please cite the following paper:

```
@inproceedings{mclaughlin2017codaspy,
title = "Deep Android Malware Detection",
author = "Niall McLaughlin and {Martinez del Rincon}, Jesus and BooJoong Kang and Suleiman Yerima and Paul Miller and Sakir Sezer and Yeganeh Safaeisemnani and Erik Trickel and Ziming Zhao and Adam Doupé and {Joon Ahn}, Gail",
year = "2016",
month = "12",
booktitle = "Proceeding of the ACM Conference on Data and Applications Security and Privacy (CODASPY) 2017",
publisher = "Association for Computing Machinery (ACM)",
}
```

## How to run the code

Given an existing dataset directory (see below for details), the run.sh file will do the following:

1. Partition the dataset into training-set and held-out test-set
2. Train a neural network
3. Test the trained network on the test-set

## Prerequisites

### Dataset structure

An example dataset with the required directory structure is provided in ./dataset 

The neural network requires opcode sequence files in the correct format, and a dataset directory with sub-directories containing malware and benign opcode sequence files.

An example dataset directory is provided in ./dataset. The dataset directory must have the following structure:

1. There must be a directory called 'Benign', and contains non-malware opcode sequences files
2. The other directory can have any name ,and contains malware opcode sequence files

### Opcode Sequence files

Opcode sequence files can be created from android APK files using the opcode sequence creation tool. This tool is located in ./opcodeseq_creator Please see the readme file in this directory for more information.

### Setup

The neural network code is implemented using Torch. It is recommended to use a GPU to achieve acceleration of testing and training. For details on installing Torch please see http://torch.ch

The opcode sequence creator tool requires APKTool https://ibotpeaches.github.io/Apktool/


================================================
FILE: results/exampleOutput.txt
================================================
{
  useOneHot : false
  nConvLayers : 1
  usemom : false
  dataAugProb : 0.1
  batchSize : 1
  nSamplingEpochs : 5
  nFCLayers : 1
  nEmbeddingDims : 8
  kernelLength : 8
  useDropoutAfterEmbedding : false
  numDAShuffles : 1
  metaDataFile : "./config/metaData_small_test.th7"
  useSpatialDropout : false
  useHiddenLayer : true
  weightDecay : 0
  nConvFilters : 64
  dropoutFrac : 0.5
  useRMSProp : false
  programLen : 8192
  gpuid : 1
  nHiddenNodes : 16
  dataAugTesting : false
  dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/"
  seed : 1
  maxSequenceLength : 8192
  markFunctionEnds : false
  debug : false
  useDropoutAfterConv : false
  useDropout : false
  weightClasses : false
  saveFileName : "model_tmp"
  fixEmbedding : false
  trainingSetSize : 2
  randomize : false
  weightDecayFrac : 0.1
  useCUDA : true
  usePreTrainedEmbedding : false
  nEpochs : 75
  decayLearningRate : false
  setupMode : true
  dataAugMethod : 1
  saveModel : true
  learningRate : 0.001
}
reading dataset	
programs read 	100	5064.8681640625	
programs read 	200	5527.64453125	
programs read 	300	6014.9560546875	
programs read 	400	3911.5263671875	
programs read 	500	7196.4423828125	
programs read 	600	8327.2734375	
programs read 	700	10306.740234375	
programs read 	800	6509.2666015625	
programs read 	900	7206.0546875	
programs read 	1000	6228.0478515625	
programs read 	1100	6535.55078125	
programs read 	1200	6618.107421875	
programs read 	1300	4311.482421875	
programs read 	1400	8571.1533203125	
programs read 	1500	8814.9814453125	
programs read 	1600	6065.205078125	
programs read 	1700	5644.7822265625	
programs read 	1800	4623.0302734375	
programs read 	1900	6804.72265625	
programs read 	2000	4155.318359375	
programs read 	2100	3895.193359375	
splitting dataset into train/test sets	
846	1259	
splitting dataset	
nPosTrain	761	nNegTrain	1133	pos/neg 	0.40179514255544	
nPosTest	85	nNegTest	126	pos/neg 	0.40284360189573	
saving dataset metadata to file 	./config/metaData_small_test.th7	
{
  useOneHot : false
  nConvLayers : 1
  usemom : false
  dataAugProb : 0.1
  batchSize : 1
  nSamplingEpochs : 5
  nFCLayers : 1
  nEmbeddingDims : 8
  kernelLength : 8
  useDropoutAfterEmbedding : false
  numDAShuffles : 1
  metaDataFile : "./config/metaData_small_test.th7"
  useSpatialDropout : false
  useHiddenLayer : true
  weightDecay : 0
  nConvFilters : 64
  dropoutFrac : 0.5
  useRMSProp : false
  programLen : 8192
  gpuid : 1
  nHiddenNodes : 16
  dataAugTesting : false
  dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/"
  seed : 1
  maxSequenceLength : 8192
  markFunctionEnds : false
  debug : false
  useDropoutAfterConv : false
  useDropout : false
  weightClasses : false
  saveFileName : "model_tmp"
  fixEmbedding : false
  trainingSetSize : 2
  randomize : false
  weightDecayFrac : 0.1
  useCUDA : true
  usePreTrainedEmbedding : false
  nEpochs : 75
  decayLearningRate : false
  setupMode : false
  dataAugMethod : 1
  saveModel : true
  learningRate : 0.001
}
./config/metaData_small_test.th7	
reading data from disk	
reading files with version 2	
upper bound number of programs 	2125	
programs read 	100	5121.4873046875	
programs read 	200	17134.224609375	
programs read 	300	8701.2607421875	
programs read 	400	7367.3076171875	
programs read 	500	7284.8056640625	
programs read 	600	8411.69921875	
programs read 	700	10391.264648438	
programs read 	800	6593.51953125	
programs read 	900	7580.0087890625	
programs read 	1000	6310.1181640625	
programs read 	1100	6471.4033203125	
programs read 	1200	5646.609375	
programs read 	1300	6056.0703125	
programs read 	1400	5953.17578125	
programs read 	1500	5674.7333984375	
programs read 	1600	6110.0087890625	
programs read 	1700	5555.7314453125	
programs read 	1800	6911.7939453125	
programs read 	1900	6337.0595703125	
programs read 	2000	8575.1025390625	
programs read 	2100	3910.28515625	
reading data from disk - complete	
program lens 	88	1083463	66743.73064133	
splitting data into train/val/test sets	
t,v,t	
0.1	0.1	0.8	
nPrograms 	2105	
846	1259	
761	1133	
splitting dataset	
nPosTrain	676	nNegTrain	1007	
nPosVal	85	nNegVal	126	
nPosTest	85	nNegTest	126	
train/val/test check	1	1	2105	2105	
new network	
nn.Sequential {
  [input -> (1) -> (2) -> (3) -> (4) -> (5) -> (6) -> (7) -> (8) -> (9) -> (10) -> output]
  (1): nn.LookupTable
  (2): nn.Reshape(1x-1x8)
  (3): nn.SpatialConvolutionMM(1 -> 64, 8x8)
  (4): nn.ReLU
  (5): nn.Reshape(64x-1)
  (6): nn.Max
  (7): nn.Linear(64 -> 16)
  (8): nn.ReLU
  (9): nn.Linear(16 -> 2)
  (10): nn.LogSoftMax
}
starting training	
Number of Model Parameters 	7282	
Using CUDA	
Number of training examples 	1683	
Number of validation examples 	211	
allocating batch memory	
memory allocated	
CUDA memory usage	
free 	3702857728	total 	4294246400	ratio 	0.86228347958794	
training time	 30.722	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
5	val   	0.40789575868608	0.38057567440503	0.87677725118483	0.83908045977011	0.85882352941176	0.84883720930233	
testing time - val  	  0.157	 nValPrograms	211	
  73   14
  12  112
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
5	train 	0.40789575868608	0.33726407361753	0.89304812834225	0.88036809815951	0.8491124260355	0.8644578313253	
testing time - train	  1.179	 nTrainingPrograms	1683	
 574   78
 102  929
[torch.DoubleTensor of size 2x2]

--	
training time	 32.220	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
10	val   	0.16407678506945	0.17450008092898	0.9478672985782	0.92045454545455	0.95294117647059	0.9364161849711	
testing time - val  	  0.149	 nValPrograms	211	
  81    7
   4  119
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
10	train 	0.16407678506945	0.12949798309725	0.96375519904932	0.93741109530583	0.97485207100592	0.95576504713561	
testing time - train	  1.157	 nTrainingPrograms	1683	
 659   44
  17  963
[torch.DoubleTensor of size 2x2]

--	
training time	 32.102	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
15	val   	0.084003127938711	0.13772791624069	0.9478672985782	0.94047619047619	0.92941176470588	0.93491124260355	
testing time - val  	  0.149	 nValPrograms	211	
  79    5
   6  121
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
15	train 	0.084003127938711	0.059644215920109	0.98871063576946	0.98811292719168	0.98372781065089	0.98591549295775	
testing time - train	  1.178	 nTrainingPrograms	1683	
 665    8
  11  999
[torch.DoubleTensor of size 2x2]

--	
training time	 31.980	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
20	val   	0.045003364997043	0.12233256954717	0.9478672985782	0.92045454545455	0.95294117647059	0.9364161849711	
testing time - val  	  0.153	 nValPrograms	211	
  81    7
   4  119
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
20	train 	0.045003364997043	0.031773728943268	0.99524658348188	0.99408284023669	0.99408284023669	0.99408284023669	
testing time - train	  1.180	 nTrainingPrograms	1683	
  672     4
    4  1003
[torch.DoubleTensor of size 2x2]

--	
training time	 32.320	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
25	val   	0.026457868370355	0.11776774217732	0.94312796208531	0.92941176470588	0.92941176470588	0.92941176470588	
testing time - val  	  0.152	 nValPrograms	211	
  79    6
   6  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
25	train 	0.026457868370355	0.015975192693891	0.99762329174094	0.99851632047478	0.99556213017751	0.99703703703704	
testing time - train	  1.181	 nTrainingPrograms	1683	
  673     1
    3  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.997	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
30	val   	0.019170329284611	0.11645289704698	0.94312796208531	0.92941176470588	0.92941176470588	0.92941176470588	
testing time - val  	  0.154	 nValPrograms	211	
  79    6
   6  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
30	train 	0.019170329284611	0.012826394978125	0.9982174688057	0.99704579025111	0.99852071005917	0.99778270509978	
testing time - train	  1.186	 nTrainingPrograms	1683	
  675     2
    1  1005
[torch.DoubleTensor of size 2x2]

--	
training time	 32.005	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
35	val   	0.021271346299619	0.12037801799051	0.9478672985782	0.92045454545455	0.95294117647059	0.9364161849711	
testing time - val  	  0.156	 nValPrograms	211	
  81    7
   4  119
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
35	train 	0.021271346299619	0.014115614049575	0.99702911467617	0.99410898379971	0.99852071005917	0.99630996309963	
testing time - train	  1.183	 nTrainingPrograms	1683	
  675     4
    1  1003
[torch.DoubleTensor of size 2x2]

--	
training time	 31.991	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
40	val   	0.017399860965448	0.12343576564608	0.93364928909953	0.92771084337349	0.90588235294118	0.91666666666667	
testing time - val  	  0.154	 nValPrograms	211	
  77    6
   8  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
40	train 	0.017399860965448	0.0081807981271228	0.9982174688057	0.99851851851852	0.99704142011834	0.99777942264989	
testing time - train	  1.189	 nTrainingPrograms	1683	
  674     1
    2  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.986	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
45	val   	0.012330326521177	0.13431220088525	0.94312796208531	0.92941176470588	0.92941176470588	0.92941176470588	
testing time - val  	  0.152	 nValPrograms	211	
  79    6
   6  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
45	train 	0.012330326521177	0.0074799091279046	0.9982174688057	0.99851851851852	0.99704142011834	0.99777942264989	
testing time - train	  1.184	 nTrainingPrograms	1683	
  674     1
    2  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.966	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
50	val   	0.015055712885752	0.13219990125765	0.9478672985782	0.93023255813953	0.94117647058824	0.93567251461988	
testing time - val  	  0.153	 nValPrograms	211	
  80    6
   5  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
50	train 	0.015055712885752	0.0077246054254397	0.99762329174094	0.99704142011834	0.99704142011834	0.99704142011834	
testing time - train	  1.184	 nTrainingPrograms	1683	
  674     2
    2  1005
[torch.DoubleTensor of size 2x2]

--	
training time	 31.966	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
55	val   	0.012149294217428	0.12793228326816	0.93364928909953	0.92771084337349	0.90588235294118	0.91666666666667	
testing time - val  	  0.154	 nValPrograms	211	
  77    6
   8  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
55	train 	0.012149294217428	0.006638735727547	0.99881164587047	0.99852071005917	0.99852071005917	0.99852071005917	
testing time - train	  1.184	 nTrainingPrograms	1683	
  675     1
    1  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.977	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
60	val   	0.012296135696144	0.13272679530049	0.9478672985782	0.93023255813953	0.94117647058824	0.93567251461988	
testing time - val  	  0.153	 nValPrograms	211	
  80    6
   5  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
60	train 	0.012296135696144	0.0069904121273129	0.99881164587047	0.99852071005917	0.99852071005917	0.99852071005917	
testing time - train	  1.187	 nTrainingPrograms	1683	
  675     1
    1  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.982	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
65	val   	0.013662806098403	0.13083266985925	0.93838862559242	0.92857142857143	0.91764705882353	0.92307692307692	
testing time - val  	  0.154	 nValPrograms	211	
  78    6
   7  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
65	train 	0.013662806098403	0.0061591699303294	0.99881164587047	0.99852071005917	0.99852071005917	0.99852071005917	
testing time - train	  1.187	 nTrainingPrograms	1683	
  675     1
    1  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 31.974	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
70	val   	0.014961927119848	0.14605692608097	0.95734597156398	0.93181818181818	0.96470588235294	0.94797687861272	
testing time - val  	  0.153	 nValPrograms	211	
  82    6
   3  120
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
70	train 	0.014961927119848	0.0064191930150957	0.99881164587047	0.99852071005917	0.99852071005917	0.99852071005917	
testing time - train	  1.152	 nTrainingPrograms	1683	
  675     1
    1  1006
[torch.DoubleTensor of size 2x2]

--	
training time	 32.109	 nPrograms in training 	1683	
nValPrograms	211	nTrainingPrograms	1683	
testing corrected verison 2	
Test Stats : nMalware 	126	 nBenign 	85	 positiveLabel 	1	
75	val   	0.011070825411887	0.14359631668335	0.95260663507109	0.92134831460674	0.96470588235294	0.94252873563218	
testing time - val  	  0.152	 nValPrograms	211	
  82    7
   3  119
[torch.DoubleTensor of size 2x2]

testing corrected verison 2	
Test Stats : nMalware 	1007	 nBenign 	676	 positiveLabel 	1	
75	train 	0.011070825411887	0.0070228511495245	0.9982174688057	0.99704579025111	0.99852071005917	0.99778270509978	
testing time - train	  1.183	 nTrainingPrograms	1683	
  675     2
    1  1005
[torch.DoubleTensor of size 2x2]

--	
Best Result 	0.014961927119848	0.14605692608097	0.95734597156398	0.93181818181818	0.96470588235294	0.94797687861272	


================================================
FILE: run.sh
================================================
#
#  First we must run the program with the -setupMode flag
#  The program should be run with this flag ONLY ONCE for each dataset
#  This reads the dataset, splits it into training and testing-sets
#  and saves the dataset metadata to a file
#
th  DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64   -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192 -setupMode
#
#
#  Below is the code to train a network
#  This uses the metadata file above so that we can reproduce our results
#
th  DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64   -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192
#
#
#  Below is the code to test a pre-trained network
#  This should only be run ONCE after setting hyper-parameters using the validation-set
#
th testWithPreTrainedNetwork.lua -useCUDA -dataDir ./dataset -modelPath ./trainedNets/model_tmp.th7


================================================
FILE: splitMalwareData.lua
================================================
-- run this program once given a new dataset
-- saves the test / train split to disk
-- later sub-divide the train-set into train / validation sets
-- return indicies for the training and testing sets
-- we will later sub-divide the training-set into train & val sets
function splitMalwareDataTrainTest(labels,pTrain,pTest)

	local pos = {}
	local neg = {}
	local nPrograms = labels:size(1)--allData.program:size(1)

	-- record the incidies of all the pos/neg i.e. malware/benign examples
	for i = 1,nPrograms do
		if labels[i] == 1 then
			table.insert(pos,i)
		else
			table.insert(neg,i)
		end
	end

	print(#pos,#neg)

	-- record all the positive and negative indicies
	-- shuffle the data
	-- take the first X% of pos and first x% of pos for training

	local trainInds = {}
	local testInds = {}

	local indsPos = torch.randperm(#pos)
	local indsNeg = torch.randperm(#neg)

	local nPosTrain = torch.floor(#pos * pTrain)
	local nNegTrain = torch.floor(#neg * pTrain)

	local nPosTest = #pos - nPosTrain
	local nNegTest = #neg - nNegTrain

	print('splitting dataset')
	print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain,'pos/neg ',nPosTrain / (nPosTrain+nNegTrain))
	print('nPosTest',nPosTest,'nNegTest',nNegTest,'pos/neg ',nPosTest / (nPosTest+nNegTest))

	for i = 1,nPosTrain do
		table.insert(trainInds,pos[indsPos[i]])
	end	
	for i = 1,nNegTrain do
		table.insert(trainInds,neg[indsNeg[i]])
	end

	for i = 1,nPosTest do
		table.insert(testInds,pos[indsPos[nPosTrain + i]])
	end	
	for i = 1,nNegTest do
		table.insert(testInds,neg[indsNeg[nNegTrain + i]])
	end

	-- ratio used to weight the classes during training. Deals with
	-- the unbalanced number of examples for each class
	local posNegRatio = nPosTrain / (nPosTrain + nNegTrain)

	return trainInds,testInds,posNegRatio
end

-- return indicies for the train,val and testing sets
function splitMalwareDataTrainValTest(labels,metaData)

	local pTrain = 0.8
	local pVal = 0.1
	local pTest = 0.1

	local testInds = metaData.testInds

	local pos = {}
	local neg = {}
	local nPrograms = labels:size(1)--allData.program:size(1)	
	print('nPrograms ',nPrograms)

	-- record the incidies of all the pos/neg i.e. malware/benign examples
	for i = 1,nPrograms do
		if labels[i] == 1 then
			table.insert(pos,i)
		else
			table.insert(neg,i)
		end
	end

	local posTrainVal = {}
	local negTrainVal = {}
	-- record the incidies of all the pos/neg i.e. malware/benign examples in the training-set
	for i = 1,#metaData.trainInds do
		if labels[metaData.trainInds[i]] == 1 then
			table.insert(posTrainVal,metaData.trainInds[i])
		else
			table.insert(negTrainVal,metaData.trainInds[i])
		end
	end

	print(#pos,#neg)
	print(#posTrainVal,#negTrainVal)

	-- record all the positive and negative indicies
	-- shuffle the data
	-- take the first X% of pos and first x% of pos for training

	local trainInds = {}
	local valInds = {}

	local indsPos = torch.randperm(#posTrainVal)
	local indsNeg = torch.randperm(#negTrainVal)

	local nPosTrain = torch.floor(#pos * pTrain)
	local nNegTrain = torch.floor(#neg * pTrain)
	local nPosVal = #posTrainVal - nPosTrain
	local nNegVal = #negTrainVal - nNegTrain
	local nPosTest = #pos - (nPosTrain + nPosVal)
	local nNegTest = #neg - (nNegTrain + nNegVal)

	print('splitting dataset')
	print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain)
	print('nPosVal',nPosVal,'nNegVal',nNegVal)
	print('nPosTest',nPosTest,'nNegTest',nNegTest)

	for i = 1,nPosTrain do
		table.insert(trainInds,posTrainVal[indsPos[i]])
	end	
	for i = 1,nNegTrain do
		table.insert(trainInds,negTrainVal[indsNeg[i]])
	end
	for i = 1,nPosVal do
		table.insert(valInds,posTrainVal[indsPos[nPosTrain + i]])
	end	
	for i = 1,nNegVal do
		table.insert(valInds,negTrainVal[indsNeg[nNegTrain + i]])
	end
	-- for i = 1,nPosTest do
	-- 	table.insert(testInds,pos[indsPos[nPosTrain + nPosVal + i]])
	-- end	
	-- for i = 1,nNegTest do
	-- 	table.insert(testInds,neg[indsNeg[nNegTrain + nNegVal + i]])
	-- end

	-- ratio used to weight the classes during training. Deals with
	-- the unbalanced number of examples for each class
	local posNegRatio = nPosTrain / (nPosTrain + nNegTrain)

	-- check there is no overlap between train / val / test sets
	local sanity = torch.zeros(nPrograms)
	for i = 1,#trainInds do
		sanity[trainInds[i]] = sanity[trainInds[i]] + 1
	end
	for i = 1,#testInds do
		sanity[testInds[i]] = sanity[testInds[i]] + 1
	end
	for i = 1,#valInds do
		sanity[valInds[i]] = sanity[valInds[i]] + 1
	end

	print('train/val/test check',torch.min(sanity),torch.max(sanity),torch.sum(sanity),nPrograms)
	if not (torch.min(sanity) == 1) or not (torch.max(sanity) == 1) or not (torch.sum(sanity) == nPrograms) then
		-- stop if this happens
		error('overlap between training / validation and testing sets')
	end

	return trainInds,valInds,testInds,posNegRatio
end

================================================
FILE: testModel.lua
================================================
function testModel(allData,model,valInds,epochError)

	print('testing corrected verison 2')

	local timerTest = torch.Timer()

	local dtype = 'torch.DoubleTensor'
	if opt.useCUDA then
		dtype = 'torch.CudaTensor'
	end

	local criterion = nn.ClassNLLCriterion():type(dtype)

	model:evaluate()
	
	-- push the validation data through the network
	local nValPrograms = #valInds
	local valError = 0
	local correct = 0
	local confmat = torch.zeros(2,2)
	local lens = torch.zeros(nValPrograms)

	-- We need to make sure the rare-class is regarded as positive
	-- This means the f-score etc will be corectly calculated
	-- When reading the data benign is labelled as 1 and malware as 2
	local nBenign = 0
	local nMalware = 0
	for k = 1,nValPrograms do
		if allData.label[valInds[k]] == 1 then
			nBenign = nBenign + 1
		else
			nMalware = nMalware + 1
		end
	end
	local positiveLabel = 1
	if nMalware < nBenign then
		positiveLabel = 2
	end

	print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)

	--local valBatch = torch.zeros(1,opt.programLen):type(dtype)
	local valLabel = torch.zeros(1):type(dtype)

	for k = 1,nValPrograms do
		valLabel[{1}] = allData.label[valInds[k]]
		--valBatch[{{1},{}}] = allData.program[valInds[k]]

		local currProgramPtr = allData.programStartPtrs[valInds[k]]
		local currProgramLen = allData.programLengths[valInds[k]]

		if currProgramLen > opt.maxSequenceLength then
			currProgramLen = opt.maxSequenceLength
		end			

		local valBatch = torch.zeros(1,currProgramLen):type(dtype)
		valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]

		local netOutput = model:forward(valBatch)

		valError = valError + criterion:forward(netOutput,valLabel)	 		
		local netOutputProb = nn.Exp():forward(netOutput:double())

		local v,i = torch.max(netOutputProb,2)
		local pred = i[{1,1}]
		local gt = allData.label[valInds[k]]
		if pred == gt then
			correct = correct + 1;						
		end
		confmat[pred][gt] = confmat[pred][gt] + 1
	end
	valError = valError / nValPrograms

	local tp = 0
	local fp = 0
	local fn = 0

	if positiveLabel == 1 then
		tp = confmat[1][1]
		fp = confmat[1][2]
		fn = confmat[2][1]
	else
		tp = confmat[2][2]
		fp = confmat[2][1]
		fn = confmat[1][2]
	end

	local testResult = {
		-- tp = tp,
		-- fp = fp,
		-- fn = fn,
		prec = tp / (tp + fp),
		recall = tp / (tp + fn),
		fscore = (2 * tp) / ((2 * tp) + fp + fn),
		accuracy = correct/nValPrograms,
		testError = valError,		
	}

	local time = timerTest:time().real	

	model:training()

	-- clean up
	valLabel = nil
	collectgarbage()

	return testResult,confmat,time
end

================================================
FILE: testModel_dataAug.lua
================================================
function testModel(allData,model,valInds,epochError)

	print('testing corrected verison 3')

	local timerTest = torch.Timer()

	local dtype = 'torch.DoubleTensor'
	if opt.useCUDA then
		dtype = 'torch.CudaTensor'
	end

	local criterion = nn.ClassNLLCriterion():type(dtype)

	model:evaluate()
	
	-- push the validation data through the network
	local nValPrograms = #valInds
	local valError = 0
	local correct = 0
	local confmat = torch.zeros(2,2)
	local lens = torch.zeros(nValPrograms)

	-- We need to make sure the rare-class is regarded as positive
	-- This means the f-score etc will be corectly calculated
	-- When reading the data benign is labelled as 1 and malware as 2
	local nBenign = 0
	local nMalware = 0
	for k = 1,nValPrograms do
		if allData.label[valInds[k]] == 1 then
			nBenign = nBenign + 1
		else
			nMalware = nMalware + 1
		end
	end
	local positiveLabel = 1
	if nMalware < nBenign then
		positiveLabel = 2
	end

	print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)

	--local valBatch = torch.zeros(1,opt.programLen):type(dtype)
	local valLabel = torch.zeros(1):type(dtype)

	for k = 1,nValPrograms do
		valLabel[{1}] = allData.label[valInds[k]]
		--valBatch[{{1},{}}] = allData.program[valInds[k]]

		local currProgramPtr = allData.programStartPtrs[valInds[k]]
		local currProgramLen = allData.programLengths[valInds[k]]

		local netOutputProb = torch.zeros(1,2)
		local nDataAug = 10
		for j = 1,nDataAug do

			local valBatch
			if currProgramLen > opt.maxSequenceLength then
				valBatch = torch.zeros(1,opt.maxSequenceLength):type(dtype)
				local rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))
				valBatch[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]
			else
				valBatch = torch.zeros(1,currProgramLen):type(dtype)
				valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
			end

			-- if currProgramLen > opt.maxSequenceLength then
			-- 	currProgramLen = opt.maxSequenceLength
			-- end			
			-- local valBatch = torch.zeros(1,currProgramLen):type(dtype)
			-- valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]

			local netOutput = model:forward(valBatch)
			valError = valError + criterion:forward(netOutput,valLabel)	 		
			netOutputProb = netOutputProb + nn.Exp():forward(netOutput:double())
		end

		local v,i = torch.max(netOutputProb,2)
		local pred = i[{1,1}]
		local gt = allData.label[valInds[k]]
		if pred == gt then
			correct = correct + 1;						
		end
		confmat[pred][gt] = confmat[pred][gt] + 1
	end
	valError = valError / nValPrograms

	local tp = 0
	local fp = 0
	local fn = 0

	if positiveLabel == 1 then
		tp = confmat[1][1]
		fp = confmat[1][2]
		fn = confmat[2][1]
	else
		tp = confmat[2][2]
		fp = confmat[2][1]
		fn = confmat[1][2]
	end

	local testResult = {
		-- tp = tp,
		-- fp = fp,
		-- fn = fn,
		prec = tp / (tp + fp),
		recall = tp / (tp + fn),
		fscore = (2 * tp) / ((2 * tp) + fp + fn),
		accuracy = correct/nValPrograms,
		testError = valError,		
	}

	local time = timerTest:time().real	

	model:training()

	-- clean up
	valBatch = nil
	valLabel = nil
	collectgarbage()

	return testResult,confmat,time
end

================================================
FILE: testWithPreTrainedNetwork.lua
================================================
-- Example of how to test using a pre-trained network
-- Expects a directory containing two or more directories
-- One directory contains all the malware
-- The other directory contains all the benign software

-- given a model that has already been trained
-- and a directory containing programs - classify into malware / benign

require 'nn'
require 'optim'
require 'nngraph'
require 'cunn'
require 'cutorch'

require 'readMalwareData'
require 'testModel'

cmd = torch.CmdLine()
cmd:option('-useCUDA',false,'use CUDA optimisation')
cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')
cmd:option('-modelPath','./trainedNets/model.th7','path to model to use for testing')
opt = cmd:parse(arg)

print('loading model from disk')
savedModel = torch.load(opt.modelPath)
print('loaded model')
print(savedModel.trainedModel)

-- we need these values to correctly prepare the files when reading from disk
opt.programLen = savedModel.opt.programLen
opt.kernelLength = savedModel.opt.kernelLength
opt.maxSequenceLength = savedModel.opt.maxSequenceLength

print('reading data from disk')
allData = readMalwareData(opt.dataDir,savedModel.metaData)

if opt.useCUDA then
	savedModel.trainedModel:cuda()
end
savedModel.trainedModel:evaluate()

print('starting test')
testResult,confmat,time = testModel(allData,savedModel.trainedModel,savedModel.metaData.testInds,0)

print('Results')
print('f-score   ',testResult.fscore)
print('precision ',testResult.prec)
print('recall    ',testResult.recall)
print('accuracy  ',testResult.accuracy)
print('--')
print('Confusion Matrix')
print(confmat)
print('--')
print('time to complete test (s) :',time)


================================================
FILE: trainModel.lua
================================================
-- use the GPU to process the whole batch in parallel
function trainModel(model,criterion,allData,trainInds,valInds,dataSplit,metaData)

	local parameters,gradParameters = model:getParameters()
	print('Number of Model Parameters ',parameters:size(1))

	local dtype = 'torch.DoubleTensor'
	if opt.useCUDA then
		print('Using CUDA')
		dtype = 'torch.CudaTensor'
	else
		print('Running on CPU - CUDA disabled')
	end

	local config = {
		learningRate = opt.learningRate,
		weightDecay = opt.weightDecay,
	}

	local bestfscore = 0
	local bestResult = torch.zeros(6)

	local timer = torch.Timer()
	local nPrograms = #trainInds
	print('Number of training examples ',#trainInds)
	print('Number of validation examples ',#valInds)

	-- pre-allocate memory for the batch
	print('allocating batch memory')
	--local batchProg = torch.zeros(opt.batchSize,opt.programLen):type(dtype)
	local batchLabel = torch.zeros(opt.batchSize):type(dtype)
	print('memory allocated')
	--print(#batchProg)
	if opt.useCUDA then
		local freeMemory, totalMemory = cutorch.getMemoryUsage(opt.gpuid)
		print('CUDA memory usage')
		print('free ',freeMemory,'total ',totalMemory,'ratio ',freeMemory/totalMemory)
	end

	local gradMultiplier = torch.zeros(2):type(dtype)
	if dataSplit.posNegRatio < 0.5 then
		gradMultiplier[1] = 1 - dataSplit.posNegRatio
		gradMultiplier[2] = dataSplit.posNegRatio
	else
		gradMultiplier[1] = dataSplit.posNegRatio
		gradMultiplier[2] = 1 - dataSplit.posNegRatio
	end

	for e = 1,opt.nEpochs do

		--batchProg:mul(0)
		batchLabel:mul(0)

		local nBatches = 0
		local nSamples = 0
		local epochError = 0
		local order = torch.randperm(nPrograms)

		for i = 1,(nPrograms - (nPrograms%opt.batchSize)),opt.batchSize do

			nSamples = nSamples + opt.batchSize
			nBatches = nBatches + 1

			-- build the batch here
			for k = 0,(opt.batchSize-1) do
				--batchProg[{{k+1},{}}] = allData.program[trainInds[order[i + k]]]
				batchLabel[{k+1}] = allData.label[trainInds[order[i + k]]]				
			end

			local currProgramPtr = allData.programStartPtrs[trainInds[order[i]]]
			local currProgramLen = allData.programLengths[trainInds[order[i]]]

			local batchProg
			if currProgramLen > opt.maxSequenceLength then
				batchProg = torch.zeros(1,opt.maxSequenceLength):type(dtype)	
				local rndPtr = 0
				if opt.dataAugTesting then			
					rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))
				end
				batchProg[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]
			else
				batchProg = torch.zeros(1,currProgramLen):type(dtype)
				batchProg[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
			end

			--print(#batchProg)
			--print(currProgramPtr,currProgramLen)

			local feval = function(x)

				local batchError = 0
				if x ~= parameters then
					parameters:copy(x)
				end
				gradParameters:zero()
				
				local output = model:forward(batchProg)
				local netError = criterion:forward(output,batchLabel)
				batchError = batchError + netError
				epochError = epochError + netError
				local gradCriterion = criterion:backward(output,batchLabel)

				if opt.weightClasses then					
					-- seems to be a bug in Torch with ClassNLLCriterion as it should 
					-- do this automatically ... 
					-- manually weight the classes to deal with imbalanced pos / neg samples
					gradCriterion = gradCriterion:cmul(gradMultiplier)
				end

				model:backward(batchProg,gradCriterion)	

				return batchError,gradParameters					
			end
			if opt.useRMSProp then
				optim.rmsprop(feval, parameters, config)
         	else
         		optim.sgd(feval, parameters, config)
         	end

			if isnan(epochError) then
				print('training fail - Nan')
				return 0
			end		
			if epochError > 1e9 then
				print('training fail - gradient exploded')
				return 0
			end
		end		

		if (e == 50 or e == 75) and opt.decayLearningRate then
			config.learningRate = config.learningRate * opt.weightDecayFrac
		end

		-- check the cross validation error
	    if e % opt.nSamplingEpochs == 0 or e == opt.nEpochs then  

			local time = timer:time().real
			print('training time',string.format("%7.3f",time),' nPrograms in training ',nSamples)
			timer:reset()

			local nValPrograms = #valInds
			local nTrainPrograms = #trainInds

			print('nValPrograms',nValPrograms,'nTrainingPrograms',nTrainPrograms)

	 		local valResult,valConfMat,valTime = testModel(allData,model,valInds,bestfscore)

 			if valResult.fscore > bestfscore then
				bestfscore = valResult.fscore
				bestResult[1] = valResult.accuracy
				bestResult[2] = valResult.prec
				bestResult[3] = valResult.recall
				bestResult[4] = valResult.fscore
				bestResult[5] = epochError/nBatches
				bestResult[6] = valResult.testError

				-- save the best model so far and the data split etc
				if opt.saveModel then
					local experimentData = {
						opt = opt,
						trainedModel = model:double(),
						dataSplit = dataSplit,
						metaData = metaData,						
					}
					torch.save('./trainedNets/' .. opt.saveFileName .. '.th7',experimentData)
					model:type(dtype)
					parameters, gradParameters = model:getParameters()
    				collectgarbage()
				end
			end

			print(e,'val   ',epochError/nBatches,valResult.testError,valResult.accuracy,valResult.prec,valResult.recall,valResult.fscore)
			print('testing time - val  ',string.format("%7.3f",valTime),' nValPrograms',nValPrograms)
			print(valConfMat)

	 		local testResult,testConfMat,testTime = testModel(allData,model,trainInds,1)
	 		print(e,'train ',epochError/nBatches,testResult.testError,testResult.accuracy,testResult.prec,testResult.recall,testResult.fscore)
	 		print('testing time - train',string.format("%7.3f",testTime),' nTrainingPrograms',nTrainPrograms)
	 		print(testConfMat)
	 		print('--')

	 		epochError = 0
			nSamples = 0
			nBatches = 0
			collectgarbage()
	 	end
	end
	print('Best Result ',bestResult[5],bestResult[6],bestResult[1],bestResult[2],bestResult[3],bestResult[4])
	return model
end