Repository: niallmcl/Deep-Android-Malware-Detection
Branch: master
Commit: 7b9ed3def0ea
Files: 17
Total size: 64.4 KB
Directory structure:
gitextract_s38v2mnj/
├── .gitattributes
├── DetectMalware_CNN.lua
├── buildNetwork.lua
├── dataset/
│ ├── Benign/
│ │ └── example.opseq
│ └── Malware/
│ └── example.opseq
├── opcodeseq_creator/
│ ├── DalvikOpcodes.txt
│ ├── README.txt
│ └── run_opcode_seq_creation.py
├── readMalwareData.lua
├── readme.md
├── results/
│ └── exampleOutput.txt
├── run.sh
├── splitMalwareData.lua
├── testModel.lua
├── testModel_dataAug.lua
├── testWithPreTrainedNetwork.lua
└── trainModel.lua
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto
# Custom for Visual Studio
*.cs diff=csharp
# Standard to msysgit
*.doc diff=astextplain
*.DOC diff=astextplain
*.docx diff=astextplain
*.DOCX diff=astextplain
*.dot diff=astextplain
*.DOT diff=astextplain
*.pdf diff=astextplain
*.PDF diff=astextplain
*.rtf diff=astextplain
*.RTF diff=astextplain
================================================
FILE: DetectMalware_CNN.lua
================================================
require 'nn'
require 'optim'
require 'nngraph'
require 'readMalwareData'
require 'splitMalwareData'
require 'buildNetwork'
require 'trainModel'
local cmd = torch.CmdLine()
cmd:option('-seed',1,'seed the random number generator')
cmd:option('-nEmbeddingDims',8,'number of dims in lookupTable for projecting instructions to network')
cmd:option('-nConvFilters',64,'number of convolutional filters')
cmd:option('-kernelLength',8,'seed the random number generator')
cmd:option('-useHiddenLayer',true,'use hidden layer between the conv layers and classifier')
cmd:option('-nHiddenNodes',16,'seed the random number generator')
cmd:option('-weightClasses',false,'seed the random number generator')
cmd:option('-nSamplingEpochs',10,'how often to sample the validation set - slow')
cmd:option('-useDropout',false,'use dropout between the conv and hidden layers')
cmd:option('-dropoutFrac',0.5,'dropout strength')
cmd:option('-randomize',false,'randomly select the network parameters')
cmd:option('-numDAShuffles',1,'number of function order shuffled versions of each program to keep')
cmd:option('-useOneHot',false,'Represent programs using one-hot / otherwise use look-up-table')
cmd:option('-learningRate',1e-3,'learning rate')
cmd:option('-nEpochs',20,'training epochs')
cmd:option('-nConvLayers',1,'number of extra convolutional layers')
cmd:option('-nFCLayers',1,'number of extra convolutional layers')
cmd:option('-batchSize',1,'size of batch used in training')
cmd:option('-usemom',false,'use momentum during SGD optimisation')
cmd:option('-useRMSProp',false,'use alternative optimizer rather than SGD')
cmd:option('-useCUDA',false,'use CUDA optimisation')
cmd:option('-gpuid',1,'which GPU to use')
cmd:option('-usePreTrainedEmbedding',false,'initialise network with pre-trained embedding')
cmd:option('-fixEmbedding',false,'prevent the embedding from being updated during learning')
cmd:option('-programLen',8,'how many instructions to read')
cmd:option('-debug',false,'enter debug mode')
cmd:option('-dataAugProb',0.1,'probability of changing an instruction during data augmentation')
cmd:option('-dataAugMethod',1,'1 - substitue the semantically most similar instruction, 2 - substitue random instruction')
cmd:option('-trainingSetSize',2,'restrict the size of the training-set for evaluation purposes')
cmd:option('-markFunctionEnds',false,'place a marker at the end of each method which may help classification work better')
cmd:option('-saveModel',false,'save the model and data split')
cmd:option('-saveFileName','detect_malware_cnn','filename to save the network')
cmd:option('-decayLearningRate',false,'reduce learning rate by factor of 10 every so often')
cmd:option('-weightDecay',0,'weight decay for L2 regularisation')
cmd:option('-weightDecayFrac',0.1,'amount to reduce learning rate by, 0.1 or 0.5 are good values')
-- try using dropout in various places of the network
cmd:option('-useSpatialDropout',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterEmbedding',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterConv',false,'drop instructions after the embedding layer')
cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')
cmd:option('-metaDataFile','./config/metaData.th7','file containing indicies of test/train/val split')
cmd:option('-setupMode',false,'Only run in this mode once. Splits the data into the train/test sets. Saved into ./config/metaData.th7')
cmd:option('-maxSequenceLength',1000000,'if program is longer than this length, crop sequence before passing to GPU')
cmd:option('-dataAugTesting',false,'Use data augmentation during testing i.e average score over random samples from program')
opt = cmd:parse(arg)
if opt.useCUDA then
require 'cunn'
require 'cutorch'
end
torch.setdefaulttensortype("torch.DoubleTensor")
torch.manualSeed(opt.seed)
if opt.useCUDA then
cutorch.setDevice(opt.gpuid)
cutorch.manualSeedAll(opt.seed)
end
if opt.dataAugTesting then
require 'testModel_dataAug'
else
require 'testModel'
end
print(opt)
function isnan(z)
return z ~= z
end
if opt.setupMode then
-- READ-ME
-- Given a new dataset we need to split into training / testing sets.
-- We only run this chunk once to generate the new train / test split and save it to disk
-- Later, when training the network, the training-set is randomly spit into train / validation for a given run
-- This allows us to perform cross-validation on the training-set. After we have finished
-- doing all development we can test a pre-trained network on the testing-set.
------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------
-- read the data from the root dir
-- decide which files should be included in the dataset
print('reading dataset')
local datasetInfo = readMalwareData_setup(opt.dataDir)
print('splitting dataset into train/test sets')
local trainPercentage = 0.9 -- use 90% for training and validation sets, and 10% for held-out testing-set
local trainInds,testInds,posNegRatio = splitMalwareDataTrainTest(datasetInfo.label,trainPercentage,1 - trainPercentage)
local metaData = {
trainInds = trainInds,
testInds = testInds,
posNegRatio = posNegRatio,
trainPercentage = trainPercentage,
--
filesList = datasetInfo.filesList,
family = datasetInfo.family,
label = datasetInfo.label,
benignFamily = datasetInfo.benignFamily,
familyName = datasetInfo.familyName,
}
print('saving dataset metadata to file ',opt.metaDataFile)
torch.save(opt.metaDataFile,metaData)
-- ------------------------------------------------------------------------------------------
-- ------------------------------------------------------------------------------------------
else
-- train the network and save version with lowest validation error to disk
print(opt.metaDataFile)
local metaData = torch.load(opt.metaDataFile)
print('reading data from disk')
local allData = readMalwareData(opt.dataDir,metaData)
print('reading data from disk - complete')
print('program lens ',torch.min(allData.programLengths),torch.max(allData.programLengths),torch.mean(allData.programLengths))
--take the saved split of train/test and further split the train-set into train/val
print('splitting data into train/val/test sets')
local testPercentage = (1 - metaData.trainPercentage)
local valPercentage = (1 - metaData.trainPercentage)
local trainPercentage = 1 - (testPercentage + valPercentage)
print('t,v,t')
print(testPercentage,valPercentage,trainPercentage)
local trainInds,valInds,testInds,posNegRatio = splitMalwareDataTrainValTest(allData.label,metaData,trainPercentage)
local dataSplit = {
trainInds = trainInds,
valInds = valInds,
testInds = testInds,
posNegRatio = posNegRatio,
}
print('new network')
local model,criterion = buildNetwork(metaData.posNegRatio)
print('starting training')
local trainedModel = trainModel(model,criterion,allData,dataSplit.trainInds,dataSplit.valInds,dataSplit,metaData)
end
================================================
FILE: buildNetwork.lua
================================================
function buildNetwork(posNegRatio)
local nIndex = 256
local nOutputSamples = opt.nConvFilters -- number of conv-filters
local kernelStride = 1 -- stride of kernel
local nClasses = 2
local nHidden = opt.nHiddenNodes
local model = nn.Sequential()
-- project from one-hot to low-dim embedding space
if opt.constrainEmbeddingNorm then
model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims,0,1,2))
else
model:add(nn.LookupTable(nIndex,opt.nEmbeddingDims))
end
-- we can add this here to prevent the network from updating the projection layer
-- maybe the projection does not matter much?
-- model:add(nn.GradBlocker())
-- 1st conv layer
--model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))
model:add(nn.Reshape(1,-1,opt.nEmbeddingDims,true))
if opt.useSpatialDropout then
-- should be batchx1xproglenxembeddingdim
model:add(nn.Reshape(opt.programLen,opt.nEmbeddingDims,1,true))
model:add(nn.SpatialDropout(opt.dropoutFrac))
model:add(nn.Reshape(1,opt.programLen,opt.nEmbeddingDims,true))
end
--model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))
if opt.useDropoutAfterEmbedding then
model:add(nn.Dropout(opt.dropoutFrac))
end
model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nEmbeddingDims,opt.kernelLength,kernelStride))
model:add(nn.ReLU())
-- if opt.nConvLayers > 1 then
-- for layernum = 1,(opt.nConvLayers-1) do
-- model:add(nn.Reshape(opt.nConvFilters,-1,true))
-- model:add(nn.Transpose({2,3}))
-- --model:add(nn.TemporalMaxPooling(opt.kernelLength/2,opt.kernelLength/2))
-- model:add(nn.TemporalMaxPooling(2,2))
-- model:add(nn.Reshape(1,-1,opt.nConvFilters,true))
-- model:add(nn.SpatialZeroPadding(0,0,opt.kernelLength,opt.kernelLength))
-- model:add(nn.SpatialConvolutionMM(1,opt.nConvFilters,opt.nConvFilters,opt.kernelLength,kernelStride))
-- model:add(nn.ReLU())
-- end
-- end
model:add(nn.Reshape(opt.nConvFilters,-1,true))
if opt.useDropoutAfterConv then
model:add(nn.Dropout(opt.dropoutFrac))
end
model:add(nn.Max(3)) -- produces a vector of fixed size
if opt.useHiddenLayer then
model:add(nn.Linear(nOutputSamples,nHidden))
model:add(nn.ReLU())
model:add(nn.Linear(nHidden,nClasses))
else
model:add(nn.Linear(nOutputSamples,nClasses))
end
model:add(nn.LogSoftMax())
local criterion = 0
if opt.weightClasses then
local weights = torch.zeros(nClasses)
if posNegRatio < 0.5 then
weights[1] = 1 - posNegRatio
weights[2] = posNegRatio
else
weights[2] = 1 - posNegRatio
weights[1] = posNegRatio
end
criterion = nn.ClassNLLCriterion(weights)
else
criterion = nn.ClassNLLCriterion()
end
if opt.useCUDA then
model:cuda()
criterion:cuda()
end
print(model)
return model,criterion
end
================================================
FILE: dataset/Benign/example.opseq
================================================
5b700e
700e
1f6e0c
================================================
FILE: dataset/Malware/example.opseq
================================================
5b700e
700e
1f6e0c
================================================
FILE: opcodeseq_creator/DalvikOpcodes.txt
================================================
nop 00
move 01
move/from16 02
move/16 03
move-wide 04
move-wide/from16 05
move-wide/16 06
move-object 07
move-object/from16 08
move-object/16 09
move-result 0a
move-result-wide 0b
move-result-object 0c
move-exception 0d
return-void 0e
return 0f
return-wide 10
return-object 11
const/4 12
const/16 13
const 14
const/high16 15
const-wide/16 16
const-wide/32 17
const-wide 18
const-wide/high16 19
const-string 1a
const-string/jumbo 1b
const-class 1c
monitor-enter 1d
monitor-exit 1e
check-cast 1f
instance-of 20
array-length 21
new-instance 22
new-array 23
filled-new-array 24
filled-new-array/range 25
fill-array-data 26
throw 27
goto 28
goto/16 29
goto/32 2a
packed-switch 2b
sparse-switch 2c
cmpl-float 2d
cmpg-float 2e
cmpl-double 2f
cmpg-double 30
cmp-long 31
if-eq 32
if-ne 33
if-lt 34
if-ge 35
if-gt 36
if-le 37
if-eqz 38
if-nez 39
if-ltz 3a
if-gez 3b
if-gtz 3c
if-lez 3d
aget 44
aget-wide 45
aget-object 46
aget-boolean 47
aget-byte 48
aget-char 49
aget-short 4a
aput 4b
aput-wide 4c
aput-object 4d
aput-boolean 4e
aput-byte 4f
aput-char 50
aput-short 51
iget 52
iget-wide 53
iget-object 54
iget-boolean 55
iget-byte 56
iget-char 57
iget-short 58
iput 59
iput-wide 5a
iput-object 5b
iput-boolean 5c
iput-byte 5d
iput-char 5e
iput-short 5f
sget 60
sget-wide 61
sget-object 62
sget-boolean 63
sget-byte 64
sget-char 65
sget-short 66
sput 67
sput-wide 68
sput-object 69
sput-boolean 6a
sput-byte 6b
sput-char 6c
sput-short 6d
invoke-virtual 6e
invoke-super 6f
invoke-direct 70
invoke-static 71
invoke-interface 72
invoke-virtual/range 74
invoke-super/range 75
invoke-direct/range 76
invoke-static/range 77
invoke-interface/range 78
neg-int 7b
not-int 7c
neg-long 7d
not-long 7e
neg-float 7f
neg-double 80
int-to-long 81
int-to-float 82
int-to-double 83
long-to-int 84
long-to-float 85
long-to-double 86
float-to-int 87
float-to-long 88
float-to-double 89
double-to-int 8a
double-to-long 8b
double-to-float 8c
int-to-byte 8d
int-to-char 8e
int-to-short 8f
add-int 90
sub-int 91
mul-int 92
div-int 93
rem-int 94
and-int 95
or-int 96
xor-int 97
shl-int 98
shr-int 99
ushr-int 9a
add-long 9b
sub-long 9c
mul-long 9d
div-long 9e
rem-long 9f
and-long a0
or-long a1
xor-long a2
shl-long a3
shr-long a4
ushr-long a5
add-float a6
sub-float a7
mul-float a8
div-float a9
rem-float aa
add-double ab
sub-double ac
mul-double ad
div-double ae
rem-double af
add-int/2addr b0
sub-int/2addr b1
mul-int/2addr b2
div-int/2addr b3
rem-int/2addr b4
and-int/2addr b5
or-int/2addr b6
xor-int/2addr b7
shl-int/2addr b8
shr-int/2addr b9
ushr-int/2addr ba
add-long/2addr bb
sub-long/2addr bc
mul-long/2addr bd
div-long/2addr be
rem-long/2addr bf
and-long/2addr c0
or-long/2addr c1
xor-long/2addr c2
shl-long/2addr c3
shr-long/2addr c4
ushr-long/2addr c5
add-float/2addr c6
sub-float/2addr c7
mul-float/2addr c8
div-float/2addr c9
rem-float/2addr ca
add-double/2addr cb
sub-double/2addr cc
mul-double/2addr cd
div-double/2addr ce
rem-double/2addr cf
add-int/lit16 d0
rsub-int d1
mul-int/lit16 d2
div-int/lit16 d3
rem-int/lit16 d4
and-int/lit16 d5
or-int/lit16 d6
xor-int/lit16 d7
add-int/lit8 d8
rsub-int/lit8 d9
mul-int/lit8 da
div-int/lit8 db
rem-int/lit8 dc
and-int/lit8 dd
or-int/lit8 de
xor-int/lit8 df
shl-int/lit8 e0
shr-int/lit8 e1
ushr-int/lit8 e2
================================================
FILE: opcodeseq_creator/README.txt
================================================
The zip file contains:
1- A csv file containing Davlik opcodes
2- Sample directory structure containing
-apk folder with one sample apk
-tmp folder to hold the decoded apps
-opseq folder to store the opcode sequece files
5- a python file run_opcode_seq_creation.py which takes the following arguments:
Python script arguments:
<apk file directory>
1. Pathname to the directory containing apk file
<temp directory>
2. Pathname of a temporary folder to keep the decoded files during the analysis
<opseq directory>
3. Pathname to an arbitrary directory to store the opcode sequence files
<include support libraries>
4. (optional) "incl" (without quotes) to include android support library files
Note: default behavior is NOT to include those libraries
Steps to run the script:
1) Apktool installation:
-Make sure you have java install by running "java --version"
you can install jre by running "apt-get install default-jre"
-Follow the installation below to install apktool on Linux
https://ibotpeaches.github.io/Apktool/install/
(folowing the instructions will place apktool files in /usr/local/bin)
Note: Make sure that they are executable
2) Extract the zip file to a folder (extracted_folder) and run the following command:
extracted_folder$ ./run_opcode_seq_creation.py ./apk ./tmp ./opseq incl
================================================
FILE: opcodeseq_creator/run_opcode_seq_creation.py
================================================
#!/usr/bin/env python
import sys
import os
import shutil
import datetime
import logging
sys.path.insert(1, os.path.join(sys.path[0], '../..'))
def main():
if len(sys.argv) < 4:
print "Usage", sys.argv[0], "<apk file directory> <temp directory> <opseq directory> <include support libraries>"
return
# Reads the location of apk files that need decoding
apk_file_directory = sys.argv[1]
print "Reading apks from", apk_file_directory
# Temporary folder to store the decoded app
tmp_file_directory = sys.argv[2]
print "Decoding folder", tmp_file_directory
# Reads the location that we want to store our opseq files in
opseq_file_directory = sys.argv[3]
print "opseq folder", opseq_file_directory
# Default is not to include smali files in android support libraries unless 4th parameter is provided
include_libs = False
if len(sys.argv) == 5:
include_libs = ((sys.argv[4]) == "incl")
print "Include Android support library smali files", include_libs
print "Keep Android support libaray files: "+ str(include_libs)
# Created a log file in the temp directory
logging.basicConfig(filename=tmp_file_directory+'/opseq.log', level=logging.DEBUG)
apks = []
for name in os.listdir(apk_file_directory):
if os.path.isfile(os.path.join(apk_file_directory, name)):
apks.append(name)
logging.info('Total apks to be decoded {0}'.format(len(apks)))
print "Total apks to be decoded",len(apks)
num_local = 0
before=datetime.datetime.now()
logging.info('Starting at: {0}'.format(before))
print "Starting at: {0}",before
# Looping through all apks
for apk_hash in apks:
apk_file_location = os.path.join(apk_file_directory, apk_hash)
num_local += 1
logging.info('Decoding apk: {0} apk #: {1}'.format(apk_file_location,num_local))
print "apk #: ", num_local
print "apk location: ", apk_file_location
decoded_location = None
# Decoding apk into the tmp_file_directory
decoded_location = decode_application(apk_file_location,tmp_file_directory,apk_hash,include_libs)
if (not os.path.exists(decoded_location) or not os.listdir(decoded_location)):
print "smali directory does not exist continue...."
logging.error('NOT decoded directory: {0}'.format(apk_file_location))
print "NOT decoded directory:", apk_file_location
continue
result =create_opcode_seq(decoded_location,opseq_file_directory,apk_hash)
if result:
print "opseq file for apk #",num_local," is created"
logging.info('opseq file for apk # {0} is created'.format(num_local))
else:
logging.error('opseq file creation was not successful')
print "opseq file creation was not successful"
if os.path.exists(decoded_location):
shutil.rmtree(decoded_location)
after=datetime.datetime.now()
print "Finished by: {0} ",after
logging.info('Total time taken: {0}'.format(after-before))
print "Total time taken:", after-before
def create_opcode_seq(decoded_dir,opseq_file_directory,apk_hash):
# Returns true if creating opcode sequence file was successful,
# searches all files in smali folder,
# writes the coresponding opcode sequence to a .opseq file
# and depending on the include_lib value,
# it includes or excludes the support library files
dalvik_opcodes = {}
# Reading Davlik opcodes into a dictionary
with open("DalvikOpcodes.txt") as fop:
for linee in fop:
(key, val) = linee.split()
dalvik_opcodes[key] = val
try:
smali_dir = os.path.join(decoded_dir, "smali")
opseq_fname=os.path.join(opseq_file_directory,apk_hash+".opseq")
with open(opseq_fname, "a") as opseq_file:
for root, dirs, fnames in os.walk(smali_dir):
for fname in fnames:
full_path = os.path.join(root, fname)
opseq_file.write(get_opcode_seq(full_path, dalvik_opcodes))
opseq_file.close()
return True
except Exception as e:
print "Exception occured during opseq creation of apk " ,apk_hash
logging.error('Exception occured during opseq creation {0}'.format(str(e)))
return False
def get_opcode_seq(smali_fname, dalvik_opcodes):
# Returns opcode sequence created from smali file 'smali_fname'.
opcode_seq=''
with open(smali_fname, mode="r") as bigfile:
reader = bigfile.read()
for i, part in enumerate(reader.split(".method")):
add_newline = False
if i!=0:
method_part=part.split(".end method")[0]
method_body = method_part.strip().split('\n')
for line in method_body:
if not line.strip().startswith('.') and not line.strip().startswith('#') and line.strip():
method_line = line.strip().split()
if method_line[0] in dalvik_opcodes:
add_newline = True
opcode_seq += dalvik_opcodes[method_line[0]]
if add_newline:
opcode_seq += '\n'
return opcode_seq
def decode_application (apk_file_location,tmp_file_directory,hash,include_libs):
# Decodes the apk at apk_file_location and
# stores the decoded folders in tmp_file_directory
out_file_location = os.path.join(tmp_file_directory, hash+ ".smali")
try:
apktool_decode_apk( apk_file_location, out_file_location,include_libs )
except ApkToolException:
print "ApktoolException on decoding"
logging.error("ApktoolException on decoding apk {0} ".format(apk_file_location))
pass
return out_file_location
def apktool_decode_apk(apk_file, out_file,include_libs):
# Runs the apktool on a given apk
apktooldir="/usr/local/bin"
apktoolcmd = "{0}/apktool d -f {1} -o {2}".format(apktooldir, apk_file, out_file)
res = os.system(apktoolcmd)
if res != 0: raise ApkToolException(apktoolcmd)
# Checks if we should keep the smali files belonging to the android support libraries
if not include_libs:
# Don't keep the smali/android folder
android_folder = os.path.join(out_file, "smali/android")
if os.path.exists(android_folder):
rm_cmd = "rm -r %s" %(android_folder)
os.system(rm_cmd)
# Exception class to signify an Apktool Exception
class ApkToolException(Exception):
def __init__(self, command):
self.command = command
def __str__(self):
return repr(self.command)
if __name__ == '__main__':
main()
================================================
FILE: readMalwareData.lua
================================================
-- read the malware data
-- in setup mode
-- - read all the files
-- - decide if it should be in dataset
-- - save a list of all files
-- -
-- read the whole program into a tensor
function readfileFunc_tensor(filename)
local contents = {}
local f = torch.DiskFile(filename)
f.quiet(f)
local c = 'a'
local count = 0
local func = {}
for i = 1,opt.kernelLength do
table.insert(func,1)
count = count + 1
end
local nFuncs = 0
while c ~= '' do --and count <= opt.programLen do -- potential bug...
c = f.readString(f,'*l')
local len = #c
if len > 0 then
for k = 1,len,2 do
local num = string.sub(c,k,k+1)
local n = tonumber(num,16)
table.insert(func,n + 2) -- plus 2 so that our lowest symbol is '2' i.e. no_op is '2'
count = count + 1
end
nFuncs = nFuncs + 1
for i = 1,opt.kernelLength do
table.insert(func,1)
count = count + 1
end
if opt.markFunctionEnds then
table.insert(func,255) -- mark the end of each function
end
end
end
return torch.ByteTensor(func),nFuncs,count
end
-- get an upper bound on the number of malware files
-- we will discard some files that are too short etc
function upperBoundNumberOfFiles(rootDir)
local numberOfFilesBound = 0
local malwareDirs = paths.dir(rootDir)
for i = 1,#malwareDirs do
local dir = malwareDirs[i]
if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then
local malwarefiles = paths.dir(paths.concat(rootDir,dir))
-- number of files minus '.' and '..'
numberOfFilesBound = numberOfFilesBound + #malwarefiles - 2
end
end
print('upper bound number of programs ',numberOfFilesBound)
return numberOfFilesBound
end
-- this function gets called once when processing a new dataset
-- we read all the programs and decide which ones should be included
-- we just use an arbitrary rule that excludes very short programs
-- the list of included programs is returned and saved for later use
function readMalwareData_setup(rootDir)
-- read all the directories
-- check each file to see if it meets some criterion
-- save list of filenames
-- split into train / test sets
local datasetInfo = {
filesList = {},
family = {},
familyName = {},
label = {},
benignFamily = -1,
}
local programCount = 0
local familyNumber = 1
local malwareDirs = paths.dir(rootDir)
for i = 1,#malwareDirs do
local dir = malwareDirs[i]
if dir ~= '.' and dir ~= '..' and paths.dirp(paths.concat(rootDir,dir)) then
local malwarefiles = paths.dir(paths.concat(rootDir,dir))
for f = 1,#malwarefiles do
local file = malwarefiles[f]
if file ~= '.' and file ~= '..' then
local contents,nFuncs = readfileFunc_tensor(paths.concat(rootDir,dir,malwarefiles[f]))
if nFuncs >= 8 then -- a bit arbitrary... basically we want to ignore very short files
programCount = programCount + 1
if programCount % 100 == 0 then
print('programs read ',programCount,collectgarbage("count"))
collectgarbage()
end
-- local includeFile = dir .. '/' .. malwarefiles[f]
table.insert(datasetInfo.filesList,malwarefiles[f])
table.insert(datasetInfo.family,familyNumber)
if dir == 'Benign' then
datasetInfo.benignFamily = familyNumber
table.insert(datasetInfo.label,1)
else
table.insert(datasetInfo.label,2)
end
end
end
end
familyNumber = familyNumber + 1
table.insert(datasetInfo.familyName,dir)
end
end
datasetInfo.family = torch.Tensor(datasetInfo.family)
datasetInfo.label = torch.Tensor(datasetInfo.label)
return datasetInfo
end
-- reads the malware data into a tensor
-- We read all the opcodes into a single block of memory
-- this is because each program can be a different length
-- so storing in a 2D array will waste lots space
-- We also can't use a Lua list as they are limited to 2GB
--
-- allData.program - tensor (i.e. 1D array of bytes) containing all opcodes
-- allData.programStartPtrs - pointers to start of each program in allData.program
-- allData.programLengths - the length of each opcode sequence
--
-- For example, to access program 3 do
--
-- local ptr = allData.programStartPrts[3]
-- local len = allData.programLengths[3]
-- local prog = allData.program[{{ptr,ptr + len - 1}}]
--
function readMalwareData(rootDir,metaData)
print('reading files with version 2')
local malwareDirs = paths.dir(rootDir)
local upperBoundNumFiles = upperBoundNumberOfFiles(rootDir)
local meanProgramLen = 50000
local allData = {
program = torch.ones(upperBoundNumFiles * meanProgramLen):byte(),
programStartPtrs = {},
programLengths = {},
}
local programLen = {}
local progPtr = 1
local programCount = 0
for i = 1,#metaData.filesList do
local file = metaData.filesList[i]
local familyDir = metaData.familyName[metaData.family[i]]
local fullFile = paths.concat(rootDir,familyDir,file)
if paths.filep(fullFile) then
local contents = readfileFunc_tensor(fullFile)
programCount = programCount + 1
if programCount % 100 == 0 then
print('programs read ',programCount,collectgarbage("count"))
collectgarbage()
end
local programLength = contents:size(1)
-- if needed - increase the size of the storage
if (progPtr + programLength - 1) > allData.program:size(1) then
local currSize = allData.program:size(1)
allData.program = allData.program:resize(currSize * 1.05)
end
table.insert(allData.programStartPtrs,progPtr)
table.insert(allData.programLengths,programLength)
-- insert the program into the memory
allData.program[{{progPtr,progPtr + programLength - 1}}] = contents
progPtr = progPtr + programLength
else
-- we should stop if this happens!
error('ERROR : Missing file in dataset : ' .. fullFile)
end
end
allData.program = allData.program:resize(progPtr) -- discard redundant rows
allData.programStartPtrs = torch.Tensor(allData.programStartPtrs)
allData.programLengths = torch.Tensor(allData.programLengths)
allData.label = metaData.label
return allData,programLen
end
================================================
FILE: readme.md
================================================
# Deep Android Malware Detection
This repository contains the code for the paper "Deep Android Malware Detection" ([pdf download](https://pure.qub.ac.uk/portal/files/122380314/sig_camera_ready.pdf)) | ([citation](http://dl.acm.org/citation.cfm?id=3029823))
We use a convolutional neural network (CNN) for android malware classification. Malware classification is performed based on static analysis of the raw opcode sequence from a disassembled android apk. Features indicative of malware are automatically learned from the raw opcode sequence thus removing the need for hand-engineered malware features. The network runs on GPU, allowing a very large number of files to be quickly scanned.
<p><img src='malware_network_diagram.png'></p>
If you use this code please cite the following paper:
```
@inproceedings{mclaughlin2017codaspy,
title = "Deep Android Malware Detection",
author = "Niall McLaughlin and {Martinez del Rincon}, Jesus and BooJoong Kang and Suleiman Yerima and Paul Miller and Sakir Sezer and Yeganeh Safaeisemnani and Erik Trickel and Ziming Zhao and Adam Doupé and {Joon Ahn}, Gail",
year = "2016",
month = "12",
booktitle = "Proceeding of the ACM Conference on Data and Applications Security and Privacy (CODASPY) 2017",
publisher = "Association for Computing Machinery (ACM)",
}
```
## How to run the code
Given an existing dataset directory (see below for details), the run.sh file will do the following:
1. Partition the dataset into training-set and held-out test-set
2. Train a neural network
3. Test the trained network on the test-set
## Prerequisites
### Dataset structure
An example dataset with the required directory structure is provided in ./dataset
The neural network requires opcode sequence files in the correct format, and a dataset directory with sub-directories containing malware and benign opcode sequence files.
An example dataset directory is provided in ./dataset. The dataset directory must have the following structure:
1. There must be a directory called 'Benign', and contains non-malware opcode sequences files
2. The other directory can have any name ,and contains malware opcode sequence files
### Opcode Sequence files
Opcode sequence files can be created from android APK files using the opcode sequence creation tool. This tool is located in ./opcodeseq_creator Please see the readme file in this directory for more information.
### Setup
The neural network code is implemented using Torch. It is recommended to use a GPU to achieve acceleration of testing and training. For details on installing Torch please see http://torch.ch
The opcode sequence creator tool requires APKTool https://ibotpeaches.github.io/Apktool/
================================================
FILE: results/exampleOutput.txt
================================================
{
useOneHot : false
nConvLayers : 1
usemom : false
dataAugProb : 0.1
batchSize : 1
nSamplingEpochs : 5
nFCLayers : 1
nEmbeddingDims : 8
kernelLength : 8
useDropoutAfterEmbedding : false
numDAShuffles : 1
metaDataFile : "./config/metaData_small_test.th7"
useSpatialDropout : false
useHiddenLayer : true
weightDecay : 0
nConvFilters : 64
dropoutFrac : 0.5
useRMSProp : false
programLen : 8192
gpuid : 1
nHiddenNodes : 16
dataAugTesting : false
dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/"
seed : 1
maxSequenceLength : 8192
markFunctionEnds : false
debug : false
useDropoutAfterConv : false
useDropout : false
weightClasses : false
saveFileName : "model_tmp"
fixEmbedding : false
trainingSetSize : 2
randomize : false
weightDecayFrac : 0.1
useCUDA : true
usePreTrainedEmbedding : false
nEpochs : 75
decayLearningRate : false
setupMode : true
dataAugMethod : 1
saveModel : true
learningRate : 0.001
}
reading dataset
programs read 100 5064.8681640625
programs read 200 5527.64453125
programs read 300 6014.9560546875
programs read 400 3911.5263671875
programs read 500 7196.4423828125
programs read 600 8327.2734375
programs read 700 10306.740234375
programs read 800 6509.2666015625
programs read 900 7206.0546875
programs read 1000 6228.0478515625
programs read 1100 6535.55078125
programs read 1200 6618.107421875
programs read 1300 4311.482421875
programs read 1400 8571.1533203125
programs read 1500 8814.9814453125
programs read 1600 6065.205078125
programs read 1700 5644.7822265625
programs read 1800 4623.0302734375
programs read 1900 6804.72265625
programs read 2000 4155.318359375
programs read 2100 3895.193359375
splitting dataset into train/test sets
846 1259
splitting dataset
nPosTrain 761 nNegTrain 1133 pos/neg 0.40179514255544
nPosTest 85 nNegTest 126 pos/neg 0.40284360189573
saving dataset metadata to file ./config/metaData_small_test.th7
{
useOneHot : false
nConvLayers : 1
usemom : false
dataAugProb : 0.1
batchSize : 1
nSamplingEpochs : 5
nFCLayers : 1
nEmbeddingDims : 8
kernelLength : 8
useDropoutAfterEmbedding : false
numDAShuffles : 1
metaDataFile : "./config/metaData_small_test.th7"
useSpatialDropout : false
useHiddenLayer : true
weightDecay : 0
nConvFilters : 64
dropoutFrac : 0.5
useRMSProp : false
programLen : 8192
gpuid : 1
nHiddenNodes : 16
dataAugTesting : false
dataDir : "/home/nmclaughlin02/Documents/cyberdata/malware/"
seed : 1
maxSequenceLength : 8192
markFunctionEnds : false
debug : false
useDropoutAfterConv : false
useDropout : false
weightClasses : false
saveFileName : "model_tmp"
fixEmbedding : false
trainingSetSize : 2
randomize : false
weightDecayFrac : 0.1
useCUDA : true
usePreTrainedEmbedding : false
nEpochs : 75
decayLearningRate : false
setupMode : false
dataAugMethod : 1
saveModel : true
learningRate : 0.001
}
./config/metaData_small_test.th7
reading data from disk
reading files with version 2
upper bound number of programs 2125
programs read 100 5121.4873046875
programs read 200 17134.224609375
programs read 300 8701.2607421875
programs read 400 7367.3076171875
programs read 500 7284.8056640625
programs read 600 8411.69921875
programs read 700 10391.264648438
programs read 800 6593.51953125
programs read 900 7580.0087890625
programs read 1000 6310.1181640625
programs read 1100 6471.4033203125
programs read 1200 5646.609375
programs read 1300 6056.0703125
programs read 1400 5953.17578125
programs read 1500 5674.7333984375
programs read 1600 6110.0087890625
programs read 1700 5555.7314453125
programs read 1800 6911.7939453125
programs read 1900 6337.0595703125
programs read 2000 8575.1025390625
programs read 2100 3910.28515625
reading data from disk - complete
program lens 88 1083463 66743.73064133
splitting data into train/val/test sets
t,v,t
0.1 0.1 0.8
nPrograms 2105
846 1259
761 1133
splitting dataset
nPosTrain 676 nNegTrain 1007
nPosVal 85 nNegVal 126
nPosTest 85 nNegTest 126
train/val/test check 1 1 2105 2105
new network
nn.Sequential {
[input -> (1) -> (2) -> (3) -> (4) -> (5) -> (6) -> (7) -> (8) -> (9) -> (10) -> output]
(1): nn.LookupTable
(2): nn.Reshape(1x-1x8)
(3): nn.SpatialConvolutionMM(1 -> 64, 8x8)
(4): nn.ReLU
(5): nn.Reshape(64x-1)
(6): nn.Max
(7): nn.Linear(64 -> 16)
(8): nn.ReLU
(9): nn.Linear(16 -> 2)
(10): nn.LogSoftMax
}
starting training
Number of Model Parameters 7282
Using CUDA
Number of training examples 1683
Number of validation examples 211
allocating batch memory
memory allocated
CUDA memory usage
free 3702857728 total 4294246400 ratio 0.86228347958794
training time 30.722 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
5 val 0.40789575868608 0.38057567440503 0.87677725118483 0.83908045977011 0.85882352941176 0.84883720930233
testing time - val 0.157 nValPrograms 211
73 14
12 112
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
5 train 0.40789575868608 0.33726407361753 0.89304812834225 0.88036809815951 0.8491124260355 0.8644578313253
testing time - train 1.179 nTrainingPrograms 1683
574 78
102 929
[torch.DoubleTensor of size 2x2]
--
training time 32.220 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
10 val 0.16407678506945 0.17450008092898 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711
testing time - val 0.149 nValPrograms 211
81 7
4 119
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
10 train 0.16407678506945 0.12949798309725 0.96375519904932 0.93741109530583 0.97485207100592 0.95576504713561
testing time - train 1.157 nTrainingPrograms 1683
659 44
17 963
[torch.DoubleTensor of size 2x2]
--
training time 32.102 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
15 val 0.084003127938711 0.13772791624069 0.9478672985782 0.94047619047619 0.92941176470588 0.93491124260355
testing time - val 0.149 nValPrograms 211
79 5
6 121
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
15 train 0.084003127938711 0.059644215920109 0.98871063576946 0.98811292719168 0.98372781065089 0.98591549295775
testing time - train 1.178 nTrainingPrograms 1683
665 8
11 999
[torch.DoubleTensor of size 2x2]
--
training time 31.980 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
20 val 0.045003364997043 0.12233256954717 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711
testing time - val 0.153 nValPrograms 211
81 7
4 119
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
20 train 0.045003364997043 0.031773728943268 0.99524658348188 0.99408284023669 0.99408284023669 0.99408284023669
testing time - train 1.180 nTrainingPrograms 1683
672 4
4 1003
[torch.DoubleTensor of size 2x2]
--
training time 32.320 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
25 val 0.026457868370355 0.11776774217732 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588
testing time - val 0.152 nValPrograms 211
79 6
6 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
25 train 0.026457868370355 0.015975192693891 0.99762329174094 0.99851632047478 0.99556213017751 0.99703703703704
testing time - train 1.181 nTrainingPrograms 1683
673 1
3 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.997 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
30 val 0.019170329284611 0.11645289704698 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588
testing time - val 0.154 nValPrograms 211
79 6
6 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
30 train 0.019170329284611 0.012826394978125 0.9982174688057 0.99704579025111 0.99852071005917 0.99778270509978
testing time - train 1.186 nTrainingPrograms 1683
675 2
1 1005
[torch.DoubleTensor of size 2x2]
--
training time 32.005 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
35 val 0.021271346299619 0.12037801799051 0.9478672985782 0.92045454545455 0.95294117647059 0.9364161849711
testing time - val 0.156 nValPrograms 211
81 7
4 119
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
35 train 0.021271346299619 0.014115614049575 0.99702911467617 0.99410898379971 0.99852071005917 0.99630996309963
testing time - train 1.183 nTrainingPrograms 1683
675 4
1 1003
[torch.DoubleTensor of size 2x2]
--
training time 31.991 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
40 val 0.017399860965448 0.12343576564608 0.93364928909953 0.92771084337349 0.90588235294118 0.91666666666667
testing time - val 0.154 nValPrograms 211
77 6
8 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
40 train 0.017399860965448 0.0081807981271228 0.9982174688057 0.99851851851852 0.99704142011834 0.99777942264989
testing time - train 1.189 nTrainingPrograms 1683
674 1
2 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.986 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
45 val 0.012330326521177 0.13431220088525 0.94312796208531 0.92941176470588 0.92941176470588 0.92941176470588
testing time - val 0.152 nValPrograms 211
79 6
6 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
45 train 0.012330326521177 0.0074799091279046 0.9982174688057 0.99851851851852 0.99704142011834 0.99777942264989
testing time - train 1.184 nTrainingPrograms 1683
674 1
2 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.966 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
50 val 0.015055712885752 0.13219990125765 0.9478672985782 0.93023255813953 0.94117647058824 0.93567251461988
testing time - val 0.153 nValPrograms 211
80 6
5 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
50 train 0.015055712885752 0.0077246054254397 0.99762329174094 0.99704142011834 0.99704142011834 0.99704142011834
testing time - train 1.184 nTrainingPrograms 1683
674 2
2 1005
[torch.DoubleTensor of size 2x2]
--
training time 31.966 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
55 val 0.012149294217428 0.12793228326816 0.93364928909953 0.92771084337349 0.90588235294118 0.91666666666667
testing time - val 0.154 nValPrograms 211
77 6
8 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
55 train 0.012149294217428 0.006638735727547 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917
testing time - train 1.184 nTrainingPrograms 1683
675 1
1 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.977 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
60 val 0.012296135696144 0.13272679530049 0.9478672985782 0.93023255813953 0.94117647058824 0.93567251461988
testing time - val 0.153 nValPrograms 211
80 6
5 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
60 train 0.012296135696144 0.0069904121273129 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917
testing time - train 1.187 nTrainingPrograms 1683
675 1
1 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.982 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
65 val 0.013662806098403 0.13083266985925 0.93838862559242 0.92857142857143 0.91764705882353 0.92307692307692
testing time - val 0.154 nValPrograms 211
78 6
7 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
65 train 0.013662806098403 0.0061591699303294 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917
testing time - train 1.187 nTrainingPrograms 1683
675 1
1 1006
[torch.DoubleTensor of size 2x2]
--
training time 31.974 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
70 val 0.014961927119848 0.14605692608097 0.95734597156398 0.93181818181818 0.96470588235294 0.94797687861272
testing time - val 0.153 nValPrograms 211
82 6
3 120
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
70 train 0.014961927119848 0.0064191930150957 0.99881164587047 0.99852071005917 0.99852071005917 0.99852071005917
testing time - train 1.152 nTrainingPrograms 1683
675 1
1 1006
[torch.DoubleTensor of size 2x2]
--
training time 32.109 nPrograms in training 1683
nValPrograms 211 nTrainingPrograms 1683
testing corrected verison 2
Test Stats : nMalware 126 nBenign 85 positiveLabel 1
75 val 0.011070825411887 0.14359631668335 0.95260663507109 0.92134831460674 0.96470588235294 0.94252873563218
testing time - val 0.152 nValPrograms 211
82 7
3 119
[torch.DoubleTensor of size 2x2]
testing corrected verison 2
Test Stats : nMalware 1007 nBenign 676 positiveLabel 1
75 train 0.011070825411887 0.0070228511495245 0.9982174688057 0.99704579025111 0.99852071005917 0.99778270509978
testing time - train 1.183 nTrainingPrograms 1683
675 2
1 1005
[torch.DoubleTensor of size 2x2]
--
Best Result 0.014961927119848 0.14605692608097 0.95734597156398 0.93181818181818 0.96470588235294 0.94797687861272
================================================
FILE: run.sh
================================================
#
# First we must run the program with the -setupMode flag
# The program should be run with this flag ONLY ONCE for each dataset
# This reads the dataset, splits it into training and testing-sets
# and saves the dataset metadata to a file
#
th DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64 -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192 -setupMode
#
#
# Below is the code to train a network
# This uses the metadata file above so that we can reproduce our results
#
th DetectMalware_CNN.lua -useCUDA -gpuid 1 -programLen 8192 -nConvFilters 64 -nEpochs 75 -nSamplingEpochs 5 -nConvLayers 1 -seed 1 -learningRate 1e-3 -nEmbeddingDims 8 -kernelLength 8 -saveModel -saveFileName model_tmp -dataDir ./dataset/ -metaDataFile ./config/metaData_small_test.th7 -maxSequenceLength 8192
#
#
# Below is the code to test a pre-trained network
# This should only be run ONCE after setting hyper-parameters using the validation-set
#
th testWithPreTrainedNetwork.lua -useCUDA -dataDir ./dataset -modelPath ./trainedNets/model_tmp.th7
================================================
FILE: splitMalwareData.lua
================================================
-- run this program once given a new dataset
-- saves the test / train split to disk
-- later sub-divide the train-set into train / validation sets
-- return indicies for the training and testing sets
-- we will later sub-divide the training-set into train & val sets
function splitMalwareDataTrainTest(labels,pTrain,pTest)
local pos = {}
local neg = {}
local nPrograms = labels:size(1)--allData.program:size(1)
-- record the incidies of all the pos/neg i.e. malware/benign examples
for i = 1,nPrograms do
if labels[i] == 1 then
table.insert(pos,i)
else
table.insert(neg,i)
end
end
print(#pos,#neg)
-- record all the positive and negative indicies
-- shuffle the data
-- take the first X% of pos and first x% of pos for training
local trainInds = {}
local testInds = {}
local indsPos = torch.randperm(#pos)
local indsNeg = torch.randperm(#neg)
local nPosTrain = torch.floor(#pos * pTrain)
local nNegTrain = torch.floor(#neg * pTrain)
local nPosTest = #pos - nPosTrain
local nNegTest = #neg - nNegTrain
print('splitting dataset')
print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain,'pos/neg ',nPosTrain / (nPosTrain+nNegTrain))
print('nPosTest',nPosTest,'nNegTest',nNegTest,'pos/neg ',nPosTest / (nPosTest+nNegTest))
for i = 1,nPosTrain do
table.insert(trainInds,pos[indsPos[i]])
end
for i = 1,nNegTrain do
table.insert(trainInds,neg[indsNeg[i]])
end
for i = 1,nPosTest do
table.insert(testInds,pos[indsPos[nPosTrain + i]])
end
for i = 1,nNegTest do
table.insert(testInds,neg[indsNeg[nNegTrain + i]])
end
-- ratio used to weight the classes during training. Deals with
-- the unbalanced number of examples for each class
local posNegRatio = nPosTrain / (nPosTrain + nNegTrain)
return trainInds,testInds,posNegRatio
end
-- return indicies for the train,val and testing sets
function splitMalwareDataTrainValTest(labels,metaData)
local pTrain = 0.8
local pVal = 0.1
local pTest = 0.1
local testInds = metaData.testInds
local pos = {}
local neg = {}
local nPrograms = labels:size(1)--allData.program:size(1)
print('nPrograms ',nPrograms)
-- record the incidies of all the pos/neg i.e. malware/benign examples
for i = 1,nPrograms do
if labels[i] == 1 then
table.insert(pos,i)
else
table.insert(neg,i)
end
end
local posTrainVal = {}
local negTrainVal = {}
-- record the incidies of all the pos/neg i.e. malware/benign examples in the training-set
for i = 1,#metaData.trainInds do
if labels[metaData.trainInds[i]] == 1 then
table.insert(posTrainVal,metaData.trainInds[i])
else
table.insert(negTrainVal,metaData.trainInds[i])
end
end
print(#pos,#neg)
print(#posTrainVal,#negTrainVal)
-- record all the positive and negative indicies
-- shuffle the data
-- take the first X% of pos and first x% of pos for training
local trainInds = {}
local valInds = {}
local indsPos = torch.randperm(#posTrainVal)
local indsNeg = torch.randperm(#negTrainVal)
local nPosTrain = torch.floor(#pos * pTrain)
local nNegTrain = torch.floor(#neg * pTrain)
local nPosVal = #posTrainVal - nPosTrain
local nNegVal = #negTrainVal - nNegTrain
local nPosTest = #pos - (nPosTrain + nPosVal)
local nNegTest = #neg - (nNegTrain + nNegVal)
print('splitting dataset')
print('nPosTrain',nPosTrain,'nNegTrain',nNegTrain)
print('nPosVal',nPosVal,'nNegVal',nNegVal)
print('nPosTest',nPosTest,'nNegTest',nNegTest)
for i = 1,nPosTrain do
table.insert(trainInds,posTrainVal[indsPos[i]])
end
for i = 1,nNegTrain do
table.insert(trainInds,negTrainVal[indsNeg[i]])
end
for i = 1,nPosVal do
table.insert(valInds,posTrainVal[indsPos[nPosTrain + i]])
end
for i = 1,nNegVal do
table.insert(valInds,negTrainVal[indsNeg[nNegTrain + i]])
end
-- for i = 1,nPosTest do
-- table.insert(testInds,pos[indsPos[nPosTrain + nPosVal + i]])
-- end
-- for i = 1,nNegTest do
-- table.insert(testInds,neg[indsNeg[nNegTrain + nNegVal + i]])
-- end
-- ratio used to weight the classes during training. Deals with
-- the unbalanced number of examples for each class
local posNegRatio = nPosTrain / (nPosTrain + nNegTrain)
-- check there is no overlap between train / val / test sets
local sanity = torch.zeros(nPrograms)
for i = 1,#trainInds do
sanity[trainInds[i]] = sanity[trainInds[i]] + 1
end
for i = 1,#testInds do
sanity[testInds[i]] = sanity[testInds[i]] + 1
end
for i = 1,#valInds do
sanity[valInds[i]] = sanity[valInds[i]] + 1
end
print('train/val/test check',torch.min(sanity),torch.max(sanity),torch.sum(sanity),nPrograms)
if not (torch.min(sanity) == 1) or not (torch.max(sanity) == 1) or not (torch.sum(sanity) == nPrograms) then
-- stop if this happens
error('overlap between training / validation and testing sets')
end
return trainInds,valInds,testInds,posNegRatio
end
================================================
FILE: testModel.lua
================================================
function testModel(allData,model,valInds,epochError)
print('testing corrected verison 2')
local timerTest = torch.Timer()
local dtype = 'torch.DoubleTensor'
if opt.useCUDA then
dtype = 'torch.CudaTensor'
end
local criterion = nn.ClassNLLCriterion():type(dtype)
model:evaluate()
-- push the validation data through the network
local nValPrograms = #valInds
local valError = 0
local correct = 0
local confmat = torch.zeros(2,2)
local lens = torch.zeros(nValPrograms)
-- We need to make sure the rare-class is regarded as positive
-- This means the f-score etc will be corectly calculated
-- When reading the data benign is labelled as 1 and malware as 2
local nBenign = 0
local nMalware = 0
for k = 1,nValPrograms do
if allData.label[valInds[k]] == 1 then
nBenign = nBenign + 1
else
nMalware = nMalware + 1
end
end
local positiveLabel = 1
if nMalware < nBenign then
positiveLabel = 2
end
print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)
--local valBatch = torch.zeros(1,opt.programLen):type(dtype)
local valLabel = torch.zeros(1):type(dtype)
for k = 1,nValPrograms do
valLabel[{1}] = allData.label[valInds[k]]
--valBatch[{{1},{}}] = allData.program[valInds[k]]
local currProgramPtr = allData.programStartPtrs[valInds[k]]
local currProgramLen = allData.programLengths[valInds[k]]
if currProgramLen > opt.maxSequenceLength then
currProgramLen = opt.maxSequenceLength
end
local valBatch = torch.zeros(1,currProgramLen):type(dtype)
valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
local netOutput = model:forward(valBatch)
valError = valError + criterion:forward(netOutput,valLabel)
local netOutputProb = nn.Exp():forward(netOutput:double())
local v,i = torch.max(netOutputProb,2)
local pred = i[{1,1}]
local gt = allData.label[valInds[k]]
if pred == gt then
correct = correct + 1;
end
confmat[pred][gt] = confmat[pred][gt] + 1
end
valError = valError / nValPrograms
local tp = 0
local fp = 0
local fn = 0
if positiveLabel == 1 then
tp = confmat[1][1]
fp = confmat[1][2]
fn = confmat[2][1]
else
tp = confmat[2][2]
fp = confmat[2][1]
fn = confmat[1][2]
end
local testResult = {
-- tp = tp,
-- fp = fp,
-- fn = fn,
prec = tp / (tp + fp),
recall = tp / (tp + fn),
fscore = (2 * tp) / ((2 * tp) + fp + fn),
accuracy = correct/nValPrograms,
testError = valError,
}
local time = timerTest:time().real
model:training()
-- clean up
valLabel = nil
collectgarbage()
return testResult,confmat,time
end
================================================
FILE: testModel_dataAug.lua
================================================
function testModel(allData,model,valInds,epochError)
print('testing corrected verison 3')
local timerTest = torch.Timer()
local dtype = 'torch.DoubleTensor'
if opt.useCUDA then
dtype = 'torch.CudaTensor'
end
local criterion = nn.ClassNLLCriterion():type(dtype)
model:evaluate()
-- push the validation data through the network
local nValPrograms = #valInds
local valError = 0
local correct = 0
local confmat = torch.zeros(2,2)
local lens = torch.zeros(nValPrograms)
-- We need to make sure the rare-class is regarded as positive
-- This means the f-score etc will be corectly calculated
-- When reading the data benign is labelled as 1 and malware as 2
local nBenign = 0
local nMalware = 0
for k = 1,nValPrograms do
if allData.label[valInds[k]] == 1 then
nBenign = nBenign + 1
else
nMalware = nMalware + 1
end
end
local positiveLabel = 1
if nMalware < nBenign then
positiveLabel = 2
end
print('Test Stats : nMalware ',nMalware, ' nBenign ',nBenign, ' positiveLabel ',positiveLabel)
--local valBatch = torch.zeros(1,opt.programLen):type(dtype)
local valLabel = torch.zeros(1):type(dtype)
for k = 1,nValPrograms do
valLabel[{1}] = allData.label[valInds[k]]
--valBatch[{{1},{}}] = allData.program[valInds[k]]
local currProgramPtr = allData.programStartPtrs[valInds[k]]
local currProgramLen = allData.programLengths[valInds[k]]
local netOutputProb = torch.zeros(1,2)
local nDataAug = 10
for j = 1,nDataAug do
local valBatch
if currProgramLen > opt.maxSequenceLength then
valBatch = torch.zeros(1,opt.maxSequenceLength):type(dtype)
local rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))
valBatch[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]
else
valBatch = torch.zeros(1,currProgramLen):type(dtype)
valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
end
-- if currProgramLen > opt.maxSequenceLength then
-- currProgramLen = opt.maxSequenceLength
-- end
-- local valBatch = torch.zeros(1,currProgramLen):type(dtype)
-- valBatch[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
local netOutput = model:forward(valBatch)
valError = valError + criterion:forward(netOutput,valLabel)
netOutputProb = netOutputProb + nn.Exp():forward(netOutput:double())
end
local v,i = torch.max(netOutputProb,2)
local pred = i[{1,1}]
local gt = allData.label[valInds[k]]
if pred == gt then
correct = correct + 1;
end
confmat[pred][gt] = confmat[pred][gt] + 1
end
valError = valError / nValPrograms
local tp = 0
local fp = 0
local fn = 0
if positiveLabel == 1 then
tp = confmat[1][1]
fp = confmat[1][2]
fn = confmat[2][1]
else
tp = confmat[2][2]
fp = confmat[2][1]
fn = confmat[1][2]
end
local testResult = {
-- tp = tp,
-- fp = fp,
-- fn = fn,
prec = tp / (tp + fp),
recall = tp / (tp + fn),
fscore = (2 * tp) / ((2 * tp) + fp + fn),
accuracy = correct/nValPrograms,
testError = valError,
}
local time = timerTest:time().real
model:training()
-- clean up
valBatch = nil
valLabel = nil
collectgarbage()
return testResult,confmat,time
end
================================================
FILE: testWithPreTrainedNetwork.lua
================================================
-- Example of how to test using a pre-trained network
-- Expects a directory containing two or more directories
-- One directory contains all the malware
-- The other directory contains all the benign software
-- given a model that has already been trained
-- and a directory containing programs - classify into malware / benign
require 'nn'
require 'optim'
require 'nngraph'
require 'cunn'
require 'cutorch'
require 'readMalwareData'
require 'testModel'
cmd = torch.CmdLine()
cmd:option('-useCUDA',false,'use CUDA optimisation')
cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')
cmd:option('-modelPath','./trainedNets/model.th7','path to model to use for testing')
opt = cmd:parse(arg)
print('loading model from disk')
savedModel = torch.load(opt.modelPath)
print('loaded model')
print(savedModel.trainedModel)
-- we need these values to correctly prepare the files when reading from disk
opt.programLen = savedModel.opt.programLen
opt.kernelLength = savedModel.opt.kernelLength
opt.maxSequenceLength = savedModel.opt.maxSequenceLength
print('reading data from disk')
allData = readMalwareData(opt.dataDir,savedModel.metaData)
if opt.useCUDA then
savedModel.trainedModel:cuda()
end
savedModel.trainedModel:evaluate()
print('starting test')
testResult,confmat,time = testModel(allData,savedModel.trainedModel,savedModel.metaData.testInds,0)
print('Results')
print('f-score ',testResult.fscore)
print('precision ',testResult.prec)
print('recall ',testResult.recall)
print('accuracy ',testResult.accuracy)
print('--')
print('Confusion Matrix')
print(confmat)
print('--')
print('time to complete test (s) :',time)
================================================
FILE: trainModel.lua
================================================
-- use the GPU to process the whole batch in parallel
function trainModel(model,criterion,allData,trainInds,valInds,dataSplit,metaData)
local parameters,gradParameters = model:getParameters()
print('Number of Model Parameters ',parameters:size(1))
local dtype = 'torch.DoubleTensor'
if opt.useCUDA then
print('Using CUDA')
dtype = 'torch.CudaTensor'
else
print('Running on CPU - CUDA disabled')
end
local config = {
learningRate = opt.learningRate,
weightDecay = opt.weightDecay,
}
local bestfscore = 0
local bestResult = torch.zeros(6)
local timer = torch.Timer()
local nPrograms = #trainInds
print('Number of training examples ',#trainInds)
print('Number of validation examples ',#valInds)
-- pre-allocate memory for the batch
print('allocating batch memory')
--local batchProg = torch.zeros(opt.batchSize,opt.programLen):type(dtype)
local batchLabel = torch.zeros(opt.batchSize):type(dtype)
print('memory allocated')
--print(#batchProg)
if opt.useCUDA then
local freeMemory, totalMemory = cutorch.getMemoryUsage(opt.gpuid)
print('CUDA memory usage')
print('free ',freeMemory,'total ',totalMemory,'ratio ',freeMemory/totalMemory)
end
local gradMultiplier = torch.zeros(2):type(dtype)
if dataSplit.posNegRatio < 0.5 then
gradMultiplier[1] = 1 - dataSplit.posNegRatio
gradMultiplier[2] = dataSplit.posNegRatio
else
gradMultiplier[1] = dataSplit.posNegRatio
gradMultiplier[2] = 1 - dataSplit.posNegRatio
end
for e = 1,opt.nEpochs do
--batchProg:mul(0)
batchLabel:mul(0)
local nBatches = 0
local nSamples = 0
local epochError = 0
local order = torch.randperm(nPrograms)
for i = 1,(nPrograms - (nPrograms%opt.batchSize)),opt.batchSize do
nSamples = nSamples + opt.batchSize
nBatches = nBatches + 1
-- build the batch here
for k = 0,(opt.batchSize-1) do
--batchProg[{{k+1},{}}] = allData.program[trainInds[order[i + k]]]
batchLabel[{k+1}] = allData.label[trainInds[order[i + k]]]
end
local currProgramPtr = allData.programStartPtrs[trainInds[order[i]]]
local currProgramLen = allData.programLengths[trainInds[order[i]]]
local batchProg
if currProgramLen > opt.maxSequenceLength then
batchProg = torch.zeros(1,opt.maxSequenceLength):type(dtype)
local rndPtr = 0
if opt.dataAugTesting then
rndPtr = torch.floor(torch.rand(1)[1] * (currProgramLen - opt.maxSequenceLength - 1))
end
batchProg[{{1},{}}] = allData.program[{{currProgramPtr + rndPtr,currProgramPtr + rndPtr + opt.maxSequenceLength - 1}}]
else
batchProg = torch.zeros(1,currProgramLen):type(dtype)
batchProg[{{1},{}}] = allData.program[{{currProgramPtr,currProgramPtr + currProgramLen - 1}}]
end
--print(#batchProg)
--print(currProgramPtr,currProgramLen)
local feval = function(x)
local batchError = 0
if x ~= parameters then
parameters:copy(x)
end
gradParameters:zero()
local output = model:forward(batchProg)
local netError = criterion:forward(output,batchLabel)
batchError = batchError + netError
epochError = epochError + netError
local gradCriterion = criterion:backward(output,batchLabel)
if opt.weightClasses then
-- seems to be a bug in Torch with ClassNLLCriterion as it should
-- do this automatically ...
-- manually weight the classes to deal with imbalanced pos / neg samples
gradCriterion = gradCriterion:cmul(gradMultiplier)
end
model:backward(batchProg,gradCriterion)
return batchError,gradParameters
end
if opt.useRMSProp then
optim.rmsprop(feval, parameters, config)
else
optim.sgd(feval, parameters, config)
end
if isnan(epochError) then
print('training fail - Nan')
return 0
end
if epochError > 1e9 then
print('training fail - gradient exploded')
return 0
end
end
if (e == 50 or e == 75) and opt.decayLearningRate then
config.learningRate = config.learningRate * opt.weightDecayFrac
end
-- check the cross validation error
if e % opt.nSamplingEpochs == 0 or e == opt.nEpochs then
local time = timer:time().real
print('training time',string.format("%7.3f",time),' nPrograms in training ',nSamples)
timer:reset()
local nValPrograms = #valInds
local nTrainPrograms = #trainInds
print('nValPrograms',nValPrograms,'nTrainingPrograms',nTrainPrograms)
local valResult,valConfMat,valTime = testModel(allData,model,valInds,bestfscore)
if valResult.fscore > bestfscore then
bestfscore = valResult.fscore
bestResult[1] = valResult.accuracy
bestResult[2] = valResult.prec
bestResult[3] = valResult.recall
bestResult[4] = valResult.fscore
bestResult[5] = epochError/nBatches
bestResult[6] = valResult.testError
-- save the best model so far and the data split etc
if opt.saveModel then
local experimentData = {
opt = opt,
trainedModel = model:double(),
dataSplit = dataSplit,
metaData = metaData,
}
torch.save('./trainedNets/' .. opt.saveFileName .. '.th7',experimentData)
model:type(dtype)
parameters, gradParameters = model:getParameters()
collectgarbage()
end
end
print(e,'val ',epochError/nBatches,valResult.testError,valResult.accuracy,valResult.prec,valResult.recall,valResult.fscore)
print('testing time - val ',string.format("%7.3f",valTime),' nValPrograms',nValPrograms)
print(valConfMat)
local testResult,testConfMat,testTime = testModel(allData,model,trainInds,1)
print(e,'train ',epochError/nBatches,testResult.testError,testResult.accuracy,testResult.prec,testResult.recall,testResult.fscore)
print('testing time - train',string.format("%7.3f",testTime),' nTrainingPrograms',nTrainPrograms)
print(testConfMat)
print('--')
epochError = 0
nSamples = 0
nBatches = 0
collectgarbage()
end
end
print('Best Result ',bestResult[5],bestResult[6],bestResult[1],bestResult[2],bestResult[3],bestResult[4])
return model
end
gitextract_s38v2mnj/ ├── .gitattributes ├── DetectMalware_CNN.lua ├── buildNetwork.lua ├── dataset/ │ ├── Benign/ │ │ └── example.opseq │ └── Malware/ │ └── example.opseq ├── opcodeseq_creator/ │ ├── DalvikOpcodes.txt │ ├── README.txt │ └── run_opcode_seq_creation.py ├── readMalwareData.lua ├── readme.md ├── results/ │ └── exampleOutput.txt ├── run.sh ├── splitMalwareData.lua ├── testModel.lua ├── testModel_dataAug.lua ├── testWithPreTrainedNetwork.lua └── trainModel.lua
SYMBOL INDEX (8 symbols across 1 files)
FILE: opcodeseq_creator/run_opcode_seq_creation.py
function main (line 10) | def main():
function create_opcode_seq (line 90) | def create_opcode_seq(decoded_dir,opseq_file_directory,apk_hash):
function get_opcode_seq (line 120) | def get_opcode_seq(smali_fname, dalvik_opcodes):
function decode_application (line 142) | def decode_application (apk_file_location,tmp_file_directory,hash,includ...
function apktool_decode_apk (line 155) | def apktool_decode_apk(apk_file, out_file,include_libs):
class ApkToolException (line 173) | class ApkToolException(Exception):
method __init__ (line 174) | def __init__(self, command):
method __str__ (line 177) | def __str__(self):
Condensed preview — 17 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (72K chars).
[
{
"path": ".gitattributes",
"chars": 378,
"preview": "# Auto detect text files and perform LF normalization\n* text=auto\n\n# Custom for Visual Studio\n*.cs diff=csharp\n\n# St"
},
{
"path": "DetectMalware_CNN.lua",
"chars": 7118,
"preview": "require 'nn'\nrequire 'optim'\nrequire 'nngraph'\n\nrequire 'readMalwareData'\nrequire 'splitMalwareData'\nrequire 'buildNetwo"
},
{
"path": "buildNetwork.lua",
"chars": 2784,
"preview": "function buildNetwork(posNegRatio)\n\n\tlocal nIndex = 256\n\tlocal nOutputSamples = opt.nConvFilters -- number of conv-fi"
},
{
"path": "dataset/Benign/example.opseq",
"chars": 18,
"preview": "5b700e\n700e\n1f6e0c"
},
{
"path": "dataset/Malware/example.opseq",
"chars": 18,
"preview": "5b700e\n700e\n1f6e0c"
},
{
"path": "opcodeseq_creator/DalvikOpcodes.txt",
"chars": 3232,
"preview": "nop 00\nmove 01\nmove/from16 02\nmove/16 03\nmove-wide 04\nmove-wide/from16 05\nmove-wide/16 06\nmove-object 07\nmove-object/fro"
},
{
"path": "opcodeseq_creator/README.txt",
"chars": 1409,
"preview": "\nThe zip file contains:\n\n\t1- A csv file containing Davlik opcodes\n\n\t2- Sample directory structure containing\n\t\t-apk fold"
},
{
"path": "opcodeseq_creator/run_opcode_seq_creation.py",
"chars": 6764,
"preview": "#!/usr/bin/env python\nimport sys\nimport os\nimport shutil\nimport datetime\nimport logging\n\nsys.path.insert(1, os.path.join"
},
{
"path": "readMalwareData.lua",
"chars": 6128,
"preview": "\n-- read the malware data\n--\t in setup mode \n--\t- read all the files\n-- - decide if it should be in dataset\n-- - save "
},
{
"path": "readme.md",
"chars": 2693,
"preview": "# Deep Android Malware Detection\n\nThis repository contains the code for the paper \"Deep Android Malware Detection\" ([pdf"
},
{
"path": "results/exampleOutput.txt",
"chars": 15675,
"preview": "{\n useOneHot : false\n nConvLayers : 1\n usemom : false\n dataAugProb : 0.1\n batchSize : 1\n nSamplingEpochs : 5\n nFC"
},
{
"path": "run.sh",
"chars": 1247,
"preview": "#\n# First we must run the program with the -setupMode flag\n# The program should be run with this flag ONLY ONCE for ea"
},
{
"path": "splitMalwareData.lua",
"chars": 4805,
"preview": "-- run this program once given a new dataset\n-- saves the test / train split to disk\n-- later sub-divide the train-set i"
},
{
"path": "testModel.lua",
"chars": 2641,
"preview": "function testModel(allData,model,valInds,epochError)\n\n\tprint('testing corrected verison 2')\n\n\tlocal timerTest = torch.Ti"
},
{
"path": "testModel_dataAug.lua",
"chars": 3302,
"preview": "function testModel(allData,model,valInds,epochError)\n\n\tprint('testing corrected verison 3')\n\n\tlocal timerTest = torch.Ti"
},
{
"path": "testWithPreTrainedNetwork.lua",
"chars": 1671,
"preview": "-- Example of how to test using a pre-trained network\n-- Expects a directory containing two or more directories\n-- One d"
},
{
"path": "trainModel.lua",
"chars": 6042,
"preview": "-- use the GPU to process the whole batch in parallel\nfunction trainModel(model,criterion,allData,trainInds,valInds,data"
}
]
About this extraction
This page contains the full source code of the niallmcl/Deep-Android-Malware-Detection GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 17 files (64.4 KB), approximately 20.5k tokens, and a symbol index with 8 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.